diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4230565838180857, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00042305658381808567, + "grad_norm": 81.84548950195312, + "learning_rate": 0.0, + "loss": 812.9326, + "step": 1 + }, + { + "epoch": 0.0008461131676361713, + "grad_norm": 102.70423126220703, + "learning_rate": 2.8169014084507045e-08, + "loss": 774.5251, + "step": 2 + }, + { + "epoch": 0.001269169751454257, + "grad_norm": 128.83238220214844, + "learning_rate": 5.633802816901409e-08, + "loss": 733.8044, + "step": 3 + }, + { + "epoch": 0.0016922263352723427, + "grad_norm": 128.73910522460938, + "learning_rate": 8.450704225352113e-08, + "loss": 734.627, + "step": 4 + }, + { + "epoch": 0.0021152829190904283, + "grad_norm": 105.89934539794922, + "learning_rate": 1.1267605633802818e-07, + "loss": 775.4291, + "step": 5 + }, + { + "epoch": 0.002538339502908514, + "grad_norm": 85.26824951171875, + "learning_rate": 1.4084507042253522e-07, + "loss": 816.5836, + "step": 6 + }, + { + "epoch": 0.0029613960867265997, + "grad_norm": 98.91043853759766, + "learning_rate": 1.6901408450704225e-07, + "loss": 796.295, + "step": 7 + }, + { + "epoch": 0.0033844526705446853, + "grad_norm": 128.0021514892578, + "learning_rate": 1.9718309859154932e-07, + "loss": 736.9498, + "step": 8 + }, + { + "epoch": 0.003807509254362771, + "grad_norm": 128.68179321289062, + "learning_rate": 2.2535211267605636e-07, + "loss": 715.6018, + "step": 9 + }, + { + "epoch": 0.004230565838180857, + "grad_norm": 88.86766815185547, + "learning_rate": 2.535211267605634e-07, + "loss": 814.3213, + "step": 10 + }, + { + "epoch": 0.004653622421998942, + "grad_norm": 85.91423797607422, + "learning_rate": 2.8169014084507043e-07, + "loss": 813.2338, + "step": 11 + }, + { + "epoch": 0.005076679005817028, + "grad_norm": 133.02713012695312, + "learning_rate": 3.0985915492957747e-07, + "loss": 712.9101, + "step": 12 + }, + { + "epoch": 0.005499735589635114, + "grad_norm": 85.18633270263672, + "learning_rate": 3.380281690140845e-07, + "loss": 813.5604, + "step": 13 + }, + { + "epoch": 0.005922792173453199, + "grad_norm": 124.5801773071289, + "learning_rate": 3.661971830985916e-07, + "loss": 734.9832, + "step": 14 + }, + { + "epoch": 0.006345848757271285, + "grad_norm": 76.61686706542969, + "learning_rate": 3.9436619718309864e-07, + "loss": 833.8665, + "step": 15 + }, + { + "epoch": 0.006768905341089371, + "grad_norm": 148.28494262695312, + "learning_rate": 4.225352112676057e-07, + "loss": 695.165, + "step": 16 + }, + { + "epoch": 0.007191961924907456, + "grad_norm": 96.79639434814453, + "learning_rate": 4.507042253521127e-07, + "loss": 795.7985, + "step": 17 + }, + { + "epoch": 0.007615018508725542, + "grad_norm": 88.83817291259766, + "learning_rate": 4.788732394366198e-07, + "loss": 816.8214, + "step": 18 + }, + { + "epoch": 0.008038075092543628, + "grad_norm": 112.4771728515625, + "learning_rate": 5.070422535211268e-07, + "loss": 753.9221, + "step": 19 + }, + { + "epoch": 0.008461131676361713, + "grad_norm": 104.54440307617188, + "learning_rate": 5.352112676056338e-07, + "loss": 776.382, + "step": 20 + }, + { + "epoch": 0.008884188260179799, + "grad_norm": 73.44041442871094, + "learning_rate": 5.633802816901409e-07, + "loss": 835.2264, + "step": 21 + }, + { + "epoch": 0.009307244843997885, + "grad_norm": 111.25773620605469, + "learning_rate": 5.915492957746479e-07, + "loss": 756.1508, + "step": 22 + }, + { + "epoch": 0.00973030142781597, + "grad_norm": 83.49649047851562, + "learning_rate": 6.197183098591549e-07, + "loss": 816.5864, + "step": 23 + }, + { + "epoch": 0.010153358011634056, + "grad_norm": 109.28431701660156, + "learning_rate": 6.47887323943662e-07, + "loss": 754.9388, + "step": 24 + }, + { + "epoch": 0.010576414595452142, + "grad_norm": 116.55452728271484, + "learning_rate": 6.76056338028169e-07, + "loss": 735.3099, + "step": 25 + }, + { + "epoch": 0.010999471179270227, + "grad_norm": 63.16034698486328, + "learning_rate": 7.042253521126762e-07, + "loss": 874.1632, + "step": 26 + }, + { + "epoch": 0.011422527763088313, + "grad_norm": 110.44268798828125, + "learning_rate": 7.323943661971832e-07, + "loss": 753.7839, + "step": 27 + }, + { + "epoch": 0.011845584346906399, + "grad_norm": 117.79016876220703, + "learning_rate": 7.605633802816901e-07, + "loss": 737.6011, + "step": 28 + }, + { + "epoch": 0.012268640930724484, + "grad_norm": 115.26844787597656, + "learning_rate": 7.887323943661973e-07, + "loss": 734.0729, + "step": 29 + }, + { + "epoch": 0.01269169751454257, + "grad_norm": 92.99131774902344, + "learning_rate": 8.169014084507043e-07, + "loss": 776.9102, + "step": 30 + }, + { + "epoch": 0.013114754098360656, + "grad_norm": 82.01249694824219, + "learning_rate": 8.450704225352114e-07, + "loss": 792.6701, + "step": 31 + }, + { + "epoch": 0.013537810682178741, + "grad_norm": 98.73196411132812, + "learning_rate": 8.732394366197183e-07, + "loss": 776.9266, + "step": 32 + }, + { + "epoch": 0.013960867265996827, + "grad_norm": 91.21691131591797, + "learning_rate": 9.014084507042254e-07, + "loss": 775.0609, + "step": 33 + }, + { + "epoch": 0.014383923849814913, + "grad_norm": 129.7372283935547, + "learning_rate": 9.295774647887325e-07, + "loss": 716.7698, + "step": 34 + }, + { + "epoch": 0.014806980433632998, + "grad_norm": 98.59812927246094, + "learning_rate": 9.577464788732395e-07, + "loss": 735.5928, + "step": 35 + }, + { + "epoch": 0.015230037017451084, + "grad_norm": 58.540382385253906, + "learning_rate": 9.859154929577465e-07, + "loss": 853.8718, + "step": 36 + }, + { + "epoch": 0.01565309360126917, + "grad_norm": 70.80773162841797, + "learning_rate": 1.0140845070422536e-06, + "loss": 788.3241, + "step": 37 + }, + { + "epoch": 0.016076150185087255, + "grad_norm": 72.62784576416016, + "learning_rate": 1.0422535211267606e-06, + "loss": 770.8654, + "step": 38 + }, + { + "epoch": 0.01649920676890534, + "grad_norm": 78.5606918334961, + "learning_rate": 1.0704225352112677e-06, + "loss": 755.0453, + "step": 39 + }, + { + "epoch": 0.016922263352723427, + "grad_norm": 67.18627166748047, + "learning_rate": 1.098591549295775e-06, + "loss": 790.9852, + "step": 40 + }, + { + "epoch": 0.017345319936541512, + "grad_norm": 58.59244155883789, + "learning_rate": 1.1267605633802817e-06, + "loss": 815.5032, + "step": 41 + }, + { + "epoch": 0.017768376520359598, + "grad_norm": 104.85185241699219, + "learning_rate": 1.1549295774647888e-06, + "loss": 673.9536, + "step": 42 + }, + { + "epoch": 0.018191433104177684, + "grad_norm": 60.07638168334961, + "learning_rate": 1.1830985915492958e-06, + "loss": 794.3215, + "step": 43 + }, + { + "epoch": 0.01861448968799577, + "grad_norm": 66.23779296875, + "learning_rate": 1.211267605633803e-06, + "loss": 777.0001, + "step": 44 + }, + { + "epoch": 0.019037546271813855, + "grad_norm": 76.81977844238281, + "learning_rate": 1.2394366197183099e-06, + "loss": 752.4107, + "step": 45 + }, + { + "epoch": 0.01946060285563194, + "grad_norm": 77.48674011230469, + "learning_rate": 1.267605633802817e-06, + "loss": 731.7759, + "step": 46 + }, + { + "epoch": 0.019883659439450026, + "grad_norm": 74.13082885742188, + "learning_rate": 1.295774647887324e-06, + "loss": 732.6713, + "step": 47 + }, + { + "epoch": 0.020306716023268112, + "grad_norm": 67.71472930908203, + "learning_rate": 1.323943661971831e-06, + "loss": 754.5653, + "step": 48 + }, + { + "epoch": 0.020729772607086198, + "grad_norm": 48.328861236572266, + "learning_rate": 1.352112676056338e-06, + "loss": 771.5323, + "step": 49 + }, + { + "epoch": 0.021152829190904283, + "grad_norm": 43.20874786376953, + "learning_rate": 1.3802816901408453e-06, + "loss": 773.3832, + "step": 50 + }, + { + "epoch": 0.02157588577472237, + "grad_norm": 36.09504318237305, + "learning_rate": 1.4084507042253523e-06, + "loss": 830.1559, + "step": 51 + }, + { + "epoch": 0.021998942358540455, + "grad_norm": 40.12434387207031, + "learning_rate": 1.4366197183098594e-06, + "loss": 773.9634, + "step": 52 + }, + { + "epoch": 0.02242199894235854, + "grad_norm": 31.96895980834961, + "learning_rate": 1.4647887323943664e-06, + "loss": 849.7255, + "step": 53 + }, + { + "epoch": 0.022845055526176626, + "grad_norm": 31.673309326171875, + "learning_rate": 1.4929577464788732e-06, + "loss": 850.11, + "step": 54 + }, + { + "epoch": 0.023268112109994712, + "grad_norm": 41.284149169921875, + "learning_rate": 1.5211267605633803e-06, + "loss": 752.267, + "step": 55 + }, + { + "epoch": 0.023691168693812797, + "grad_norm": 40.81191635131836, + "learning_rate": 1.5492957746478873e-06, + "loss": 749.9826, + "step": 56 + }, + { + "epoch": 0.024114225277630883, + "grad_norm": 46.4893913269043, + "learning_rate": 1.5774647887323946e-06, + "loss": 710.7681, + "step": 57 + }, + { + "epoch": 0.02453728186144897, + "grad_norm": 37.99464797973633, + "learning_rate": 1.6056338028169016e-06, + "loss": 747.9175, + "step": 58 + }, + { + "epoch": 0.024960338445267054, + "grad_norm": 27.126907348632812, + "learning_rate": 1.6338028169014086e-06, + "loss": 852.4036, + "step": 59 + }, + { + "epoch": 0.02538339502908514, + "grad_norm": 32.76125717163086, + "learning_rate": 1.6619718309859157e-06, + "loss": 770.4146, + "step": 60 + }, + { + "epoch": 0.025806451612903226, + "grad_norm": 26.590463638305664, + "learning_rate": 1.6901408450704227e-06, + "loss": 811.9281, + "step": 61 + }, + { + "epoch": 0.02622950819672131, + "grad_norm": 27.449390411376953, + "learning_rate": 1.7183098591549297e-06, + "loss": 789.7933, + "step": 62 + }, + { + "epoch": 0.026652564780539397, + "grad_norm": 29.129018783569336, + "learning_rate": 1.7464788732394366e-06, + "loss": 771.5277, + "step": 63 + }, + { + "epoch": 0.027075621364357483, + "grad_norm": 33.60978317260742, + "learning_rate": 1.774647887323944e-06, + "loss": 731.0164, + "step": 64 + }, + { + "epoch": 0.02749867794817557, + "grad_norm": 24.35285758972168, + "learning_rate": 1.8028169014084509e-06, + "loss": 792.4971, + "step": 65 + }, + { + "epoch": 0.027921734531993654, + "grad_norm": 29.933168411254883, + "learning_rate": 1.8309859154929579e-06, + "loss": 728.1086, + "step": 66 + }, + { + "epoch": 0.02834479111581174, + "grad_norm": 30.648229598999023, + "learning_rate": 1.859154929577465e-06, + "loss": 708.9996, + "step": 67 + }, + { + "epoch": 0.028767847699629825, + "grad_norm": 28.025320053100586, + "learning_rate": 1.887323943661972e-06, + "loss": 829.2945, + "step": 68 + }, + { + "epoch": 0.02919090428344791, + "grad_norm": 26.298429489135742, + "learning_rate": 1.915492957746479e-06, + "loss": 749.3297, + "step": 69 + }, + { + "epoch": 0.029613960867265997, + "grad_norm": 29.36769676208496, + "learning_rate": 1.943661971830986e-06, + "loss": 687.0703, + "step": 70 + }, + { + "epoch": 0.030037017451084082, + "grad_norm": 24.958955764770508, + "learning_rate": 1.971830985915493e-06, + "loss": 751.1055, + "step": 71 + }, + { + "epoch": 0.030460074034902168, + "grad_norm": 25.022201538085938, + "learning_rate": 2.0000000000000003e-06, + "loss": 826.5974, + "step": 72 + }, + { + "epoch": 0.030883130618720254, + "grad_norm": 23.380407333374023, + "learning_rate": 2.028169014084507e-06, + "loss": 846.5952, + "step": 73 + }, + { + "epoch": 0.03130618720253834, + "grad_norm": 24.58793067932129, + "learning_rate": 2.0563380281690144e-06, + "loss": 749.4149, + "step": 74 + }, + { + "epoch": 0.03172924378635643, + "grad_norm": 19.74152946472168, + "learning_rate": 2.0845070422535212e-06, + "loss": 850.069, + "step": 75 + }, + { + "epoch": 0.03215230037017451, + "grad_norm": 17.751310348510742, + "learning_rate": 2.1126760563380285e-06, + "loss": 867.8693, + "step": 76 + }, + { + "epoch": 0.0325753569539926, + "grad_norm": 20.678585052490234, + "learning_rate": 2.1408450704225353e-06, + "loss": 785.7188, + "step": 77 + }, + { + "epoch": 0.03299841353781068, + "grad_norm": 18.48772621154785, + "learning_rate": 2.169014084507042e-06, + "loss": 789.459, + "step": 78 + }, + { + "epoch": 0.03342147012162877, + "grad_norm": 17.77286148071289, + "learning_rate": 2.19718309859155e-06, + "loss": 790.9608, + "step": 79 + }, + { + "epoch": 0.033844526705446853, + "grad_norm": 18.79627799987793, + "learning_rate": 2.2253521126760566e-06, + "loss": 769.3281, + "step": 80 + }, + { + "epoch": 0.03426758328926494, + "grad_norm": 19.375093460083008, + "learning_rate": 2.2535211267605635e-06, + "loss": 727.7484, + "step": 81 + }, + { + "epoch": 0.034690639873083025, + "grad_norm": 15.10090160369873, + "learning_rate": 2.2816901408450707e-06, + "loss": 805.2005, + "step": 82 + }, + { + "epoch": 0.035113696456901114, + "grad_norm": 18.192068099975586, + "learning_rate": 2.3098591549295775e-06, + "loss": 726.6041, + "step": 83 + }, + { + "epoch": 0.035536753040719196, + "grad_norm": 14.974076271057129, + "learning_rate": 2.338028169014085e-06, + "loss": 788.6285, + "step": 84 + }, + { + "epoch": 0.035959809624537285, + "grad_norm": 16.76145362854004, + "learning_rate": 2.3661971830985916e-06, + "loss": 745.9722, + "step": 85 + }, + { + "epoch": 0.03638286620835537, + "grad_norm": 18.84196662902832, + "learning_rate": 2.3943661971830984e-06, + "loss": 686.3864, + "step": 86 + }, + { + "epoch": 0.03680592279217346, + "grad_norm": 15.284069061279297, + "learning_rate": 2.422535211267606e-06, + "loss": 784.7472, + "step": 87 + }, + { + "epoch": 0.03722897937599154, + "grad_norm": 14.29327392578125, + "learning_rate": 2.450704225352113e-06, + "loss": 785.7626, + "step": 88 + }, + { + "epoch": 0.03765203595980963, + "grad_norm": 15.331565856933594, + "learning_rate": 2.4788732394366198e-06, + "loss": 745.7901, + "step": 89 + }, + { + "epoch": 0.03807509254362771, + "grad_norm": 17.515869140625, + "learning_rate": 2.507042253521127e-06, + "loss": 686.7051, + "step": 90 + }, + { + "epoch": 0.0384981491274458, + "grad_norm": 14.446605682373047, + "learning_rate": 2.535211267605634e-06, + "loss": 766.524, + "step": 91 + }, + { + "epoch": 0.03892120571126388, + "grad_norm": 12.427390098571777, + "learning_rate": 2.563380281690141e-06, + "loss": 807.5848, + "step": 92 + }, + { + "epoch": 0.03934426229508197, + "grad_norm": 13.477662086486816, + "learning_rate": 2.591549295774648e-06, + "loss": 789.5703, + "step": 93 + }, + { + "epoch": 0.03976731887890005, + "grad_norm": 14.324225425720215, + "learning_rate": 2.619718309859155e-06, + "loss": 744.8116, + "step": 94 + }, + { + "epoch": 0.04019037546271814, + "grad_norm": 12.983494758605957, + "learning_rate": 2.647887323943662e-06, + "loss": 804.8558, + "step": 95 + }, + { + "epoch": 0.040613432046536224, + "grad_norm": 15.812573432922363, + "learning_rate": 2.676056338028169e-06, + "loss": 725.4119, + "step": 96 + }, + { + "epoch": 0.04103648863035431, + "grad_norm": 12.906014442443848, + "learning_rate": 2.704225352112676e-06, + "loss": 806.4986, + "step": 97 + }, + { + "epoch": 0.041459545214172396, + "grad_norm": 14.5569486618042, + "learning_rate": 2.7323943661971837e-06, + "loss": 726.3054, + "step": 98 + }, + { + "epoch": 0.041882601797990485, + "grad_norm": 12.483677864074707, + "learning_rate": 2.7605633802816906e-06, + "loss": 828.4179, + "step": 99 + }, + { + "epoch": 0.04230565838180857, + "grad_norm": 15.042231559753418, + "learning_rate": 2.7887323943661974e-06, + "loss": 727.3077, + "step": 100 + }, + { + "epoch": 0.042728714965626656, + "grad_norm": 16.66362190246582, + "learning_rate": 2.8169014084507046e-06, + "loss": 665.2318, + "step": 101 + }, + { + "epoch": 0.04315177154944474, + "grad_norm": 12.847768783569336, + "learning_rate": 2.8450704225352115e-06, + "loss": 785.1121, + "step": 102 + }, + { + "epoch": 0.04357482813326283, + "grad_norm": 14.952692985534668, + "learning_rate": 2.8732394366197187e-06, + "loss": 708.7941, + "step": 103 + }, + { + "epoch": 0.04399788471708091, + "grad_norm": 12.519607543945312, + "learning_rate": 2.9014084507042255e-06, + "loss": 784.6961, + "step": 104 + }, + { + "epoch": 0.044420941300899, + "grad_norm": 11.40876579284668, + "learning_rate": 2.929577464788733e-06, + "loss": 826.3976, + "step": 105 + }, + { + "epoch": 0.04484399788471708, + "grad_norm": 14.628512382507324, + "learning_rate": 2.9577464788732396e-06, + "loss": 726.8975, + "step": 106 + }, + { + "epoch": 0.04526705446853517, + "grad_norm": 11.975522994995117, + "learning_rate": 2.9859154929577465e-06, + "loss": 806.7027, + "step": 107 + }, + { + "epoch": 0.04569011105235325, + "grad_norm": 13.41118335723877, + "learning_rate": 3.0140845070422537e-06, + "loss": 745.8456, + "step": 108 + }, + { + "epoch": 0.04611316763617134, + "grad_norm": 11.88784408569336, + "learning_rate": 3.0422535211267605e-06, + "loss": 807.1974, + "step": 109 + }, + { + "epoch": 0.046536224219989424, + "grad_norm": 12.050515174865723, + "learning_rate": 3.0704225352112678e-06, + "loss": 783.1707, + "step": 110 + }, + { + "epoch": 0.04695928080380751, + "grad_norm": 12.62401008605957, + "learning_rate": 3.0985915492957746e-06, + "loss": 767.4214, + "step": 111 + }, + { + "epoch": 0.047382337387625595, + "grad_norm": 13.698180198669434, + "learning_rate": 3.1267605633802823e-06, + "loss": 725.1587, + "step": 112 + }, + { + "epoch": 0.047805393971443684, + "grad_norm": 11.507564544677734, + "learning_rate": 3.154929577464789e-06, + "loss": 807.4629, + "step": 113 + }, + { + "epoch": 0.048228450555261766, + "grad_norm": 12.66966724395752, + "learning_rate": 3.1830985915492964e-06, + "loss": 763.887, + "step": 114 + }, + { + "epoch": 0.048651507139079855, + "grad_norm": 13.759469985961914, + "learning_rate": 3.211267605633803e-06, + "loss": 727.2871, + "step": 115 + }, + { + "epoch": 0.04907456372289794, + "grad_norm": 10.869978904724121, + "learning_rate": 3.2394366197183104e-06, + "loss": 823.7092, + "step": 116 + }, + { + "epoch": 0.04949762030671603, + "grad_norm": 12.202546119689941, + "learning_rate": 3.2676056338028173e-06, + "loss": 784.2358, + "step": 117 + }, + { + "epoch": 0.04992067689053411, + "grad_norm": 11.720606803894043, + "learning_rate": 3.295774647887324e-06, + "loss": 804.8806, + "step": 118 + }, + { + "epoch": 0.0503437334743522, + "grad_norm": 11.251771926879883, + "learning_rate": 3.3239436619718313e-06, + "loss": 804.4562, + "step": 119 + }, + { + "epoch": 0.05076679005817028, + "grad_norm": 10.281838417053223, + "learning_rate": 3.352112676056338e-06, + "loss": 829.6753, + "step": 120 + }, + { + "epoch": 0.05118984664198837, + "grad_norm": 12.925240516662598, + "learning_rate": 3.3802816901408454e-06, + "loss": 765.2947, + "step": 121 + }, + { + "epoch": 0.05161290322580645, + "grad_norm": 14.645537376403809, + "learning_rate": 3.4084507042253522e-06, + "loss": 702.7504, + "step": 122 + }, + { + "epoch": 0.05203595980962454, + "grad_norm": 11.410442352294922, + "learning_rate": 3.4366197183098595e-06, + "loss": 805.8836, + "step": 123 + }, + { + "epoch": 0.05245901639344262, + "grad_norm": 12.8775053024292, + "learning_rate": 3.4647887323943663e-06, + "loss": 767.8827, + "step": 124 + }, + { + "epoch": 0.05288207297726071, + "grad_norm": 15.571783065795898, + "learning_rate": 3.492957746478873e-06, + "loss": 681.9991, + "step": 125 + }, + { + "epoch": 0.053305129561078794, + "grad_norm": 12.192140579223633, + "learning_rate": 3.5211267605633804e-06, + "loss": 763.0199, + "step": 126 + }, + { + "epoch": 0.05372818614489688, + "grad_norm": 14.904058456420898, + "learning_rate": 3.549295774647888e-06, + "loss": 680.9163, + "step": 127 + }, + { + "epoch": 0.054151242728714966, + "grad_norm": 12.380144119262695, + "learning_rate": 3.577464788732395e-06, + "loss": 762.0075, + "step": 128 + }, + { + "epoch": 0.054574299312533055, + "grad_norm": 12.839605331420898, + "learning_rate": 3.6056338028169017e-06, + "loss": 763.3351, + "step": 129 + }, + { + "epoch": 0.05499735589635114, + "grad_norm": 12.572240829467773, + "learning_rate": 3.633802816901409e-06, + "loss": 764.4438, + "step": 130 + }, + { + "epoch": 0.055420412480169226, + "grad_norm": 14.65994644165039, + "learning_rate": 3.6619718309859158e-06, + "loss": 724.8147, + "step": 131 + }, + { + "epoch": 0.05584346906398731, + "grad_norm": 10.371657371520996, + "learning_rate": 3.690140845070423e-06, + "loss": 843.1233, + "step": 132 + }, + { + "epoch": 0.0562665256478054, + "grad_norm": 10.114032745361328, + "learning_rate": 3.71830985915493e-06, + "loss": 838.7578, + "step": 133 + }, + { + "epoch": 0.05668958223162348, + "grad_norm": 11.934911727905273, + "learning_rate": 3.746478873239437e-06, + "loss": 782.7778, + "step": 134 + }, + { + "epoch": 0.05711263881544157, + "grad_norm": 13.536404609680176, + "learning_rate": 3.774647887323944e-06, + "loss": 746.2087, + "step": 135 + }, + { + "epoch": 0.05753569539925965, + "grad_norm": 11.777791023254395, + "learning_rate": 3.8028169014084508e-06, + "loss": 783.6028, + "step": 136 + }, + { + "epoch": 0.05795875198307774, + "grad_norm": 12.140782356262207, + "learning_rate": 3.830985915492958e-06, + "loss": 785.7461, + "step": 137 + }, + { + "epoch": 0.05838180856689582, + "grad_norm": 12.780108451843262, + "learning_rate": 3.859154929577465e-06, + "loss": 740.6241, + "step": 138 + }, + { + "epoch": 0.05880486515071391, + "grad_norm": 12.778249740600586, + "learning_rate": 3.887323943661972e-06, + "loss": 759.5258, + "step": 139 + }, + { + "epoch": 0.059227921734531994, + "grad_norm": 13.203436851501465, + "learning_rate": 3.915492957746479e-06, + "loss": 740.8657, + "step": 140 + }, + { + "epoch": 0.05965097831835008, + "grad_norm": 13.304123878479004, + "learning_rate": 3.943661971830986e-06, + "loss": 761.1189, + "step": 141 + }, + { + "epoch": 0.060074034902168165, + "grad_norm": 12.840725898742676, + "learning_rate": 3.971830985915493e-06, + "loss": 743.9786, + "step": 142 + }, + { + "epoch": 0.060497091485986254, + "grad_norm": 10.575838088989258, + "learning_rate": 4.000000000000001e-06, + "loss": 826.9847, + "step": 143 + }, + { + "epoch": 0.060920148069804336, + "grad_norm": 10.273885726928711, + "learning_rate": 4.028169014084508e-06, + "loss": 845.1307, + "step": 144 + }, + { + "epoch": 0.061343204653622425, + "grad_norm": 12.817646026611328, + "learning_rate": 4.056338028169014e-06, + "loss": 764.0731, + "step": 145 + }, + { + "epoch": 0.06176626123744051, + "grad_norm": 13.322304725646973, + "learning_rate": 4.0845070422535216e-06, + "loss": 721.0749, + "step": 146 + }, + { + "epoch": 0.0621893178212586, + "grad_norm": 11.851049423217773, + "learning_rate": 4.112676056338029e-06, + "loss": 781.6534, + "step": 147 + }, + { + "epoch": 0.06261237440507668, + "grad_norm": 12.586398124694824, + "learning_rate": 4.140845070422535e-06, + "loss": 762.0431, + "step": 148 + }, + { + "epoch": 0.06303543098889476, + "grad_norm": 11.579684257507324, + "learning_rate": 4.1690140845070425e-06, + "loss": 780.2112, + "step": 149 + }, + { + "epoch": 0.06345848757271286, + "grad_norm": 11.349550247192383, + "learning_rate": 4.19718309859155e-06, + "loss": 802.6907, + "step": 150 + }, + { + "epoch": 0.06388154415653094, + "grad_norm": 12.061245918273926, + "learning_rate": 4.225352112676057e-06, + "loss": 782.1167, + "step": 151 + }, + { + "epoch": 0.06430460074034902, + "grad_norm": 12.726497650146484, + "learning_rate": 4.253521126760563e-06, + "loss": 759.1027, + "step": 152 + }, + { + "epoch": 0.0647276573241671, + "grad_norm": 14.955892562866211, + "learning_rate": 4.281690140845071e-06, + "loss": 698.6662, + "step": 153 + }, + { + "epoch": 0.0651507139079852, + "grad_norm": 12.189628601074219, + "learning_rate": 4.309859154929578e-06, + "loss": 781.8915, + "step": 154 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 12.787518501281738, + "learning_rate": 4.338028169014084e-06, + "loss": 742.3318, + "step": 155 + }, + { + "epoch": 0.06599682707562136, + "grad_norm": 9.815357208251953, + "learning_rate": 4.3661971830985915e-06, + "loss": 842.6252, + "step": 156 + }, + { + "epoch": 0.06641988365943945, + "grad_norm": 10.738014221191406, + "learning_rate": 4.3943661971831e-06, + "loss": 822.4574, + "step": 157 + }, + { + "epoch": 0.06684294024325754, + "grad_norm": 12.376725196838379, + "learning_rate": 4.422535211267606e-06, + "loss": 785.0488, + "step": 158 + }, + { + "epoch": 0.06726599682707562, + "grad_norm": 11.86491584777832, + "learning_rate": 4.450704225352113e-06, + "loss": 781.5145, + "step": 159 + }, + { + "epoch": 0.06768905341089371, + "grad_norm": 13.894014358520508, + "learning_rate": 4.4788732394366205e-06, + "loss": 699.2943, + "step": 160 + }, + { + "epoch": 0.06811210999471179, + "grad_norm": 14.1129732131958, + "learning_rate": 4.507042253521127e-06, + "loss": 703.138, + "step": 161 + }, + { + "epoch": 0.06853516657852989, + "grad_norm": 11.298645973205566, + "learning_rate": 4.535211267605634e-06, + "loss": 805.1056, + "step": 162 + }, + { + "epoch": 0.06895822316234797, + "grad_norm": 14.156214714050293, + "learning_rate": 4.5633802816901414e-06, + "loss": 720.2867, + "step": 163 + }, + { + "epoch": 0.06938127974616605, + "grad_norm": 13.371084213256836, + "learning_rate": 4.591549295774648e-06, + "loss": 719.2059, + "step": 164 + }, + { + "epoch": 0.06980433632998413, + "grad_norm": 11.292455673217773, + "learning_rate": 4.619718309859155e-06, + "loss": 803.7174, + "step": 165 + }, + { + "epoch": 0.07022739291380223, + "grad_norm": 13.342081069946289, + "learning_rate": 4.647887323943662e-06, + "loss": 738.744, + "step": 166 + }, + { + "epoch": 0.07065044949762031, + "grad_norm": 14.620978355407715, + "learning_rate": 4.67605633802817e-06, + "loss": 718.3683, + "step": 167 + }, + { + "epoch": 0.07107350608143839, + "grad_norm": 11.831371307373047, + "learning_rate": 4.704225352112676e-06, + "loss": 780.2458, + "step": 168 + }, + { + "epoch": 0.07149656266525647, + "grad_norm": 11.213191986083984, + "learning_rate": 4.732394366197183e-06, + "loss": 802.7579, + "step": 169 + }, + { + "epoch": 0.07191961924907457, + "grad_norm": 13.74271297454834, + "learning_rate": 4.7605633802816905e-06, + "loss": 720.8162, + "step": 170 + }, + { + "epoch": 0.07234267583289265, + "grad_norm": 16.020933151245117, + "learning_rate": 4.788732394366197e-06, + "loss": 739.6012, + "step": 171 + }, + { + "epoch": 0.07276573241671073, + "grad_norm": 13.694025993347168, + "learning_rate": 4.816901408450705e-06, + "loss": 741.5752, + "step": 172 + }, + { + "epoch": 0.07318878900052882, + "grad_norm": 12.236172676086426, + "learning_rate": 4.845070422535212e-06, + "loss": 763.8745, + "step": 173 + }, + { + "epoch": 0.07361184558434691, + "grad_norm": 13.075498580932617, + "learning_rate": 4.873239436619719e-06, + "loss": 760.1675, + "step": 174 + }, + { + "epoch": 0.074034902168165, + "grad_norm": 11.457498550415039, + "learning_rate": 4.901408450704226e-06, + "loss": 803.55, + "step": 175 + }, + { + "epoch": 0.07445795875198308, + "grad_norm": 13.509984970092773, + "learning_rate": 4.929577464788733e-06, + "loss": 760.4025, + "step": 176 + }, + { + "epoch": 0.07488101533580116, + "grad_norm": 12.341206550598145, + "learning_rate": 4.9577464788732395e-06, + "loss": 762.9824, + "step": 177 + }, + { + "epoch": 0.07530407191961926, + "grad_norm": 12.090161323547363, + "learning_rate": 4.985915492957747e-06, + "loss": 804.2631, + "step": 178 + }, + { + "epoch": 0.07572712850343734, + "grad_norm": 17.76517677307129, + "learning_rate": 5.014084507042254e-06, + "loss": 696.7214, + "step": 179 + }, + { + "epoch": 0.07615018508725542, + "grad_norm": 11.294623374938965, + "learning_rate": 5.042253521126761e-06, + "loss": 802.3047, + "step": 180 + }, + { + "epoch": 0.0765732416710735, + "grad_norm": 11.407654762268066, + "learning_rate": 5.070422535211268e-06, + "loss": 798.0818, + "step": 181 + }, + { + "epoch": 0.0769962982548916, + "grad_norm": 14.942896842956543, + "learning_rate": 5.098591549295775e-06, + "loss": 739.9935, + "step": 182 + }, + { + "epoch": 0.07741935483870968, + "grad_norm": 11.34852409362793, + "learning_rate": 5.126760563380282e-06, + "loss": 801.6583, + "step": 183 + }, + { + "epoch": 0.07784241142252776, + "grad_norm": 15.629440307617188, + "learning_rate": 5.154929577464789e-06, + "loss": 678.8229, + "step": 184 + }, + { + "epoch": 0.07826546800634585, + "grad_norm": 13.148126602172852, + "learning_rate": 5.183098591549296e-06, + "loss": 741.8548, + "step": 185 + }, + { + "epoch": 0.07868852459016394, + "grad_norm": 11.921072959899902, + "learning_rate": 5.211267605633803e-06, + "loss": 780.3279, + "step": 186 + }, + { + "epoch": 0.07911158117398202, + "grad_norm": 16.826881408691406, + "learning_rate": 5.23943661971831e-06, + "loss": 659.399, + "step": 187 + }, + { + "epoch": 0.0795346377578001, + "grad_norm": 12.955828666687012, + "learning_rate": 5.267605633802817e-06, + "loss": 762.2226, + "step": 188 + }, + { + "epoch": 0.07995769434161819, + "grad_norm": 15.199082374572754, + "learning_rate": 5.295774647887324e-06, + "loss": 677.9695, + "step": 189 + }, + { + "epoch": 0.08038075092543628, + "grad_norm": 14.151132583618164, + "learning_rate": 5.323943661971831e-06, + "loss": 700.547, + "step": 190 + }, + { + "epoch": 0.08080380750925437, + "grad_norm": 11.990769386291504, + "learning_rate": 5.352112676056338e-06, + "loss": 778.9265, + "step": 191 + }, + { + "epoch": 0.08122686409307245, + "grad_norm": 11.620555877685547, + "learning_rate": 5.380281690140845e-06, + "loss": 783.0181, + "step": 192 + }, + { + "epoch": 0.08164992067689053, + "grad_norm": 13.583348274230957, + "learning_rate": 5.408450704225352e-06, + "loss": 744.9356, + "step": 193 + }, + { + "epoch": 0.08207297726070863, + "grad_norm": 10.4830904006958, + "learning_rate": 5.43661971830986e-06, + "loss": 821.1246, + "step": 194 + }, + { + "epoch": 0.08249603384452671, + "grad_norm": 13.16250991821289, + "learning_rate": 5.4647887323943675e-06, + "loss": 761.0168, + "step": 195 + }, + { + "epoch": 0.08291909042834479, + "grad_norm": 13.73855972290039, + "learning_rate": 5.492957746478874e-06, + "loss": 763.1394, + "step": 196 + }, + { + "epoch": 0.08334214701216287, + "grad_norm": 11.830713272094727, + "learning_rate": 5.521126760563381e-06, + "loss": 786.0801, + "step": 197 + }, + { + "epoch": 0.08376520359598097, + "grad_norm": 14.222375869750977, + "learning_rate": 5.549295774647888e-06, + "loss": 719.3934, + "step": 198 + }, + { + "epoch": 0.08418826017979905, + "grad_norm": 11.704263687133789, + "learning_rate": 5.577464788732395e-06, + "loss": 801.6824, + "step": 199 + }, + { + "epoch": 0.08461131676361713, + "grad_norm": 11.42605209350586, + "learning_rate": 5.605633802816902e-06, + "loss": 802.4886, + "step": 200 + }, + { + "epoch": 0.08503437334743522, + "grad_norm": 17.302059173583984, + "learning_rate": 5.633802816901409e-06, + "loss": 680.4895, + "step": 201 + }, + { + "epoch": 0.08545742993125331, + "grad_norm": 15.966385841369629, + "learning_rate": 5.6619718309859165e-06, + "loss": 659.8467, + "step": 202 + }, + { + "epoch": 0.0858804865150714, + "grad_norm": 13.223852157592773, + "learning_rate": 5.690140845070423e-06, + "loss": 738.3452, + "step": 203 + }, + { + "epoch": 0.08630354309888948, + "grad_norm": 12.700986862182617, + "learning_rate": 5.71830985915493e-06, + "loss": 759.3812, + "step": 204 + }, + { + "epoch": 0.08672659968270756, + "grad_norm": 13.884270668029785, + "learning_rate": 5.7464788732394374e-06, + "loss": 739.0624, + "step": 205 + }, + { + "epoch": 0.08714965626652565, + "grad_norm": 16.061424255371094, + "learning_rate": 5.774647887323944e-06, + "loss": 656.0363, + "step": 206 + }, + { + "epoch": 0.08757271285034374, + "grad_norm": 12.873085021972656, + "learning_rate": 5.802816901408451e-06, + "loss": 738.4911, + "step": 207 + }, + { + "epoch": 0.08799576943416182, + "grad_norm": 11.68387508392334, + "learning_rate": 5.830985915492958e-06, + "loss": 798.8392, + "step": 208 + }, + { + "epoch": 0.0884188260179799, + "grad_norm": 12.689035415649414, + "learning_rate": 5.859154929577466e-06, + "loss": 754.3231, + "step": 209 + }, + { + "epoch": 0.088841882601798, + "grad_norm": 9.941793441772461, + "learning_rate": 5.887323943661972e-06, + "loss": 843.1752, + "step": 210 + }, + { + "epoch": 0.08926493918561608, + "grad_norm": 14.372147560119629, + "learning_rate": 5.915492957746479e-06, + "loss": 720.0258, + "step": 211 + }, + { + "epoch": 0.08968799576943416, + "grad_norm": 14.841486930847168, + "learning_rate": 5.9436619718309865e-06, + "loss": 716.4156, + "step": 212 + }, + { + "epoch": 0.09011105235325224, + "grad_norm": 12.936238288879395, + "learning_rate": 5.971830985915493e-06, + "loss": 763.2375, + "step": 213 + }, + { + "epoch": 0.09053410893707034, + "grad_norm": 14.032865524291992, + "learning_rate": 6e-06, + "loss": 739.2999, + "step": 214 + }, + { + "epoch": 0.09095716552088842, + "grad_norm": 11.31862735748291, + "learning_rate": 6.028169014084507e-06, + "loss": 798.2567, + "step": 215 + }, + { + "epoch": 0.0913802221047065, + "grad_norm": 15.051209449768066, + "learning_rate": 6.056338028169015e-06, + "loss": 699.4674, + "step": 216 + }, + { + "epoch": 0.09180327868852459, + "grad_norm": 12.853492736816406, + "learning_rate": 6.084507042253521e-06, + "loss": 757.322, + "step": 217 + }, + { + "epoch": 0.09222633527234268, + "grad_norm": 13.888556480407715, + "learning_rate": 6.112676056338028e-06, + "loss": 738.9712, + "step": 218 + }, + { + "epoch": 0.09264939185616076, + "grad_norm": 12.94190502166748, + "learning_rate": 6.1408450704225356e-06, + "loss": 758.505, + "step": 219 + }, + { + "epoch": 0.09307244843997885, + "grad_norm": 11.555241584777832, + "learning_rate": 6.169014084507042e-06, + "loss": 800.7646, + "step": 220 + }, + { + "epoch": 0.09349550502379693, + "grad_norm": 12.766619682312012, + "learning_rate": 6.197183098591549e-06, + "loss": 757.0752, + "step": 221 + }, + { + "epoch": 0.09391856160761503, + "grad_norm": 12.642359733581543, + "learning_rate": 6.2253521126760565e-06, + "loss": 779.2925, + "step": 222 + }, + { + "epoch": 0.09434161819143311, + "grad_norm": 12.971923828125, + "learning_rate": 6.2535211267605646e-06, + "loss": 757.6266, + "step": 223 + }, + { + "epoch": 0.09476467477525119, + "grad_norm": 11.587763786315918, + "learning_rate": 6.281690140845072e-06, + "loss": 780.3951, + "step": 224 + }, + { + "epoch": 0.09518773135906927, + "grad_norm": 12.811110496520996, + "learning_rate": 6.309859154929578e-06, + "loss": 779.9502, + "step": 225 + }, + { + "epoch": 0.09561078794288737, + "grad_norm": 11.17767333984375, + "learning_rate": 6.3380281690140855e-06, + "loss": 797.8807, + "step": 226 + }, + { + "epoch": 0.09603384452670545, + "grad_norm": 12.834802627563477, + "learning_rate": 6.366197183098593e-06, + "loss": 754.6746, + "step": 227 + }, + { + "epoch": 0.09645690111052353, + "grad_norm": 11.9449462890625, + "learning_rate": 6.394366197183099e-06, + "loss": 779.6824, + "step": 228 + }, + { + "epoch": 0.09687995769434161, + "grad_norm": 13.82591438293457, + "learning_rate": 6.422535211267606e-06, + "loss": 742.3206, + "step": 229 + }, + { + "epoch": 0.09730301427815971, + "grad_norm": 12.917394638061523, + "learning_rate": 6.450704225352114e-06, + "loss": 738.9241, + "step": 230 + }, + { + "epoch": 0.09772607086197779, + "grad_norm": 12.958208084106445, + "learning_rate": 6.478873239436621e-06, + "loss": 759.2827, + "step": 231 + }, + { + "epoch": 0.09814912744579588, + "grad_norm": 12.81097412109375, + "learning_rate": 6.507042253521127e-06, + "loss": 783.6682, + "step": 232 + }, + { + "epoch": 0.09857218402961396, + "grad_norm": 14.196828842163086, + "learning_rate": 6.5352112676056345e-06, + "loss": 737.3646, + "step": 233 + }, + { + "epoch": 0.09899524061343205, + "grad_norm": 9.792341232299805, + "learning_rate": 6.563380281690142e-06, + "loss": 841.9721, + "step": 234 + }, + { + "epoch": 0.09941829719725014, + "grad_norm": 14.710625648498535, + "learning_rate": 6.591549295774648e-06, + "loss": 735.1588, + "step": 235 + }, + { + "epoch": 0.09984135378106822, + "grad_norm": 11.597647666931152, + "learning_rate": 6.619718309859155e-06, + "loss": 802.2897, + "step": 236 + }, + { + "epoch": 0.1002644103648863, + "grad_norm": 11.403594017028809, + "learning_rate": 6.647887323943663e-06, + "loss": 800.1327, + "step": 237 + }, + { + "epoch": 0.1006874669487044, + "grad_norm": 14.67492961883545, + "learning_rate": 6.67605633802817e-06, + "loss": 717.5559, + "step": 238 + }, + { + "epoch": 0.10111052353252248, + "grad_norm": 14.060994148254395, + "learning_rate": 6.704225352112676e-06, + "loss": 762.9032, + "step": 239 + }, + { + "epoch": 0.10153358011634056, + "grad_norm": 12.9485502243042, + "learning_rate": 6.7323943661971836e-06, + "loss": 760.3637, + "step": 240 + }, + { + "epoch": 0.10195663670015864, + "grad_norm": 13.912680625915527, + "learning_rate": 6.760563380281691e-06, + "loss": 736.4286, + "step": 241 + }, + { + "epoch": 0.10237969328397674, + "grad_norm": 13.293014526367188, + "learning_rate": 6.788732394366197e-06, + "loss": 757.4353, + "step": 242 + }, + { + "epoch": 0.10280274986779482, + "grad_norm": 13.04476261138916, + "learning_rate": 6.8169014084507045e-06, + "loss": 757.9034, + "step": 243 + }, + { + "epoch": 0.1032258064516129, + "grad_norm": 11.37155532836914, + "learning_rate": 6.845070422535212e-06, + "loss": 819.3074, + "step": 244 + }, + { + "epoch": 0.10364886303543099, + "grad_norm": 9.879289627075195, + "learning_rate": 6.873239436619719e-06, + "loss": 842.1132, + "step": 245 + }, + { + "epoch": 0.10407191961924908, + "grad_norm": 9.874364852905273, + "learning_rate": 6.901408450704225e-06, + "loss": 840.7875, + "step": 246 + }, + { + "epoch": 0.10449497620306716, + "grad_norm": 12.859636306762695, + "learning_rate": 6.929577464788733e-06, + "loss": 756.2938, + "step": 247 + }, + { + "epoch": 0.10491803278688525, + "grad_norm": 11.37687873840332, + "learning_rate": 6.95774647887324e-06, + "loss": 797.7767, + "step": 248 + }, + { + "epoch": 0.10534108937070333, + "grad_norm": 13.692133903503418, + "learning_rate": 6.985915492957746e-06, + "loss": 759.5692, + "step": 249 + }, + { + "epoch": 0.10576414595452142, + "grad_norm": 13.65575885772705, + "learning_rate": 7.0140845070422535e-06, + "loss": 737.8914, + "step": 250 + }, + { + "epoch": 0.1061872025383395, + "grad_norm": 12.590005874633789, + "learning_rate": 7.042253521126761e-06, + "loss": 780.3461, + "step": 251 + }, + { + "epoch": 0.10661025912215759, + "grad_norm": 14.149081230163574, + "learning_rate": 7.070422535211268e-06, + "loss": 716.4764, + "step": 252 + }, + { + "epoch": 0.10703331570597567, + "grad_norm": 10.632115364074707, + "learning_rate": 7.098591549295776e-06, + "loss": 822.5111, + "step": 253 + }, + { + "epoch": 0.10745637228979377, + "grad_norm": 12.453465461730957, + "learning_rate": 7.1267605633802825e-06, + "loss": 757.7155, + "step": 254 + }, + { + "epoch": 0.10787942887361185, + "grad_norm": 11.052130699157715, + "learning_rate": 7.15492957746479e-06, + "loss": 820.4286, + "step": 255 + }, + { + "epoch": 0.10830248545742993, + "grad_norm": 10.628396987915039, + "learning_rate": 7.183098591549297e-06, + "loss": 821.5991, + "step": 256 + }, + { + "epoch": 0.10872554204124801, + "grad_norm": 15.458640098571777, + "learning_rate": 7.211267605633803e-06, + "loss": 654.8982, + "step": 257 + }, + { + "epoch": 0.10914859862506611, + "grad_norm": 12.94022274017334, + "learning_rate": 7.239436619718311e-06, + "loss": 758.9969, + "step": 258 + }, + { + "epoch": 0.10957165520888419, + "grad_norm": 12.632230758666992, + "learning_rate": 7.267605633802818e-06, + "loss": 775.6349, + "step": 259 + }, + { + "epoch": 0.10999471179270227, + "grad_norm": 13.002449035644531, + "learning_rate": 7.295774647887325e-06, + "loss": 754.9686, + "step": 260 + }, + { + "epoch": 0.11041776837652036, + "grad_norm": 17.93729591369629, + "learning_rate": 7.3239436619718316e-06, + "loss": 672.6771, + "step": 261 + }, + { + "epoch": 0.11084082496033845, + "grad_norm": 15.656952857971191, + "learning_rate": 7.352112676056339e-06, + "loss": 675.699, + "step": 262 + }, + { + "epoch": 0.11126388154415653, + "grad_norm": 12.789374351501465, + "learning_rate": 7.380281690140846e-06, + "loss": 775.5643, + "step": 263 + }, + { + "epoch": 0.11168693812797462, + "grad_norm": 12.42507553100586, + "learning_rate": 7.4084507042253525e-06, + "loss": 798.407, + "step": 264 + }, + { + "epoch": 0.1121099947117927, + "grad_norm": 12.29761791229248, + "learning_rate": 7.43661971830986e-06, + "loss": 778.4043, + "step": 265 + }, + { + "epoch": 0.1125330512956108, + "grad_norm": 14.056609153747559, + "learning_rate": 7.464788732394367e-06, + "loss": 716.1398, + "step": 266 + }, + { + "epoch": 0.11295610787942888, + "grad_norm": 12.214421272277832, + "learning_rate": 7.492957746478874e-06, + "loss": 799.4088, + "step": 267 + }, + { + "epoch": 0.11337916446324696, + "grad_norm": 11.814835548400879, + "learning_rate": 7.521126760563381e-06, + "loss": 795.6407, + "step": 268 + }, + { + "epoch": 0.11380222104706504, + "grad_norm": 14.104567527770996, + "learning_rate": 7.549295774647888e-06, + "loss": 734.4398, + "step": 269 + }, + { + "epoch": 0.11422527763088314, + "grad_norm": 12.371315956115723, + "learning_rate": 7.577464788732395e-06, + "loss": 800.4613, + "step": 270 + }, + { + "epoch": 0.11464833421470122, + "grad_norm": 13.998788833618164, + "learning_rate": 7.6056338028169015e-06, + "loss": 757.4918, + "step": 271 + }, + { + "epoch": 0.1150713907985193, + "grad_norm": 12.564826011657715, + "learning_rate": 7.633802816901409e-06, + "loss": 756.5626, + "step": 272 + }, + { + "epoch": 0.11549444738233738, + "grad_norm": 12.614603996276855, + "learning_rate": 7.661971830985916e-06, + "loss": 754.3975, + "step": 273 + }, + { + "epoch": 0.11591750396615548, + "grad_norm": 14.148496627807617, + "learning_rate": 7.690140845070423e-06, + "loss": 715.9868, + "step": 274 + }, + { + "epoch": 0.11634056054997356, + "grad_norm": 17.046178817749023, + "learning_rate": 7.71830985915493e-06, + "loss": 654.8709, + "step": 275 + }, + { + "epoch": 0.11676361713379164, + "grad_norm": 12.191910743713379, + "learning_rate": 7.746478873239436e-06, + "loss": 777.5788, + "step": 276 + }, + { + "epoch": 0.11718667371760973, + "grad_norm": 14.816165924072266, + "learning_rate": 7.774647887323943e-06, + "loss": 717.7572, + "step": 277 + }, + { + "epoch": 0.11760973030142782, + "grad_norm": 14.237536430358887, + "learning_rate": 7.80281690140845e-06, + "loss": 736.8791, + "step": 278 + }, + { + "epoch": 0.1180327868852459, + "grad_norm": 13.653615951538086, + "learning_rate": 7.830985915492958e-06, + "loss": 734.8318, + "step": 279 + }, + { + "epoch": 0.11845584346906399, + "grad_norm": 14.138933181762695, + "learning_rate": 7.859154929577465e-06, + "loss": 716.6235, + "step": 280 + }, + { + "epoch": 0.11887890005288207, + "grad_norm": 13.666022300720215, + "learning_rate": 7.887323943661972e-06, + "loss": 737.1309, + "step": 281 + }, + { + "epoch": 0.11930195663670017, + "grad_norm": 12.850542068481445, + "learning_rate": 7.91549295774648e-06, + "loss": 778.734, + "step": 282 + }, + { + "epoch": 0.11972501322051825, + "grad_norm": 11.598752975463867, + "learning_rate": 7.943661971830987e-06, + "loss": 798.0325, + "step": 283 + }, + { + "epoch": 0.12014806980433633, + "grad_norm": 13.594483375549316, + "learning_rate": 7.971830985915494e-06, + "loss": 736.7729, + "step": 284 + }, + { + "epoch": 0.12057112638815441, + "grad_norm": 14.832201957702637, + "learning_rate": 8.000000000000001e-06, + "loss": 695.967, + "step": 285 + }, + { + "epoch": 0.12099418297197251, + "grad_norm": 11.722511291503906, + "learning_rate": 8.028169014084509e-06, + "loss": 795.8473, + "step": 286 + }, + { + "epoch": 0.12141723955579059, + "grad_norm": 11.44096851348877, + "learning_rate": 8.056338028169016e-06, + "loss": 793.1028, + "step": 287 + }, + { + "epoch": 0.12184029613960867, + "grad_norm": 12.772346496582031, + "learning_rate": 8.084507042253521e-06, + "loss": 754.6519, + "step": 288 + }, + { + "epoch": 0.12226335272342675, + "grad_norm": 13.239483833312988, + "learning_rate": 8.112676056338029e-06, + "loss": 760.6239, + "step": 289 + }, + { + "epoch": 0.12268640930724485, + "grad_norm": 11.81961441040039, + "learning_rate": 8.140845070422536e-06, + "loss": 796.1074, + "step": 290 + }, + { + "epoch": 0.12310946589106293, + "grad_norm": 12.267008781433105, + "learning_rate": 8.169014084507043e-06, + "loss": 777.7317, + "step": 291 + }, + { + "epoch": 0.12353252247488102, + "grad_norm": 13.151246070861816, + "learning_rate": 8.19718309859155e-06, + "loss": 756.4695, + "step": 292 + }, + { + "epoch": 0.1239555790586991, + "grad_norm": 11.524453163146973, + "learning_rate": 8.225352112676058e-06, + "loss": 816.7649, + "step": 293 + }, + { + "epoch": 0.1243786356425172, + "grad_norm": 14.034421920776367, + "learning_rate": 8.253521126760565e-06, + "loss": 733.7847, + "step": 294 + }, + { + "epoch": 0.12480169222633528, + "grad_norm": 12.834213256835938, + "learning_rate": 8.28169014084507e-06, + "loss": 796.6864, + "step": 295 + }, + { + "epoch": 0.12522474881015336, + "grad_norm": 11.805051803588867, + "learning_rate": 8.309859154929578e-06, + "loss": 815.3569, + "step": 296 + }, + { + "epoch": 0.12564780539397144, + "grad_norm": 13.60484504699707, + "learning_rate": 8.338028169014085e-06, + "loss": 738.9814, + "step": 297 + }, + { + "epoch": 0.12607086197778952, + "grad_norm": 12.630477905273438, + "learning_rate": 8.366197183098592e-06, + "loss": 778.5721, + "step": 298 + }, + { + "epoch": 0.1264939185616076, + "grad_norm": 16.00406265258789, + "learning_rate": 8.3943661971831e-06, + "loss": 652.7642, + "step": 299 + }, + { + "epoch": 0.12691697514542571, + "grad_norm": 12.212175369262695, + "learning_rate": 8.422535211267607e-06, + "loss": 797.1302, + "step": 300 + }, + { + "epoch": 0.1273400317292438, + "grad_norm": 13.555891036987305, + "learning_rate": 8.450704225352114e-06, + "loss": 756.9495, + "step": 301 + }, + { + "epoch": 0.12776308831306188, + "grad_norm": 15.193621635437012, + "learning_rate": 8.47887323943662e-06, + "loss": 713.0088, + "step": 302 + }, + { + "epoch": 0.12818614489687996, + "grad_norm": 13.157082557678223, + "learning_rate": 8.507042253521127e-06, + "loss": 779.2067, + "step": 303 + }, + { + "epoch": 0.12860920148069804, + "grad_norm": 13.295742988586426, + "learning_rate": 8.535211267605634e-06, + "loss": 779.5781, + "step": 304 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 11.40140151977539, + "learning_rate": 8.563380281690141e-06, + "loss": 797.5276, + "step": 305 + }, + { + "epoch": 0.1294553146483342, + "grad_norm": 15.688788414001465, + "learning_rate": 8.591549295774648e-06, + "loss": 693.9215, + "step": 306 + }, + { + "epoch": 0.1298783712321523, + "grad_norm": 12.788575172424316, + "learning_rate": 8.619718309859156e-06, + "loss": 817.5286, + "step": 307 + }, + { + "epoch": 0.1303014278159704, + "grad_norm": 13.39333724975586, + "learning_rate": 8.647887323943663e-06, + "loss": 778.3508, + "step": 308 + }, + { + "epoch": 0.13072448439978848, + "grad_norm": 12.953949928283691, + "learning_rate": 8.676056338028169e-06, + "loss": 778.9214, + "step": 309 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 15.441649436950684, + "learning_rate": 8.704225352112676e-06, + "loss": 736.5083, + "step": 310 + }, + { + "epoch": 0.13157059756742465, + "grad_norm": 14.27033805847168, + "learning_rate": 8.732394366197183e-06, + "loss": 757.1612, + "step": 311 + }, + { + "epoch": 0.13199365415124273, + "grad_norm": 13.073638916015625, + "learning_rate": 8.760563380281692e-06, + "loss": 776.9095, + "step": 312 + }, + { + "epoch": 0.1324167107350608, + "grad_norm": 13.515632629394531, + "learning_rate": 8.7887323943662e-06, + "loss": 776.4768, + "step": 313 + }, + { + "epoch": 0.1328397673188789, + "grad_norm": 13.743244171142578, + "learning_rate": 8.816901408450705e-06, + "loss": 755.4343, + "step": 314 + }, + { + "epoch": 0.13326282390269698, + "grad_norm": 16.14875030517578, + "learning_rate": 8.845070422535212e-06, + "loss": 711.0346, + "step": 315 + }, + { + "epoch": 0.13368588048651509, + "grad_norm": 13.213085174560547, + "learning_rate": 8.87323943661972e-06, + "loss": 757.5919, + "step": 316 + }, + { + "epoch": 0.13410893707033317, + "grad_norm": 15.552072525024414, + "learning_rate": 8.901408450704227e-06, + "loss": 673.1417, + "step": 317 + }, + { + "epoch": 0.13453199365415125, + "grad_norm": 13.443818092346191, + "learning_rate": 8.929577464788734e-06, + "loss": 756.0958, + "step": 318 + }, + { + "epoch": 0.13495505023796933, + "grad_norm": 11.72736644744873, + "learning_rate": 8.957746478873241e-06, + "loss": 796.7432, + "step": 319 + }, + { + "epoch": 0.13537810682178741, + "grad_norm": 14.146912574768066, + "learning_rate": 8.985915492957748e-06, + "loss": 736.0888, + "step": 320 + }, + { + "epoch": 0.1358011634056055, + "grad_norm": 15.127228736877441, + "learning_rate": 9.014084507042254e-06, + "loss": 713.463, + "step": 321 + }, + { + "epoch": 0.13622421998942358, + "grad_norm": 12.653226852416992, + "learning_rate": 9.042253521126761e-06, + "loss": 818.5015, + "step": 322 + }, + { + "epoch": 0.13664727657324166, + "grad_norm": 12.538908004760742, + "learning_rate": 9.070422535211268e-06, + "loss": 797.3392, + "step": 323 + }, + { + "epoch": 0.13707033315705977, + "grad_norm": 12.776373863220215, + "learning_rate": 9.098591549295776e-06, + "loss": 775.7761, + "step": 324 + }, + { + "epoch": 0.13749338974087785, + "grad_norm": 13.518254280090332, + "learning_rate": 9.126760563380283e-06, + "loss": 753.9362, + "step": 325 + }, + { + "epoch": 0.13791644632469593, + "grad_norm": 17.87234878540039, + "learning_rate": 9.15492957746479e-06, + "loss": 652.7172, + "step": 326 + }, + { + "epoch": 0.13833950290851402, + "grad_norm": 17.0050106048584, + "learning_rate": 9.183098591549296e-06, + "loss": 674.3735, + "step": 327 + }, + { + "epoch": 0.1387625594923321, + "grad_norm": 13.370560646057129, + "learning_rate": 9.211267605633803e-06, + "loss": 778.4688, + "step": 328 + }, + { + "epoch": 0.13918561607615018, + "grad_norm": 16.789766311645508, + "learning_rate": 9.23943661971831e-06, + "loss": 736.4812, + "step": 329 + }, + { + "epoch": 0.13960867265996826, + "grad_norm": 13.75463581085205, + "learning_rate": 9.267605633802817e-06, + "loss": 754.6477, + "step": 330 + }, + { + "epoch": 0.14003172924378635, + "grad_norm": 13.878124237060547, + "learning_rate": 9.295774647887325e-06, + "loss": 777.736, + "step": 331 + }, + { + "epoch": 0.14045478582760446, + "grad_norm": 16.170642852783203, + "learning_rate": 9.323943661971832e-06, + "loss": 713.9561, + "step": 332 + }, + { + "epoch": 0.14087784241142254, + "grad_norm": 14.423304557800293, + "learning_rate": 9.35211267605634e-06, + "loss": 757.4655, + "step": 333 + }, + { + "epoch": 0.14130089899524062, + "grad_norm": 13.21603012084961, + "learning_rate": 9.380281690140845e-06, + "loss": 794.8549, + "step": 334 + }, + { + "epoch": 0.1417239555790587, + "grad_norm": 11.370606422424316, + "learning_rate": 9.408450704225352e-06, + "loss": 821.2043, + "step": 335 + }, + { + "epoch": 0.14214701216287678, + "grad_norm": 15.90064525604248, + "learning_rate": 9.43661971830986e-06, + "loss": 735.8975, + "step": 336 + }, + { + "epoch": 0.14257006874669487, + "grad_norm": 13.71650218963623, + "learning_rate": 9.464788732394366e-06, + "loss": 752.1182, + "step": 337 + }, + { + "epoch": 0.14299312533051295, + "grad_norm": 14.384450912475586, + "learning_rate": 9.492957746478874e-06, + "loss": 735.8753, + "step": 338 + }, + { + "epoch": 0.14341618191433103, + "grad_norm": 12.603275299072266, + "learning_rate": 9.521126760563381e-06, + "loss": 798.2675, + "step": 339 + }, + { + "epoch": 0.14383923849814914, + "grad_norm": 15.95854377746582, + "learning_rate": 9.549295774647888e-06, + "loss": 732.4976, + "step": 340 + }, + { + "epoch": 0.14426229508196722, + "grad_norm": 13.709829330444336, + "learning_rate": 9.577464788732394e-06, + "loss": 733.7445, + "step": 341 + }, + { + "epoch": 0.1446853516657853, + "grad_norm": 12.397858619689941, + "learning_rate": 9.605633802816903e-06, + "loss": 796.848, + "step": 342 + }, + { + "epoch": 0.1451084082496034, + "grad_norm": 16.04680633544922, + "learning_rate": 9.63380281690141e-06, + "loss": 733.8109, + "step": 343 + }, + { + "epoch": 0.14553146483342147, + "grad_norm": 15.829607963562012, + "learning_rate": 9.661971830985917e-06, + "loss": 691.0699, + "step": 344 + }, + { + "epoch": 0.14595452141723955, + "grad_norm": 12.21288776397705, + "learning_rate": 9.690140845070424e-06, + "loss": 797.8896, + "step": 345 + }, + { + "epoch": 0.14637757800105763, + "grad_norm": 16.243478775024414, + "learning_rate": 9.71830985915493e-06, + "loss": 712.0507, + "step": 346 + }, + { + "epoch": 0.14680063458487572, + "grad_norm": 14.1298246383667, + "learning_rate": 9.746478873239437e-06, + "loss": 777.03, + "step": 347 + }, + { + "epoch": 0.14722369116869383, + "grad_norm": 12.762916564941406, + "learning_rate": 9.774647887323945e-06, + "loss": 774.7269, + "step": 348 + }, + { + "epoch": 0.1476467477525119, + "grad_norm": 13.856205940246582, + "learning_rate": 9.802816901408452e-06, + "loss": 754.0123, + "step": 349 + }, + { + "epoch": 0.14806980433633, + "grad_norm": 14.769192695617676, + "learning_rate": 9.830985915492959e-06, + "loss": 713.2305, + "step": 350 + }, + { + "epoch": 0.14849286092014807, + "grad_norm": 15.015844345092773, + "learning_rate": 9.859154929577466e-06, + "loss": 734.4545, + "step": 351 + }, + { + "epoch": 0.14891591750396616, + "grad_norm": 12.819107055664062, + "learning_rate": 9.887323943661974e-06, + "loss": 795.6561, + "step": 352 + }, + { + "epoch": 0.14933897408778424, + "grad_norm": 15.649892807006836, + "learning_rate": 9.915492957746479e-06, + "loss": 732.0286, + "step": 353 + }, + { + "epoch": 0.14976203067160232, + "grad_norm": 14.045530319213867, + "learning_rate": 9.943661971830986e-06, + "loss": 755.2662, + "step": 354 + }, + { + "epoch": 0.1501850872554204, + "grad_norm": 13.347284317016602, + "learning_rate": 9.971830985915494e-06, + "loss": 774.8124, + "step": 355 + }, + { + "epoch": 0.1506081438392385, + "grad_norm": 12.295364379882812, + "learning_rate": 1e-05, + "loss": 797.9017, + "step": 356 + }, + { + "epoch": 0.1510312004230566, + "grad_norm": 13.199535369873047, + "learning_rate": 9.999999812288176e-06, + "loss": 777.722, + "step": 357 + }, + { + "epoch": 0.15145425700687468, + "grad_norm": 13.100055694580078, + "learning_rate": 9.999999249152713e-06, + "loss": 774.4236, + "step": 358 + }, + { + "epoch": 0.15187731359069276, + "grad_norm": 15.409270286560059, + "learning_rate": 9.999998310593657e-06, + "loss": 712.9058, + "step": 359 + }, + { + "epoch": 0.15230037017451084, + "grad_norm": 13.784954071044922, + "learning_rate": 9.999996996611075e-06, + "loss": 754.6187, + "step": 360 + }, + { + "epoch": 0.15272342675832892, + "grad_norm": 13.666350364685059, + "learning_rate": 9.999995307205068e-06, + "loss": 794.953, + "step": 361 + }, + { + "epoch": 0.153146483342147, + "grad_norm": 16.89323616027832, + "learning_rate": 9.999993242375762e-06, + "loss": 711.3347, + "step": 362 + }, + { + "epoch": 0.1535695399259651, + "grad_norm": 15.849589347839355, + "learning_rate": 9.999990802123313e-06, + "loss": 730.9784, + "step": 363 + }, + { + "epoch": 0.1539925965097832, + "grad_norm": 11.785365104675293, + "learning_rate": 9.999987986447906e-06, + "loss": 835.5739, + "step": 364 + }, + { + "epoch": 0.15441565309360128, + "grad_norm": 17.82701301574707, + "learning_rate": 9.999984795349747e-06, + "loss": 714.7216, + "step": 365 + }, + { + "epoch": 0.15483870967741936, + "grad_norm": 15.360261917114258, + "learning_rate": 9.99998122882908e-06, + "loss": 733.6466, + "step": 366 + }, + { + "epoch": 0.15526176626123744, + "grad_norm": 14.08161735534668, + "learning_rate": 9.999977286886171e-06, + "loss": 794.0718, + "step": 367 + }, + { + "epoch": 0.15568482284505553, + "grad_norm": 18.318384170532227, + "learning_rate": 9.999972969521317e-06, + "loss": 714.5942, + "step": 368 + }, + { + "epoch": 0.1561078794288736, + "grad_norm": 11.950384140014648, + "learning_rate": 9.99996827673484e-06, + "loss": 813.4524, + "step": 369 + }, + { + "epoch": 0.1565309360126917, + "grad_norm": 12.740276336669922, + "learning_rate": 9.999963208527095e-06, + "loss": 797.1209, + "step": 370 + }, + { + "epoch": 0.15695399259650977, + "grad_norm": 13.769516944885254, + "learning_rate": 9.999957764898463e-06, + "loss": 815.7757, + "step": 371 + }, + { + "epoch": 0.15737704918032788, + "grad_norm": 13.970303535461426, + "learning_rate": 9.99995194584935e-06, + "loss": 771.6652, + "step": 372 + }, + { + "epoch": 0.15780010576414596, + "grad_norm": 19.20124626159668, + "learning_rate": 9.999945751380192e-06, + "loss": 674.2137, + "step": 373 + }, + { + "epoch": 0.15822316234796405, + "grad_norm": 12.929895401000977, + "learning_rate": 9.999939181491458e-06, + "loss": 794.8874, + "step": 374 + }, + { + "epoch": 0.15864621893178213, + "grad_norm": 12.376791000366211, + "learning_rate": 9.999932236183641e-06, + "loss": 797.9962, + "step": 375 + }, + { + "epoch": 0.1590692755156002, + "grad_norm": 13.68117618560791, + "learning_rate": 9.99992491545726e-06, + "loss": 774.4408, + "step": 376 + }, + { + "epoch": 0.1594923320994183, + "grad_norm": 15.729691505432129, + "learning_rate": 9.999917219312865e-06, + "loss": 731.6411, + "step": 377 + }, + { + "epoch": 0.15991538868323638, + "grad_norm": 12.815239906311035, + "learning_rate": 9.999909147751035e-06, + "loss": 792.6683, + "step": 378 + }, + { + "epoch": 0.16033844526705446, + "grad_norm": 16.535804748535156, + "learning_rate": 9.999900700772376e-06, + "loss": 731.1664, + "step": 379 + }, + { + "epoch": 0.16076150185087257, + "grad_norm": 14.127059936523438, + "learning_rate": 9.999891878377521e-06, + "loss": 753.6534, + "step": 380 + }, + { + "epoch": 0.16118455843469065, + "grad_norm": 14.731239318847656, + "learning_rate": 9.999882680567134e-06, + "loss": 733.2232, + "step": 381 + }, + { + "epoch": 0.16160761501850873, + "grad_norm": 14.512930870056152, + "learning_rate": 9.999873107341905e-06, + "loss": 754.3052, + "step": 382 + }, + { + "epoch": 0.16203067160232681, + "grad_norm": 12.53246021270752, + "learning_rate": 9.999863158702552e-06, + "loss": 796.3177, + "step": 383 + }, + { + "epoch": 0.1624537281861449, + "grad_norm": 16.268728256225586, + "learning_rate": 9.999852834649823e-06, + "loss": 712.7704, + "step": 384 + }, + { + "epoch": 0.16287678476996298, + "grad_norm": 14.170899391174316, + "learning_rate": 9.999842135184494e-06, + "loss": 753.7166, + "step": 385 + }, + { + "epoch": 0.16329984135378106, + "grad_norm": 13.529441833496094, + "learning_rate": 9.999831060307367e-06, + "loss": 752.6455, + "step": 386 + }, + { + "epoch": 0.16372289793759914, + "grad_norm": 14.87386417388916, + "learning_rate": 9.999819610019272e-06, + "loss": 733.3545, + "step": 387 + }, + { + "epoch": 0.16414595452141725, + "grad_norm": 12.453298568725586, + "learning_rate": 9.999807784321074e-06, + "loss": 796.3146, + "step": 388 + }, + { + "epoch": 0.16456901110523534, + "grad_norm": 15.096817016601562, + "learning_rate": 9.999795583213653e-06, + "loss": 733.1597, + "step": 389 + }, + { + "epoch": 0.16499206768905342, + "grad_norm": 16.404541015625, + "learning_rate": 9.999783006697932e-06, + "loss": 692.5626, + "step": 390 + }, + { + "epoch": 0.1654151242728715, + "grad_norm": 17.051618576049805, + "learning_rate": 9.999770054774853e-06, + "loss": 712.984, + "step": 391 + }, + { + "epoch": 0.16583818085668958, + "grad_norm": 16.117685317993164, + "learning_rate": 9.999756727445388e-06, + "loss": 689.8359, + "step": 392 + }, + { + "epoch": 0.16626123744050766, + "grad_norm": 12.309391021728516, + "learning_rate": 9.999743024710539e-06, + "loss": 818.4892, + "step": 393 + }, + { + "epoch": 0.16668429402432575, + "grad_norm": 14.341193199157715, + "learning_rate": 9.999728946571333e-06, + "loss": 756.1593, + "step": 394 + }, + { + "epoch": 0.16710735060814383, + "grad_norm": 15.790420532226562, + "learning_rate": 9.999714493028829e-06, + "loss": 692.5813, + "step": 395 + }, + { + "epoch": 0.16753040719196194, + "grad_norm": 14.742142677307129, + "learning_rate": 9.999699664084111e-06, + "loss": 733.5026, + "step": 396 + }, + { + "epoch": 0.16795346377578002, + "grad_norm": 15.687649726867676, + "learning_rate": 9.99968445973829e-06, + "loss": 755.0579, + "step": 397 + }, + { + "epoch": 0.1683765203595981, + "grad_norm": 12.860382080078125, + "learning_rate": 9.999668879992513e-06, + "loss": 773.8835, + "step": 398 + }, + { + "epoch": 0.16879957694341619, + "grad_norm": 15.734672546386719, + "learning_rate": 9.999652924847947e-06, + "loss": 713.6204, + "step": 399 + }, + { + "epoch": 0.16922263352723427, + "grad_norm": 13.652953147888184, + "learning_rate": 9.999636594305789e-06, + "loss": 773.9716, + "step": 400 + }, + { + "epoch": 0.16964569011105235, + "grad_norm": 15.122949600219727, + "learning_rate": 9.999619888367268e-06, + "loss": 774.1875, + "step": 401 + }, + { + "epoch": 0.17006874669487043, + "grad_norm": 11.619685173034668, + "learning_rate": 9.999602807033634e-06, + "loss": 832.9962, + "step": 402 + }, + { + "epoch": 0.17049180327868851, + "grad_norm": 16.464195251464844, + "learning_rate": 9.999585350306173e-06, + "loss": 712.769, + "step": 403 + }, + { + "epoch": 0.17091485986250662, + "grad_norm": 13.01963996887207, + "learning_rate": 9.999567518186197e-06, + "loss": 795.5116, + "step": 404 + }, + { + "epoch": 0.1713379164463247, + "grad_norm": 16.92215919494629, + "learning_rate": 9.999549310675041e-06, + "loss": 713.1183, + "step": 405 + }, + { + "epoch": 0.1717609730301428, + "grad_norm": 15.008792877197266, + "learning_rate": 9.999530727774075e-06, + "loss": 733.1235, + "step": 406 + }, + { + "epoch": 0.17218402961396087, + "grad_norm": 12.612340927124023, + "learning_rate": 9.99951176948469e-06, + "loss": 796.8601, + "step": 407 + }, + { + "epoch": 0.17260708619777895, + "grad_norm": 12.101532936096191, + "learning_rate": 9.999492435808315e-06, + "loss": 812.4819, + "step": 408 + }, + { + "epoch": 0.17303014278159703, + "grad_norm": 19.601043701171875, + "learning_rate": 9.999472726746401e-06, + "loss": 670.74, + "step": 409 + }, + { + "epoch": 0.17345319936541512, + "grad_norm": 14.761680603027344, + "learning_rate": 9.999452642300425e-06, + "loss": 753.7325, + "step": 410 + }, + { + "epoch": 0.1738762559492332, + "grad_norm": 14.371439933776855, + "learning_rate": 9.999432182471895e-06, + "loss": 774.1177, + "step": 411 + }, + { + "epoch": 0.1742993125330513, + "grad_norm": 13.613729476928711, + "learning_rate": 9.999411347262349e-06, + "loss": 793.9263, + "step": 412 + }, + { + "epoch": 0.1747223691168694, + "grad_norm": 14.80978012084961, + "learning_rate": 9.99939013667335e-06, + "loss": 733.7786, + "step": 413 + }, + { + "epoch": 0.17514542570068747, + "grad_norm": 15.264474868774414, + "learning_rate": 9.999368550706494e-06, + "loss": 711.6267, + "step": 414 + }, + { + "epoch": 0.17556848228450556, + "grad_norm": 13.602452278137207, + "learning_rate": 9.999346589363397e-06, + "loss": 791.01, + "step": 415 + }, + { + "epoch": 0.17599153886832364, + "grad_norm": 14.27392578125, + "learning_rate": 9.999324252645713e-06, + "loss": 773.1117, + "step": 416 + }, + { + "epoch": 0.17641459545214172, + "grad_norm": 12.81863021850586, + "learning_rate": 9.999301540555113e-06, + "loss": 793.8004, + "step": 417 + }, + { + "epoch": 0.1768376520359598, + "grad_norm": 14.69222354888916, + "learning_rate": 9.999278453093306e-06, + "loss": 731.239, + "step": 418 + }, + { + "epoch": 0.17726070861977788, + "grad_norm": 16.393163681030273, + "learning_rate": 9.999254990262026e-06, + "loss": 710.8708, + "step": 419 + }, + { + "epoch": 0.177683765203596, + "grad_norm": 13.967550277709961, + "learning_rate": 9.999231152063035e-06, + "loss": 753.8186, + "step": 420 + }, + { + "epoch": 0.17810682178741408, + "grad_norm": 14.1089448928833, + "learning_rate": 9.99920693849812e-06, + "loss": 749.7565, + "step": 421 + }, + { + "epoch": 0.17852987837123216, + "grad_norm": 16.998958587646484, + "learning_rate": 9.999182349569101e-06, + "loss": 727.8883, + "step": 422 + }, + { + "epoch": 0.17895293495505024, + "grad_norm": 12.627494812011719, + "learning_rate": 9.999157385277827e-06, + "loss": 794.7938, + "step": 423 + }, + { + "epoch": 0.17937599153886832, + "grad_norm": 12.859837532043457, + "learning_rate": 9.999132045626165e-06, + "loss": 772.3897, + "step": 424 + }, + { + "epoch": 0.1797990481226864, + "grad_norm": 16.37617301940918, + "learning_rate": 9.999106330616025e-06, + "loss": 733.641, + "step": 425 + }, + { + "epoch": 0.1802221047065045, + "grad_norm": 16.082563400268555, + "learning_rate": 9.999080240249334e-06, + "loss": 714.193, + "step": 426 + }, + { + "epoch": 0.18064516129032257, + "grad_norm": 13.796841621398926, + "learning_rate": 9.999053774528054e-06, + "loss": 774.2094, + "step": 427 + }, + { + "epoch": 0.18106821787414068, + "grad_norm": 13.699914932250977, + "learning_rate": 9.999026933454169e-06, + "loss": 753.9722, + "step": 428 + }, + { + "epoch": 0.18149127445795876, + "grad_norm": 11.605844497680664, + "learning_rate": 9.998999717029694e-06, + "loss": 809.7978, + "step": 429 + }, + { + "epoch": 0.18191433104177684, + "grad_norm": 13.584386825561523, + "learning_rate": 9.998972125256675e-06, + "loss": 773.8148, + "step": 430 + }, + { + "epoch": 0.18233738762559493, + "grad_norm": 11.695263862609863, + "learning_rate": 9.99894415813718e-06, + "loss": 814.0646, + "step": 431 + }, + { + "epoch": 0.182760444209413, + "grad_norm": 20.750173568725586, + "learning_rate": 9.998915815673315e-06, + "loss": 773.4464, + "step": 432 + }, + { + "epoch": 0.1831835007932311, + "grad_norm": 16.518861770629883, + "learning_rate": 9.998887097867204e-06, + "loss": 733.1015, + "step": 433 + }, + { + "epoch": 0.18360655737704917, + "grad_norm": 12.798626899719238, + "learning_rate": 9.998858004721004e-06, + "loss": 792.5321, + "step": 434 + }, + { + "epoch": 0.18402961396086726, + "grad_norm": 14.560208320617676, + "learning_rate": 9.998828536236899e-06, + "loss": 730.3467, + "step": 435 + }, + { + "epoch": 0.18445267054468537, + "grad_norm": 12.710073471069336, + "learning_rate": 9.998798692417103e-06, + "loss": 772.3323, + "step": 436 + }, + { + "epoch": 0.18487572712850345, + "grad_norm": 13.742936134338379, + "learning_rate": 9.998768473263856e-06, + "loss": 795.3016, + "step": 437 + }, + { + "epoch": 0.18529878371232153, + "grad_norm": 14.985366821289062, + "learning_rate": 9.998737878779425e-06, + "loss": 736.309, + "step": 438 + }, + { + "epoch": 0.1857218402961396, + "grad_norm": 15.177434921264648, + "learning_rate": 9.998706908966109e-06, + "loss": 707.1273, + "step": 439 + }, + { + "epoch": 0.1861448968799577, + "grad_norm": 12.304004669189453, + "learning_rate": 9.998675563826236e-06, + "loss": 837.0641, + "step": 440 + }, + { + "epoch": 0.18656795346377578, + "grad_norm": 21.088029861450195, + "learning_rate": 9.998643843362154e-06, + "loss": 711.1479, + "step": 441 + }, + { + "epoch": 0.18699101004759386, + "grad_norm": 14.690875053405762, + "learning_rate": 9.99861174757625e-06, + "loss": 734.1656, + "step": 442 + }, + { + "epoch": 0.18741406663141194, + "grad_norm": 14.47488784790039, + "learning_rate": 9.998579276470931e-06, + "loss": 728.27, + "step": 443 + }, + { + "epoch": 0.18783712321523005, + "grad_norm": 13.837357521057129, + "learning_rate": 9.998546430048634e-06, + "loss": 792.6006, + "step": 444 + }, + { + "epoch": 0.18826017979904813, + "grad_norm": 10.948613166809082, + "learning_rate": 9.998513208311829e-06, + "loss": 835.2747, + "step": 445 + }, + { + "epoch": 0.18868323638286622, + "grad_norm": 13.532142639160156, + "learning_rate": 9.998479611263007e-06, + "loss": 773.8485, + "step": 446 + }, + { + "epoch": 0.1891062929666843, + "grad_norm": 16.726871490478516, + "learning_rate": 9.998445638904694e-06, + "loss": 733.7336, + "step": 447 + }, + { + "epoch": 0.18952934955050238, + "grad_norm": 13.909221649169922, + "learning_rate": 9.998411291239439e-06, + "loss": 772.0204, + "step": 448 + }, + { + "epoch": 0.18995240613432046, + "grad_norm": 13.340675354003906, + "learning_rate": 9.998376568269819e-06, + "loss": 796.2129, + "step": 449 + }, + { + "epoch": 0.19037546271813854, + "grad_norm": 20.17201042175293, + "learning_rate": 9.998341469998443e-06, + "loss": 649.0154, + "step": 450 + }, + { + "epoch": 0.19079851930195663, + "grad_norm": 12.232992172241211, + "learning_rate": 9.998305996427945e-06, + "loss": 815.289, + "step": 451 + }, + { + "epoch": 0.19122157588577474, + "grad_norm": 14.767935752868652, + "learning_rate": 9.998270147560991e-06, + "loss": 776.1144, + "step": 452 + }, + { + "epoch": 0.19164463246959282, + "grad_norm": 12.352226257324219, + "learning_rate": 9.998233923400273e-06, + "loss": 813.2661, + "step": 453 + }, + { + "epoch": 0.1920676890534109, + "grad_norm": 16.502901077270508, + "learning_rate": 9.998197323948508e-06, + "loss": 711.3968, + "step": 454 + }, + { + "epoch": 0.19249074563722898, + "grad_norm": 17.42971420288086, + "learning_rate": 9.998160349208446e-06, + "loss": 713.4203, + "step": 455 + }, + { + "epoch": 0.19291380222104706, + "grad_norm": 14.089303970336914, + "learning_rate": 9.998122999182862e-06, + "loss": 754.8633, + "step": 456 + }, + { + "epoch": 0.19333685880486515, + "grad_norm": 13.425054550170898, + "learning_rate": 9.998085273874562e-06, + "loss": 792.4691, + "step": 457 + }, + { + "epoch": 0.19375991538868323, + "grad_norm": 15.47014331817627, + "learning_rate": 9.998047173286378e-06, + "loss": 732.3478, + "step": 458 + }, + { + "epoch": 0.1941829719725013, + "grad_norm": 16.045291900634766, + "learning_rate": 9.99800869742117e-06, + "loss": 748.8778, + "step": 459 + }, + { + "epoch": 0.19460602855631942, + "grad_norm": 13.96675968170166, + "learning_rate": 9.997969846281827e-06, + "loss": 775.6492, + "step": 460 + }, + { + "epoch": 0.1950290851401375, + "grad_norm": 13.909330368041992, + "learning_rate": 9.997930619871267e-06, + "loss": 749.5533, + "step": 461 + }, + { + "epoch": 0.19545214172395559, + "grad_norm": 14.924839973449707, + "learning_rate": 9.997891018192433e-06, + "loss": 773.8635, + "step": 462 + }, + { + "epoch": 0.19587519830777367, + "grad_norm": 13.335284233093262, + "learning_rate": 9.997851041248303e-06, + "loss": 775.519, + "step": 463 + }, + { + "epoch": 0.19629825489159175, + "grad_norm": 18.169382095336914, + "learning_rate": 9.997810689041875e-06, + "loss": 669.1969, + "step": 464 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 16.133991241455078, + "learning_rate": 9.99776996157618e-06, + "loss": 688.9814, + "step": 465 + }, + { + "epoch": 0.19714436805922791, + "grad_norm": 15.445608139038086, + "learning_rate": 9.997728858854277e-06, + "loss": 729.0157, + "step": 466 + }, + { + "epoch": 0.197567424643046, + "grad_norm": 15.433357238769531, + "learning_rate": 9.997687380879248e-06, + "loss": 729.525, + "step": 467 + }, + { + "epoch": 0.1979904812268641, + "grad_norm": 14.787826538085938, + "learning_rate": 9.997645527654215e-06, + "loss": 731.5955, + "step": 468 + }, + { + "epoch": 0.1984135378106822, + "grad_norm": 16.245075225830078, + "learning_rate": 9.997603299182312e-06, + "loss": 733.3505, + "step": 469 + }, + { + "epoch": 0.19883659439450027, + "grad_norm": 16.868227005004883, + "learning_rate": 9.997560695466714e-06, + "loss": 689.8536, + "step": 470 + }, + { + "epoch": 0.19925965097831835, + "grad_norm": 14.078987121582031, + "learning_rate": 9.99751771651062e-06, + "loss": 730.205, + "step": 471 + }, + { + "epoch": 0.19968270756213644, + "grad_norm": 13.627495765686035, + "learning_rate": 9.997474362317258e-06, + "loss": 795.2089, + "step": 472 + }, + { + "epoch": 0.20010576414595452, + "grad_norm": 10.83949089050293, + "learning_rate": 9.997430632889881e-06, + "loss": 834.3681, + "step": 473 + }, + { + "epoch": 0.2005288207297726, + "grad_norm": 13.961080551147461, + "learning_rate": 9.997386528231772e-06, + "loss": 770.5109, + "step": 474 + }, + { + "epoch": 0.20095187731359068, + "grad_norm": 13.958606719970703, + "learning_rate": 9.997342048346245e-06, + "loss": 751.8972, + "step": 475 + }, + { + "epoch": 0.2013749338974088, + "grad_norm": 14.925124168395996, + "learning_rate": 9.997297193236638e-06, + "loss": 732.5073, + "step": 476 + }, + { + "epoch": 0.20179799048122687, + "grad_norm": 12.921204566955566, + "learning_rate": 9.99725196290632e-06, + "loss": 790.8522, + "step": 477 + }, + { + "epoch": 0.20222104706504496, + "grad_norm": 11.71634578704834, + "learning_rate": 9.997206357358689e-06, + "loss": 815.1984, + "step": 478 + }, + { + "epoch": 0.20264410364886304, + "grad_norm": 10.824254035949707, + "learning_rate": 9.997160376597164e-06, + "loss": 837.952, + "step": 479 + }, + { + "epoch": 0.20306716023268112, + "grad_norm": 17.767045974731445, + "learning_rate": 9.997114020625201e-06, + "loss": 690.7749, + "step": 480 + }, + { + "epoch": 0.2034902168164992, + "grad_norm": 12.854984283447266, + "learning_rate": 9.99706728944628e-06, + "loss": 791.934, + "step": 481 + }, + { + "epoch": 0.20391327340031729, + "grad_norm": 14.076663970947266, + "learning_rate": 9.99702018306391e-06, + "loss": 748.1455, + "step": 482 + }, + { + "epoch": 0.20433632998413537, + "grad_norm": 15.128349304199219, + "learning_rate": 9.996972701481627e-06, + "loss": 730.4189, + "step": 483 + }, + { + "epoch": 0.20475938656795348, + "grad_norm": 15.626911163330078, + "learning_rate": 9.996924844702998e-06, + "loss": 706.7859, + "step": 484 + }, + { + "epoch": 0.20518244315177156, + "grad_norm": 15.059734344482422, + "learning_rate": 9.996876612731615e-06, + "loss": 735.6243, + "step": 485 + }, + { + "epoch": 0.20560549973558964, + "grad_norm": 12.081915855407715, + "learning_rate": 9.996828005571099e-06, + "loss": 812.7684, + "step": 486 + }, + { + "epoch": 0.20602855631940772, + "grad_norm": 15.49333667755127, + "learning_rate": 9.996779023225101e-06, + "loss": 709.7596, + "step": 487 + }, + { + "epoch": 0.2064516129032258, + "grad_norm": 12.673453330993652, + "learning_rate": 9.996729665697298e-06, + "loss": 790.1187, + "step": 488 + }, + { + "epoch": 0.2068746694870439, + "grad_norm": 12.858985900878906, + "learning_rate": 9.996679932991395e-06, + "loss": 795.0846, + "step": 489 + }, + { + "epoch": 0.20729772607086197, + "grad_norm": 12.948575973510742, + "learning_rate": 9.99662982511113e-06, + "loss": 787.5475, + "step": 490 + }, + { + "epoch": 0.20772078265468005, + "grad_norm": 13.713641166687012, + "learning_rate": 9.99657934206026e-06, + "loss": 752.131, + "step": 491 + }, + { + "epoch": 0.20814383923849816, + "grad_norm": 13.754651069641113, + "learning_rate": 9.99652848384258e-06, + "loss": 751.3858, + "step": 492 + }, + { + "epoch": 0.20856689582231625, + "grad_norm": 17.989845275878906, + "learning_rate": 9.996477250461907e-06, + "loss": 709.8654, + "step": 493 + }, + { + "epoch": 0.20898995240613433, + "grad_norm": 13.00173568725586, + "learning_rate": 9.996425641922088e-06, + "loss": 791.731, + "step": 494 + }, + { + "epoch": 0.2094130089899524, + "grad_norm": 14.950695991516113, + "learning_rate": 9.996373658226995e-06, + "loss": 729.3202, + "step": 495 + }, + { + "epoch": 0.2098360655737705, + "grad_norm": 14.290796279907227, + "learning_rate": 9.996321299380536e-06, + "loss": 752.03, + "step": 496 + }, + { + "epoch": 0.21025912215758857, + "grad_norm": 17.02768898010254, + "learning_rate": 9.99626856538664e-06, + "loss": 688.7895, + "step": 497 + }, + { + "epoch": 0.21068217874140666, + "grad_norm": 14.67369556427002, + "learning_rate": 9.996215456249268e-06, + "loss": 748.0872, + "step": 498 + }, + { + "epoch": 0.21110523532522474, + "grad_norm": 15.39163875579834, + "learning_rate": 9.996161971972405e-06, + "loss": 729.3165, + "step": 499 + }, + { + "epoch": 0.21152829190904285, + "grad_norm": 14.391759872436523, + "learning_rate": 9.996108112560068e-06, + "loss": 753.8647, + "step": 500 + }, + { + "epoch": 0.21195134849286093, + "grad_norm": 16.187179565429688, + "learning_rate": 9.996053878016302e-06, + "loss": 709.9928, + "step": 501 + }, + { + "epoch": 0.212374405076679, + "grad_norm": 14.946179389953613, + "learning_rate": 9.995999268345179e-06, + "loss": 728.0349, + "step": 502 + }, + { + "epoch": 0.2127974616604971, + "grad_norm": 16.942272186279297, + "learning_rate": 9.995944283550799e-06, + "loss": 688.722, + "step": 503 + }, + { + "epoch": 0.21322051824431518, + "grad_norm": 11.699006080627441, + "learning_rate": 9.995888923637288e-06, + "loss": 809.8322, + "step": 504 + }, + { + "epoch": 0.21364357482813326, + "grad_norm": 13.436891555786133, + "learning_rate": 9.995833188608806e-06, + "loss": 771.9644, + "step": 505 + }, + { + "epoch": 0.21406663141195134, + "grad_norm": 15.341656684875488, + "learning_rate": 9.99577707846954e-06, + "loss": 730.9764, + "step": 506 + }, + { + "epoch": 0.21448968799576942, + "grad_norm": 12.649733543395996, + "learning_rate": 9.995720593223697e-06, + "loss": 770.3612, + "step": 507 + }, + { + "epoch": 0.21491274457958753, + "grad_norm": 12.433646202087402, + "learning_rate": 9.995663732875522e-06, + "loss": 811.577, + "step": 508 + }, + { + "epoch": 0.21533580116340562, + "grad_norm": 15.445535659790039, + "learning_rate": 9.995606497429283e-06, + "loss": 750.7621, + "step": 509 + }, + { + "epoch": 0.2157588577472237, + "grad_norm": 13.717183113098145, + "learning_rate": 9.995548886889277e-06, + "loss": 751.2988, + "step": 510 + }, + { + "epoch": 0.21618191433104178, + "grad_norm": 14.1709566116333, + "learning_rate": 9.995490901259833e-06, + "loss": 771.0392, + "step": 511 + }, + { + "epoch": 0.21660497091485986, + "grad_norm": 13.377311706542969, + "learning_rate": 9.9954325405453e-06, + "loss": 793.7984, + "step": 512 + }, + { + "epoch": 0.21702802749867794, + "grad_norm": 13.593575477600098, + "learning_rate": 9.995373804750064e-06, + "loss": 771.6621, + "step": 513 + }, + { + "epoch": 0.21745108408249603, + "grad_norm": 16.287029266357422, + "learning_rate": 9.995314693878535e-06, + "loss": 751.2656, + "step": 514 + }, + { + "epoch": 0.2178741406663141, + "grad_norm": 15.225509643554688, + "learning_rate": 9.995255207935148e-06, + "loss": 729.1929, + "step": 515 + }, + { + "epoch": 0.21829719725013222, + "grad_norm": 14.088748931884766, + "learning_rate": 9.995195346924372e-06, + "loss": 770.9734, + "step": 516 + }, + { + "epoch": 0.2187202538339503, + "grad_norm": 13.854455947875977, + "learning_rate": 9.9951351108507e-06, + "loss": 792.4874, + "step": 517 + }, + { + "epoch": 0.21914331041776838, + "grad_norm": 11.280946731567383, + "learning_rate": 9.995074499718658e-06, + "loss": 809.2042, + "step": 518 + }, + { + "epoch": 0.21956636700158647, + "grad_norm": 16.207279205322266, + "learning_rate": 9.995013513532795e-06, + "loss": 774.3907, + "step": 519 + }, + { + "epoch": 0.21998942358540455, + "grad_norm": 12.287997245788574, + "learning_rate": 9.994952152297688e-06, + "loss": 812.1009, + "step": 520 + }, + { + "epoch": 0.22041248016922263, + "grad_norm": 11.712491035461426, + "learning_rate": 9.994890416017946e-06, + "loss": 836.1512, + "step": 521 + }, + { + "epoch": 0.2208355367530407, + "grad_norm": 13.708894729614258, + "learning_rate": 9.994828304698206e-06, + "loss": 792.5018, + "step": 522 + }, + { + "epoch": 0.2212585933368588, + "grad_norm": 13.083122253417969, + "learning_rate": 9.99476581834313e-06, + "loss": 791.6917, + "step": 523 + }, + { + "epoch": 0.2216816499206769, + "grad_norm": 14.049800872802734, + "learning_rate": 9.994702956957412e-06, + "loss": 749.1741, + "step": 524 + }, + { + "epoch": 0.222104706504495, + "grad_norm": 14.696182250976562, + "learning_rate": 9.994639720545767e-06, + "loss": 730.9039, + "step": 525 + }, + { + "epoch": 0.22252776308831307, + "grad_norm": 13.706002235412598, + "learning_rate": 9.994576109112948e-06, + "loss": 773.9236, + "step": 526 + }, + { + "epoch": 0.22295081967213115, + "grad_norm": 15.339780807495117, + "learning_rate": 9.994512122663729e-06, + "loss": 729.2384, + "step": 527 + }, + { + "epoch": 0.22337387625594923, + "grad_norm": 14.556045532226562, + "learning_rate": 9.994447761202915e-06, + "loss": 749.0478, + "step": 528 + }, + { + "epoch": 0.22379693283976732, + "grad_norm": 13.034219741821289, + "learning_rate": 9.99438302473534e-06, + "loss": 789.9837, + "step": 529 + }, + { + "epoch": 0.2242199894235854, + "grad_norm": 14.384037017822266, + "learning_rate": 9.99431791326586e-06, + "loss": 753.3604, + "step": 530 + }, + { + "epoch": 0.22464304600740348, + "grad_norm": 15.585625648498535, + "learning_rate": 9.994252426799367e-06, + "loss": 712.1857, + "step": 531 + }, + { + "epoch": 0.2250661025912216, + "grad_norm": 17.219160079956055, + "learning_rate": 9.99418656534078e-06, + "loss": 667.2874, + "step": 532 + }, + { + "epoch": 0.22548915917503967, + "grad_norm": 14.34976577758789, + "learning_rate": 9.994120328895041e-06, + "loss": 750.2067, + "step": 533 + }, + { + "epoch": 0.22591221575885775, + "grad_norm": 14.736771583557129, + "learning_rate": 9.994053717467126e-06, + "loss": 727.4701, + "step": 534 + }, + { + "epoch": 0.22633527234267584, + "grad_norm": 12.407829284667969, + "learning_rate": 9.993986731062034e-06, + "loss": 796.4521, + "step": 535 + }, + { + "epoch": 0.22675832892649392, + "grad_norm": 14.592554092407227, + "learning_rate": 9.993919369684795e-06, + "loss": 730.9424, + "step": 536 + }, + { + "epoch": 0.227181385510312, + "grad_norm": 16.63705062866211, + "learning_rate": 9.993851633340469e-06, + "loss": 688.5394, + "step": 537 + }, + { + "epoch": 0.22760444209413008, + "grad_norm": 11.82974624633789, + "learning_rate": 9.99378352203414e-06, + "loss": 811.3176, + "step": 538 + }, + { + "epoch": 0.22802749867794816, + "grad_norm": 13.167518615722656, + "learning_rate": 9.993715035770923e-06, + "loss": 768.8174, + "step": 539 + }, + { + "epoch": 0.22845055526176627, + "grad_norm": 12.229063034057617, + "learning_rate": 9.993646174555958e-06, + "loss": 786.4929, + "step": 540 + }, + { + "epoch": 0.22887361184558436, + "grad_norm": 16.408519744873047, + "learning_rate": 9.993576938394419e-06, + "loss": 691.8701, + "step": 541 + }, + { + "epoch": 0.22929666842940244, + "grad_norm": 11.608378410339355, + "learning_rate": 9.993507327291502e-06, + "loss": 833.9163, + "step": 542 + }, + { + "epoch": 0.22971972501322052, + "grad_norm": 16.58883285522461, + "learning_rate": 9.993437341252435e-06, + "loss": 707.496, + "step": 543 + }, + { + "epoch": 0.2301427815970386, + "grad_norm": 13.47687816619873, + "learning_rate": 9.993366980282474e-06, + "loss": 769.6858, + "step": 544 + }, + { + "epoch": 0.23056583818085669, + "grad_norm": 14.78942584991455, + "learning_rate": 9.993296244386898e-06, + "loss": 729.0137, + "step": 545 + }, + { + "epoch": 0.23098889476467477, + "grad_norm": 14.9539213180542, + "learning_rate": 9.993225133571022e-06, + "loss": 728.0913, + "step": 546 + }, + { + "epoch": 0.23141195134849285, + "grad_norm": 18.348339080810547, + "learning_rate": 9.993153647840184e-06, + "loss": 626.0314, + "step": 547 + }, + { + "epoch": 0.23183500793231096, + "grad_norm": 12.608426094055176, + "learning_rate": 9.993081787199752e-06, + "loss": 792.7026, + "step": 548 + }, + { + "epoch": 0.23225806451612904, + "grad_norm": 15.126500129699707, + "learning_rate": 9.99300955165512e-06, + "loss": 709.1066, + "step": 549 + }, + { + "epoch": 0.23268112109994712, + "grad_norm": 14.170491218566895, + "learning_rate": 9.992936941211712e-06, + "loss": 772.4585, + "step": 550 + }, + { + "epoch": 0.2331041776837652, + "grad_norm": 16.050321578979492, + "learning_rate": 9.992863955874983e-06, + "loss": 687.3251, + "step": 551 + }, + { + "epoch": 0.2335272342675833, + "grad_norm": 12.890348434448242, + "learning_rate": 9.99279059565041e-06, + "loss": 810.8807, + "step": 552 + }, + { + "epoch": 0.23395029085140137, + "grad_norm": 14.457061767578125, + "learning_rate": 9.992716860543503e-06, + "loss": 769.5216, + "step": 553 + }, + { + "epoch": 0.23437334743521945, + "grad_norm": 15.015445709228516, + "learning_rate": 9.992642750559797e-06, + "loss": 725.1898, + "step": 554 + }, + { + "epoch": 0.23479640401903754, + "grad_norm": 16.252107620239258, + "learning_rate": 9.992568265704856e-06, + "loss": 709.1776, + "step": 555 + }, + { + "epoch": 0.23521946060285565, + "grad_norm": 16.278573989868164, + "learning_rate": 9.992493405984276e-06, + "loss": 707.6428, + "step": 556 + }, + { + "epoch": 0.23564251718667373, + "grad_norm": 15.46164321899414, + "learning_rate": 9.992418171403675e-06, + "loss": 748.1681, + "step": 557 + }, + { + "epoch": 0.2360655737704918, + "grad_norm": 14.662430763244629, + "learning_rate": 9.992342561968702e-06, + "loss": 729.328, + "step": 558 + }, + { + "epoch": 0.2364886303543099, + "grad_norm": 17.697969436645508, + "learning_rate": 9.992266577685035e-06, + "loss": 685.244, + "step": 559 + }, + { + "epoch": 0.23691168693812797, + "grad_norm": 17.050352096557617, + "learning_rate": 9.992190218558378e-06, + "loss": 707.8534, + "step": 560 + }, + { + "epoch": 0.23733474352194606, + "grad_norm": 15.034307479858398, + "learning_rate": 9.992113484594465e-06, + "loss": 726.4353, + "step": 561 + }, + { + "epoch": 0.23775780010576414, + "grad_norm": 16.008089065551758, + "learning_rate": 9.992036375799061e-06, + "loss": 730.9178, + "step": 562 + }, + { + "epoch": 0.23818085668958222, + "grad_norm": 12.4567289352417, + "learning_rate": 9.99195889217795e-06, + "loss": 794.0131, + "step": 563 + }, + { + "epoch": 0.23860391327340033, + "grad_norm": 12.710521697998047, + "learning_rate": 9.991881033736953e-06, + "loss": 792.0283, + "step": 564 + }, + { + "epoch": 0.2390269698572184, + "grad_norm": 14.488910675048828, + "learning_rate": 9.991802800481916e-06, + "loss": 748.8219, + "step": 565 + }, + { + "epoch": 0.2394500264410365, + "grad_norm": 14.382262229919434, + "learning_rate": 9.991724192418711e-06, + "loss": 749.2411, + "step": 566 + }, + { + "epoch": 0.23987308302485458, + "grad_norm": 14.753145217895508, + "learning_rate": 9.991645209553243e-06, + "loss": 769.9041, + "step": 567 + }, + { + "epoch": 0.24029613960867266, + "grad_norm": 13.785974502563477, + "learning_rate": 9.99156585189144e-06, + "loss": 772.7175, + "step": 568 + }, + { + "epoch": 0.24071919619249074, + "grad_norm": 12.003847122192383, + "learning_rate": 9.991486119439265e-06, + "loss": 810.1777, + "step": 569 + }, + { + "epoch": 0.24114225277630882, + "grad_norm": 15.738054275512695, + "learning_rate": 9.991406012202697e-06, + "loss": 711.4714, + "step": 570 + }, + { + "epoch": 0.2415653093601269, + "grad_norm": 15.65022087097168, + "learning_rate": 9.991325530187758e-06, + "loss": 752.2881, + "step": 571 + }, + { + "epoch": 0.24198836594394502, + "grad_norm": 17.29945945739746, + "learning_rate": 9.991244673400489e-06, + "loss": 708.3313, + "step": 572 + }, + { + "epoch": 0.2424114225277631, + "grad_norm": 12.896024703979492, + "learning_rate": 9.991163441846958e-06, + "loss": 767.7472, + "step": 573 + }, + { + "epoch": 0.24283447911158118, + "grad_norm": 14.034971237182617, + "learning_rate": 9.991081835533268e-06, + "loss": 746.7238, + "step": 574 + }, + { + "epoch": 0.24325753569539926, + "grad_norm": 14.408370018005371, + "learning_rate": 9.990999854465545e-06, + "loss": 770.64, + "step": 575 + }, + { + "epoch": 0.24368059227921735, + "grad_norm": 15.3308687210083, + "learning_rate": 9.990917498649944e-06, + "loss": 729.8007, + "step": 576 + }, + { + "epoch": 0.24410364886303543, + "grad_norm": 13.564072608947754, + "learning_rate": 9.99083476809265e-06, + "loss": 771.3879, + "step": 577 + }, + { + "epoch": 0.2445267054468535, + "grad_norm": 13.404775619506836, + "learning_rate": 9.990751662799873e-06, + "loss": 772.5026, + "step": 578 + }, + { + "epoch": 0.2449497620306716, + "grad_norm": 12.079168319702148, + "learning_rate": 9.990668182777855e-06, + "loss": 809.0527, + "step": 579 + }, + { + "epoch": 0.2453728186144897, + "grad_norm": 13.129827499389648, + "learning_rate": 9.990584328032862e-06, + "loss": 771.359, + "step": 580 + }, + { + "epoch": 0.24579587519830778, + "grad_norm": 17.244400024414062, + "learning_rate": 9.990500098571191e-06, + "loss": 687.9229, + "step": 581 + }, + { + "epoch": 0.24621893178212587, + "grad_norm": 13.733057975769043, + "learning_rate": 9.990415494399167e-06, + "loss": 768.8884, + "step": 582 + }, + { + "epoch": 0.24664198836594395, + "grad_norm": 13.617788314819336, + "learning_rate": 9.990330515523143e-06, + "loss": 775.1088, + "step": 583 + }, + { + "epoch": 0.24706504494976203, + "grad_norm": 16.17061996459961, + "learning_rate": 9.990245161949497e-06, + "loss": 709.2199, + "step": 584 + }, + { + "epoch": 0.2474881015335801, + "grad_norm": 15.41438102722168, + "learning_rate": 9.99015943368464e-06, + "loss": 730.3363, + "step": 585 + }, + { + "epoch": 0.2479111581173982, + "grad_norm": 14.853214263916016, + "learning_rate": 9.990073330735008e-06, + "loss": 725.8908, + "step": 586 + }, + { + "epoch": 0.24833421470121628, + "grad_norm": 13.830111503601074, + "learning_rate": 9.989986853107067e-06, + "loss": 769.7603, + "step": 587 + }, + { + "epoch": 0.2487572712850344, + "grad_norm": 12.029333114624023, + "learning_rate": 9.98990000080731e-06, + "loss": 810.1149, + "step": 588 + }, + { + "epoch": 0.24918032786885247, + "grad_norm": 15.225948333740234, + "learning_rate": 9.989812773842257e-06, + "loss": 747.5815, + "step": 589 + }, + { + "epoch": 0.24960338445267055, + "grad_norm": 14.332786560058594, + "learning_rate": 9.989725172218458e-06, + "loss": 749.8716, + "step": 590 + }, + { + "epoch": 0.25002644103648863, + "grad_norm": 12.483567237854004, + "learning_rate": 9.989637195942491e-06, + "loss": 791.1083, + "step": 591 + }, + { + "epoch": 0.2504494976203067, + "grad_norm": 15.3134183883667, + "learning_rate": 9.98954884502096e-06, + "loss": 768.5284, + "step": 592 + }, + { + "epoch": 0.2508725542041248, + "grad_norm": 16.63307762145996, + "learning_rate": 9.989460119460503e-06, + "loss": 706.2826, + "step": 593 + }, + { + "epoch": 0.2512956107879429, + "grad_norm": 13.873503684997559, + "learning_rate": 9.989371019267777e-06, + "loss": 772.9481, + "step": 594 + }, + { + "epoch": 0.25171866737176096, + "grad_norm": 14.440890312194824, + "learning_rate": 9.989281544449474e-06, + "loss": 751.4976, + "step": 595 + }, + { + "epoch": 0.25214172395557904, + "grad_norm": 14.63353157043457, + "learning_rate": 9.989191695012313e-06, + "loss": 752.3766, + "step": 596 + }, + { + "epoch": 0.2525647805393971, + "grad_norm": 14.112288475036621, + "learning_rate": 9.989101470963041e-06, + "loss": 749.3422, + "step": 597 + }, + { + "epoch": 0.2529878371232152, + "grad_norm": 12.606511116027832, + "learning_rate": 9.989010872308428e-06, + "loss": 815.8671, + "step": 598 + }, + { + "epoch": 0.2534108937070333, + "grad_norm": 13.304884910583496, + "learning_rate": 9.988919899055282e-06, + "loss": 789.1403, + "step": 599 + }, + { + "epoch": 0.25383395029085143, + "grad_norm": 15.555290222167969, + "learning_rate": 9.988828551210433e-06, + "loss": 728.9872, + "step": 600 + }, + { + "epoch": 0.2542570068746695, + "grad_norm": 17.186012268066406, + "learning_rate": 9.988736828780736e-06, + "loss": 688.9184, + "step": 601 + }, + { + "epoch": 0.2546800634584876, + "grad_norm": 14.821638107299805, + "learning_rate": 9.98864473177308e-06, + "loss": 746.8917, + "step": 602 + }, + { + "epoch": 0.2551031200423057, + "grad_norm": 14.28255844116211, + "learning_rate": 9.98855226019438e-06, + "loss": 752.9296, + "step": 603 + }, + { + "epoch": 0.25552617662612376, + "grad_norm": 17.622272491455078, + "learning_rate": 9.98845941405158e-06, + "loss": 683.5189, + "step": 604 + }, + { + "epoch": 0.25594923320994184, + "grad_norm": 14.68231201171875, + "learning_rate": 9.988366193351651e-06, + "loss": 750.4978, + "step": 605 + }, + { + "epoch": 0.2563722897937599, + "grad_norm": 16.95355224609375, + "learning_rate": 9.988272598101593e-06, + "loss": 704.9797, + "step": 606 + }, + { + "epoch": 0.256795346377578, + "grad_norm": 17.456499099731445, + "learning_rate": 9.988178628308432e-06, + "loss": 708.2197, + "step": 607 + }, + { + "epoch": 0.2572184029613961, + "grad_norm": 15.862042427062988, + "learning_rate": 9.988084283979225e-06, + "loss": 708.2691, + "step": 608 + }, + { + "epoch": 0.25764145954521417, + "grad_norm": 15.338232040405273, + "learning_rate": 9.987989565121055e-06, + "loss": 731.2156, + "step": 609 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 14.211383819580078, + "learning_rate": 9.987894471741032e-06, + "loss": 768.1458, + "step": 610 + }, + { + "epoch": 0.25848757271285033, + "grad_norm": 12.98444938659668, + "learning_rate": 9.9877990038463e-06, + "loss": 766.7051, + "step": 611 + }, + { + "epoch": 0.2589106292966684, + "grad_norm": 11.82456111907959, + "learning_rate": 9.987703161444028e-06, + "loss": 789.0675, + "step": 612 + }, + { + "epoch": 0.2593336858804865, + "grad_norm": 11.237346649169922, + "learning_rate": 9.987606944541409e-06, + "loss": 833.3247, + "step": 613 + }, + { + "epoch": 0.2597567424643046, + "grad_norm": 19.746479034423828, + "learning_rate": 9.987510353145667e-06, + "loss": 645.8649, + "step": 614 + }, + { + "epoch": 0.26017979904812266, + "grad_norm": 12.854883193969727, + "learning_rate": 9.987413387264056e-06, + "loss": 789.0947, + "step": 615 + }, + { + "epoch": 0.2606028556319408, + "grad_norm": 12.521780014038086, + "learning_rate": 9.987316046903855e-06, + "loss": 833.2208, + "step": 616 + }, + { + "epoch": 0.2610259122157589, + "grad_norm": 17.422582626342773, + "learning_rate": 9.987218332072376e-06, + "loss": 727.9456, + "step": 617 + }, + { + "epoch": 0.26144896879957696, + "grad_norm": 14.974026679992676, + "learning_rate": 9.987120242776954e-06, + "loss": 748.3118, + "step": 618 + }, + { + "epoch": 0.26187202538339505, + "grad_norm": 15.921882629394531, + "learning_rate": 9.987021779024953e-06, + "loss": 788.6419, + "step": 619 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 18.31747055053711, + "learning_rate": 9.986922940823768e-06, + "loss": 704.2823, + "step": 620 + }, + { + "epoch": 0.2627181385510312, + "grad_norm": 18.767438888549805, + "learning_rate": 9.98682372818082e-06, + "loss": 666.2666, + "step": 621 + }, + { + "epoch": 0.2631411951348493, + "grad_norm": 17.320356369018555, + "learning_rate": 9.986724141103557e-06, + "loss": 728.8633, + "step": 622 + }, + { + "epoch": 0.2635642517186674, + "grad_norm": 13.273025512695312, + "learning_rate": 9.986624179599458e-06, + "loss": 812.9435, + "step": 623 + }, + { + "epoch": 0.26398730830248546, + "grad_norm": 14.60000991821289, + "learning_rate": 9.986523843676028e-06, + "loss": 750.3506, + "step": 624 + }, + { + "epoch": 0.26441036488630354, + "grad_norm": 17.19547462463379, + "learning_rate": 9.9864231333408e-06, + "loss": 728.8174, + "step": 625 + }, + { + "epoch": 0.2648334214701216, + "grad_norm": 16.140121459960938, + "learning_rate": 9.986322048601337e-06, + "loss": 769.0399, + "step": 626 + }, + { + "epoch": 0.2652564780539397, + "grad_norm": 13.107047080993652, + "learning_rate": 9.986220589465228e-06, + "loss": 768.9719, + "step": 627 + }, + { + "epoch": 0.2656795346377578, + "grad_norm": 15.769641876220703, + "learning_rate": 9.98611875594009e-06, + "loss": 729.5747, + "step": 628 + }, + { + "epoch": 0.26610259122157587, + "grad_norm": 16.121686935424805, + "learning_rate": 9.986016548033572e-06, + "loss": 810.8019, + "step": 629 + }, + { + "epoch": 0.26652564780539395, + "grad_norm": 19.363224029541016, + "learning_rate": 9.985913965753347e-06, + "loss": 644.7514, + "step": 630 + }, + { + "epoch": 0.26694870438921203, + "grad_norm": 14.275716781616211, + "learning_rate": 9.985811009107116e-06, + "loss": 768.4279, + "step": 631 + }, + { + "epoch": 0.26737176097303017, + "grad_norm": 15.12416934967041, + "learning_rate": 9.985707678102611e-06, + "loss": 749.2072, + "step": 632 + }, + { + "epoch": 0.26779481755684825, + "grad_norm": 15.428362846374512, + "learning_rate": 9.985603972747591e-06, + "loss": 748.6761, + "step": 633 + }, + { + "epoch": 0.26821787414066633, + "grad_norm": 13.372490882873535, + "learning_rate": 9.985499893049842e-06, + "loss": 771.5308, + "step": 634 + }, + { + "epoch": 0.2686409307244844, + "grad_norm": 14.641702651977539, + "learning_rate": 9.985395439017177e-06, + "loss": 768.8315, + "step": 635 + }, + { + "epoch": 0.2690639873083025, + "grad_norm": 11.935099601745605, + "learning_rate": 9.985290610657441e-06, + "loss": 811.3998, + "step": 636 + }, + { + "epoch": 0.2694870438921206, + "grad_norm": 11.329277992248535, + "learning_rate": 9.985185407978504e-06, + "loss": 832.1404, + "step": 637 + }, + { + "epoch": 0.26991010047593866, + "grad_norm": 14.82882308959961, + "learning_rate": 9.985079830988267e-06, + "loss": 746.3147, + "step": 638 + }, + { + "epoch": 0.27033315705975675, + "grad_norm": 13.474793434143066, + "learning_rate": 9.984973879694656e-06, + "loss": 771.2763, + "step": 639 + }, + { + "epoch": 0.27075621364357483, + "grad_norm": 13.33484935760498, + "learning_rate": 9.984867554105626e-06, + "loss": 768.8647, + "step": 640 + }, + { + "epoch": 0.2711792702273929, + "grad_norm": 12.85734748840332, + "learning_rate": 9.984760854229162e-06, + "loss": 793.0065, + "step": 641 + }, + { + "epoch": 0.271602326811211, + "grad_norm": 16.762540817260742, + "learning_rate": 9.98465378007327e-06, + "loss": 687.4204, + "step": 642 + }, + { + "epoch": 0.2720253833950291, + "grad_norm": 16.515939712524414, + "learning_rate": 9.984546331645996e-06, + "loss": 684.6247, + "step": 643 + }, + { + "epoch": 0.27244843997884716, + "grad_norm": 14.211777687072754, + "learning_rate": 9.984438508955406e-06, + "loss": 746.4603, + "step": 644 + }, + { + "epoch": 0.27287149656266524, + "grad_norm": 13.69175910949707, + "learning_rate": 9.984330312009596e-06, + "loss": 766.9557, + "step": 645 + }, + { + "epoch": 0.2732945531464833, + "grad_norm": 16.35338020324707, + "learning_rate": 9.984221740816689e-06, + "loss": 708.2935, + "step": 646 + }, + { + "epoch": 0.2737176097303014, + "grad_norm": 16.178255081176758, + "learning_rate": 9.984112795384836e-06, + "loss": 686.0378, + "step": 647 + }, + { + "epoch": 0.27414066631411954, + "grad_norm": 13.948187828063965, + "learning_rate": 9.984003475722219e-06, + "loss": 793.3854, + "step": 648 + }, + { + "epoch": 0.2745637228979376, + "grad_norm": 15.159440994262695, + "learning_rate": 9.983893781837047e-06, + "loss": 749.8276, + "step": 649 + }, + { + "epoch": 0.2749867794817557, + "grad_norm": 14.716519355773926, + "learning_rate": 9.983783713737554e-06, + "loss": 747.2683, + "step": 650 + }, + { + "epoch": 0.2754098360655738, + "grad_norm": 15.593849182128906, + "learning_rate": 9.983673271432006e-06, + "loss": 687.7885, + "step": 651 + }, + { + "epoch": 0.27583289264939187, + "grad_norm": 14.301959991455078, + "learning_rate": 9.983562454928695e-06, + "loss": 748.8584, + "step": 652 + }, + { + "epoch": 0.27625594923320995, + "grad_norm": 16.454837799072266, + "learning_rate": 9.98345126423594e-06, + "loss": 727.8329, + "step": 653 + }, + { + "epoch": 0.27667900581702803, + "grad_norm": 13.26797866821289, + "learning_rate": 9.983339699362094e-06, + "loss": 768.2882, + "step": 654 + }, + { + "epoch": 0.2771020624008461, + "grad_norm": 15.876700401306152, + "learning_rate": 9.983227760315529e-06, + "loss": 729.4928, + "step": 655 + }, + { + "epoch": 0.2775251189846642, + "grad_norm": 15.482695579528809, + "learning_rate": 9.983115447104653e-06, + "loss": 726.2488, + "step": 656 + }, + { + "epoch": 0.2779481755684823, + "grad_norm": 12.379313468933105, + "learning_rate": 9.983002759737899e-06, + "loss": 810.8096, + "step": 657 + }, + { + "epoch": 0.27837123215230036, + "grad_norm": 13.29471206665039, + "learning_rate": 9.982889698223726e-06, + "loss": 769.3227, + "step": 658 + }, + { + "epoch": 0.27879428873611845, + "grad_norm": 16.50892448425293, + "learning_rate": 9.982776262570623e-06, + "loss": 726.9524, + "step": 659 + }, + { + "epoch": 0.2792173453199365, + "grad_norm": 15.921353340148926, + "learning_rate": 9.982662452787111e-06, + "loss": 745.3869, + "step": 660 + }, + { + "epoch": 0.2796404019037546, + "grad_norm": 15.960860252380371, + "learning_rate": 9.982548268881733e-06, + "loss": 748.3965, + "step": 661 + }, + { + "epoch": 0.2800634584875727, + "grad_norm": 17.83744239807129, + "learning_rate": 9.982433710863061e-06, + "loss": 663.9916, + "step": 662 + }, + { + "epoch": 0.2804865150713908, + "grad_norm": 17.06768035888672, + "learning_rate": 9.982318778739701e-06, + "loss": 684.3965, + "step": 663 + }, + { + "epoch": 0.2809095716552089, + "grad_norm": 12.358064651489258, + "learning_rate": 9.982203472520278e-06, + "loss": 831.6656, + "step": 664 + }, + { + "epoch": 0.281332628239027, + "grad_norm": 14.897374153137207, + "learning_rate": 9.982087792213451e-06, + "loss": 770.0961, + "step": 665 + }, + { + "epoch": 0.2817556848228451, + "grad_norm": 14.351284980773926, + "learning_rate": 9.981971737827908e-06, + "loss": 768.9137, + "step": 666 + }, + { + "epoch": 0.28217874140666316, + "grad_norm": 14.44252872467041, + "learning_rate": 9.981855309372359e-06, + "loss": 748.5928, + "step": 667 + }, + { + "epoch": 0.28260179799048124, + "grad_norm": 13.11347770690918, + "learning_rate": 9.98173850685555e-06, + "loss": 771.0664, + "step": 668 + }, + { + "epoch": 0.2830248545742993, + "grad_norm": 14.9547758102417, + "learning_rate": 9.98162133028625e-06, + "loss": 725.9767, + "step": 669 + }, + { + "epoch": 0.2834479111581174, + "grad_norm": 14.77941608428955, + "learning_rate": 9.981503779673253e-06, + "loss": 723.051, + "step": 670 + }, + { + "epoch": 0.2838709677419355, + "grad_norm": 17.78607749938965, + "learning_rate": 9.98138585502539e-06, + "loss": 705.7529, + "step": 671 + }, + { + "epoch": 0.28429402432575357, + "grad_norm": 16.573898315429688, + "learning_rate": 9.981267556351515e-06, + "loss": 664.9444, + "step": 672 + }, + { + "epoch": 0.28471708090957165, + "grad_norm": 14.600595474243164, + "learning_rate": 9.98114888366051e-06, + "loss": 748.9236, + "step": 673 + }, + { + "epoch": 0.28514013749338973, + "grad_norm": 17.41822052001953, + "learning_rate": 9.981029836961284e-06, + "loss": 704.6552, + "step": 674 + }, + { + "epoch": 0.2855631940772078, + "grad_norm": 11.234994888305664, + "learning_rate": 9.980910416262777e-06, + "loss": 830.0034, + "step": 675 + }, + { + "epoch": 0.2859862506610259, + "grad_norm": 11.624262809753418, + "learning_rate": 9.980790621573955e-06, + "loss": 812.3172, + "step": 676 + }, + { + "epoch": 0.286409307244844, + "grad_norm": 13.202802658081055, + "learning_rate": 9.980670452903813e-06, + "loss": 790.864, + "step": 677 + }, + { + "epoch": 0.28683236382866206, + "grad_norm": 15.065196990966797, + "learning_rate": 9.980549910261374e-06, + "loss": 748.8851, + "step": 678 + }, + { + "epoch": 0.28725542041248014, + "grad_norm": 14.498136520385742, + "learning_rate": 9.980428993655689e-06, + "loss": 746.1309, + "step": 679 + }, + { + "epoch": 0.2876784769962983, + "grad_norm": 11.739739418029785, + "learning_rate": 9.980307703095838e-06, + "loss": 826.5045, + "step": 680 + }, + { + "epoch": 0.28810153358011636, + "grad_norm": 12.653199195861816, + "learning_rate": 9.980186038590924e-06, + "loss": 830.9347, + "step": 681 + }, + { + "epoch": 0.28852459016393445, + "grad_norm": 17.304716110229492, + "learning_rate": 9.980064000150087e-06, + "loss": 729.7948, + "step": 682 + }, + { + "epoch": 0.28894764674775253, + "grad_norm": 17.47504997253418, + "learning_rate": 9.979941587782487e-06, + "loss": 729.4938, + "step": 683 + }, + { + "epoch": 0.2893707033315706, + "grad_norm": 19.709930419921875, + "learning_rate": 9.979818801497318e-06, + "loss": 663.3539, + "step": 684 + }, + { + "epoch": 0.2897937599153887, + "grad_norm": 16.992671966552734, + "learning_rate": 9.979695641303798e-06, + "loss": 707.4894, + "step": 685 + }, + { + "epoch": 0.2902168164992068, + "grad_norm": 15.146056175231934, + "learning_rate": 9.979572107211171e-06, + "loss": 727.4884, + "step": 686 + }, + { + "epoch": 0.29063987308302486, + "grad_norm": 14.489683151245117, + "learning_rate": 9.979448199228718e-06, + "loss": 726.7111, + "step": 687 + }, + { + "epoch": 0.29106292966684294, + "grad_norm": 14.657001495361328, + "learning_rate": 9.979323917365743e-06, + "loss": 767.5029, + "step": 688 + }, + { + "epoch": 0.291485986250661, + "grad_norm": 18.797361373901367, + "learning_rate": 9.979199261631572e-06, + "loss": 730.319, + "step": 689 + }, + { + "epoch": 0.2919090428344791, + "grad_norm": 11.352380752563477, + "learning_rate": 9.979074232035566e-06, + "loss": 829.4482, + "step": 690 + }, + { + "epoch": 0.2923320994182972, + "grad_norm": 12.66897201538086, + "learning_rate": 9.97894882858712e-06, + "loss": 813.0274, + "step": 691 + }, + { + "epoch": 0.29275515600211527, + "grad_norm": 13.36633014678955, + "learning_rate": 9.97882305129564e-06, + "loss": 791.9064, + "step": 692 + }, + { + "epoch": 0.29317821258593335, + "grad_norm": 12.376299858093262, + "learning_rate": 9.978696900170577e-06, + "loss": 788.5964, + "step": 693 + }, + { + "epoch": 0.29360126916975143, + "grad_norm": 13.944463729858398, + "learning_rate": 9.978570375221399e-06, + "loss": 764.0544, + "step": 694 + }, + { + "epoch": 0.2940243257535695, + "grad_norm": 12.949091911315918, + "learning_rate": 9.978443476457608e-06, + "loss": 790.4714, + "step": 695 + }, + { + "epoch": 0.29444738233738765, + "grad_norm": 14.893821716308594, + "learning_rate": 9.978316203888733e-06, + "loss": 728.5012, + "step": 696 + }, + { + "epoch": 0.29487043892120574, + "grad_norm": 17.195363998413086, + "learning_rate": 9.978188557524327e-06, + "loss": 707.6553, + "step": 697 + }, + { + "epoch": 0.2952934955050238, + "grad_norm": 14.242202758789062, + "learning_rate": 9.978060537373977e-06, + "loss": 745.0168, + "step": 698 + }, + { + "epoch": 0.2957165520888419, + "grad_norm": 13.631537437438965, + "learning_rate": 9.977932143447297e-06, + "loss": 771.7348, + "step": 699 + }, + { + "epoch": 0.29613960867266, + "grad_norm": 14.845404624938965, + "learning_rate": 9.977803375753923e-06, + "loss": 727.8301, + "step": 700 + }, + { + "epoch": 0.29656266525647806, + "grad_norm": 14.348610877990723, + "learning_rate": 9.977674234303525e-06, + "loss": 747.4587, + "step": 701 + }, + { + "epoch": 0.29698572184029615, + "grad_norm": 11.928888320922852, + "learning_rate": 9.977544719105801e-06, + "loss": 810.5916, + "step": 702 + }, + { + "epoch": 0.29740877842411423, + "grad_norm": 12.88424301147461, + "learning_rate": 9.977414830170475e-06, + "loss": 771.665, + "step": 703 + }, + { + "epoch": 0.2978318350079323, + "grad_norm": 12.052179336547852, + "learning_rate": 9.977284567507299e-06, + "loss": 809.858, + "step": 704 + }, + { + "epoch": 0.2982548915917504, + "grad_norm": 13.15569019317627, + "learning_rate": 9.977153931126053e-06, + "loss": 768.3575, + "step": 705 + }, + { + "epoch": 0.2986779481755685, + "grad_norm": 12.444567680358887, + "learning_rate": 9.977022921036549e-06, + "loss": 790.3693, + "step": 706 + }, + { + "epoch": 0.29910100475938656, + "grad_norm": 15.728294372558594, + "learning_rate": 9.97689153724862e-06, + "loss": 706.674, + "step": 707 + }, + { + "epoch": 0.29952406134320464, + "grad_norm": 14.735518455505371, + "learning_rate": 9.976759779772133e-06, + "loss": 710.1975, + "step": 708 + }, + { + "epoch": 0.2999471179270227, + "grad_norm": 14.229260444641113, + "learning_rate": 9.976627648616983e-06, + "loss": 728.4937, + "step": 709 + }, + { + "epoch": 0.3003701745108408, + "grad_norm": 15.571407318115234, + "learning_rate": 9.976495143793085e-06, + "loss": 749.7518, + "step": 710 + }, + { + "epoch": 0.3007932310946589, + "grad_norm": 12.068614959716797, + "learning_rate": 9.976362265310395e-06, + "loss": 811.6953, + "step": 711 + }, + { + "epoch": 0.301216287678477, + "grad_norm": 11.820147514343262, + "learning_rate": 9.976229013178886e-06, + "loss": 809.6445, + "step": 712 + }, + { + "epoch": 0.3016393442622951, + "grad_norm": 12.058935165405273, + "learning_rate": 9.976095387408564e-06, + "loss": 792.3411, + "step": 713 + }, + { + "epoch": 0.3020624008461132, + "grad_norm": 11.992490768432617, + "learning_rate": 9.975961388009461e-06, + "loss": 812.6433, + "step": 714 + }, + { + "epoch": 0.30248545742993127, + "grad_norm": 13.410305976867676, + "learning_rate": 9.975827014991642e-06, + "loss": 766.4595, + "step": 715 + }, + { + "epoch": 0.30290851401374935, + "grad_norm": 14.730376243591309, + "learning_rate": 9.975692268365194e-06, + "loss": 747.7289, + "step": 716 + }, + { + "epoch": 0.30333157059756743, + "grad_norm": 17.647377014160156, + "learning_rate": 9.975557148140234e-06, + "loss": 688.0602, + "step": 717 + }, + { + "epoch": 0.3037546271813855, + "grad_norm": 13.575157165527344, + "learning_rate": 9.975421654326908e-06, + "loss": 769.1161, + "step": 718 + }, + { + "epoch": 0.3041776837652036, + "grad_norm": 13.677549362182617, + "learning_rate": 9.975285786935387e-06, + "loss": 813.3157, + "step": 719 + }, + { + "epoch": 0.3046007403490217, + "grad_norm": 15.736393928527832, + "learning_rate": 9.975149545975877e-06, + "loss": 723.9918, + "step": 720 + }, + { + "epoch": 0.30502379693283976, + "grad_norm": 15.964823722839355, + "learning_rate": 9.975012931458606e-06, + "loss": 708.2536, + "step": 721 + }, + { + "epoch": 0.30544685351665785, + "grad_norm": 15.846915245056152, + "learning_rate": 9.974875943393831e-06, + "loss": 744.0417, + "step": 722 + }, + { + "epoch": 0.30586991010047593, + "grad_norm": 17.00204086303711, + "learning_rate": 9.974738581791839e-06, + "loss": 705.119, + "step": 723 + }, + { + "epoch": 0.306292966684294, + "grad_norm": 14.577068328857422, + "learning_rate": 9.97460084666294e-06, + "loss": 768.5793, + "step": 724 + }, + { + "epoch": 0.3067160232681121, + "grad_norm": 15.303778648376465, + "learning_rate": 9.974462738017481e-06, + "loss": 724.4928, + "step": 725 + }, + { + "epoch": 0.3071390798519302, + "grad_norm": 12.448958396911621, + "learning_rate": 9.97432425586583e-06, + "loss": 807.6232, + "step": 726 + }, + { + "epoch": 0.30756213643574826, + "grad_norm": 13.887706756591797, + "learning_rate": 9.974185400218384e-06, + "loss": 771.7232, + "step": 727 + }, + { + "epoch": 0.3079851930195664, + "grad_norm": 12.825654029846191, + "learning_rate": 9.974046171085567e-06, + "loss": 769.6501, + "step": 728 + }, + { + "epoch": 0.3084082496033845, + "grad_norm": 13.430620193481445, + "learning_rate": 9.973906568477839e-06, + "loss": 789.9462, + "step": 729 + }, + { + "epoch": 0.30883130618720256, + "grad_norm": 13.169463157653809, + "learning_rate": 9.973766592405673e-06, + "loss": 788.3062, + "step": 730 + }, + { + "epoch": 0.30925436277102064, + "grad_norm": 16.521507263183594, + "learning_rate": 9.973626242879588e-06, + "loss": 683.33, + "step": 731 + }, + { + "epoch": 0.3096774193548387, + "grad_norm": 16.34935760498047, + "learning_rate": 9.973485519910118e-06, + "loss": 679.803, + "step": 732 + }, + { + "epoch": 0.3101004759386568, + "grad_norm": 15.908613204956055, + "learning_rate": 9.97334442350783e-06, + "loss": 710.7457, + "step": 733 + }, + { + "epoch": 0.3105235325224749, + "grad_norm": 13.530574798583984, + "learning_rate": 9.973202953683318e-06, + "loss": 769.7982, + "step": 734 + }, + { + "epoch": 0.31094658910629297, + "grad_norm": 14.88730239868164, + "learning_rate": 9.973061110447204e-06, + "loss": 723.2163, + "step": 735 + }, + { + "epoch": 0.31136964569011105, + "grad_norm": 16.565988540649414, + "learning_rate": 9.972918893810137e-06, + "loss": 687.1375, + "step": 736 + }, + { + "epoch": 0.31179270227392913, + "grad_norm": 14.909332275390625, + "learning_rate": 9.972776303782796e-06, + "loss": 726.5847, + "step": 737 + }, + { + "epoch": 0.3122157588577472, + "grad_norm": 13.755976676940918, + "learning_rate": 9.97263334037589e-06, + "loss": 748.9023, + "step": 738 + }, + { + "epoch": 0.3126388154415653, + "grad_norm": 13.64484977722168, + "learning_rate": 9.97249000360015e-06, + "loss": 749.0909, + "step": 739 + }, + { + "epoch": 0.3130618720253834, + "grad_norm": 16.38260269165039, + "learning_rate": 9.97234629346634e-06, + "loss": 688.2885, + "step": 740 + }, + { + "epoch": 0.31348492860920146, + "grad_norm": 17.04517364501953, + "learning_rate": 9.97220220998525e-06, + "loss": 665.9562, + "step": 741 + }, + { + "epoch": 0.31390798519301955, + "grad_norm": 15.058135032653809, + "learning_rate": 9.972057753167698e-06, + "loss": 728.7301, + "step": 742 + }, + { + "epoch": 0.3143310417768376, + "grad_norm": 15.888201713562012, + "learning_rate": 9.971912923024533e-06, + "loss": 704.1436, + "step": 743 + }, + { + "epoch": 0.31475409836065577, + "grad_norm": 14.609177589416504, + "learning_rate": 9.971767719566628e-06, + "loss": 728.2115, + "step": 744 + }, + { + "epoch": 0.31517715494447385, + "grad_norm": 14.917192459106445, + "learning_rate": 9.971622142804882e-06, + "loss": 725.403, + "step": 745 + }, + { + "epoch": 0.31560021152829193, + "grad_norm": 15.162091255187988, + "learning_rate": 9.971476192750232e-06, + "loss": 765.1327, + "step": 746 + }, + { + "epoch": 0.31602326811211, + "grad_norm": 16.873836517333984, + "learning_rate": 9.971329869413631e-06, + "loss": 710.0198, + "step": 747 + }, + { + "epoch": 0.3164463246959281, + "grad_norm": 13.50626277923584, + "learning_rate": 9.971183172806069e-06, + "loss": 785.5498, + "step": 748 + }, + { + "epoch": 0.3168693812797462, + "grad_norm": 13.464475631713867, + "learning_rate": 9.971036102938559e-06, + "loss": 747.1633, + "step": 749 + }, + { + "epoch": 0.31729243786356426, + "grad_norm": 16.158958435058594, + "learning_rate": 9.970888659822144e-06, + "loss": 666.0376, + "step": 750 + }, + { + "epoch": 0.31771549444738234, + "grad_norm": 13.643985748291016, + "learning_rate": 9.970740843467896e-06, + "loss": 769.1812, + "step": 751 + }, + { + "epoch": 0.3181385510312004, + "grad_norm": 16.85952377319336, + "learning_rate": 9.970592653886915e-06, + "loss": 682.6871, + "step": 752 + }, + { + "epoch": 0.3185616076150185, + "grad_norm": 12.994452476501465, + "learning_rate": 9.970444091090324e-06, + "loss": 791.3276, + "step": 753 + }, + { + "epoch": 0.3189846641988366, + "grad_norm": 15.354202270507812, + "learning_rate": 9.970295155089278e-06, + "loss": 729.6956, + "step": 754 + }, + { + "epoch": 0.31940772078265467, + "grad_norm": 13.157687187194824, + "learning_rate": 9.970145845894963e-06, + "loss": 768.7365, + "step": 755 + }, + { + "epoch": 0.31983077736647275, + "grad_norm": 14.75980281829834, + "learning_rate": 9.969996163518587e-06, + "loss": 722.5856, + "step": 756 + }, + { + "epoch": 0.32025383395029083, + "grad_norm": 15.745953559875488, + "learning_rate": 9.969846107971391e-06, + "loss": 686.2398, + "step": 757 + }, + { + "epoch": 0.3206768905341089, + "grad_norm": 12.20231819152832, + "learning_rate": 9.96969567926464e-06, + "loss": 809.6797, + "step": 758 + }, + { + "epoch": 0.321099947117927, + "grad_norm": 20.609363555908203, + "learning_rate": 9.969544877409631e-06, + "loss": 643.1382, + "step": 759 + }, + { + "epoch": 0.32152300370174514, + "grad_norm": 16.233619689941406, + "learning_rate": 9.969393702417685e-06, + "loss": 725.8947, + "step": 760 + }, + { + "epoch": 0.3219460602855632, + "grad_norm": 14.343175888061523, + "learning_rate": 9.969242154300152e-06, + "loss": 747.1578, + "step": 761 + }, + { + "epoch": 0.3223691168693813, + "grad_norm": 15.446037292480469, + "learning_rate": 9.969090233068415e-06, + "loss": 745.2869, + "step": 762 + }, + { + "epoch": 0.3227921734531994, + "grad_norm": 13.787464141845703, + "learning_rate": 9.968937938733878e-06, + "loss": 769.9205, + "step": 763 + }, + { + "epoch": 0.32321523003701746, + "grad_norm": 13.29078483581543, + "learning_rate": 9.968785271307976e-06, + "loss": 810.6031, + "step": 764 + }, + { + "epoch": 0.32363828662083555, + "grad_norm": 16.9868106842041, + "learning_rate": 9.968632230802172e-06, + "loss": 705.6881, + "step": 765 + }, + { + "epoch": 0.32406134320465363, + "grad_norm": 14.813126564025879, + "learning_rate": 9.968478817227958e-06, + "loss": 789.6064, + "step": 766 + }, + { + "epoch": 0.3244843997884717, + "grad_norm": 15.376908302307129, + "learning_rate": 9.968325030596852e-06, + "loss": 767.3699, + "step": 767 + }, + { + "epoch": 0.3249074563722898, + "grad_norm": 17.634231567382812, + "learning_rate": 9.968170870920405e-06, + "loss": 666.7907, + "step": 768 + }, + { + "epoch": 0.3253305129561079, + "grad_norm": 15.956480979919434, + "learning_rate": 9.968016338210185e-06, + "loss": 746.0415, + "step": 769 + }, + { + "epoch": 0.32575356953992596, + "grad_norm": 16.895830154418945, + "learning_rate": 9.9678614324778e-06, + "loss": 729.4943, + "step": 770 + }, + { + "epoch": 0.32617662612374404, + "grad_norm": 13.781426429748535, + "learning_rate": 9.967706153734877e-06, + "loss": 788.0038, + "step": 771 + }, + { + "epoch": 0.3265996827075621, + "grad_norm": 13.748750686645508, + "learning_rate": 9.967550501993081e-06, + "loss": 767.8076, + "step": 772 + }, + { + "epoch": 0.3270227392913802, + "grad_norm": 15.415308952331543, + "learning_rate": 9.967394477264095e-06, + "loss": 766.8181, + "step": 773 + }, + { + "epoch": 0.3274457958751983, + "grad_norm": 14.477202415466309, + "learning_rate": 9.967238079559634e-06, + "loss": 745.8593, + "step": 774 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 16.91889190673828, + "learning_rate": 9.967081308891441e-06, + "loss": 682.687, + "step": 775 + }, + { + "epoch": 0.3282919090428345, + "grad_norm": 15.293749809265137, + "learning_rate": 9.966924165271292e-06, + "loss": 767.1032, + "step": 776 + }, + { + "epoch": 0.3287149656266526, + "grad_norm": 19.225868225097656, + "learning_rate": 9.96676664871098e-06, + "loss": 682.2003, + "step": 777 + }, + { + "epoch": 0.32913802221047067, + "grad_norm": 12.294820785522461, + "learning_rate": 9.966608759222333e-06, + "loss": 807.7837, + "step": 778 + }, + { + "epoch": 0.32956107879428875, + "grad_norm": 17.117204666137695, + "learning_rate": 9.966450496817207e-06, + "loss": 683.5738, + "step": 779 + }, + { + "epoch": 0.32998413537810684, + "grad_norm": 15.332831382751465, + "learning_rate": 9.966291861507489e-06, + "loss": 764.4813, + "step": 780 + }, + { + "epoch": 0.3304071919619249, + "grad_norm": 18.145721435546875, + "learning_rate": 9.966132853305084e-06, + "loss": 684.7432, + "step": 781 + }, + { + "epoch": 0.330830248545743, + "grad_norm": 12.829322814941406, + "learning_rate": 9.965973472221934e-06, + "loss": 785.7034, + "step": 782 + }, + { + "epoch": 0.3312533051295611, + "grad_norm": 15.518482208251953, + "learning_rate": 9.965813718270005e-06, + "loss": 746.9774, + "step": 783 + }, + { + "epoch": 0.33167636171337916, + "grad_norm": 14.075335502624512, + "learning_rate": 9.965653591461294e-06, + "loss": 726.8811, + "step": 784 + }, + { + "epoch": 0.33209941829719725, + "grad_norm": 16.456472396850586, + "learning_rate": 9.965493091807821e-06, + "loss": 664.638, + "step": 785 + }, + { + "epoch": 0.33252247488101533, + "grad_norm": 13.475645065307617, + "learning_rate": 9.965332219321641e-06, + "loss": 790.7752, + "step": 786 + }, + { + "epoch": 0.3329455314648334, + "grad_norm": 13.281405448913574, + "learning_rate": 9.965170974014831e-06, + "loss": 768.4264, + "step": 787 + }, + { + "epoch": 0.3333685880486515, + "grad_norm": 14.678174018859863, + "learning_rate": 9.965009355899497e-06, + "loss": 726.0158, + "step": 788 + }, + { + "epoch": 0.3337916446324696, + "grad_norm": 15.359972953796387, + "learning_rate": 9.964847364987776e-06, + "loss": 726.4828, + "step": 789 + }, + { + "epoch": 0.33421470121628766, + "grad_norm": 13.446939468383789, + "learning_rate": 9.964685001291827e-06, + "loss": 790.6974, + "step": 790 + }, + { + "epoch": 0.33463775780010574, + "grad_norm": 13.072418212890625, + "learning_rate": 9.964522264823847e-06, + "loss": 807.7113, + "step": 791 + }, + { + "epoch": 0.3350608143839239, + "grad_norm": 14.912205696105957, + "learning_rate": 9.964359155596053e-06, + "loss": 746.7633, + "step": 792 + }, + { + "epoch": 0.33548387096774196, + "grad_norm": 14.55552864074707, + "learning_rate": 9.96419567362069e-06, + "loss": 728.5201, + "step": 793 + }, + { + "epoch": 0.33590692755156004, + "grad_norm": 15.325204849243164, + "learning_rate": 9.964031818910033e-06, + "loss": 706.5521, + "step": 794 + }, + { + "epoch": 0.3363299841353781, + "grad_norm": 17.423236846923828, + "learning_rate": 9.963867591476388e-06, + "loss": 665.6261, + "step": 795 + }, + { + "epoch": 0.3367530407191962, + "grad_norm": 14.941301345825195, + "learning_rate": 9.963702991332082e-06, + "loss": 764.7434, + "step": 796 + }, + { + "epoch": 0.3371760973030143, + "grad_norm": 14.04619026184082, + "learning_rate": 9.96353801848948e-06, + "loss": 745.6765, + "step": 797 + }, + { + "epoch": 0.33759915388683237, + "grad_norm": 11.939712524414062, + "learning_rate": 9.963372672960962e-06, + "loss": 812.2851, + "step": 798 + }, + { + "epoch": 0.33802221047065045, + "grad_norm": 11.071653366088867, + "learning_rate": 9.963206954758946e-06, + "loss": 826.8423, + "step": 799 + }, + { + "epoch": 0.33844526705446853, + "grad_norm": 11.33122730255127, + "learning_rate": 9.963040863895876e-06, + "loss": 830.8718, + "step": 800 + }, + { + "epoch": 0.3388683236382866, + "grad_norm": 14.59567642211914, + "learning_rate": 9.962874400384222e-06, + "loss": 767.8294, + "step": 801 + }, + { + "epoch": 0.3392913802221047, + "grad_norm": 17.75092887878418, + "learning_rate": 9.96270756423648e-06, + "loss": 705.1477, + "step": 802 + }, + { + "epoch": 0.3397144368059228, + "grad_norm": 13.636887550354004, + "learning_rate": 9.962540355465182e-06, + "loss": 791.8401, + "step": 803 + }, + { + "epoch": 0.34013749338974086, + "grad_norm": 14.336786270141602, + "learning_rate": 9.96237277408288e-06, + "loss": 767.8309, + "step": 804 + }, + { + "epoch": 0.34056054997355895, + "grad_norm": 12.121651649475098, + "learning_rate": 9.962204820102157e-06, + "loss": 811.7954, + "step": 805 + }, + { + "epoch": 0.34098360655737703, + "grad_norm": 13.289406776428223, + "learning_rate": 9.962036493535624e-06, + "loss": 792.4717, + "step": 806 + }, + { + "epoch": 0.3414066631411951, + "grad_norm": 14.09830093383789, + "learning_rate": 9.961867794395921e-06, + "loss": 771.1855, + "step": 807 + }, + { + "epoch": 0.34182971972501325, + "grad_norm": 13.547648429870605, + "learning_rate": 9.961698722695711e-06, + "loss": 766.1014, + "step": 808 + }, + { + "epoch": 0.34225277630883133, + "grad_norm": 14.228902816772461, + "learning_rate": 9.961529278447692e-06, + "loss": 788.2466, + "step": 809 + }, + { + "epoch": 0.3426758328926494, + "grad_norm": 16.29969024658203, + "learning_rate": 9.961359461664587e-06, + "loss": 707.0734, + "step": 810 + }, + { + "epoch": 0.3430988894764675, + "grad_norm": 18.4735107421875, + "learning_rate": 9.961189272359143e-06, + "loss": 705.8962, + "step": 811 + }, + { + "epoch": 0.3435219460602856, + "grad_norm": 15.910839080810547, + "learning_rate": 9.961018710544143e-06, + "loss": 724.2695, + "step": 812 + }, + { + "epoch": 0.34394500264410366, + "grad_norm": 14.71861457824707, + "learning_rate": 9.96084777623239e-06, + "loss": 749.3469, + "step": 813 + }, + { + "epoch": 0.34436805922792174, + "grad_norm": 14.374828338623047, + "learning_rate": 9.960676469436722e-06, + "loss": 766.8179, + "step": 814 + }, + { + "epoch": 0.3447911158117398, + "grad_norm": 13.825345993041992, + "learning_rate": 9.960504790169997e-06, + "loss": 764.8573, + "step": 815 + }, + { + "epoch": 0.3452141723955579, + "grad_norm": 10.702971458435059, + "learning_rate": 9.960332738445112e-06, + "loss": 855.4296, + "step": 816 + }, + { + "epoch": 0.345637228979376, + "grad_norm": 13.637968063354492, + "learning_rate": 9.960160314274979e-06, + "loss": 785.2141, + "step": 817 + }, + { + "epoch": 0.34606028556319407, + "grad_norm": 14.425210952758789, + "learning_rate": 9.959987517672548e-06, + "loss": 767.8815, + "step": 818 + }, + { + "epoch": 0.34648334214701215, + "grad_norm": 13.19023323059082, + "learning_rate": 9.959814348650791e-06, + "loss": 789.8585, + "step": 819 + }, + { + "epoch": 0.34690639873083023, + "grad_norm": 17.313051223754883, + "learning_rate": 9.959640807222715e-06, + "loss": 727.3242, + "step": 820 + }, + { + "epoch": 0.3473294553146483, + "grad_norm": 14.85757064819336, + "learning_rate": 9.959466893401344e-06, + "loss": 746.2883, + "step": 821 + }, + { + "epoch": 0.3477525118984664, + "grad_norm": 13.38452434539795, + "learning_rate": 9.95929260719974e-06, + "loss": 790.2625, + "step": 822 + }, + { + "epoch": 0.3481755684822845, + "grad_norm": 18.052213668823242, + "learning_rate": 9.959117948630989e-06, + "loss": 705.8742, + "step": 823 + }, + { + "epoch": 0.3485986250661026, + "grad_norm": 13.775823593139648, + "learning_rate": 9.958942917708206e-06, + "loss": 768.5142, + "step": 824 + }, + { + "epoch": 0.3490216816499207, + "grad_norm": 15.38662338256836, + "learning_rate": 9.95876751444453e-06, + "loss": 746.0074, + "step": 825 + }, + { + "epoch": 0.3494447382337388, + "grad_norm": 14.068460464477539, + "learning_rate": 9.958591738853135e-06, + "loss": 769.8628, + "step": 826 + }, + { + "epoch": 0.34986779481755687, + "grad_norm": 13.198057174682617, + "learning_rate": 9.958415590947215e-06, + "loss": 787.2469, + "step": 827 + }, + { + "epoch": 0.35029085140137495, + "grad_norm": 13.602401733398438, + "learning_rate": 9.95823907074e-06, + "loss": 789.0796, + "step": 828 + }, + { + "epoch": 0.35071390798519303, + "grad_norm": 14.47413158416748, + "learning_rate": 9.958062178244741e-06, + "loss": 726.0978, + "step": 829 + }, + { + "epoch": 0.3511369645690111, + "grad_norm": 14.187417984008789, + "learning_rate": 9.957884913474723e-06, + "loss": 724.3409, + "step": 830 + }, + { + "epoch": 0.3515600211528292, + "grad_norm": 15.574676513671875, + "learning_rate": 9.957707276443252e-06, + "loss": 725.4321, + "step": 831 + }, + { + "epoch": 0.3519830777366473, + "grad_norm": 16.04277801513672, + "learning_rate": 9.957529267163668e-06, + "loss": 683.8291, + "step": 832 + }, + { + "epoch": 0.35240613432046536, + "grad_norm": 14.896516799926758, + "learning_rate": 9.957350885649337e-06, + "loss": 748.1541, + "step": 833 + }, + { + "epoch": 0.35282919090428344, + "grad_norm": 14.5438871383667, + "learning_rate": 9.957172131913652e-06, + "loss": 725.5833, + "step": 834 + }, + { + "epoch": 0.3532522474881015, + "grad_norm": 13.79955768585205, + "learning_rate": 9.956993005970034e-06, + "loss": 766.6707, + "step": 835 + }, + { + "epoch": 0.3536753040719196, + "grad_norm": 16.686288833618164, + "learning_rate": 9.956813507831936e-06, + "loss": 684.2942, + "step": 836 + }, + { + "epoch": 0.3540983606557377, + "grad_norm": 12.69182300567627, + "learning_rate": 9.956633637512831e-06, + "loss": 790.2018, + "step": 837 + }, + { + "epoch": 0.35452141723955577, + "grad_norm": 16.54207420349121, + "learning_rate": 9.956453395026226e-06, + "loss": 705.4152, + "step": 838 + }, + { + "epoch": 0.35494447382337385, + "grad_norm": 16.809913635253906, + "learning_rate": 9.956272780385657e-06, + "loss": 662.7619, + "step": 839 + }, + { + "epoch": 0.355367530407192, + "grad_norm": 12.587762832641602, + "learning_rate": 9.956091793604684e-06, + "loss": 810.446, + "step": 840 + }, + { + "epoch": 0.35579058699101007, + "grad_norm": 16.587175369262695, + "learning_rate": 9.955910434696893e-06, + "loss": 725.9471, + "step": 841 + }, + { + "epoch": 0.35621364357482815, + "grad_norm": 15.673657417297363, + "learning_rate": 9.955728703675906e-06, + "loss": 726.1071, + "step": 842 + }, + { + "epoch": 0.35663670015864624, + "grad_norm": 11.896963119506836, + "learning_rate": 9.955546600555368e-06, + "loss": 873.4116, + "step": 843 + }, + { + "epoch": 0.3570597567424643, + "grad_norm": 17.120180130004883, + "learning_rate": 9.955364125348948e-06, + "loss": 747.328, + "step": 844 + }, + { + "epoch": 0.3574828133262824, + "grad_norm": 15.844847679138184, + "learning_rate": 9.955181278070351e-06, + "loss": 726.6568, + "step": 845 + }, + { + "epoch": 0.3579058699101005, + "grad_norm": 19.132349014282227, + "learning_rate": 9.954998058733303e-06, + "loss": 704.7498, + "step": 846 + }, + { + "epoch": 0.35832892649391856, + "grad_norm": 16.72077751159668, + "learning_rate": 9.954814467351564e-06, + "loss": 703.4618, + "step": 847 + }, + { + "epoch": 0.35875198307773665, + "grad_norm": 13.52892017364502, + "learning_rate": 9.954630503938918e-06, + "loss": 790.9484, + "step": 848 + }, + { + "epoch": 0.35917503966155473, + "grad_norm": 14.155750274658203, + "learning_rate": 9.954446168509175e-06, + "loss": 764.459, + "step": 849 + }, + { + "epoch": 0.3595980962453728, + "grad_norm": 14.8522367477417, + "learning_rate": 9.95426146107618e-06, + "loss": 746.5029, + "step": 850 + }, + { + "epoch": 0.3600211528291909, + "grad_norm": 19.290327072143555, + "learning_rate": 9.954076381653801e-06, + "loss": 642.1521, + "step": 851 + }, + { + "epoch": 0.360444209413009, + "grad_norm": 13.182841300964355, + "learning_rate": 9.953890930255932e-06, + "loss": 831.376, + "step": 852 + }, + { + "epoch": 0.36086726599682706, + "grad_norm": 15.464792251586914, + "learning_rate": 9.953705106896498e-06, + "loss": 743.3821, + "step": 853 + }, + { + "epoch": 0.36129032258064514, + "grad_norm": 15.273982048034668, + "learning_rate": 9.953518911589455e-06, + "loss": 703.1139, + "step": 854 + }, + { + "epoch": 0.3617133791644632, + "grad_norm": 14.607475280761719, + "learning_rate": 9.95333234434878e-06, + "loss": 764.5971, + "step": 855 + }, + { + "epoch": 0.36213643574828136, + "grad_norm": 13.527697563171387, + "learning_rate": 9.953145405188483e-06, + "loss": 789.266, + "step": 856 + }, + { + "epoch": 0.36255949233209944, + "grad_norm": 14.011011123657227, + "learning_rate": 9.952958094122597e-06, + "loss": 763.5895, + "step": 857 + }, + { + "epoch": 0.3629825489159175, + "grad_norm": 21.10732078552246, + "learning_rate": 9.952770411165192e-06, + "loss": 664.128, + "step": 858 + }, + { + "epoch": 0.3634056054997356, + "grad_norm": 14.247906684875488, + "learning_rate": 9.952582356330357e-06, + "loss": 768.0233, + "step": 859 + }, + { + "epoch": 0.3638286620835537, + "grad_norm": 13.041983604431152, + "learning_rate": 9.95239392963221e-06, + "loss": 808.5127, + "step": 860 + }, + { + "epoch": 0.36425171866737177, + "grad_norm": 16.724746704101562, + "learning_rate": 9.9522051310849e-06, + "loss": 725.9777, + "step": 861 + }, + { + "epoch": 0.36467477525118985, + "grad_norm": 13.34963321685791, + "learning_rate": 9.952015960702605e-06, + "loss": 786.709, + "step": 862 + }, + { + "epoch": 0.36509783183500794, + "grad_norm": 16.76328468322754, + "learning_rate": 9.95182641849953e-06, + "loss": 723.3813, + "step": 863 + }, + { + "epoch": 0.365520888418826, + "grad_norm": 15.545631408691406, + "learning_rate": 9.951636504489903e-06, + "loss": 744.4617, + "step": 864 + }, + { + "epoch": 0.3659439450026441, + "grad_norm": 15.11990737915039, + "learning_rate": 9.951446218687983e-06, + "loss": 766.6077, + "step": 865 + }, + { + "epoch": 0.3663670015864622, + "grad_norm": 14.010467529296875, + "learning_rate": 9.951255561108063e-06, + "loss": 768.2708, + "step": 866 + }, + { + "epoch": 0.36679005817028026, + "grad_norm": 16.335216522216797, + "learning_rate": 9.951064531764452e-06, + "loss": 704.551, + "step": 867 + }, + { + "epoch": 0.36721311475409835, + "grad_norm": 14.769498825073242, + "learning_rate": 9.950873130671498e-06, + "loss": 744.1234, + "step": 868 + }, + { + "epoch": 0.36763617133791643, + "grad_norm": 13.790244102478027, + "learning_rate": 9.95068135784357e-06, + "loss": 785.8151, + "step": 869 + }, + { + "epoch": 0.3680592279217345, + "grad_norm": 13.23391056060791, + "learning_rate": 9.950489213295069e-06, + "loss": 768.8296, + "step": 870 + }, + { + "epoch": 0.3684822845055526, + "grad_norm": 15.420374870300293, + "learning_rate": 9.95029669704042e-06, + "loss": 703.0149, + "step": 871 + }, + { + "epoch": 0.36890534108937073, + "grad_norm": 15.079566955566406, + "learning_rate": 9.95010380909408e-06, + "loss": 725.311, + "step": 872 + }, + { + "epoch": 0.3693283976731888, + "grad_norm": 15.301424026489258, + "learning_rate": 9.94991054947053e-06, + "loss": 747.8666, + "step": 873 + }, + { + "epoch": 0.3697514542570069, + "grad_norm": 14.321982383728027, + "learning_rate": 9.949716918184282e-06, + "loss": 745.6357, + "step": 874 + }, + { + "epoch": 0.370174510840825, + "grad_norm": 13.928252220153809, + "learning_rate": 9.949522915249876e-06, + "loss": 744.9454, + "step": 875 + }, + { + "epoch": 0.37059756742464306, + "grad_norm": 17.16299819946289, + "learning_rate": 9.949328540681877e-06, + "loss": 660.2034, + "step": 876 + }, + { + "epoch": 0.37102062400846114, + "grad_norm": 14.533899307250977, + "learning_rate": 9.949133794494878e-06, + "loss": 767.9462, + "step": 877 + }, + { + "epoch": 0.3714436805922792, + "grad_norm": 15.50848388671875, + "learning_rate": 9.948938676703506e-06, + "loss": 722.5176, + "step": 878 + }, + { + "epoch": 0.3718667371760973, + "grad_norm": 14.660758018493652, + "learning_rate": 9.948743187322406e-06, + "loss": 746.0592, + "step": 879 + }, + { + "epoch": 0.3722897937599154, + "grad_norm": 15.362968444824219, + "learning_rate": 9.948547326366262e-06, + "loss": 742.8708, + "step": 880 + }, + { + "epoch": 0.37271285034373347, + "grad_norm": 13.897639274597168, + "learning_rate": 9.948351093849775e-06, + "loss": 764.3601, + "step": 881 + }, + { + "epoch": 0.37313590692755155, + "grad_norm": 14.836454391479492, + "learning_rate": 9.948154489787682e-06, + "loss": 749.0524, + "step": 882 + }, + { + "epoch": 0.37355896351136963, + "grad_norm": 19.555627822875977, + "learning_rate": 9.947957514194743e-06, + "loss": 702.2396, + "step": 883 + }, + { + "epoch": 0.3739820200951877, + "grad_norm": 17.474937438964844, + "learning_rate": 9.947760167085752e-06, + "loss": 686.6125, + "step": 884 + }, + { + "epoch": 0.3744050766790058, + "grad_norm": 16.26493263244629, + "learning_rate": 9.947562448475521e-06, + "loss": 724.1489, + "step": 885 + }, + { + "epoch": 0.3748281332628239, + "grad_norm": 17.004648208618164, + "learning_rate": 9.947364358378899e-06, + "loss": 723.2219, + "step": 886 + }, + { + "epoch": 0.37525118984664196, + "grad_norm": 14.418662071228027, + "learning_rate": 9.94716589681076e-06, + "loss": 766.613, + "step": 887 + }, + { + "epoch": 0.3756742464304601, + "grad_norm": 12.2735595703125, + "learning_rate": 9.946967063786002e-06, + "loss": 807.4565, + "step": 888 + }, + { + "epoch": 0.3760973030142782, + "grad_norm": 17.792415618896484, + "learning_rate": 9.946767859319559e-06, + "loss": 705.079, + "step": 889 + }, + { + "epoch": 0.37652035959809627, + "grad_norm": 15.249762535095215, + "learning_rate": 9.946568283426385e-06, + "loss": 743.6244, + "step": 890 + }, + { + "epoch": 0.37694341618191435, + "grad_norm": 13.490350723266602, + "learning_rate": 9.946368336121467e-06, + "loss": 787.7301, + "step": 891 + }, + { + "epoch": 0.37736647276573243, + "grad_norm": 15.768430709838867, + "learning_rate": 9.946168017419816e-06, + "loss": 725.1906, + "step": 892 + }, + { + "epoch": 0.3777895293495505, + "grad_norm": 17.953472137451172, + "learning_rate": 9.945967327336474e-06, + "loss": 703.5588, + "step": 893 + }, + { + "epoch": 0.3782125859333686, + "grad_norm": 15.837947845458984, + "learning_rate": 9.94576626588651e-06, + "loss": 703.4486, + "step": 894 + }, + { + "epoch": 0.3786356425171867, + "grad_norm": 13.303399085998535, + "learning_rate": 9.94556483308502e-06, + "loss": 805.4293, + "step": 895 + }, + { + "epoch": 0.37905869910100476, + "grad_norm": 18.557615280151367, + "learning_rate": 9.945363028947131e-06, + "loss": 704.175, + "step": 896 + }, + { + "epoch": 0.37948175568482284, + "grad_norm": 14.103608131408691, + "learning_rate": 9.94516085348799e-06, + "loss": 768.2723, + "step": 897 + }, + { + "epoch": 0.3799048122686409, + "grad_norm": 15.866509437561035, + "learning_rate": 9.944958306722782e-06, + "loss": 745.3622, + "step": 898 + }, + { + "epoch": 0.380327868852459, + "grad_norm": 12.063283920288086, + "learning_rate": 9.944755388666714e-06, + "loss": 870.9514, + "step": 899 + }, + { + "epoch": 0.3807509254362771, + "grad_norm": 15.486409187316895, + "learning_rate": 9.944552099335021e-06, + "loss": 747.4865, + "step": 900 + }, + { + "epoch": 0.38117398202009517, + "grad_norm": 15.407876968383789, + "learning_rate": 9.944348438742969e-06, + "loss": 769.3948, + "step": 901 + }, + { + "epoch": 0.38159703860391325, + "grad_norm": 18.817684173583984, + "learning_rate": 9.944144406905846e-06, + "loss": 660.5693, + "step": 902 + }, + { + "epoch": 0.38202009518773133, + "grad_norm": 16.42972755432129, + "learning_rate": 9.943940003838976e-06, + "loss": 744.201, + "step": 903 + }, + { + "epoch": 0.38244315177154947, + "grad_norm": 16.097463607788086, + "learning_rate": 9.943735229557704e-06, + "loss": 743.6407, + "step": 904 + }, + { + "epoch": 0.38286620835536755, + "grad_norm": 13.440646171569824, + "learning_rate": 9.943530084077406e-06, + "loss": 786.5857, + "step": 905 + }, + { + "epoch": 0.38328926493918564, + "grad_norm": 16.08328628540039, + "learning_rate": 9.943324567413488e-06, + "loss": 726.0998, + "step": 906 + }, + { + "epoch": 0.3837123215230037, + "grad_norm": 16.431415557861328, + "learning_rate": 9.943118679581377e-06, + "loss": 725.0726, + "step": 907 + }, + { + "epoch": 0.3841353781068218, + "grad_norm": 15.262025833129883, + "learning_rate": 9.942912420596531e-06, + "loss": 723.947, + "step": 908 + }, + { + "epoch": 0.3845584346906399, + "grad_norm": 13.220877647399902, + "learning_rate": 9.942705790474442e-06, + "loss": 787.9496, + "step": 909 + }, + { + "epoch": 0.38498149127445797, + "grad_norm": 13.276567459106445, + "learning_rate": 9.94249878923062e-06, + "loss": 785.3256, + "step": 910 + }, + { + "epoch": 0.38540454785827605, + "grad_norm": 13.16378116607666, + "learning_rate": 9.942291416880614e-06, + "loss": 787.9699, + "step": 911 + }, + { + "epoch": 0.38582760444209413, + "grad_norm": 13.327720642089844, + "learning_rate": 9.942083673439988e-06, + "loss": 785.1938, + "step": 912 + }, + { + "epoch": 0.3862506610259122, + "grad_norm": 15.457996368408203, + "learning_rate": 9.941875558924343e-06, + "loss": 766.6582, + "step": 913 + }, + { + "epoch": 0.3866737176097303, + "grad_norm": 14.214059829711914, + "learning_rate": 9.941667073349306e-06, + "loss": 786.7939, + "step": 914 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 14.309141159057617, + "learning_rate": 9.941458216730528e-06, + "loss": 763.2081, + "step": 915 + }, + { + "epoch": 0.38751983077736646, + "grad_norm": 15.659618377685547, + "learning_rate": 9.941248989083693e-06, + "loss": 744.1142, + "step": 916 + }, + { + "epoch": 0.38794288736118454, + "grad_norm": 15.47734546661377, + "learning_rate": 9.941039390424514e-06, + "loss": 723.0295, + "step": 917 + }, + { + "epoch": 0.3883659439450026, + "grad_norm": 14.27219009399414, + "learning_rate": 9.940829420768723e-06, + "loss": 789.1136, + "step": 918 + }, + { + "epoch": 0.3887890005288207, + "grad_norm": 18.321895599365234, + "learning_rate": 9.940619080132088e-06, + "loss": 682.9147, + "step": 919 + }, + { + "epoch": 0.38921205711263884, + "grad_norm": 19.78194808959961, + "learning_rate": 9.940408368530402e-06, + "loss": 681.4796, + "step": 920 + }, + { + "epoch": 0.3896351136964569, + "grad_norm": 15.735118865966797, + "learning_rate": 9.940197285979489e-06, + "loss": 727.5448, + "step": 921 + }, + { + "epoch": 0.390058170280275, + "grad_norm": 13.77587890625, + "learning_rate": 9.939985832495194e-06, + "loss": 787.6647, + "step": 922 + }, + { + "epoch": 0.3904812268640931, + "grad_norm": 12.55642318725586, + "learning_rate": 9.939774008093396e-06, + "loss": 808.2413, + "step": 923 + }, + { + "epoch": 0.39090428344791117, + "grad_norm": 17.33633041381836, + "learning_rate": 9.93956181279e-06, + "loss": 724.7108, + "step": 924 + }, + { + "epoch": 0.39132734003172925, + "grad_norm": 19.03684425354004, + "learning_rate": 9.939349246600936e-06, + "loss": 642.1163, + "step": 925 + }, + { + "epoch": 0.39175039661554734, + "grad_norm": 16.287567138671875, + "learning_rate": 9.939136309542168e-06, + "loss": 706.8674, + "step": 926 + }, + { + "epoch": 0.3921734531993654, + "grad_norm": 16.01658058166504, + "learning_rate": 9.938923001629681e-06, + "loss": 747.9971, + "step": 927 + }, + { + "epoch": 0.3925965097831835, + "grad_norm": 15.2535400390625, + "learning_rate": 9.938709322879495e-06, + "loss": 723.6833, + "step": 928 + }, + { + "epoch": 0.3930195663670016, + "grad_norm": 12.192900657653809, + "learning_rate": 9.938495273307652e-06, + "loss": 809.7858, + "step": 929 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 15.316184997558594, + "learning_rate": 9.938280852930222e-06, + "loss": 767.2147, + "step": 930 + }, + { + "epoch": 0.39386567953463775, + "grad_norm": 13.718640327453613, + "learning_rate": 9.938066061763309e-06, + "loss": 771.2937, + "step": 931 + }, + { + "epoch": 0.39428873611845583, + "grad_norm": 16.496862411499023, + "learning_rate": 9.937850899823036e-06, + "loss": 746.9215, + "step": 932 + }, + { + "epoch": 0.3947117927022739, + "grad_norm": 15.864405632019043, + "learning_rate": 9.93763536712556e-06, + "loss": 767.9708, + "step": 933 + }, + { + "epoch": 0.395134849286092, + "grad_norm": 12.841909408569336, + "learning_rate": 9.937419463687066e-06, + "loss": 788.3328, + "step": 934 + }, + { + "epoch": 0.3955579058699101, + "grad_norm": 19.36853790283203, + "learning_rate": 9.937203189523764e-06, + "loss": 684.1501, + "step": 935 + }, + { + "epoch": 0.3959809624537282, + "grad_norm": 15.809462547302246, + "learning_rate": 9.936986544651892e-06, + "loss": 725.9304, + "step": 936 + }, + { + "epoch": 0.3964040190375463, + "grad_norm": 13.599227905273438, + "learning_rate": 9.936769529087717e-06, + "loss": 788.2534, + "step": 937 + }, + { + "epoch": 0.3968270756213644, + "grad_norm": 17.580150604248047, + "learning_rate": 9.936552142847535e-06, + "loss": 702.753, + "step": 938 + }, + { + "epoch": 0.39725013220518246, + "grad_norm": 14.446245193481445, + "learning_rate": 9.936334385947667e-06, + "loss": 746.0005, + "step": 939 + }, + { + "epoch": 0.39767318878900054, + "grad_norm": 14.475452423095703, + "learning_rate": 9.936116258404462e-06, + "loss": 767.3042, + "step": 940 + }, + { + "epoch": 0.3980962453728186, + "grad_norm": 12.733047485351562, + "learning_rate": 9.9358977602343e-06, + "loss": 811.7549, + "step": 941 + }, + { + "epoch": 0.3985193019566367, + "grad_norm": 13.809123992919922, + "learning_rate": 9.935678891453588e-06, + "loss": 763.3655, + "step": 942 + }, + { + "epoch": 0.3989423585404548, + "grad_norm": 15.705228805541992, + "learning_rate": 9.935459652078758e-06, + "loss": 724.3462, + "step": 943 + }, + { + "epoch": 0.39936541512427287, + "grad_norm": 14.631417274475098, + "learning_rate": 9.935240042126271e-06, + "loss": 763.9325, + "step": 944 + }, + { + "epoch": 0.39978847170809095, + "grad_norm": 14.297161102294922, + "learning_rate": 9.935020061612616e-06, + "loss": 767.5776, + "step": 945 + }, + { + "epoch": 0.40021152829190904, + "grad_norm": 16.344633102416992, + "learning_rate": 9.934799710554312e-06, + "loss": 703.5031, + "step": 946 + }, + { + "epoch": 0.4006345848757271, + "grad_norm": 14.165742874145508, + "learning_rate": 9.934578988967903e-06, + "loss": 744.6021, + "step": 947 + }, + { + "epoch": 0.4010576414595452, + "grad_norm": 14.12894344329834, + "learning_rate": 9.934357896869962e-06, + "loss": 768.608, + "step": 948 + }, + { + "epoch": 0.4014806980433633, + "grad_norm": 14.430864334106445, + "learning_rate": 9.93413643427709e-06, + "loss": 748.3682, + "step": 949 + }, + { + "epoch": 0.40190375462718136, + "grad_norm": 15.452367782592773, + "learning_rate": 9.933914601205914e-06, + "loss": 742.0602, + "step": 950 + }, + { + "epoch": 0.40232681121099945, + "grad_norm": 13.238718032836914, + "learning_rate": 9.933692397673093e-06, + "loss": 787.0675, + "step": 951 + }, + { + "epoch": 0.4027498677948176, + "grad_norm": 21.159046173095703, + "learning_rate": 9.933469823695308e-06, + "loss": 641.8973, + "step": 952 + }, + { + "epoch": 0.40317292437863567, + "grad_norm": 13.029828071594238, + "learning_rate": 9.933246879289274e-06, + "loss": 789.2331, + "step": 953 + }, + { + "epoch": 0.40359598096245375, + "grad_norm": 15.350179672241211, + "learning_rate": 9.933023564471727e-06, + "loss": 728.2852, + "step": 954 + }, + { + "epoch": 0.40401903754627183, + "grad_norm": 13.186515808105469, + "learning_rate": 9.932799879259439e-06, + "loss": 784.6826, + "step": 955 + }, + { + "epoch": 0.4044420941300899, + "grad_norm": 14.198470115661621, + "learning_rate": 9.932575823669199e-06, + "loss": 746.9941, + "step": 956 + }, + { + "epoch": 0.404865150713908, + "grad_norm": 14.67611026763916, + "learning_rate": 9.932351397717838e-06, + "loss": 746.9797, + "step": 957 + }, + { + "epoch": 0.4052882072977261, + "grad_norm": 15.76727294921875, + "learning_rate": 9.932126601422202e-06, + "loss": 723.5588, + "step": 958 + }, + { + "epoch": 0.40571126388154416, + "grad_norm": 17.499101638793945, + "learning_rate": 9.931901434799171e-06, + "loss": 683.4672, + "step": 959 + }, + { + "epoch": 0.40613432046536224, + "grad_norm": 15.142650604248047, + "learning_rate": 9.931675897865651e-06, + "loss": 724.0934, + "step": 960 + }, + { + "epoch": 0.4065573770491803, + "grad_norm": 13.28437614440918, + "learning_rate": 9.931449990638575e-06, + "loss": 783.9309, + "step": 961 + }, + { + "epoch": 0.4069804336329984, + "grad_norm": 15.221598625183105, + "learning_rate": 9.93122371313491e-06, + "loss": 726.2026, + "step": 962 + }, + { + "epoch": 0.4074034902168165, + "grad_norm": 16.53617286682129, + "learning_rate": 9.930997065371642e-06, + "loss": 706.3323, + "step": 963 + }, + { + "epoch": 0.40782654680063457, + "grad_norm": 13.350725173950195, + "learning_rate": 9.93077004736579e-06, + "loss": 787.1077, + "step": 964 + }, + { + "epoch": 0.40824960338445265, + "grad_norm": 17.828699111938477, + "learning_rate": 9.930542659134398e-06, + "loss": 662.3491, + "step": 965 + }, + { + "epoch": 0.40867265996827074, + "grad_norm": 16.882736206054688, + "learning_rate": 9.930314900694541e-06, + "loss": 684.0532, + "step": 966 + }, + { + "epoch": 0.4090957165520888, + "grad_norm": 16.133956909179688, + "learning_rate": 9.930086772063322e-06, + "loss": 702.7836, + "step": 967 + }, + { + "epoch": 0.40951877313590695, + "grad_norm": 16.80211639404297, + "learning_rate": 9.929858273257865e-06, + "loss": 702.614, + "step": 968 + }, + { + "epoch": 0.40994182971972504, + "grad_norm": 12.624656677246094, + "learning_rate": 9.929629404295331e-06, + "loss": 805.7792, + "step": 969 + }, + { + "epoch": 0.4103648863035431, + "grad_norm": 13.367966651916504, + "learning_rate": 9.929400165192904e-06, + "loss": 788.4878, + "step": 970 + }, + { + "epoch": 0.4107879428873612, + "grad_norm": 14.450268745422363, + "learning_rate": 9.929170555967794e-06, + "loss": 770.9141, + "step": 971 + }, + { + "epoch": 0.4112109994711793, + "grad_norm": 11.616170883178711, + "learning_rate": 9.928940576637244e-06, + "loss": 829.3068, + "step": 972 + }, + { + "epoch": 0.41163405605499737, + "grad_norm": 17.193885803222656, + "learning_rate": 9.92871022721852e-06, + "loss": 701.3455, + "step": 973 + }, + { + "epoch": 0.41205711263881545, + "grad_norm": 14.111948013305664, + "learning_rate": 9.928479507728919e-06, + "loss": 764.9476, + "step": 974 + }, + { + "epoch": 0.41248016922263353, + "grad_norm": 14.842401504516602, + "learning_rate": 9.928248418185764e-06, + "loss": 744.1023, + "step": 975 + }, + { + "epoch": 0.4129032258064516, + "grad_norm": 16.706933975219727, + "learning_rate": 9.928016958606405e-06, + "loss": 703.8126, + "step": 976 + }, + { + "epoch": 0.4133262823902697, + "grad_norm": 11.82533073425293, + "learning_rate": 9.927785129008223e-06, + "loss": 830.506, + "step": 977 + }, + { + "epoch": 0.4137493389740878, + "grad_norm": 21.792964935302734, + "learning_rate": 9.927552929408624e-06, + "loss": 640.7302, + "step": 978 + }, + { + "epoch": 0.41417239555790586, + "grad_norm": 15.429586410522461, + "learning_rate": 9.927320359825043e-06, + "loss": 726.817, + "step": 979 + }, + { + "epoch": 0.41459545214172394, + "grad_norm": 16.78628921508789, + "learning_rate": 9.927087420274941e-06, + "loss": 742.936, + "step": 980 + }, + { + "epoch": 0.415018508725542, + "grad_norm": 13.133566856384277, + "learning_rate": 9.92685411077581e-06, + "loss": 809.3224, + "step": 981 + }, + { + "epoch": 0.4154415653093601, + "grad_norm": 14.860319137573242, + "learning_rate": 9.92662043134517e-06, + "loss": 765.9435, + "step": 982 + }, + { + "epoch": 0.4158646218931782, + "grad_norm": 15.220569610595703, + "learning_rate": 9.92638638200056e-06, + "loss": 786.1871, + "step": 983 + }, + { + "epoch": 0.4162876784769963, + "grad_norm": 17.559186935424805, + "learning_rate": 9.92615196275956e-06, + "loss": 725.755, + "step": 984 + }, + { + "epoch": 0.4167107350608144, + "grad_norm": 13.01585865020752, + "learning_rate": 9.925917173639768e-06, + "loss": 808.0776, + "step": 985 + }, + { + "epoch": 0.4171337916446325, + "grad_norm": 16.278385162353516, + "learning_rate": 9.925682014658814e-06, + "loss": 762.0364, + "step": 986 + }, + { + "epoch": 0.4175568482284506, + "grad_norm": 14.703958511352539, + "learning_rate": 9.925446485834355e-06, + "loss": 743.9379, + "step": 987 + }, + { + "epoch": 0.41797990481226865, + "grad_norm": 14.600900650024414, + "learning_rate": 9.925210587184078e-06, + "loss": 762.9129, + "step": 988 + }, + { + "epoch": 0.41840296139608674, + "grad_norm": 14.124653816223145, + "learning_rate": 9.92497431872569e-06, + "loss": 785.7627, + "step": 989 + }, + { + "epoch": 0.4188260179799048, + "grad_norm": 17.254732131958008, + "learning_rate": 9.924737680476935e-06, + "loss": 703.9064, + "step": 990 + }, + { + "epoch": 0.4192490745637229, + "grad_norm": 16.771974563598633, + "learning_rate": 9.924500672455579e-06, + "loss": 703.8248, + "step": 991 + }, + { + "epoch": 0.419672131147541, + "grad_norm": 12.984028816223145, + "learning_rate": 9.92426329467942e-06, + "loss": 828.0646, + "step": 992 + }, + { + "epoch": 0.42009518773135907, + "grad_norm": 15.710400581359863, + "learning_rate": 9.92402554716628e-06, + "loss": 748.0686, + "step": 993 + }, + { + "epoch": 0.42051824431517715, + "grad_norm": 14.155405044555664, + "learning_rate": 9.923787429934011e-06, + "loss": 770.9438, + "step": 994 + }, + { + "epoch": 0.42094130089899523, + "grad_norm": 12.304079055786133, + "learning_rate": 9.92354894300049e-06, + "loss": 810.0375, + "step": 995 + }, + { + "epoch": 0.4213643574828133, + "grad_norm": 17.770231246948242, + "learning_rate": 9.923310086383624e-06, + "loss": 684.3562, + "step": 996 + }, + { + "epoch": 0.4217874140666314, + "grad_norm": 17.25157356262207, + "learning_rate": 9.92307086010135e-06, + "loss": 663.3404, + "step": 997 + }, + { + "epoch": 0.4222104706504495, + "grad_norm": 15.345598220825195, + "learning_rate": 9.922831264171628e-06, + "loss": 766.6577, + "step": 998 + }, + { + "epoch": 0.42263352723426756, + "grad_norm": 12.5167818069458, + "learning_rate": 9.92259129861245e-06, + "loss": 827.9806, + "step": 999 + }, + { + "epoch": 0.4230565838180857, + "grad_norm": 16.270212173461914, + "learning_rate": 9.922350963441832e-06, + "loss": 725.7877, + "step": 1000 + } + ], + "logging_steps": 1.0, + "max_steps": 11820, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.460534261470829e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}