| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 10.0, |
| "eval_steps": 500, |
| "global_step": 6660, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015015015015015015, |
| "grad_norm": 19.707826614379883, |
| "learning_rate": 2.702702702702703e-06, |
| "loss": 1.331, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03003003003003003, |
| "grad_norm": 2.359984874725342, |
| "learning_rate": 5.705705705705706e-06, |
| "loss": 0.7347, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04504504504504504, |
| "grad_norm": 2.2898712158203125, |
| "learning_rate": 8.708708708708708e-06, |
| "loss": 0.3623, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06006006006006006, |
| "grad_norm": 1.4282516241073608, |
| "learning_rate": 1.1711711711711713e-05, |
| "loss": 0.3051, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.07507507507507508, |
| "grad_norm": 1.88104248046875, |
| "learning_rate": 1.4714714714714713e-05, |
| "loss": 0.2592, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.09009009009009009, |
| "grad_norm": 1.2314252853393555, |
| "learning_rate": 1.771771771771772e-05, |
| "loss": 0.2382, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10510510510510511, |
| "grad_norm": 1.5696786642074585, |
| "learning_rate": 2.0720720720720722e-05, |
| "loss": 0.2467, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12012012012012012, |
| "grad_norm": 1.4571477174758911, |
| "learning_rate": 2.3723723723723725e-05, |
| "loss": 0.2299, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.13513513513513514, |
| "grad_norm": 1.6013493537902832, |
| "learning_rate": 2.672672672672673e-05, |
| "loss": 0.1952, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.15015015015015015, |
| "grad_norm": 1.280948519706726, |
| "learning_rate": 2.9729729729729733e-05, |
| "loss": 0.1551, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.16516516516516516, |
| "grad_norm": 1.1729398965835571, |
| "learning_rate": 3.2732732732732736e-05, |
| "loss": 0.1309, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.18018018018018017, |
| "grad_norm": 1.0188239812850952, |
| "learning_rate": 3.573573573573574e-05, |
| "loss": 0.1111, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.19519519519519518, |
| "grad_norm": 1.4927932024002075, |
| "learning_rate": 3.873873873873874e-05, |
| "loss": 0.0884, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.21021021021021022, |
| "grad_norm": 1.5110639333724976, |
| "learning_rate": 4.1741741741741744e-05, |
| "loss": 0.0734, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.22522522522522523, |
| "grad_norm": 0.9263542294502258, |
| "learning_rate": 4.4744744744744746e-05, |
| "loss": 0.0606, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.24024024024024024, |
| "grad_norm": 1.537213921546936, |
| "learning_rate": 4.774774774774775e-05, |
| "loss": 0.0549, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.2552552552552553, |
| "grad_norm": 0.7502244114875793, |
| "learning_rate": 5.075075075075075e-05, |
| "loss": 0.0488, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.2702702702702703, |
| "grad_norm": 0.6621520519256592, |
| "learning_rate": 5.3753753753753754e-05, |
| "loss": 0.0525, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.2852852852852853, |
| "grad_norm": 1.4219032526016235, |
| "learning_rate": 5.6756756756756757e-05, |
| "loss": 0.0596, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.3003003003003003, |
| "grad_norm": 1.681037425994873, |
| "learning_rate": 5.975975975975976e-05, |
| "loss": 0.0517, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3153153153153153, |
| "grad_norm": 1.0897976160049438, |
| "learning_rate": 6.276276276276276e-05, |
| "loss": 0.0433, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.3303303303303303, |
| "grad_norm": 0.9383857846260071, |
| "learning_rate": 6.576576576576577e-05, |
| "loss": 0.0408, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.34534534534534533, |
| "grad_norm": 0.775547206401825, |
| "learning_rate": 6.876876876876878e-05, |
| "loss": 0.0371, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.36036036036036034, |
| "grad_norm": 0.7310791015625, |
| "learning_rate": 7.177177177177178e-05, |
| "loss": 0.0431, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.37537537537537535, |
| "grad_norm": 1.3099724054336548, |
| "learning_rate": 7.477477477477479e-05, |
| "loss": 0.0399, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.39039039039039036, |
| "grad_norm": 0.9201484322547913, |
| "learning_rate": 7.777777777777778e-05, |
| "loss": 0.0415, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.40540540540540543, |
| "grad_norm": 0.9633703827857971, |
| "learning_rate": 8.078078078078079e-05, |
| "loss": 0.044, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.42042042042042044, |
| "grad_norm": 1.0578511953353882, |
| "learning_rate": 8.378378378378379e-05, |
| "loss": 0.0421, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.43543543543543545, |
| "grad_norm": 0.7760908007621765, |
| "learning_rate": 8.678678678678678e-05, |
| "loss": 0.0468, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.45045045045045046, |
| "grad_norm": 0.7139829993247986, |
| "learning_rate": 8.978978978978979e-05, |
| "loss": 0.0405, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.46546546546546547, |
| "grad_norm": 0.9839809536933899, |
| "learning_rate": 9.279279279279279e-05, |
| "loss": 0.0446, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.4804804804804805, |
| "grad_norm": 1.5201447010040283, |
| "learning_rate": 9.57957957957958e-05, |
| "loss": 0.0448, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.4954954954954955, |
| "grad_norm": 1.0367432832717896, |
| "learning_rate": 9.87987987987988e-05, |
| "loss": 0.0476, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5105105105105106, |
| "grad_norm": 1.0149145126342773, |
| "learning_rate": 9.99997781056395e-05, |
| "loss": 0.0448, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5255255255255256, |
| "grad_norm": 0.746631920337677, |
| "learning_rate": 9.99984220916798e-05, |
| "loss": 0.0453, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5405405405405406, |
| "grad_norm": 0.6642729043960571, |
| "learning_rate": 9.999583337179697e-05, |
| "loss": 0.0393, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5555555555555556, |
| "grad_norm": 0.7677448391914368, |
| "learning_rate": 9.999201200981566e-05, |
| "loss": 0.0344, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.5705705705705706, |
| "grad_norm": 1.4783852100372314, |
| "learning_rate": 9.998695809995112e-05, |
| "loss": 0.0366, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.5855855855855856, |
| "grad_norm": 0.9320693016052246, |
| "learning_rate": 9.9980671766807e-05, |
| "loss": 0.0429, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.6006006006006006, |
| "grad_norm": 0.7152029871940613, |
| "learning_rate": 9.997315316537218e-05, |
| "loss": 0.0483, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6156156156156156, |
| "grad_norm": 0.5516471266746521, |
| "learning_rate": 9.9964402481017e-05, |
| "loss": 0.0358, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6306306306306306, |
| "grad_norm": 1.1393887996673584, |
| "learning_rate": 9.99544199294887e-05, |
| "loss": 0.0393, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6456456456456456, |
| "grad_norm": 0.7147128582000732, |
| "learning_rate": 9.994320575690602e-05, |
| "loss": 0.0358, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6606606606606606, |
| "grad_norm": 0.9208776354789734, |
| "learning_rate": 9.993076023975326e-05, |
| "loss": 0.0468, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.6756756756756757, |
| "grad_norm": 0.6893422603607178, |
| "learning_rate": 9.991708368487332e-05, |
| "loss": 0.04, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.6906906906906907, |
| "grad_norm": 0.8497427105903625, |
| "learning_rate": 9.990217642946027e-05, |
| "loss": 0.0446, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.7057057057057057, |
| "grad_norm": 0.715634286403656, |
| "learning_rate": 9.988603884105095e-05, |
| "loss": 0.0375, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.7207207207207207, |
| "grad_norm": 0.9225009679794312, |
| "learning_rate": 9.986867131751594e-05, |
| "loss": 0.0376, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7357357357357357, |
| "grad_norm": 0.852979838848114, |
| "learning_rate": 9.98500742870497e-05, |
| "loss": 0.0365, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7507507507507507, |
| "grad_norm": 0.8639568090438843, |
| "learning_rate": 9.983024820816012e-05, |
| "loss": 0.0333, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7657657657657657, |
| "grad_norm": 0.5087606906890869, |
| "learning_rate": 9.980919356965712e-05, |
| "loss": 0.0311, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.7807807807807807, |
| "grad_norm": 0.7664278149604797, |
| "learning_rate": 9.978691089064062e-05, |
| "loss": 0.0328, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.7957957957957958, |
| "grad_norm": 0.8648868799209595, |
| "learning_rate": 9.97634007204878e-05, |
| "loss": 0.0358, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8108108108108109, |
| "grad_norm": 0.5567862391471863, |
| "learning_rate": 9.973866363883945e-05, |
| "loss": 0.03, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.8258258258258259, |
| "grad_norm": 0.6945303678512573, |
| "learning_rate": 9.971270025558575e-05, |
| "loss": 0.0316, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8408408408408409, |
| "grad_norm": 0.9207854270935059, |
| "learning_rate": 9.968551121085123e-05, |
| "loss": 0.0391, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8558558558558559, |
| "grad_norm": 0.8578768968582153, |
| "learning_rate": 9.965709717497897e-05, |
| "loss": 0.0356, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.8708708708708709, |
| "grad_norm": 0.6991203427314758, |
| "learning_rate": 9.962745884851412e-05, |
| "loss": 0.0311, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.8858858858858859, |
| "grad_norm": 0.6747560501098633, |
| "learning_rate": 9.959659696218651e-05, |
| "loss": 0.0343, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9009009009009009, |
| "grad_norm": 0.4892956614494324, |
| "learning_rate": 9.956451227689278e-05, |
| "loss": 0.0273, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9159159159159159, |
| "grad_norm": 0.5258888602256775, |
| "learning_rate": 9.95312055836775e-05, |
| "loss": 0.026, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9309309309309309, |
| "grad_norm": 0.4343319237232208, |
| "learning_rate": 9.949667770371378e-05, |
| "loss": 0.0291, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.9459459459459459, |
| "grad_norm": 0.38234201073646545, |
| "learning_rate": 9.946092948828289e-05, |
| "loss": 0.0249, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.960960960960961, |
| "grad_norm": 0.9518115520477295, |
| "learning_rate": 9.942396181875341e-05, |
| "loss": 0.0247, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.975975975975976, |
| "grad_norm": 0.825566828250885, |
| "learning_rate": 9.938577560655942e-05, |
| "loss": 0.0287, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.990990990990991, |
| "grad_norm": 0.6939975023269653, |
| "learning_rate": 9.934637179317798e-05, |
| "loss": 0.0291, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.006006006006006, |
| "grad_norm": 0.7724920511245728, |
| "learning_rate": 9.930575135010609e-05, |
| "loss": 0.0272, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.021021021021021, |
| "grad_norm": 0.7219953536987305, |
| "learning_rate": 9.926391527883652e-05, |
| "loss": 0.0342, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.0360360360360361, |
| "grad_norm": 0.7625498175621033, |
| "learning_rate": 9.92208646108333e-05, |
| "loss": 0.0329, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0510510510510511, |
| "grad_norm": 0.6605225205421448, |
| "learning_rate": 9.91766004075062e-05, |
| "loss": 0.0388, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.0660660660660661, |
| "grad_norm": 0.7050349712371826, |
| "learning_rate": 9.913112376018457e-05, |
| "loss": 0.0298, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.0810810810810811, |
| "grad_norm": 0.6416040062904358, |
| "learning_rate": 9.908443579009043e-05, |
| "loss": 0.0268, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.0960960960960962, |
| "grad_norm": 0.6904945969581604, |
| "learning_rate": 9.903653764831087e-05, |
| "loss": 0.0353, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.1111111111111112, |
| "grad_norm": 1.0151249170303345, |
| "learning_rate": 9.898743051576962e-05, |
| "loss": 0.0387, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.1261261261261262, |
| "grad_norm": 0.8894471526145935, |
| "learning_rate": 9.893711560319794e-05, |
| "loss": 0.0364, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.1411411411411412, |
| "grad_norm": 0.4880468249320984, |
| "learning_rate": 9.88855941511048e-05, |
| "loss": 0.0297, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.1561561561561562, |
| "grad_norm": 0.5687229037284851, |
| "learning_rate": 9.883286742974624e-05, |
| "loss": 0.0325, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.1711711711711712, |
| "grad_norm": 0.6782395243644714, |
| "learning_rate": 9.877893673909418e-05, |
| "loss": 0.0345, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.1861861861861862, |
| "grad_norm": 0.7363622188568115, |
| "learning_rate": 9.872380340880416e-05, |
| "loss": 0.031, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.2012012012012012, |
| "grad_norm": 0.5948150753974915, |
| "learning_rate": 9.866746879818278e-05, |
| "loss": 0.0305, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2162162162162162, |
| "grad_norm": 0.6406951546669006, |
| "learning_rate": 9.8609934296154e-05, |
| "loss": 0.0286, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.2312312312312312, |
| "grad_norm": 0.7044562101364136, |
| "learning_rate": 9.855120132122503e-05, |
| "loss": 0.0327, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.2462462462462462, |
| "grad_norm": 0.7193213701248169, |
| "learning_rate": 9.849127132145128e-05, |
| "loss": 0.0311, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.2612612612612613, |
| "grad_norm": 0.6026361584663391, |
| "learning_rate": 9.84301457744007e-05, |
| "loss": 0.0338, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.2762762762762763, |
| "grad_norm": 0.6601231694221497, |
| "learning_rate": 9.836782618711735e-05, |
| "loss": 0.033, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.2912912912912913, |
| "grad_norm": 0.9034827947616577, |
| "learning_rate": 9.830431409608416e-05, |
| "loss": 0.0318, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.3063063063063063, |
| "grad_norm": 0.6307055354118347, |
| "learning_rate": 9.82396110671852e-05, |
| "loss": 0.0287, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.3213213213213213, |
| "grad_norm": 0.6825271844863892, |
| "learning_rate": 9.817371869566694e-05, |
| "loss": 0.0244, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.3363363363363363, |
| "grad_norm": 0.40105974674224854, |
| "learning_rate": 9.810663860609896e-05, |
| "loss": 0.025, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.3513513513513513, |
| "grad_norm": 0.7262183427810669, |
| "learning_rate": 9.803837245233399e-05, |
| "loss": 0.0243, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.3663663663663663, |
| "grad_norm": 0.6035653948783875, |
| "learning_rate": 9.79689219174669e-05, |
| "loss": 0.0242, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.3813813813813813, |
| "grad_norm": 0.634743332862854, |
| "learning_rate": 9.789828871379349e-05, |
| "loss": 0.0283, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.3963963963963963, |
| "grad_norm": 0.6286770701408386, |
| "learning_rate": 9.78264745827681e-05, |
| "loss": 0.0309, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.4114114114114114, |
| "grad_norm": 0.5936300754547119, |
| "learning_rate": 9.77534812949607e-05, |
| "loss": 0.0277, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.4264264264264264, |
| "grad_norm": 0.6561086773872375, |
| "learning_rate": 9.76793106500132e-05, |
| "loss": 0.026, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.4414414414414414, |
| "grad_norm": 0.6812066435813904, |
| "learning_rate": 9.760396447659521e-05, |
| "loss": 0.0263, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.4564564564564564, |
| "grad_norm": 0.5788238048553467, |
| "learning_rate": 9.752744463235886e-05, |
| "loss": 0.0283, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.4714714714714714, |
| "grad_norm": 0.5635336637496948, |
| "learning_rate": 9.744975300389294e-05, |
| "loss": 0.0296, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.4864864864864864, |
| "grad_norm": 0.2735409438610077, |
| "learning_rate": 9.737089150667656e-05, |
| "loss": 0.0265, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.5015015015015014, |
| "grad_norm": 0.4760662317276001, |
| "learning_rate": 9.729086208503174e-05, |
| "loss": 0.0271, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.5165165165165164, |
| "grad_norm": 0.9864751696586609, |
| "learning_rate": 9.720966671207561e-05, |
| "loss": 0.0305, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.5315315315315314, |
| "grad_norm": 0.8087036609649658, |
| "learning_rate": 9.712730738967168e-05, |
| "loss": 0.0301, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.5465465465465464, |
| "grad_norm": 0.5092441439628601, |
| "learning_rate": 9.704378614838052e-05, |
| "loss": 0.0268, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.5615615615615615, |
| "grad_norm": 0.5732120275497437, |
| "learning_rate": 9.69591050474097e-05, |
| "loss": 0.0239, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.5765765765765765, |
| "grad_norm": 0.6305236220359802, |
| "learning_rate": 9.687326617456299e-05, |
| "loss": 0.0273, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.5915915915915915, |
| "grad_norm": 0.585681676864624, |
| "learning_rate": 9.678627164618892e-05, |
| "loss": 0.025, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.6066066066066065, |
| "grad_norm": 0.49683454632759094, |
| "learning_rate": 9.66981236071286e-05, |
| "loss": 0.0213, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.6216216216216215, |
| "grad_norm": 0.45419085025787354, |
| "learning_rate": 9.66088242306628e-05, |
| "loss": 0.0278, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.6366366366366365, |
| "grad_norm": 0.43894627690315247, |
| "learning_rate": 9.65183757184584e-05, |
| "loss": 0.0239, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.6516516516516515, |
| "grad_norm": 0.4908210039138794, |
| "learning_rate": 9.642678030051413e-05, |
| "loss": 0.0294, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.6795796751976013, |
| "learning_rate": 9.633404023510553e-05, |
| "loss": 0.0256, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.6816816816816815, |
| "grad_norm": 0.720375120639801, |
| "learning_rate": 9.624015780872931e-05, |
| "loss": 0.0273, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.6966966966966965, |
| "grad_norm": 0.5921428799629211, |
| "learning_rate": 9.614513533604698e-05, |
| "loss": 0.0295, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.7117117117117115, |
| "grad_norm": 0.6450079679489136, |
| "learning_rate": 9.604897515982779e-05, |
| "loss": 0.0278, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.7267267267267268, |
| "grad_norm": 0.4198131561279297, |
| "learning_rate": 9.595167965089091e-05, |
| "loss": 0.0257, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.7417417417417418, |
| "grad_norm": 0.5609850287437439, |
| "learning_rate": 9.585325120804707e-05, |
| "loss": 0.0273, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.7567567567567568, |
| "grad_norm": 0.6113925576210022, |
| "learning_rate": 9.57536922580393e-05, |
| "loss": 0.0325, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.7717717717717718, |
| "grad_norm": 0.45319005846977234, |
| "learning_rate": 9.565300525548326e-05, |
| "loss": 0.0266, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.7867867867867868, |
| "grad_norm": 0.7308943867683411, |
| "learning_rate": 9.555119268280655e-05, |
| "loss": 0.0247, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.8018018018018018, |
| "grad_norm": 0.42079082131385803, |
| "learning_rate": 9.544825705018764e-05, |
| "loss": 0.0244, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.8168168168168168, |
| "grad_norm": 0.3191315233707428, |
| "learning_rate": 9.534420089549387e-05, |
| "loss": 0.0254, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.8318318318318318, |
| "grad_norm": 0.40956762433052063, |
| "learning_rate": 9.523902678421898e-05, |
| "loss": 0.0297, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.8468468468468469, |
| "grad_norm": 0.533236563205719, |
| "learning_rate": 9.51327373094198e-05, |
| "loss": 0.023, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.8618618618618619, |
| "grad_norm": 0.48515594005584717, |
| "learning_rate": 9.502533509165235e-05, |
| "loss": 0.0275, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.8768768768768769, |
| "grad_norm": 0.6893606185913086, |
| "learning_rate": 9.491682277890716e-05, |
| "loss": 0.0271, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.8918918918918919, |
| "grad_norm": 0.7433900237083435, |
| "learning_rate": 9.480720304654411e-05, |
| "loss": 0.0277, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.906906906906907, |
| "grad_norm": 0.708577036857605, |
| "learning_rate": 9.469647859722635e-05, |
| "loss": 0.0273, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.921921921921922, |
| "grad_norm": 0.5717357397079468, |
| "learning_rate": 9.458465216085368e-05, |
| "loss": 0.0226, |
| "step": 1280 |
| }, |
| { |
| "epoch": 1.936936936936937, |
| "grad_norm": 0.7664889097213745, |
| "learning_rate": 9.447172649449538e-05, |
| "loss": 0.0284, |
| "step": 1290 |
| }, |
| { |
| "epoch": 1.951951951951952, |
| "grad_norm": 1.0811998844146729, |
| "learning_rate": 9.435770438232202e-05, |
| "loss": 0.0321, |
| "step": 1300 |
| }, |
| { |
| "epoch": 1.966966966966967, |
| "grad_norm": 0.6489273905754089, |
| "learning_rate": 9.424258863553701e-05, |
| "loss": 0.021, |
| "step": 1310 |
| }, |
| { |
| "epoch": 1.981981981981982, |
| "grad_norm": 0.6143089532852173, |
| "learning_rate": 9.412638209230715e-05, |
| "loss": 0.0244, |
| "step": 1320 |
| }, |
| { |
| "epoch": 1.996996996996997, |
| "grad_norm": 0.4508838951587677, |
| "learning_rate": 9.400908761769276e-05, |
| "loss": 0.0291, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.012012012012012, |
| "grad_norm": 0.4465492367744446, |
| "learning_rate": 9.389070810357696e-05, |
| "loss": 0.0283, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.027027027027027, |
| "grad_norm": 0.4916835129261017, |
| "learning_rate": 9.377124646859443e-05, |
| "loss": 0.0224, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.042042042042042, |
| "grad_norm": 0.7511496543884277, |
| "learning_rate": 9.365070565805941e-05, |
| "loss": 0.0244, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.057057057057057, |
| "grad_norm": 0.4286222457885742, |
| "learning_rate": 9.352908864389312e-05, |
| "loss": 0.0262, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.0720720720720722, |
| "grad_norm": 0.3976287543773651, |
| "learning_rate": 9.340639842455045e-05, |
| "loss": 0.0214, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.0870870870870872, |
| "grad_norm": 0.5718784332275391, |
| "learning_rate": 9.328263802494609e-05, |
| "loss": 0.0254, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.1021021021021022, |
| "grad_norm": 0.7074950337409973, |
| "learning_rate": 9.315781049637987e-05, |
| "loss": 0.0281, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.1171171171171173, |
| "grad_norm": 0.5626684427261353, |
| "learning_rate": 9.303191891646156e-05, |
| "loss": 0.0261, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.1321321321321323, |
| "grad_norm": 0.7081624269485474, |
| "learning_rate": 9.290496638903507e-05, |
| "loss": 0.0242, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.1471471471471473, |
| "grad_norm": 0.540306568145752, |
| "learning_rate": 9.277695604410182e-05, |
| "loss": 0.0272, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.1621621621621623, |
| "grad_norm": 0.4797918200492859, |
| "learning_rate": 9.264789103774363e-05, |
| "loss": 0.0201, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.1771771771771773, |
| "grad_norm": 0.612108051776886, |
| "learning_rate": 9.251777455204484e-05, |
| "loss": 0.0205, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.1921921921921923, |
| "grad_norm": 0.5374974608421326, |
| "learning_rate": 9.2386609795014e-05, |
| "loss": 0.0273, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.2072072072072073, |
| "grad_norm": 0.6897376179695129, |
| "learning_rate": 9.225440000050456e-05, |
| "loss": 0.024, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.2222222222222223, |
| "grad_norm": 0.5138564705848694, |
| "learning_rate": 9.212114842813538e-05, |
| "loss": 0.0233, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.2372372372372373, |
| "grad_norm": 0.354375422000885, |
| "learning_rate": 9.198685836321012e-05, |
| "loss": 0.0254, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.2522522522522523, |
| "grad_norm": 0.6212460398674011, |
| "learning_rate": 9.185153311663651e-05, |
| "loss": 0.0236, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.2672672672672673, |
| "grad_norm": 0.4737728238105774, |
| "learning_rate": 9.171517602484447e-05, |
| "loss": 0.0265, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.2822822822822824, |
| "grad_norm": 0.5129017233848572, |
| "learning_rate": 9.157779044970396e-05, |
| "loss": 0.0239, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.2972972972972974, |
| "grad_norm": 0.5135466456413269, |
| "learning_rate": 9.143937977844216e-05, |
| "loss": 0.0225, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.3123123123123124, |
| "grad_norm": 0.3853330612182617, |
| "learning_rate": 9.129994742355983e-05, |
| "loss": 0.0253, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.3273273273273274, |
| "grad_norm": 0.39255353808403015, |
| "learning_rate": 9.115949682274728e-05, |
| "loss": 0.0215, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.3423423423423424, |
| "grad_norm": 0.43988943099975586, |
| "learning_rate": 9.101803143879948e-05, |
| "loss": 0.0215, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.3573573573573574, |
| "grad_norm": 0.4806894063949585, |
| "learning_rate": 9.087555475953089e-05, |
| "loss": 0.0242, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.3723723723723724, |
| "grad_norm": 0.5244617462158203, |
| "learning_rate": 9.073207029768926e-05, |
| "loss": 0.0214, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.3873873873873874, |
| "grad_norm": 0.6372321248054504, |
| "learning_rate": 9.058758159086918e-05, |
| "loss": 0.018, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.4024024024024024, |
| "grad_norm": 0.6845021843910217, |
| "learning_rate": 9.044209220142472e-05, |
| "loss": 0.0224, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.4174174174174174, |
| "grad_norm": 0.6199453473091125, |
| "learning_rate": 9.029560571638178e-05, |
| "loss": 0.0283, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.4324324324324325, |
| "grad_norm": 0.7201853394508362, |
| "learning_rate": 9.014812574734945e-05, |
| "loss": 0.0246, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.4474474474474475, |
| "grad_norm": 0.6509067416191101, |
| "learning_rate": 8.999965593043111e-05, |
| "loss": 0.0207, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.4624624624624625, |
| "grad_norm": 0.544884979724884, |
| "learning_rate": 8.985019992613477e-05, |
| "loss": 0.0246, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.4774774774774775, |
| "grad_norm": 0.5736904740333557, |
| "learning_rate": 8.96997614192827e-05, |
| "loss": 0.0194, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.4924924924924925, |
| "grad_norm": 0.6851534843444824, |
| "learning_rate": 8.954834411892074e-05, |
| "loss": 0.0228, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.5075075075075075, |
| "grad_norm": 0.47555166482925415, |
| "learning_rate": 8.939595175822678e-05, |
| "loss": 0.0281, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.5225225225225225, |
| "grad_norm": 0.519774317741394, |
| "learning_rate": 8.924258809441867e-05, |
| "loss": 0.0252, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.5375375375375375, |
| "grad_norm": 0.8348826169967651, |
| "learning_rate": 8.908825690866171e-05, |
| "loss": 0.0233, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.5525525525525525, |
| "grad_norm": 0.868290901184082, |
| "learning_rate": 8.89329620059753e-05, |
| "loss": 0.025, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.5675675675675675, |
| "grad_norm": 0.5431799292564392, |
| "learning_rate": 8.877670721513918e-05, |
| "loss": 0.0207, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.5825825825825826, |
| "grad_norm": 0.43196889758110046, |
| "learning_rate": 8.861949638859908e-05, |
| "loss": 0.0211, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.5975975975975976, |
| "grad_norm": 0.3355920612812042, |
| "learning_rate": 8.846133340237162e-05, |
| "loss": 0.0182, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.6126126126126126, |
| "grad_norm": 0.6166477799415588, |
| "learning_rate": 8.83022221559489e-05, |
| "loss": 0.0317, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.6276276276276276, |
| "grad_norm": 0.318366676568985, |
| "learning_rate": 8.814216657220222e-05, |
| "loss": 0.0287, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.6426426426426426, |
| "grad_norm": 0.43470993638038635, |
| "learning_rate": 8.798117059728544e-05, |
| "loss": 0.0231, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.6576576576576576, |
| "grad_norm": 0.49722573161125183, |
| "learning_rate": 8.781923820053766e-05, |
| "loss": 0.0242, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.6726726726726726, |
| "grad_norm": 0.4650242030620575, |
| "learning_rate": 8.765637337438535e-05, |
| "loss": 0.0263, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.6876876876876876, |
| "grad_norm": 0.4883041977882385, |
| "learning_rate": 8.749258013424397e-05, |
| "loss": 0.0228, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.7027027027027026, |
| "grad_norm": 0.5440917611122131, |
| "learning_rate": 8.732786251841891e-05, |
| "loss": 0.018, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.7177177177177176, |
| "grad_norm": 0.506921648979187, |
| "learning_rate": 8.716222458800591e-05, |
| "loss": 0.0221, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.7327327327327327, |
| "grad_norm": 0.5868305563926697, |
| "learning_rate": 8.699567042679104e-05, |
| "loss": 0.0214, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.7477477477477477, |
| "grad_norm": 0.5890645384788513, |
| "learning_rate": 8.682820414114991e-05, |
| "loss": 0.0173, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.7627627627627627, |
| "grad_norm": 0.43213966488838196, |
| "learning_rate": 8.665982985994645e-05, |
| "loss": 0.02, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.7777777777777777, |
| "grad_norm": 0.4124511182308197, |
| "learning_rate": 8.649055173443113e-05, |
| "loss": 0.0212, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.7927927927927927, |
| "grad_norm": 0.566199004650116, |
| "learning_rate": 8.632037393813863e-05, |
| "loss": 0.0196, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.8078078078078077, |
| "grad_norm": 0.4955221712589264, |
| "learning_rate": 8.614930066678493e-05, |
| "loss": 0.0215, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.8228228228228227, |
| "grad_norm": 0.6144834756851196, |
| "learning_rate": 8.597733613816379e-05, |
| "loss": 0.0207, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.8378378378378377, |
| "grad_norm": 0.37374141812324524, |
| "learning_rate": 8.58044845920429e-05, |
| "loss": 0.0205, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.8528528528528527, |
| "grad_norm": 0.5047124624252319, |
| "learning_rate": 8.563075029005925e-05, |
| "loss": 0.0301, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.8678678678678677, |
| "grad_norm": 0.4270637631416321, |
| "learning_rate": 8.545613751561407e-05, |
| "loss": 0.0215, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.8828828828828827, |
| "grad_norm": 0.5714237093925476, |
| "learning_rate": 8.528065057376723e-05, |
| "loss": 0.0221, |
| "step": 1920 |
| }, |
| { |
| "epoch": 2.8978978978978978, |
| "grad_norm": 0.46633490920066833, |
| "learning_rate": 8.510429379113114e-05, |
| "loss": 0.0216, |
| "step": 1930 |
| }, |
| { |
| "epoch": 2.9129129129129128, |
| "grad_norm": 0.289782851934433, |
| "learning_rate": 8.4927071515764e-05, |
| "loss": 0.0221, |
| "step": 1940 |
| }, |
| { |
| "epoch": 2.9279279279279278, |
| "grad_norm": 0.4321281909942627, |
| "learning_rate": 8.474898811706272e-05, |
| "loss": 0.0199, |
| "step": 1950 |
| }, |
| { |
| "epoch": 2.942942942942943, |
| "grad_norm": 0.5510643720626831, |
| "learning_rate": 8.457004798565504e-05, |
| "loss": 0.0214, |
| "step": 1960 |
| }, |
| { |
| "epoch": 2.957957957957958, |
| "grad_norm": 0.5263959765434265, |
| "learning_rate": 8.439025553329137e-05, |
| "loss": 0.0231, |
| "step": 1970 |
| }, |
| { |
| "epoch": 2.972972972972973, |
| "grad_norm": 0.293148934841156, |
| "learning_rate": 8.420961519273605e-05, |
| "loss": 0.023, |
| "step": 1980 |
| }, |
| { |
| "epoch": 2.987987987987988, |
| "grad_norm": 0.45720815658569336, |
| "learning_rate": 8.402813141765795e-05, |
| "loss": 0.0211, |
| "step": 1990 |
| }, |
| { |
| "epoch": 3.003003003003003, |
| "grad_norm": 0.32505160570144653, |
| "learning_rate": 8.38458086825208e-05, |
| "loss": 0.0279, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.018018018018018, |
| "grad_norm": 0.5061551928520203, |
| "learning_rate": 8.366265148247275e-05, |
| "loss": 0.0239, |
| "step": 2010 |
| }, |
| { |
| "epoch": 3.033033033033033, |
| "grad_norm": 0.5039113759994507, |
| "learning_rate": 8.347866433323562e-05, |
| "loss": 0.0223, |
| "step": 2020 |
| }, |
| { |
| "epoch": 3.048048048048048, |
| "grad_norm": 0.5448925495147705, |
| "learning_rate": 8.329385177099353e-05, |
| "loss": 0.0233, |
| "step": 2030 |
| }, |
| { |
| "epoch": 3.063063063063063, |
| "grad_norm": 0.4620119333267212, |
| "learning_rate": 8.310821835228108e-05, |
| "loss": 0.0243, |
| "step": 2040 |
| }, |
| { |
| "epoch": 3.078078078078078, |
| "grad_norm": 0.4423605501651764, |
| "learning_rate": 8.292176865387102e-05, |
| "loss": 0.0272, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.093093093093093, |
| "grad_norm": 0.5479226112365723, |
| "learning_rate": 8.273450727266135e-05, |
| "loss": 0.0237, |
| "step": 2060 |
| }, |
| { |
| "epoch": 3.108108108108108, |
| "grad_norm": 0.5344254970550537, |
| "learning_rate": 8.254643882556206e-05, |
| "loss": 0.0209, |
| "step": 2070 |
| }, |
| { |
| "epoch": 3.123123123123123, |
| "grad_norm": 0.2996688783168793, |
| "learning_rate": 8.235756794938123e-05, |
| "loss": 0.021, |
| "step": 2080 |
| }, |
| { |
| "epoch": 3.138138138138138, |
| "grad_norm": 0.593069851398468, |
| "learning_rate": 8.216789930071078e-05, |
| "loss": 0.0303, |
| "step": 2090 |
| }, |
| { |
| "epoch": 3.153153153153153, |
| "grad_norm": 0.48896265029907227, |
| "learning_rate": 8.197743755581164e-05, |
| "loss": 0.0214, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.1681681681681684, |
| "grad_norm": 0.4235636591911316, |
| "learning_rate": 8.178618741049842e-05, |
| "loss": 0.0232, |
| "step": 2110 |
| }, |
| { |
| "epoch": 3.1831831831831834, |
| "grad_norm": 0.2720106244087219, |
| "learning_rate": 8.159415358002361e-05, |
| "loss": 0.0163, |
| "step": 2120 |
| }, |
| { |
| "epoch": 3.1981981981981984, |
| "grad_norm": 0.4087418019771576, |
| "learning_rate": 8.14013407989615e-05, |
| "loss": 0.0253, |
| "step": 2130 |
| }, |
| { |
| "epoch": 3.2132132132132134, |
| "grad_norm": 0.5570055842399597, |
| "learning_rate": 8.12077538210912e-05, |
| "loss": 0.0208, |
| "step": 2140 |
| }, |
| { |
| "epoch": 3.2282282282282284, |
| "grad_norm": 0.46996933221817017, |
| "learning_rate": 8.101339741927965e-05, |
| "loss": 0.0224, |
| "step": 2150 |
| }, |
| { |
| "epoch": 3.2432432432432434, |
| "grad_norm": 0.34068506956100464, |
| "learning_rate": 8.081827638536381e-05, |
| "loss": 0.0148, |
| "step": 2160 |
| }, |
| { |
| "epoch": 3.2582582582582584, |
| "grad_norm": 0.45338696241378784, |
| "learning_rate": 8.06223955300326e-05, |
| "loss": 0.0181, |
| "step": 2170 |
| }, |
| { |
| "epoch": 3.2732732732732734, |
| "grad_norm": 0.39829152822494507, |
| "learning_rate": 8.042575968270821e-05, |
| "loss": 0.0198, |
| "step": 2180 |
| }, |
| { |
| "epoch": 3.2882882882882885, |
| "grad_norm": 0.523808479309082, |
| "learning_rate": 8.022837369142715e-05, |
| "loss": 0.02, |
| "step": 2190 |
| }, |
| { |
| "epoch": 3.3033033033033035, |
| "grad_norm": 0.42372068762779236, |
| "learning_rate": 8.003024242272057e-05, |
| "loss": 0.0219, |
| "step": 2200 |
| }, |
| { |
| "epoch": 3.3183183183183185, |
| "grad_norm": 0.4960072934627533, |
| "learning_rate": 7.983137076149442e-05, |
| "loss": 0.0243, |
| "step": 2210 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 0.4654483497142792, |
| "learning_rate": 7.963176361090894e-05, |
| "loss": 0.021, |
| "step": 2220 |
| }, |
| { |
| "epoch": 3.3483483483483485, |
| "grad_norm": 0.5904752612113953, |
| "learning_rate": 7.943142589225776e-05, |
| "loss": 0.0259, |
| "step": 2230 |
| }, |
| { |
| "epoch": 3.3633633633633635, |
| "grad_norm": 0.3706412613391876, |
| "learning_rate": 7.923036254484659e-05, |
| "loss": 0.0215, |
| "step": 2240 |
| }, |
| { |
| "epoch": 3.3783783783783785, |
| "grad_norm": 0.6387991309165955, |
| "learning_rate": 7.902857852587147e-05, |
| "loss": 0.0203, |
| "step": 2250 |
| }, |
| { |
| "epoch": 3.3933933933933935, |
| "grad_norm": 0.5002526640892029, |
| "learning_rate": 7.882607881029651e-05, |
| "loss": 0.0218, |
| "step": 2260 |
| }, |
| { |
| "epoch": 3.4084084084084085, |
| "grad_norm": 0.5591747760772705, |
| "learning_rate": 7.862286839073122e-05, |
| "loss": 0.0208, |
| "step": 2270 |
| }, |
| { |
| "epoch": 3.4234234234234235, |
| "grad_norm": 0.5342072248458862, |
| "learning_rate": 7.841895227730751e-05, |
| "loss": 0.0223, |
| "step": 2280 |
| }, |
| { |
| "epoch": 3.4384384384384385, |
| "grad_norm": 0.2828880250453949, |
| "learning_rate": 7.821433549755604e-05, |
| "loss": 0.0252, |
| "step": 2290 |
| }, |
| { |
| "epoch": 3.4534534534534536, |
| "grad_norm": 0.32669833302497864, |
| "learning_rate": 7.800902309628232e-05, |
| "loss": 0.0215, |
| "step": 2300 |
| }, |
| { |
| "epoch": 3.4684684684684686, |
| "grad_norm": 0.5081203579902649, |
| "learning_rate": 7.780302013544239e-05, |
| "loss": 0.0204, |
| "step": 2310 |
| }, |
| { |
| "epoch": 3.4834834834834836, |
| "grad_norm": 0.6576879024505615, |
| "learning_rate": 7.75963316940179e-05, |
| "loss": 0.0202, |
| "step": 2320 |
| }, |
| { |
| "epoch": 3.4984984984984986, |
| "grad_norm": 0.4684658944606781, |
| "learning_rate": 7.738896286789102e-05, |
| "loss": 0.0217, |
| "step": 2330 |
| }, |
| { |
| "epoch": 3.5135135135135136, |
| "grad_norm": 0.5036672353744507, |
| "learning_rate": 7.718091876971867e-05, |
| "loss": 0.0205, |
| "step": 2340 |
| }, |
| { |
| "epoch": 3.5285285285285286, |
| "grad_norm": 0.44505366683006287, |
| "learning_rate": 7.69722045288066e-05, |
| "loss": 0.0189, |
| "step": 2350 |
| }, |
| { |
| "epoch": 3.5435435435435436, |
| "grad_norm": 0.40767377614974976, |
| "learning_rate": 7.676282529098279e-05, |
| "loss": 0.0214, |
| "step": 2360 |
| }, |
| { |
| "epoch": 3.5585585585585586, |
| "grad_norm": 0.43969622254371643, |
| "learning_rate": 7.655278621847071e-05, |
| "loss": 0.0241, |
| "step": 2370 |
| }, |
| { |
| "epoch": 3.5735735735735736, |
| "grad_norm": 0.627383291721344, |
| "learning_rate": 7.634209248976192e-05, |
| "loss": 0.0232, |
| "step": 2380 |
| }, |
| { |
| "epoch": 3.5885885885885886, |
| "grad_norm": 0.31650131940841675, |
| "learning_rate": 7.613074929948856e-05, |
| "loss": 0.0204, |
| "step": 2390 |
| }, |
| { |
| "epoch": 3.6036036036036037, |
| "grad_norm": 0.4799094498157501, |
| "learning_rate": 7.591876185829505e-05, |
| "loss": 0.0224, |
| "step": 2400 |
| }, |
| { |
| "epoch": 3.6186186186186187, |
| "grad_norm": 0.5130876898765564, |
| "learning_rate": 7.570613539270986e-05, |
| "loss": 0.0245, |
| "step": 2410 |
| }, |
| { |
| "epoch": 3.6336336336336337, |
| "grad_norm": 0.355915367603302, |
| "learning_rate": 7.549287514501649e-05, |
| "loss": 0.0203, |
| "step": 2420 |
| }, |
| { |
| "epoch": 3.6486486486486487, |
| "grad_norm": 0.24736683070659637, |
| "learning_rate": 7.527898637312427e-05, |
| "loss": 0.0239, |
| "step": 2430 |
| }, |
| { |
| "epoch": 3.6636636636636637, |
| "grad_norm": 0.4904641807079315, |
| "learning_rate": 7.506447435043876e-05, |
| "loss": 0.0165, |
| "step": 2440 |
| }, |
| { |
| "epoch": 3.6786786786786787, |
| "grad_norm": 0.37809261679649353, |
| "learning_rate": 7.484934436573165e-05, |
| "loss": 0.0218, |
| "step": 2450 |
| }, |
| { |
| "epoch": 3.6936936936936937, |
| "grad_norm": 0.7537095546722412, |
| "learning_rate": 7.463360172301048e-05, |
| "loss": 0.0229, |
| "step": 2460 |
| }, |
| { |
| "epoch": 3.7087087087087087, |
| "grad_norm": 0.6429843902587891, |
| "learning_rate": 7.441725174138779e-05, |
| "loss": 0.0211, |
| "step": 2470 |
| }, |
| { |
| "epoch": 3.7237237237237237, |
| "grad_norm": 0.8371061086654663, |
| "learning_rate": 7.420029975494995e-05, |
| "loss": 0.02, |
| "step": 2480 |
| }, |
| { |
| "epoch": 3.7387387387387387, |
| "grad_norm": 0.5082511901855469, |
| "learning_rate": 7.398275111262577e-05, |
| "loss": 0.0194, |
| "step": 2490 |
| }, |
| { |
| "epoch": 3.7537537537537538, |
| "grad_norm": 0.430393248796463, |
| "learning_rate": 7.37646111780545e-05, |
| "loss": 0.0241, |
| "step": 2500 |
| }, |
| { |
| "epoch": 3.7687687687687688, |
| "grad_norm": 0.4554280638694763, |
| "learning_rate": 7.354588532945367e-05, |
| "loss": 0.0208, |
| "step": 2510 |
| }, |
| { |
| "epoch": 3.7837837837837838, |
| "grad_norm": 0.521698534488678, |
| "learning_rate": 7.332657895948644e-05, |
| "loss": 0.0184, |
| "step": 2520 |
| }, |
| { |
| "epoch": 3.798798798798799, |
| "grad_norm": 0.8456292748451233, |
| "learning_rate": 7.310669747512865e-05, |
| "loss": 0.0248, |
| "step": 2530 |
| }, |
| { |
| "epoch": 3.813813813813814, |
| "grad_norm": 0.34558990597724915, |
| "learning_rate": 7.288624629753558e-05, |
| "loss": 0.0183, |
| "step": 2540 |
| }, |
| { |
| "epoch": 3.828828828828829, |
| "grad_norm": 0.5863817930221558, |
| "learning_rate": 7.266523086190816e-05, |
| "loss": 0.0254, |
| "step": 2550 |
| }, |
| { |
| "epoch": 3.843843843843844, |
| "grad_norm": 0.607603132724762, |
| "learning_rate": 7.244365661735913e-05, |
| "loss": 0.0206, |
| "step": 2560 |
| }, |
| { |
| "epoch": 3.858858858858859, |
| "grad_norm": 0.3888079822063446, |
| "learning_rate": 7.222152902677853e-05, |
| "loss": 0.02, |
| "step": 2570 |
| }, |
| { |
| "epoch": 3.873873873873874, |
| "grad_norm": 0.7730490565299988, |
| "learning_rate": 7.199885356669913e-05, |
| "loss": 0.0243, |
| "step": 2580 |
| }, |
| { |
| "epoch": 3.888888888888889, |
| "grad_norm": 0.3885418772697449, |
| "learning_rate": 7.177563572716137e-05, |
| "loss": 0.0182, |
| "step": 2590 |
| }, |
| { |
| "epoch": 3.903903903903904, |
| "grad_norm": 0.546582043170929, |
| "learning_rate": 7.155188101157799e-05, |
| "loss": 0.0228, |
| "step": 2600 |
| }, |
| { |
| "epoch": 3.918918918918919, |
| "grad_norm": 0.6339150667190552, |
| "learning_rate": 7.132759493659828e-05, |
| "loss": 0.0229, |
| "step": 2610 |
| }, |
| { |
| "epoch": 3.933933933933934, |
| "grad_norm": 0.5374885201454163, |
| "learning_rate": 7.110278303197232e-05, |
| "loss": 0.0232, |
| "step": 2620 |
| }, |
| { |
| "epoch": 3.948948948948949, |
| "grad_norm": 0.7847733497619629, |
| "learning_rate": 7.087745084041428e-05, |
| "loss": 0.0177, |
| "step": 2630 |
| }, |
| { |
| "epoch": 3.963963963963964, |
| "grad_norm": 0.31849485635757446, |
| "learning_rate": 7.065160391746606e-05, |
| "loss": 0.025, |
| "step": 2640 |
| }, |
| { |
| "epoch": 3.978978978978979, |
| "grad_norm": 0.3679928481578827, |
| "learning_rate": 7.042524783136015e-05, |
| "loss": 0.0187, |
| "step": 2650 |
| }, |
| { |
| "epoch": 3.993993993993994, |
| "grad_norm": 0.5686633586883545, |
| "learning_rate": 7.019838816288246e-05, |
| "loss": 0.0246, |
| "step": 2660 |
| }, |
| { |
| "epoch": 4.009009009009009, |
| "grad_norm": 0.5385087132453918, |
| "learning_rate": 6.997103050523461e-05, |
| "loss": 0.0196, |
| "step": 2670 |
| }, |
| { |
| "epoch": 4.024024024024024, |
| "grad_norm": 0.4066893458366394, |
| "learning_rate": 6.974318046389616e-05, |
| "loss": 0.0203, |
| "step": 2680 |
| }, |
| { |
| "epoch": 4.039039039039039, |
| "grad_norm": 0.2717365622520447, |
| "learning_rate": 6.951484365648628e-05, |
| "loss": 0.0195, |
| "step": 2690 |
| }, |
| { |
| "epoch": 4.054054054054054, |
| "grad_norm": 0.5939636826515198, |
| "learning_rate": 6.928602571262532e-05, |
| "loss": 0.0213, |
| "step": 2700 |
| }, |
| { |
| "epoch": 4.069069069069069, |
| "grad_norm": 0.38194969296455383, |
| "learning_rate": 6.905673227379605e-05, |
| "loss": 0.0233, |
| "step": 2710 |
| }, |
| { |
| "epoch": 4.084084084084084, |
| "grad_norm": 0.3717132806777954, |
| "learning_rate": 6.882696899320445e-05, |
| "loss": 0.0168, |
| "step": 2720 |
| }, |
| { |
| "epoch": 4.099099099099099, |
| "grad_norm": 0.31925511360168457, |
| "learning_rate": 6.859674153564042e-05, |
| "loss": 0.017, |
| "step": 2730 |
| }, |
| { |
| "epoch": 4.114114114114114, |
| "grad_norm": 0.4138813018798828, |
| "learning_rate": 6.836605557733809e-05, |
| "loss": 0.0222, |
| "step": 2740 |
| }, |
| { |
| "epoch": 4.129129129129129, |
| "grad_norm": 0.3355839252471924, |
| "learning_rate": 6.813491680583594e-05, |
| "loss": 0.0173, |
| "step": 2750 |
| }, |
| { |
| "epoch": 4.1441441441441444, |
| "grad_norm": 0.4220035970211029, |
| "learning_rate": 6.790333091983645e-05, |
| "loss": 0.0189, |
| "step": 2760 |
| }, |
| { |
| "epoch": 4.1591591591591595, |
| "grad_norm": 0.31900477409362793, |
| "learning_rate": 6.767130362906565e-05, |
| "loss": 0.0191, |
| "step": 2770 |
| }, |
| { |
| "epoch": 4.1741741741741745, |
| "grad_norm": 0.3446926474571228, |
| "learning_rate": 6.743884065413239e-05, |
| "loss": 0.0194, |
| "step": 2780 |
| }, |
| { |
| "epoch": 4.1891891891891895, |
| "grad_norm": 0.42431914806365967, |
| "learning_rate": 6.72059477263873e-05, |
| "loss": 0.0201, |
| "step": 2790 |
| }, |
| { |
| "epoch": 4.2042042042042045, |
| "grad_norm": 0.28469523787498474, |
| "learning_rate": 6.697263058778138e-05, |
| "loss": 0.0154, |
| "step": 2800 |
| }, |
| { |
| "epoch": 4.2192192192192195, |
| "grad_norm": 0.5302097201347351, |
| "learning_rate": 6.673889499072454e-05, |
| "loss": 0.0197, |
| "step": 2810 |
| }, |
| { |
| "epoch": 4.2342342342342345, |
| "grad_norm": 0.4186718463897705, |
| "learning_rate": 6.650474669794376e-05, |
| "loss": 0.0181, |
| "step": 2820 |
| }, |
| { |
| "epoch": 4.2492492492492495, |
| "grad_norm": 0.16540727019309998, |
| "learning_rate": 6.627019148234094e-05, |
| "loss": 0.015, |
| "step": 2830 |
| }, |
| { |
| "epoch": 4.2642642642642645, |
| "grad_norm": 0.4453204870223999, |
| "learning_rate": 6.60352351268507e-05, |
| "loss": 0.0169, |
| "step": 2840 |
| }, |
| { |
| "epoch": 4.2792792792792795, |
| "grad_norm": 0.3529559373855591, |
| "learning_rate": 6.579988342429763e-05, |
| "loss": 0.0157, |
| "step": 2850 |
| }, |
| { |
| "epoch": 4.2942942942942945, |
| "grad_norm": 0.4159342646598816, |
| "learning_rate": 6.556414217725362e-05, |
| "loss": 0.0172, |
| "step": 2860 |
| }, |
| { |
| "epoch": 4.3093093093093096, |
| "grad_norm": 0.48912474513053894, |
| "learning_rate": 6.532801719789471e-05, |
| "loss": 0.02, |
| "step": 2870 |
| }, |
| { |
| "epoch": 4.324324324324325, |
| "grad_norm": 0.463779091835022, |
| "learning_rate": 6.509151430785785e-05, |
| "loss": 0.025, |
| "step": 2880 |
| }, |
| { |
| "epoch": 4.33933933933934, |
| "grad_norm": 0.4753241539001465, |
| "learning_rate": 6.48546393380973e-05, |
| "loss": 0.0206, |
| "step": 2890 |
| }, |
| { |
| "epoch": 4.354354354354355, |
| "grad_norm": 0.33267465233802795, |
| "learning_rate": 6.461739812874096e-05, |
| "loss": 0.0185, |
| "step": 2900 |
| }, |
| { |
| "epoch": 4.36936936936937, |
| "grad_norm": 0.33029690384864807, |
| "learning_rate": 6.437979652894626e-05, |
| "loss": 0.0189, |
| "step": 2910 |
| }, |
| { |
| "epoch": 4.384384384384385, |
| "grad_norm": 0.46888062357902527, |
| "learning_rate": 6.414184039675609e-05, |
| "loss": 0.0166, |
| "step": 2920 |
| }, |
| { |
| "epoch": 4.3993993993994, |
| "grad_norm": 0.5883775353431702, |
| "learning_rate": 6.390353559895425e-05, |
| "loss": 0.0202, |
| "step": 2930 |
| }, |
| { |
| "epoch": 4.414414414414415, |
| "grad_norm": 0.49353477358818054, |
| "learning_rate": 6.366488801092089e-05, |
| "loss": 0.0219, |
| "step": 2940 |
| }, |
| { |
| "epoch": 4.42942942942943, |
| "grad_norm": 0.47024640440940857, |
| "learning_rate": 6.342590351648758e-05, |
| "loss": 0.0264, |
| "step": 2950 |
| }, |
| { |
| "epoch": 4.444444444444445, |
| "grad_norm": 0.4648733139038086, |
| "learning_rate": 6.31865880077923e-05, |
| "loss": 0.0191, |
| "step": 2960 |
| }, |
| { |
| "epoch": 4.45945945945946, |
| "grad_norm": 0.39835986495018005, |
| "learning_rate": 6.294694738513417e-05, |
| "loss": 0.0165, |
| "step": 2970 |
| }, |
| { |
| "epoch": 4.474474474474475, |
| "grad_norm": 0.46081939339637756, |
| "learning_rate": 6.270698755682792e-05, |
| "loss": 0.0149, |
| "step": 2980 |
| }, |
| { |
| "epoch": 4.48948948948949, |
| "grad_norm": 0.4409334659576416, |
| "learning_rate": 6.246671443905832e-05, |
| "loss": 0.0173, |
| "step": 2990 |
| }, |
| { |
| "epoch": 4.504504504504505, |
| "grad_norm": 0.49880316853523254, |
| "learning_rate": 6.222613395573421e-05, |
| "loss": 0.0197, |
| "step": 3000 |
| }, |
| { |
| "epoch": 4.51951951951952, |
| "grad_norm": 0.5618528723716736, |
| "learning_rate": 6.19852520383425e-05, |
| "loss": 0.0181, |
| "step": 3010 |
| }, |
| { |
| "epoch": 4.534534534534535, |
| "grad_norm": 0.5146006941795349, |
| "learning_rate": 6.174407462580197e-05, |
| "loss": 0.0175, |
| "step": 3020 |
| }, |
| { |
| "epoch": 4.54954954954955, |
| "grad_norm": 0.2845626175403595, |
| "learning_rate": 6.150260766431673e-05, |
| "loss": 0.0173, |
| "step": 3030 |
| }, |
| { |
| "epoch": 4.564564564564565, |
| "grad_norm": 0.5690332055091858, |
| "learning_rate": 6.126085710722975e-05, |
| "loss": 0.0209, |
| "step": 3040 |
| }, |
| { |
| "epoch": 4.57957957957958, |
| "grad_norm": 0.5911976099014282, |
| "learning_rate": 6.1018828914876e-05, |
| "loss": 0.0252, |
| "step": 3050 |
| }, |
| { |
| "epoch": 4.594594594594595, |
| "grad_norm": 0.3847181499004364, |
| "learning_rate": 6.077652905443554e-05, |
| "loss": 0.0214, |
| "step": 3060 |
| }, |
| { |
| "epoch": 4.60960960960961, |
| "grad_norm": 0.47133636474609375, |
| "learning_rate": 6.0533963499786314e-05, |
| "loss": 0.0223, |
| "step": 3070 |
| }, |
| { |
| "epoch": 4.624624624624625, |
| "grad_norm": 0.5316842794418335, |
| "learning_rate": 6.0291138231357014e-05, |
| "loss": 0.017, |
| "step": 3080 |
| }, |
| { |
| "epoch": 4.63963963963964, |
| "grad_norm": 0.29324743151664734, |
| "learning_rate": 6.004805923597948e-05, |
| "loss": 0.0211, |
| "step": 3090 |
| }, |
| { |
| "epoch": 4.654654654654655, |
| "grad_norm": 0.5077915787696838, |
| "learning_rate": 5.980473250674119e-05, |
| "loss": 0.0148, |
| "step": 3100 |
| }, |
| { |
| "epoch": 4.66966966966967, |
| "grad_norm": 0.2967468202114105, |
| "learning_rate": 5.9561164042837495e-05, |
| "loss": 0.017, |
| "step": 3110 |
| }, |
| { |
| "epoch": 4.684684684684685, |
| "grad_norm": 0.4611144959926605, |
| "learning_rate": 5.931735984942367e-05, |
| "loss": 0.0216, |
| "step": 3120 |
| }, |
| { |
| "epoch": 4.6996996996997, |
| "grad_norm": 0.34769314527511597, |
| "learning_rate": 5.907332593746684e-05, |
| "loss": 0.0164, |
| "step": 3130 |
| }, |
| { |
| "epoch": 4.714714714714715, |
| "grad_norm": 0.2664547264575958, |
| "learning_rate": 5.8829068323597894e-05, |
| "loss": 0.0177, |
| "step": 3140 |
| }, |
| { |
| "epoch": 4.72972972972973, |
| "grad_norm": 0.27916815876960754, |
| "learning_rate": 5.858459302996301e-05, |
| "loss": 0.0133, |
| "step": 3150 |
| }, |
| { |
| "epoch": 4.744744744744745, |
| "grad_norm": 0.45642387866973877, |
| "learning_rate": 5.833990608407525e-05, |
| "loss": 0.0214, |
| "step": 3160 |
| }, |
| { |
| "epoch": 4.75975975975976, |
| "grad_norm": 0.31843769550323486, |
| "learning_rate": 5.8095013518665955e-05, |
| "loss": 0.0211, |
| "step": 3170 |
| }, |
| { |
| "epoch": 4.774774774774775, |
| "grad_norm": 0.28955334424972534, |
| "learning_rate": 5.784992137153597e-05, |
| "loss": 0.0162, |
| "step": 3180 |
| }, |
| { |
| "epoch": 4.78978978978979, |
| "grad_norm": 0.2762531340122223, |
| "learning_rate": 5.7604635685406874e-05, |
| "loss": 0.0191, |
| "step": 3190 |
| }, |
| { |
| "epoch": 4.804804804804805, |
| "grad_norm": 0.4223916828632355, |
| "learning_rate": 5.7359162507771804e-05, |
| "loss": 0.0192, |
| "step": 3200 |
| }, |
| { |
| "epoch": 4.81981981981982, |
| "grad_norm": 0.3013556897640228, |
| "learning_rate": 5.71135078907466e-05, |
| "loss": 0.0176, |
| "step": 3210 |
| }, |
| { |
| "epoch": 4.834834834834835, |
| "grad_norm": 0.5810731649398804, |
| "learning_rate": 5.686767789092041e-05, |
| "loss": 0.0203, |
| "step": 3220 |
| }, |
| { |
| "epoch": 4.84984984984985, |
| "grad_norm": 0.23505333065986633, |
| "learning_rate": 5.662167856920646e-05, |
| "loss": 0.017, |
| "step": 3230 |
| }, |
| { |
| "epoch": 4.864864864864865, |
| "grad_norm": 0.4303601086139679, |
| "learning_rate": 5.637551599069255e-05, |
| "loss": 0.0186, |
| "step": 3240 |
| }, |
| { |
| "epoch": 4.87987987987988, |
| "grad_norm": 0.3556419014930725, |
| "learning_rate": 5.6129196224491595e-05, |
| "loss": 0.0197, |
| "step": 3250 |
| }, |
| { |
| "epoch": 4.894894894894895, |
| "grad_norm": 0.4908639192581177, |
| "learning_rate": 5.588272534359192e-05, |
| "loss": 0.0204, |
| "step": 3260 |
| }, |
| { |
| "epoch": 4.90990990990991, |
| "grad_norm": 0.46801817417144775, |
| "learning_rate": 5.563610942470765e-05, |
| "loss": 0.0172, |
| "step": 3270 |
| }, |
| { |
| "epoch": 4.924924924924925, |
| "grad_norm": 0.5240958333015442, |
| "learning_rate": 5.538935454812868e-05, |
| "loss": 0.0212, |
| "step": 3280 |
| }, |
| { |
| "epoch": 4.93993993993994, |
| "grad_norm": 0.4292217493057251, |
| "learning_rate": 5.514246679757099e-05, |
| "loss": 0.0173, |
| "step": 3290 |
| }, |
| { |
| "epoch": 4.954954954954955, |
| "grad_norm": 0.4419410228729248, |
| "learning_rate": 5.4895452260026516e-05, |
| "loss": 0.0173, |
| "step": 3300 |
| }, |
| { |
| "epoch": 4.96996996996997, |
| "grad_norm": 0.4300529956817627, |
| "learning_rate": 5.464831702561313e-05, |
| "loss": 0.017, |
| "step": 3310 |
| }, |
| { |
| "epoch": 4.984984984984985, |
| "grad_norm": 0.3810187876224518, |
| "learning_rate": 5.440106718742445e-05, |
| "loss": 0.0166, |
| "step": 3320 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.4891253411769867, |
| "learning_rate": 5.415370884137966e-05, |
| "loss": 0.0212, |
| "step": 3330 |
| }, |
| { |
| "epoch": 5.015015015015015, |
| "grad_norm": 0.6054010391235352, |
| "learning_rate": 5.3906248086073205e-05, |
| "loss": 0.021, |
| "step": 3340 |
| }, |
| { |
| "epoch": 5.03003003003003, |
| "grad_norm": 0.5947943925857544, |
| "learning_rate": 5.365869102262436e-05, |
| "loss": 0.0227, |
| "step": 3350 |
| }, |
| { |
| "epoch": 5.045045045045045, |
| "grad_norm": 0.2658171057701111, |
| "learning_rate": 5.341104375452696e-05, |
| "loss": 0.0165, |
| "step": 3360 |
| }, |
| { |
| "epoch": 5.06006006006006, |
| "grad_norm": 0.4237753748893738, |
| "learning_rate": 5.3163312387498735e-05, |
| "loss": 0.0207, |
| "step": 3370 |
| }, |
| { |
| "epoch": 5.075075075075075, |
| "grad_norm": 0.47378355264663696, |
| "learning_rate": 5.2915503029330915e-05, |
| "loss": 0.0167, |
| "step": 3380 |
| }, |
| { |
| "epoch": 5.09009009009009, |
| "grad_norm": 0.2642626166343689, |
| "learning_rate": 5.266762178973759e-05, |
| "loss": 0.0187, |
| "step": 3390 |
| }, |
| { |
| "epoch": 5.105105105105105, |
| "grad_norm": 0.5872535109519958, |
| "learning_rate": 5.2419674780205074e-05, |
| "loss": 0.02, |
| "step": 3400 |
| }, |
| { |
| "epoch": 5.12012012012012, |
| "grad_norm": 0.3369291424751282, |
| "learning_rate": 5.217166811384121e-05, |
| "loss": 0.0193, |
| "step": 3410 |
| }, |
| { |
| "epoch": 5.135135135135135, |
| "grad_norm": 0.3701128363609314, |
| "learning_rate": 5.1923607905224725e-05, |
| "loss": 0.0195, |
| "step": 3420 |
| }, |
| { |
| "epoch": 5.15015015015015, |
| "grad_norm": 0.5345423817634583, |
| "learning_rate": 5.167550027025438e-05, |
| "loss": 0.0153, |
| "step": 3430 |
| }, |
| { |
| "epoch": 5.165165165165165, |
| "grad_norm": 0.47684115171432495, |
| "learning_rate": 5.142735132599824e-05, |
| "loss": 0.019, |
| "step": 3440 |
| }, |
| { |
| "epoch": 5.18018018018018, |
| "grad_norm": 0.3032578229904175, |
| "learning_rate": 5.1179167190542856e-05, |
| "loss": 0.0177, |
| "step": 3450 |
| }, |
| { |
| "epoch": 5.195195195195195, |
| "grad_norm": 0.414538711309433, |
| "learning_rate": 5.093095398284239e-05, |
| "loss": 0.0166, |
| "step": 3460 |
| }, |
| { |
| "epoch": 5.21021021021021, |
| "grad_norm": 0.5710539221763611, |
| "learning_rate": 5.068271782256782e-05, |
| "loss": 0.017, |
| "step": 3470 |
| }, |
| { |
| "epoch": 5.225225225225225, |
| "grad_norm": 0.42450839281082153, |
| "learning_rate": 5.0434464829955965e-05, |
| "loss": 0.0167, |
| "step": 3480 |
| }, |
| { |
| "epoch": 5.24024024024024, |
| "grad_norm": 0.3924393057823181, |
| "learning_rate": 5.018620112565871e-05, |
| "loss": 0.0171, |
| "step": 3490 |
| }, |
| { |
| "epoch": 5.255255255255255, |
| "grad_norm": 0.4721011817455292, |
| "learning_rate": 4.993793283059193e-05, |
| "loss": 0.0181, |
| "step": 3500 |
| }, |
| { |
| "epoch": 5.27027027027027, |
| "grad_norm": 0.3422966003417969, |
| "learning_rate": 4.968966606578482e-05, |
| "loss": 0.02, |
| "step": 3510 |
| }, |
| { |
| "epoch": 5.285285285285285, |
| "grad_norm": 0.4170960485935211, |
| "learning_rate": 4.9441406952228734e-05, |
| "loss": 0.0142, |
| "step": 3520 |
| }, |
| { |
| "epoch": 5.3003003003003, |
| "grad_norm": 0.42160433530807495, |
| "learning_rate": 4.919316161072646e-05, |
| "loss": 0.0169, |
| "step": 3530 |
| }, |
| { |
| "epoch": 5.315315315315315, |
| "grad_norm": 0.47999799251556396, |
| "learning_rate": 4.894493616174117e-05, |
| "loss": 0.0158, |
| "step": 3540 |
| }, |
| { |
| "epoch": 5.33033033033033, |
| "grad_norm": 0.39213597774505615, |
| "learning_rate": 4.8696736725245624e-05, |
| "loss": 0.0152, |
| "step": 3550 |
| }, |
| { |
| "epoch": 5.345345345345345, |
| "grad_norm": 0.36117327213287354, |
| "learning_rate": 4.8448569420571277e-05, |
| "loss": 0.0162, |
| "step": 3560 |
| }, |
| { |
| "epoch": 5.36036036036036, |
| "grad_norm": 0.2900219261646271, |
| "learning_rate": 4.820044036625734e-05, |
| "loss": 0.0148, |
| "step": 3570 |
| }, |
| { |
| "epoch": 5.375375375375375, |
| "grad_norm": 0.34812745451927185, |
| "learning_rate": 4.7952355679899966e-05, |
| "loss": 0.0132, |
| "step": 3580 |
| }, |
| { |
| "epoch": 5.39039039039039, |
| "grad_norm": 0.3243030905723572, |
| "learning_rate": 4.7704321478001415e-05, |
| "loss": 0.0146, |
| "step": 3590 |
| }, |
| { |
| "epoch": 5.405405405405405, |
| "grad_norm": 0.6267113089561462, |
| "learning_rate": 4.7456343875819293e-05, |
| "loss": 0.0138, |
| "step": 3600 |
| }, |
| { |
| "epoch": 5.42042042042042, |
| "grad_norm": 0.5994648337364197, |
| "learning_rate": 4.720842898721569e-05, |
| "loss": 0.0142, |
| "step": 3610 |
| }, |
| { |
| "epoch": 5.435435435435435, |
| "grad_norm": 0.2001846730709076, |
| "learning_rate": 4.6960582924506555e-05, |
| "loss": 0.0181, |
| "step": 3620 |
| }, |
| { |
| "epoch": 5.45045045045045, |
| "grad_norm": 0.27956146001815796, |
| "learning_rate": 4.671281179831092e-05, |
| "loss": 0.0139, |
| "step": 3630 |
| }, |
| { |
| "epoch": 5.465465465465465, |
| "grad_norm": 0.1786326915025711, |
| "learning_rate": 4.6465121717400275e-05, |
| "loss": 0.0141, |
| "step": 3640 |
| }, |
| { |
| "epoch": 5.48048048048048, |
| "grad_norm": 0.28647634387016296, |
| "learning_rate": 4.62175187885479e-05, |
| "loss": 0.016, |
| "step": 3650 |
| }, |
| { |
| "epoch": 5.495495495495495, |
| "grad_norm": 0.4608069956302643, |
| "learning_rate": 4.5970009116378424e-05, |
| "loss": 0.0177, |
| "step": 3660 |
| }, |
| { |
| "epoch": 5.51051051051051, |
| "grad_norm": 0.4186050295829773, |
| "learning_rate": 4.5722598803217135e-05, |
| "loss": 0.0226, |
| "step": 3670 |
| }, |
| { |
| "epoch": 5.525525525525525, |
| "grad_norm": 0.3347811698913574, |
| "learning_rate": 4.5475293948939795e-05, |
| "loss": 0.0168, |
| "step": 3680 |
| }, |
| { |
| "epoch": 5.54054054054054, |
| "grad_norm": 0.35979321599006653, |
| "learning_rate": 4.522810065082194e-05, |
| "loss": 0.0183, |
| "step": 3690 |
| }, |
| { |
| "epoch": 5.555555555555555, |
| "grad_norm": 0.5392782688140869, |
| "learning_rate": 4.4981025003388786e-05, |
| "loss": 0.0138, |
| "step": 3700 |
| }, |
| { |
| "epoch": 5.57057057057057, |
| "grad_norm": 0.43829211592674255, |
| "learning_rate": 4.4734073098264816e-05, |
| "loss": 0.0161, |
| "step": 3710 |
| }, |
| { |
| "epoch": 5.585585585585585, |
| "grad_norm": 0.3537711501121521, |
| "learning_rate": 4.448725102402373e-05, |
| "loss": 0.019, |
| "step": 3720 |
| }, |
| { |
| "epoch": 5.6006006006006, |
| "grad_norm": 0.2743152379989624, |
| "learning_rate": 4.4240564866038204e-05, |
| "loss": 0.0151, |
| "step": 3730 |
| }, |
| { |
| "epoch": 5.615615615615615, |
| "grad_norm": 0.7068215012550354, |
| "learning_rate": 4.3994020706329974e-05, |
| "loss": 0.0193, |
| "step": 3740 |
| }, |
| { |
| "epoch": 5.63063063063063, |
| "grad_norm": 0.4887137711048126, |
| "learning_rate": 4.374762462341975e-05, |
| "loss": 0.018, |
| "step": 3750 |
| }, |
| { |
| "epoch": 5.645645645645645, |
| "grad_norm": 0.45829343795776367, |
| "learning_rate": 4.350138269217748e-05, |
| "loss": 0.0149, |
| "step": 3760 |
| }, |
| { |
| "epoch": 5.66066066066066, |
| "grad_norm": 0.34931594133377075, |
| "learning_rate": 4.325530098367251e-05, |
| "loss": 0.0167, |
| "step": 3770 |
| }, |
| { |
| "epoch": 5.675675675675675, |
| "grad_norm": 0.3233775198459625, |
| "learning_rate": 4.3009385565023865e-05, |
| "loss": 0.0162, |
| "step": 3780 |
| }, |
| { |
| "epoch": 5.6906906906906904, |
| "grad_norm": 0.32048696279525757, |
| "learning_rate": 4.276364249925076e-05, |
| "loss": 0.0148, |
| "step": 3790 |
| }, |
| { |
| "epoch": 5.7057057057057055, |
| "grad_norm": 0.16194048523902893, |
| "learning_rate": 4.251807784512306e-05, |
| "loss": 0.015, |
| "step": 3800 |
| }, |
| { |
| "epoch": 5.7207207207207205, |
| "grad_norm": 0.4419267475605011, |
| "learning_rate": 4.2272697657011925e-05, |
| "loss": 0.0173, |
| "step": 3810 |
| }, |
| { |
| "epoch": 5.7357357357357355, |
| "grad_norm": 0.35065025091171265, |
| "learning_rate": 4.202750798474046e-05, |
| "loss": 0.0169, |
| "step": 3820 |
| }, |
| { |
| "epoch": 5.7507507507507505, |
| "grad_norm": 0.2966877818107605, |
| "learning_rate": 4.178251487343471e-05, |
| "loss": 0.0208, |
| "step": 3830 |
| }, |
| { |
| "epoch": 5.7657657657657655, |
| "grad_norm": 0.3845357894897461, |
| "learning_rate": 4.153772436337443e-05, |
| "loss": 0.0172, |
| "step": 3840 |
| }, |
| { |
| "epoch": 5.7807807807807805, |
| "grad_norm": 0.21244952082633972, |
| "learning_rate": 4.129314248984435e-05, |
| "loss": 0.0178, |
| "step": 3850 |
| }, |
| { |
| "epoch": 5.7957957957957955, |
| "grad_norm": 0.3762425482273102, |
| "learning_rate": 4.104877528298523e-05, |
| "loss": 0.0164, |
| "step": 3860 |
| }, |
| { |
| "epoch": 5.8108108108108105, |
| "grad_norm": 0.43916618824005127, |
| "learning_rate": 4.080462876764531e-05, |
| "loss": 0.0183, |
| "step": 3870 |
| }, |
| { |
| "epoch": 5.8258258258258255, |
| "grad_norm": 0.4342308044433594, |
| "learning_rate": 4.056070896323163e-05, |
| "loss": 0.0175, |
| "step": 3880 |
| }, |
| { |
| "epoch": 5.8408408408408405, |
| "grad_norm": 0.3472575545310974, |
| "learning_rate": 4.0317021883561736e-05, |
| "loss": 0.0135, |
| "step": 3890 |
| }, |
| { |
| "epoch": 5.8558558558558556, |
| "grad_norm": 0.24840188026428223, |
| "learning_rate": 4.0073573536715374e-05, |
| "loss": 0.0115, |
| "step": 3900 |
| }, |
| { |
| "epoch": 5.870870870870871, |
| "grad_norm": 0.3296308219432831, |
| "learning_rate": 3.9830369924886326e-05, |
| "loss": 0.0159, |
| "step": 3910 |
| }, |
| { |
| "epoch": 5.885885885885886, |
| "grad_norm": 0.46094295382499695, |
| "learning_rate": 3.958741704423447e-05, |
| "loss": 0.0162, |
| "step": 3920 |
| }, |
| { |
| "epoch": 5.900900900900901, |
| "grad_norm": 0.5159643888473511, |
| "learning_rate": 3.9344720884737966e-05, |
| "loss": 0.0139, |
| "step": 3930 |
| }, |
| { |
| "epoch": 5.915915915915916, |
| "grad_norm": 0.4802702069282532, |
| "learning_rate": 3.9102287430045517e-05, |
| "loss": 0.0133, |
| "step": 3940 |
| }, |
| { |
| "epoch": 5.930930930930931, |
| "grad_norm": 0.44241219758987427, |
| "learning_rate": 3.886012265732883e-05, |
| "loss": 0.0151, |
| "step": 3950 |
| }, |
| { |
| "epoch": 5.945945945945946, |
| "grad_norm": 0.40663307905197144, |
| "learning_rate": 3.861823253713535e-05, |
| "loss": 0.0142, |
| "step": 3960 |
| }, |
| { |
| "epoch": 5.960960960960961, |
| "grad_norm": 0.3772605359554291, |
| "learning_rate": 3.837662303324093e-05, |
| "loss": 0.0149, |
| "step": 3970 |
| }, |
| { |
| "epoch": 5.975975975975976, |
| "grad_norm": 0.36921435594558716, |
| "learning_rate": 3.81353001025029e-05, |
| "loss": 0.0146, |
| "step": 3980 |
| }, |
| { |
| "epoch": 5.990990990990991, |
| "grad_norm": 0.29234158992767334, |
| "learning_rate": 3.7894269694713156e-05, |
| "loss": 0.0107, |
| "step": 3990 |
| }, |
| { |
| "epoch": 6.006006006006006, |
| "grad_norm": 0.36290058493614197, |
| "learning_rate": 3.7653537752451475e-05, |
| "loss": 0.0186, |
| "step": 4000 |
| }, |
| { |
| "epoch": 6.021021021021021, |
| "grad_norm": 0.5493783950805664, |
| "learning_rate": 3.741311021093895e-05, |
| "loss": 0.0157, |
| "step": 4010 |
| }, |
| { |
| "epoch": 6.036036036036036, |
| "grad_norm": 0.35067814588546753, |
| "learning_rate": 3.7172992997891754e-05, |
| "loss": 0.0117, |
| "step": 4020 |
| }, |
| { |
| "epoch": 6.051051051051051, |
| "grad_norm": 0.34015220403671265, |
| "learning_rate": 3.693319203337494e-05, |
| "loss": 0.0184, |
| "step": 4030 |
| }, |
| { |
| "epoch": 6.066066066066066, |
| "grad_norm": 0.40151917934417725, |
| "learning_rate": 3.669371322965644e-05, |
| "loss": 0.0185, |
| "step": 4040 |
| }, |
| { |
| "epoch": 6.081081081081081, |
| "grad_norm": 0.3989444077014923, |
| "learning_rate": 3.645456249106142e-05, |
| "loss": 0.0185, |
| "step": 4050 |
| }, |
| { |
| "epoch": 6.096096096096096, |
| "grad_norm": 0.2835889756679535, |
| "learning_rate": 3.6215745713826584e-05, |
| "loss": 0.0144, |
| "step": 4060 |
| }, |
| { |
| "epoch": 6.111111111111111, |
| "grad_norm": 0.4811089038848877, |
| "learning_rate": 3.597726878595486e-05, |
| "loss": 0.0151, |
| "step": 4070 |
| }, |
| { |
| "epoch": 6.126126126126126, |
| "grad_norm": 0.3830724060535431, |
| "learning_rate": 3.5739137587070214e-05, |
| "loss": 0.0131, |
| "step": 4080 |
| }, |
| { |
| "epoch": 6.141141141141141, |
| "grad_norm": 0.5573437213897705, |
| "learning_rate": 3.5501357988272745e-05, |
| "loss": 0.0165, |
| "step": 4090 |
| }, |
| { |
| "epoch": 6.156156156156156, |
| "grad_norm": 0.25924408435821533, |
| "learning_rate": 3.52639358519938e-05, |
| "loss": 0.014, |
| "step": 4100 |
| }, |
| { |
| "epoch": 6.171171171171171, |
| "grad_norm": 0.28238144516944885, |
| "learning_rate": 3.502687703185164e-05, |
| "loss": 0.0164, |
| "step": 4110 |
| }, |
| { |
| "epoch": 6.186186186186186, |
| "grad_norm": 0.2565016448497772, |
| "learning_rate": 3.479018737250694e-05, |
| "loss": 0.012, |
| "step": 4120 |
| }, |
| { |
| "epoch": 6.201201201201201, |
| "grad_norm": 0.539279043674469, |
| "learning_rate": 3.455387270951874e-05, |
| "loss": 0.012, |
| "step": 4130 |
| }, |
| { |
| "epoch": 6.216216216216216, |
| "grad_norm": 0.21729649603366852, |
| "learning_rate": 3.431793886920061e-05, |
| "loss": 0.0147, |
| "step": 4140 |
| }, |
| { |
| "epoch": 6.231231231231231, |
| "grad_norm": 0.5940044522285461, |
| "learning_rate": 3.408239166847696e-05, |
| "loss": 0.0105, |
| "step": 4150 |
| }, |
| { |
| "epoch": 6.246246246246246, |
| "grad_norm": 0.3688409924507141, |
| "learning_rate": 3.3847236914739654e-05, |
| "loss": 0.0141, |
| "step": 4160 |
| }, |
| { |
| "epoch": 6.261261261261261, |
| "grad_norm": 0.48852694034576416, |
| "learning_rate": 3.361248040570484e-05, |
| "loss": 0.0154, |
| "step": 4170 |
| }, |
| { |
| "epoch": 6.276276276276276, |
| "grad_norm": 0.6311971545219421, |
| "learning_rate": 3.337812792926992e-05, |
| "loss": 0.0191, |
| "step": 4180 |
| }, |
| { |
| "epoch": 6.291291291291291, |
| "grad_norm": 0.34159210324287415, |
| "learning_rate": 3.3144185263370976e-05, |
| "loss": 0.015, |
| "step": 4190 |
| }, |
| { |
| "epoch": 6.306306306306306, |
| "grad_norm": 0.48673540353775024, |
| "learning_rate": 3.2910658175840226e-05, |
| "loss": 0.0121, |
| "step": 4200 |
| }, |
| { |
| "epoch": 6.321321321321321, |
| "grad_norm": 0.34678933024406433, |
| "learning_rate": 3.267755242426384e-05, |
| "loss": 0.0134, |
| "step": 4210 |
| }, |
| { |
| "epoch": 6.336336336336337, |
| "grad_norm": 0.27392464876174927, |
| "learning_rate": 3.2444873755839975e-05, |
| "loss": 0.0104, |
| "step": 4220 |
| }, |
| { |
| "epoch": 6.351351351351352, |
| "grad_norm": 0.3353574872016907, |
| "learning_rate": 3.221262790723719e-05, |
| "loss": 0.0143, |
| "step": 4230 |
| }, |
| { |
| "epoch": 6.366366366366367, |
| "grad_norm": 0.20981314778327942, |
| "learning_rate": 3.198082060445281e-05, |
| "loss": 0.0107, |
| "step": 4240 |
| }, |
| { |
| "epoch": 6.381381381381382, |
| "grad_norm": 0.3146778643131256, |
| "learning_rate": 3.1749457562671905e-05, |
| "loss": 0.0116, |
| "step": 4250 |
| }, |
| { |
| "epoch": 6.396396396396397, |
| "grad_norm": 0.3559088408946991, |
| "learning_rate": 3.151854448612633e-05, |
| "loss": 0.0148, |
| "step": 4260 |
| }, |
| { |
| "epoch": 6.411411411411412, |
| "grad_norm": 0.48589858412742615, |
| "learning_rate": 3.128808706795409e-05, |
| "loss": 0.0185, |
| "step": 4270 |
| }, |
| { |
| "epoch": 6.426426426426427, |
| "grad_norm": 0.350479394197464, |
| "learning_rate": 3.105809099005897e-05, |
| "loss": 0.0129, |
| "step": 4280 |
| }, |
| { |
| "epoch": 6.441441441441442, |
| "grad_norm": 0.40389105677604675, |
| "learning_rate": 3.082856192297051e-05, |
| "loss": 0.0146, |
| "step": 4290 |
| }, |
| { |
| "epoch": 6.456456456456457, |
| "grad_norm": 0.4889410734176636, |
| "learning_rate": 3.05995055257041e-05, |
| "loss": 0.0214, |
| "step": 4300 |
| }, |
| { |
| "epoch": 6.471471471471472, |
| "grad_norm": 0.3033137023448944, |
| "learning_rate": 3.0370927445621477e-05, |
| "loss": 0.0106, |
| "step": 4310 |
| }, |
| { |
| "epoch": 6.486486486486487, |
| "grad_norm": 0.4891495704650879, |
| "learning_rate": 3.0142833318291592e-05, |
| "loss": 0.012, |
| "step": 4320 |
| }, |
| { |
| "epoch": 6.501501501501502, |
| "grad_norm": 0.2705667018890381, |
| "learning_rate": 2.991522876735154e-05, |
| "loss": 0.0136, |
| "step": 4330 |
| }, |
| { |
| "epoch": 6.516516516516517, |
| "grad_norm": 0.2685934603214264, |
| "learning_rate": 2.9688119404367966e-05, |
| "loss": 0.0142, |
| "step": 4340 |
| }, |
| { |
| "epoch": 6.531531531531532, |
| "grad_norm": 0.18210811913013458, |
| "learning_rate": 2.9461510828698702e-05, |
| "loss": 0.0125, |
| "step": 4350 |
| }, |
| { |
| "epoch": 6.546546546546547, |
| "grad_norm": 0.24782325327396393, |
| "learning_rate": 2.9235408627354766e-05, |
| "loss": 0.0134, |
| "step": 4360 |
| }, |
| { |
| "epoch": 6.561561561561562, |
| "grad_norm": 0.2617192566394806, |
| "learning_rate": 2.900981837486253e-05, |
| "loss": 0.0146, |
| "step": 4370 |
| }, |
| { |
| "epoch": 6.576576576576577, |
| "grad_norm": 0.43912070989608765, |
| "learning_rate": 2.878474563312632e-05, |
| "loss": 0.0139, |
| "step": 4380 |
| }, |
| { |
| "epoch": 6.591591591591592, |
| "grad_norm": 0.24311336874961853, |
| "learning_rate": 2.856019595129127e-05, |
| "loss": 0.0124, |
| "step": 4390 |
| }, |
| { |
| "epoch": 6.606606606606607, |
| "grad_norm": 0.3276002109050751, |
| "learning_rate": 2.8336174865606584e-05, |
| "loss": 0.0147, |
| "step": 4400 |
| }, |
| { |
| "epoch": 6.621621621621622, |
| "grad_norm": 0.2517334520816803, |
| "learning_rate": 2.8112687899288924e-05, |
| "loss": 0.0146, |
| "step": 4410 |
| }, |
| { |
| "epoch": 6.636636636636637, |
| "grad_norm": 0.3244163691997528, |
| "learning_rate": 2.7889740562386358e-05, |
| "loss": 0.012, |
| "step": 4420 |
| }, |
| { |
| "epoch": 6.651651651651652, |
| "grad_norm": 0.3804556131362915, |
| "learning_rate": 2.766733835164236e-05, |
| "loss": 0.0114, |
| "step": 4430 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.6479921936988831, |
| "learning_rate": 2.7445486750360462e-05, |
| "loss": 0.0135, |
| "step": 4440 |
| }, |
| { |
| "epoch": 6.681681681681682, |
| "grad_norm": 0.2410162389278412, |
| "learning_rate": 2.7224191228268913e-05, |
| "loss": 0.0145, |
| "step": 4450 |
| }, |
| { |
| "epoch": 6.696696696696697, |
| "grad_norm": 0.2628104090690613, |
| "learning_rate": 2.700345724138591e-05, |
| "loss": 0.0122, |
| "step": 4460 |
| }, |
| { |
| "epoch": 6.711711711711712, |
| "grad_norm": 0.2546948492527008, |
| "learning_rate": 2.6783290231885072e-05, |
| "loss": 0.0086, |
| "step": 4470 |
| }, |
| { |
| "epoch": 6.726726726726727, |
| "grad_norm": 0.3053324222564697, |
| "learning_rate": 2.656369562796125e-05, |
| "loss": 0.0126, |
| "step": 4480 |
| }, |
| { |
| "epoch": 6.741741741741742, |
| "grad_norm": 0.6292986869812012, |
| "learning_rate": 2.634467884369667e-05, |
| "loss": 0.0178, |
| "step": 4490 |
| }, |
| { |
| "epoch": 6.756756756756757, |
| "grad_norm": 0.3189369738101959, |
| "learning_rate": 2.6126245278927452e-05, |
| "loss": 0.0173, |
| "step": 4500 |
| }, |
| { |
| "epoch": 6.771771771771772, |
| "grad_norm": 0.295524537563324, |
| "learning_rate": 2.5908400319110587e-05, |
| "loss": 0.0128, |
| "step": 4510 |
| }, |
| { |
| "epoch": 6.786786786786787, |
| "grad_norm": 0.3470920920372009, |
| "learning_rate": 2.569114933519099e-05, |
| "loss": 0.0134, |
| "step": 4520 |
| }, |
| { |
| "epoch": 6.801801801801802, |
| "grad_norm": 0.27639520168304443, |
| "learning_rate": 2.5474497683469177e-05, |
| "loss": 0.0121, |
| "step": 4530 |
| }, |
| { |
| "epoch": 6.816816816816817, |
| "grad_norm": 0.34324508905410767, |
| "learning_rate": 2.5258450705469217e-05, |
| "loss": 0.0107, |
| "step": 4540 |
| }, |
| { |
| "epoch": 6.831831831831832, |
| "grad_norm": 0.3578449785709381, |
| "learning_rate": 2.504301372780702e-05, |
| "loss": 0.0154, |
| "step": 4550 |
| }, |
| { |
| "epoch": 6.846846846846847, |
| "grad_norm": 0.29269954562187195, |
| "learning_rate": 2.482819206205896e-05, |
| "loss": 0.0132, |
| "step": 4560 |
| }, |
| { |
| "epoch": 6.861861861861862, |
| "grad_norm": 0.2745559513568878, |
| "learning_rate": 2.461399100463095e-05, |
| "loss": 0.0142, |
| "step": 4570 |
| }, |
| { |
| "epoch": 6.876876876876877, |
| "grad_norm": 0.4530208706855774, |
| "learning_rate": 2.440041583662794e-05, |
| "loss": 0.0119, |
| "step": 4580 |
| }, |
| { |
| "epoch": 6.891891891891892, |
| "grad_norm": 0.2549501657485962, |
| "learning_rate": 2.4187471823723555e-05, |
| "loss": 0.0106, |
| "step": 4590 |
| }, |
| { |
| "epoch": 6.906906906906907, |
| "grad_norm": 0.34863579273223877, |
| "learning_rate": 2.3975164216030456e-05, |
| "loss": 0.0115, |
| "step": 4600 |
| }, |
| { |
| "epoch": 6.921921921921922, |
| "grad_norm": 0.17822623252868652, |
| "learning_rate": 2.376349824797068e-05, |
| "loss": 0.0187, |
| "step": 4610 |
| }, |
| { |
| "epoch": 6.936936936936937, |
| "grad_norm": 0.1522684544324875, |
| "learning_rate": 2.3552479138146816e-05, |
| "loss": 0.015, |
| "step": 4620 |
| }, |
| { |
| "epoch": 6.951951951951952, |
| "grad_norm": 0.39343202114105225, |
| "learning_rate": 2.3342112089213163e-05, |
| "loss": 0.0124, |
| "step": 4630 |
| }, |
| { |
| "epoch": 6.966966966966967, |
| "grad_norm": 0.4499475061893463, |
| "learning_rate": 2.3132402287747546e-05, |
| "loss": 0.012, |
| "step": 4640 |
| }, |
| { |
| "epoch": 6.981981981981982, |
| "grad_norm": 0.5692602396011353, |
| "learning_rate": 2.292335490412338e-05, |
| "loss": 0.0146, |
| "step": 4650 |
| }, |
| { |
| "epoch": 6.996996996996997, |
| "grad_norm": 0.3447282016277313, |
| "learning_rate": 2.271497509238235e-05, |
| "loss": 0.0137, |
| "step": 4660 |
| }, |
| { |
| "epoch": 7.012012012012012, |
| "grad_norm": 0.38220450282096863, |
| "learning_rate": 2.2507267990107122e-05, |
| "loss": 0.0156, |
| "step": 4670 |
| }, |
| { |
| "epoch": 7.027027027027027, |
| "grad_norm": 0.3092460334300995, |
| "learning_rate": 2.2300238718294803e-05, |
| "loss": 0.0128, |
| "step": 4680 |
| }, |
| { |
| "epoch": 7.042042042042042, |
| "grad_norm": 0.3238361179828644, |
| "learning_rate": 2.2093892381230658e-05, |
| "loss": 0.0135, |
| "step": 4690 |
| }, |
| { |
| "epoch": 7.057057057057057, |
| "grad_norm": 0.44639965891838074, |
| "learning_rate": 2.1888234066362302e-05, |
| "loss": 0.0157, |
| "step": 4700 |
| }, |
| { |
| "epoch": 7.072072072072072, |
| "grad_norm": 0.2918453812599182, |
| "learning_rate": 2.1683268844174183e-05, |
| "loss": 0.0091, |
| "step": 4710 |
| }, |
| { |
| "epoch": 7.087087087087087, |
| "grad_norm": 0.28555265069007874, |
| "learning_rate": 2.1479001768062668e-05, |
| "loss": 0.0129, |
| "step": 4720 |
| }, |
| { |
| "epoch": 7.102102102102102, |
| "grad_norm": 0.19259797036647797, |
| "learning_rate": 2.1275437874211355e-05, |
| "loss": 0.0081, |
| "step": 4730 |
| }, |
| { |
| "epoch": 7.117117117117117, |
| "grad_norm": 0.15251432359218597, |
| "learning_rate": 2.1072582181467017e-05, |
| "loss": 0.0105, |
| "step": 4740 |
| }, |
| { |
| "epoch": 7.132132132132132, |
| "grad_norm": 0.3018776476383209, |
| "learning_rate": 2.0870439691215747e-05, |
| "loss": 0.0132, |
| "step": 4750 |
| }, |
| { |
| "epoch": 7.147147147147147, |
| "grad_norm": 0.40798524022102356, |
| "learning_rate": 2.0669015387259722e-05, |
| "loss": 0.0111, |
| "step": 4760 |
| }, |
| { |
| "epoch": 7.162162162162162, |
| "grad_norm": 0.2255299836397171, |
| "learning_rate": 2.046831423569433e-05, |
| "loss": 0.0125, |
| "step": 4770 |
| }, |
| { |
| "epoch": 7.177177177177177, |
| "grad_norm": 0.1696217954158783, |
| "learning_rate": 2.026834118478567e-05, |
| "loss": 0.0145, |
| "step": 4780 |
| }, |
| { |
| "epoch": 7.192192192192192, |
| "grad_norm": 0.2747775614261627, |
| "learning_rate": 2.006910116484865e-05, |
| "loss": 0.0117, |
| "step": 4790 |
| }, |
| { |
| "epoch": 7.207207207207207, |
| "grad_norm": 0.42405006289482117, |
| "learning_rate": 1.9870599088125324e-05, |
| "loss": 0.0126, |
| "step": 4800 |
| }, |
| { |
| "epoch": 7.222222222222222, |
| "grad_norm": 0.2677384614944458, |
| "learning_rate": 1.967283984866387e-05, |
| "loss": 0.0087, |
| "step": 4810 |
| }, |
| { |
| "epoch": 7.237237237237237, |
| "grad_norm": 0.36573195457458496, |
| "learning_rate": 1.947582832219787e-05, |
| "loss": 0.0114, |
| "step": 4820 |
| }, |
| { |
| "epoch": 7.252252252252252, |
| "grad_norm": 0.2867136597633362, |
| "learning_rate": 1.92795693660261e-05, |
| "loss": 0.0151, |
| "step": 4830 |
| }, |
| { |
| "epoch": 7.267267267267267, |
| "grad_norm": 0.22826363146305084, |
| "learning_rate": 1.9084067818892838e-05, |
| "loss": 0.0107, |
| "step": 4840 |
| }, |
| { |
| "epoch": 7.282282282282282, |
| "grad_norm": 0.6654722094535828, |
| "learning_rate": 1.8889328500868514e-05, |
| "loss": 0.0135, |
| "step": 4850 |
| }, |
| { |
| "epoch": 7.297297297297297, |
| "grad_norm": 0.3256247043609619, |
| "learning_rate": 1.869535621323084e-05, |
| "loss": 0.014, |
| "step": 4860 |
| }, |
| { |
| "epoch": 7.312312312312312, |
| "grad_norm": 0.3317245841026306, |
| "learning_rate": 1.8502155738346488e-05, |
| "loss": 0.0097, |
| "step": 4870 |
| }, |
| { |
| "epoch": 7.327327327327327, |
| "grad_norm": 0.33596867322921753, |
| "learning_rate": 1.8309731839553213e-05, |
| "loss": 0.0129, |
| "step": 4880 |
| }, |
| { |
| "epoch": 7.342342342342342, |
| "grad_norm": 0.3030179738998413, |
| "learning_rate": 1.81180892610423e-05, |
| "loss": 0.0121, |
| "step": 4890 |
| }, |
| { |
| "epoch": 7.357357357357357, |
| "grad_norm": 0.42004838585853577, |
| "learning_rate": 1.7927232727741676e-05, |
| "loss": 0.0132, |
| "step": 4900 |
| }, |
| { |
| "epoch": 7.372372372372372, |
| "grad_norm": 0.31752675771713257, |
| "learning_rate": 1.7737166945199452e-05, |
| "loss": 0.0089, |
| "step": 4910 |
| }, |
| { |
| "epoch": 7.387387387387387, |
| "grad_norm": 0.4903360903263092, |
| "learning_rate": 1.7547896599467827e-05, |
| "loss": 0.0122, |
| "step": 4920 |
| }, |
| { |
| "epoch": 7.402402402402402, |
| "grad_norm": 0.29328206181526184, |
| "learning_rate": 1.7359426356987592e-05, |
| "loss": 0.012, |
| "step": 4930 |
| }, |
| { |
| "epoch": 7.4174174174174174, |
| "grad_norm": 0.37560057640075684, |
| "learning_rate": 1.717176086447306e-05, |
| "loss": 0.0082, |
| "step": 4940 |
| }, |
| { |
| "epoch": 7.4324324324324325, |
| "grad_norm": 0.38988494873046875, |
| "learning_rate": 1.698490474879752e-05, |
| "loss": 0.012, |
| "step": 4950 |
| }, |
| { |
| "epoch": 7.4474474474474475, |
| "grad_norm": 0.39179790019989014, |
| "learning_rate": 1.6798862616879186e-05, |
| "loss": 0.0129, |
| "step": 4960 |
| }, |
| { |
| "epoch": 7.4624624624624625, |
| "grad_norm": 0.2623908519744873, |
| "learning_rate": 1.661363905556758e-05, |
| "loss": 0.0131, |
| "step": 4970 |
| }, |
| { |
| "epoch": 7.4774774774774775, |
| "grad_norm": 0.3803854286670685, |
| "learning_rate": 1.642923863153043e-05, |
| "loss": 0.0104, |
| "step": 4980 |
| }, |
| { |
| "epoch": 7.4924924924924925, |
| "grad_norm": 0.38123732805252075, |
| "learning_rate": 1.6245665891141094e-05, |
| "loss": 0.0117, |
| "step": 4990 |
| }, |
| { |
| "epoch": 7.5075075075075075, |
| "grad_norm": 0.3050144910812378, |
| "learning_rate": 1.6062925360366522e-05, |
| "loss": 0.013, |
| "step": 5000 |
| }, |
| { |
| "epoch": 7.5225225225225225, |
| "grad_norm": 0.39487695693969727, |
| "learning_rate": 1.588102154465557e-05, |
| "loss": 0.0142, |
| "step": 5010 |
| }, |
| { |
| "epoch": 7.5375375375375375, |
| "grad_norm": 0.21728385984897614, |
| "learning_rate": 1.5699958928827985e-05, |
| "loss": 0.011, |
| "step": 5020 |
| }, |
| { |
| "epoch": 7.5525525525525525, |
| "grad_norm": 0.2818934917449951, |
| "learning_rate": 1.5519741976963843e-05, |
| "loss": 0.0111, |
| "step": 5030 |
| }, |
| { |
| "epoch": 7.5675675675675675, |
| "grad_norm": 0.2728251814842224, |
| "learning_rate": 1.5340375132293446e-05, |
| "loss": 0.013, |
| "step": 5040 |
| }, |
| { |
| "epoch": 7.5825825825825826, |
| "grad_norm": 0.15858575701713562, |
| "learning_rate": 1.516186281708778e-05, |
| "loss": 0.0111, |
| "step": 5050 |
| }, |
| { |
| "epoch": 7.597597597597598, |
| "grad_norm": 0.36679723858833313, |
| "learning_rate": 1.4984209432549495e-05, |
| "loss": 0.0108, |
| "step": 5060 |
| }, |
| { |
| "epoch": 7.612612612612613, |
| "grad_norm": 0.10907045751810074, |
| "learning_rate": 1.4807419358704433e-05, |
| "loss": 0.0096, |
| "step": 5070 |
| }, |
| { |
| "epoch": 7.627627627627628, |
| "grad_norm": 0.21727362275123596, |
| "learning_rate": 1.4631496954293539e-05, |
| "loss": 0.0094, |
| "step": 5080 |
| }, |
| { |
| "epoch": 7.642642642642643, |
| "grad_norm": 0.3287053406238556, |
| "learning_rate": 1.4456446556665515e-05, |
| "loss": 0.0125, |
| "step": 5090 |
| }, |
| { |
| "epoch": 7.657657657657658, |
| "grad_norm": 0.15486322343349457, |
| "learning_rate": 1.4282272481669784e-05, |
| "loss": 0.01, |
| "step": 5100 |
| }, |
| { |
| "epoch": 7.672672672672673, |
| "grad_norm": 0.2929460108280182, |
| "learning_rate": 1.4108979023550161e-05, |
| "loss": 0.0165, |
| "step": 5110 |
| }, |
| { |
| "epoch": 7.687687687687688, |
| "grad_norm": 0.25819772481918335, |
| "learning_rate": 1.3936570454838916e-05, |
| "loss": 0.0154, |
| "step": 5120 |
| }, |
| { |
| "epoch": 7.702702702702703, |
| "grad_norm": 0.41107359528541565, |
| "learning_rate": 1.3765051026251452e-05, |
| "loss": 0.0126, |
| "step": 5130 |
| }, |
| { |
| "epoch": 7.717717717717718, |
| "grad_norm": 0.3458564579486847, |
| "learning_rate": 1.3594424966581553e-05, |
| "loss": 0.0095, |
| "step": 5140 |
| }, |
| { |
| "epoch": 7.732732732732733, |
| "grad_norm": 0.37176772952079773, |
| "learning_rate": 1.3424696482597072e-05, |
| "loss": 0.0104, |
| "step": 5150 |
| }, |
| { |
| "epoch": 7.747747747747748, |
| "grad_norm": 0.31696900725364685, |
| "learning_rate": 1.3255869758936213e-05, |
| "loss": 0.0115, |
| "step": 5160 |
| }, |
| { |
| "epoch": 7.762762762762763, |
| "grad_norm": 0.2980751097202301, |
| "learning_rate": 1.3087948958004353e-05, |
| "loss": 0.0104, |
| "step": 5170 |
| }, |
| { |
| "epoch": 7.777777777777778, |
| "grad_norm": 0.33132874965667725, |
| "learning_rate": 1.2920938219871504e-05, |
| "loss": 0.0111, |
| "step": 5180 |
| }, |
| { |
| "epoch": 7.792792792792793, |
| "grad_norm": 0.5029041171073914, |
| "learning_rate": 1.2754841662170115e-05, |
| "loss": 0.0119, |
| "step": 5190 |
| }, |
| { |
| "epoch": 7.807807807807808, |
| "grad_norm": 0.307763010263443, |
| "learning_rate": 1.2589663379993627e-05, |
| "loss": 0.0168, |
| "step": 5200 |
| }, |
| { |
| "epoch": 7.822822822822823, |
| "grad_norm": 0.295971155166626, |
| "learning_rate": 1.2425407445795467e-05, |
| "loss": 0.0136, |
| "step": 5210 |
| }, |
| { |
| "epoch": 7.837837837837838, |
| "grad_norm": 0.3134170174598694, |
| "learning_rate": 1.2262077909288755e-05, |
| "loss": 0.0137, |
| "step": 5220 |
| }, |
| { |
| "epoch": 7.852852852852853, |
| "grad_norm": 0.25852659344673157, |
| "learning_rate": 1.2099678797346281e-05, |
| "loss": 0.014, |
| "step": 5230 |
| }, |
| { |
| "epoch": 7.867867867867868, |
| "grad_norm": 0.5606762766838074, |
| "learning_rate": 1.1938214113901352e-05, |
| "loss": 0.0136, |
| "step": 5240 |
| }, |
| { |
| "epoch": 7.882882882882883, |
| "grad_norm": 0.17156191170215607, |
| "learning_rate": 1.1777687839849017e-05, |
| "loss": 0.0113, |
| "step": 5250 |
| }, |
| { |
| "epoch": 7.897897897897898, |
| "grad_norm": 0.20190756022930145, |
| "learning_rate": 1.1618103932947976e-05, |
| "loss": 0.01, |
| "step": 5260 |
| }, |
| { |
| "epoch": 7.912912912912913, |
| "grad_norm": 0.5913976430892944, |
| "learning_rate": 1.1459466327722912e-05, |
| "loss": 0.011, |
| "step": 5270 |
| }, |
| { |
| "epoch": 7.927927927927928, |
| "grad_norm": 0.41337406635284424, |
| "learning_rate": 1.1301778935367596e-05, |
| "loss": 0.0097, |
| "step": 5280 |
| }, |
| { |
| "epoch": 7.942942942942943, |
| "grad_norm": 0.2560533881187439, |
| "learning_rate": 1.1145045643648345e-05, |
| "loss": 0.0088, |
| "step": 5290 |
| }, |
| { |
| "epoch": 7.957957957957958, |
| "grad_norm": 0.4912932217121124, |
| "learning_rate": 1.0989270316808276e-05, |
| "loss": 0.0098, |
| "step": 5300 |
| }, |
| { |
| "epoch": 7.972972972972973, |
| "grad_norm": 0.2890324294567108, |
| "learning_rate": 1.0834456795471937e-05, |
| "loss": 0.0095, |
| "step": 5310 |
| }, |
| { |
| "epoch": 7.987987987987988, |
| "grad_norm": 0.7409487962722778, |
| "learning_rate": 1.0680608896550659e-05, |
| "loss": 0.0101, |
| "step": 5320 |
| }, |
| { |
| "epoch": 8.003003003003004, |
| "grad_norm": 0.18136847019195557, |
| "learning_rate": 1.0527730413148495e-05, |
| "loss": 0.0124, |
| "step": 5330 |
| }, |
| { |
| "epoch": 8.018018018018019, |
| "grad_norm": 0.22960734367370605, |
| "learning_rate": 1.0375825114468646e-05, |
| "loss": 0.0097, |
| "step": 5340 |
| }, |
| { |
| "epoch": 8.033033033033034, |
| "grad_norm": 0.21436496078968048, |
| "learning_rate": 1.0224896745720514e-05, |
| "loss": 0.0114, |
| "step": 5350 |
| }, |
| { |
| "epoch": 8.048048048048049, |
| "grad_norm": 0.12885528802871704, |
| "learning_rate": 1.0074949028027397e-05, |
| "loss": 0.0098, |
| "step": 5360 |
| }, |
| { |
| "epoch": 8.063063063063064, |
| "grad_norm": 0.2210664600133896, |
| "learning_rate": 9.925985658334796e-06, |
| "loss": 0.0096, |
| "step": 5370 |
| }, |
| { |
| "epoch": 8.078078078078079, |
| "grad_norm": 0.12804459035396576, |
| "learning_rate": 9.778010309319152e-06, |
| "loss": 0.012, |
| "step": 5380 |
| }, |
| { |
| "epoch": 8.093093093093094, |
| "grad_norm": 0.18077941238880157, |
| "learning_rate": 9.63102662929738e-06, |
| "loss": 0.0097, |
| "step": 5390 |
| }, |
| { |
| "epoch": 8.108108108108109, |
| "grad_norm": 0.16283175349235535, |
| "learning_rate": 9.485038242136913e-06, |
| "loss": 0.0084, |
| "step": 5400 |
| }, |
| { |
| "epoch": 8.123123123123124, |
| "grad_norm": 0.25900721549987793, |
| "learning_rate": 9.340048747166341e-06, |
| "loss": 0.0152, |
| "step": 5410 |
| }, |
| { |
| "epoch": 8.138138138138139, |
| "grad_norm": 0.24129988253116608, |
| "learning_rate": 9.196061719086645e-06, |
| "loss": 0.01, |
| "step": 5420 |
| }, |
| { |
| "epoch": 8.153153153153154, |
| "grad_norm": 0.19243118166923523, |
| "learning_rate": 9.053080707883093e-06, |
| "loss": 0.0089, |
| "step": 5430 |
| }, |
| { |
| "epoch": 8.168168168168169, |
| "grad_norm": 0.2193710207939148, |
| "learning_rate": 8.911109238737747e-06, |
| "loss": 0.0123, |
| "step": 5440 |
| }, |
| { |
| "epoch": 8.183183183183184, |
| "grad_norm": 0.25949785113334656, |
| "learning_rate": 8.770150811942462e-06, |
| "loss": 0.0099, |
| "step": 5450 |
| }, |
| { |
| "epoch": 8.198198198198199, |
| "grad_norm": 0.13739214837551117, |
| "learning_rate": 8.630208902812697e-06, |
| "loss": 0.0091, |
| "step": 5460 |
| }, |
| { |
| "epoch": 8.213213213213214, |
| "grad_norm": 0.1208202913403511, |
| "learning_rate": 8.491286961601702e-06, |
| "loss": 0.0092, |
| "step": 5470 |
| }, |
| { |
| "epoch": 8.228228228228229, |
| "grad_norm": 0.29662320017814636, |
| "learning_rate": 8.353388413415603e-06, |
| "loss": 0.0106, |
| "step": 5480 |
| }, |
| { |
| "epoch": 8.243243243243244, |
| "grad_norm": 0.14624422788619995, |
| "learning_rate": 8.216516658128815e-06, |
| "loss": 0.01, |
| "step": 5490 |
| }, |
| { |
| "epoch": 8.258258258258259, |
| "grad_norm": 0.25974395871162415, |
| "learning_rate": 8.080675070300303e-06, |
| "loss": 0.008, |
| "step": 5500 |
| }, |
| { |
| "epoch": 8.273273273273274, |
| "grad_norm": 0.12416278570890427, |
| "learning_rate": 7.945866999090335e-06, |
| "loss": 0.008, |
| "step": 5510 |
| }, |
| { |
| "epoch": 8.288288288288289, |
| "grad_norm": 0.18399783968925476, |
| "learning_rate": 7.812095768178007e-06, |
| "loss": 0.0089, |
| "step": 5520 |
| }, |
| { |
| "epoch": 8.303303303303304, |
| "grad_norm": 0.3326607048511505, |
| "learning_rate": 7.679364675679152e-06, |
| "loss": 0.0083, |
| "step": 5530 |
| }, |
| { |
| "epoch": 8.318318318318319, |
| "grad_norm": 0.26662108302116394, |
| "learning_rate": 7.547676994065117e-06, |
| "loss": 0.0084, |
| "step": 5540 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.3286709189414978, |
| "learning_rate": 7.417035970082059e-06, |
| "loss": 0.0136, |
| "step": 5550 |
| }, |
| { |
| "epoch": 8.348348348348349, |
| "grad_norm": 0.2363322228193283, |
| "learning_rate": 7.2874448246709245e-06, |
| "loss": 0.0086, |
| "step": 5560 |
| }, |
| { |
| "epoch": 8.363363363363364, |
| "grad_norm": 0.2606948912143707, |
| "learning_rate": 7.158906752887979e-06, |
| "loss": 0.011, |
| "step": 5570 |
| }, |
| { |
| "epoch": 8.378378378378379, |
| "grad_norm": 0.19196771085262299, |
| "learning_rate": 7.031424923826097e-06, |
| "loss": 0.0081, |
| "step": 5580 |
| }, |
| { |
| "epoch": 8.393393393393394, |
| "grad_norm": 0.2436392903327942, |
| "learning_rate": 6.905002480536565e-06, |
| "loss": 0.0098, |
| "step": 5590 |
| }, |
| { |
| "epoch": 8.408408408408409, |
| "grad_norm": 0.2692658603191376, |
| "learning_rate": 6.779642539951664e-06, |
| "loss": 0.0077, |
| "step": 5600 |
| }, |
| { |
| "epoch": 8.423423423423424, |
| "grad_norm": 0.2532481551170349, |
| "learning_rate": 6.6553481928077365e-06, |
| "loss": 0.009, |
| "step": 5610 |
| }, |
| { |
| "epoch": 8.438438438438439, |
| "grad_norm": 0.3922106921672821, |
| "learning_rate": 6.532122503569055e-06, |
| "loss": 0.0108, |
| "step": 5620 |
| }, |
| { |
| "epoch": 8.453453453453454, |
| "grad_norm": 0.21005350351333618, |
| "learning_rate": 6.409968510352244e-06, |
| "loss": 0.0111, |
| "step": 5630 |
| }, |
| { |
| "epoch": 8.468468468468469, |
| "grad_norm": 0.218822181224823, |
| "learning_rate": 6.288889224851358e-06, |
| "loss": 0.0092, |
| "step": 5640 |
| }, |
| { |
| "epoch": 8.483483483483484, |
| "grad_norm": 0.3226770758628845, |
| "learning_rate": 6.168887632263659e-06, |
| "loss": 0.0089, |
| "step": 5650 |
| }, |
| { |
| "epoch": 8.498498498498499, |
| "grad_norm": 0.26689550280570984, |
| "learning_rate": 6.049966691215986e-06, |
| "loss": 0.0091, |
| "step": 5660 |
| }, |
| { |
| "epoch": 8.513513513513514, |
| "grad_norm": 0.31187760829925537, |
| "learning_rate": 5.932129333691855e-06, |
| "loss": 0.0084, |
| "step": 5670 |
| }, |
| { |
| "epoch": 8.528528528528529, |
| "grad_norm": 0.5038598775863647, |
| "learning_rate": 5.815378464959109e-06, |
| "loss": 0.0125, |
| "step": 5680 |
| }, |
| { |
| "epoch": 8.543543543543544, |
| "grad_norm": 0.38266250491142273, |
| "learning_rate": 5.699716963498325e-06, |
| "loss": 0.0092, |
| "step": 5690 |
| }, |
| { |
| "epoch": 8.558558558558559, |
| "grad_norm": 0.17743314802646637, |
| "learning_rate": 5.585147680931863e-06, |
| "loss": 0.0073, |
| "step": 5700 |
| }, |
| { |
| "epoch": 8.573573573573574, |
| "grad_norm": 0.32482296228408813, |
| "learning_rate": 5.471673441953545e-06, |
| "loss": 0.0105, |
| "step": 5710 |
| }, |
| { |
| "epoch": 8.588588588588589, |
| "grad_norm": 0.32539770007133484, |
| "learning_rate": 5.3592970442589765e-06, |
| "loss": 0.0111, |
| "step": 5720 |
| }, |
| { |
| "epoch": 8.603603603603604, |
| "grad_norm": 0.3283443748950958, |
| "learning_rate": 5.2480212584766045e-06, |
| "loss": 0.0112, |
| "step": 5730 |
| }, |
| { |
| "epoch": 8.618618618618619, |
| "grad_norm": 0.3915981948375702, |
| "learning_rate": 5.137848828099434e-06, |
| "loss": 0.0107, |
| "step": 5740 |
| }, |
| { |
| "epoch": 8.633633633633634, |
| "grad_norm": 0.34933730959892273, |
| "learning_rate": 5.028782469417331e-06, |
| "loss": 0.0108, |
| "step": 5750 |
| }, |
| { |
| "epoch": 8.64864864864865, |
| "grad_norm": 0.13394686579704285, |
| "learning_rate": 4.920824871450069e-06, |
| "loss": 0.0107, |
| "step": 5760 |
| }, |
| { |
| "epoch": 8.663663663663664, |
| "grad_norm": 0.18080875277519226, |
| "learning_rate": 4.81397869588106e-06, |
| "loss": 0.0099, |
| "step": 5770 |
| }, |
| { |
| "epoch": 8.67867867867868, |
| "grad_norm": 0.15751664340496063, |
| "learning_rate": 4.708246576991726e-06, |
| "loss": 0.0094, |
| "step": 5780 |
| }, |
| { |
| "epoch": 8.693693693693694, |
| "grad_norm": 0.27100566029548645, |
| "learning_rate": 4.603631121596503e-06, |
| "loss": 0.0119, |
| "step": 5790 |
| }, |
| { |
| "epoch": 8.70870870870871, |
| "grad_norm": 0.3021513819694519, |
| "learning_rate": 4.500134908978621e-06, |
| "loss": 0.0077, |
| "step": 5800 |
| }, |
| { |
| "epoch": 8.723723723723724, |
| "grad_norm": 0.23953105509281158, |
| "learning_rate": 4.397760490826481e-06, |
| "loss": 0.0087, |
| "step": 5810 |
| }, |
| { |
| "epoch": 8.73873873873874, |
| "grad_norm": 0.23057974874973297, |
| "learning_rate": 4.2965103911707775e-06, |
| "loss": 0.0104, |
| "step": 5820 |
| }, |
| { |
| "epoch": 8.753753753753754, |
| "grad_norm": 0.44746333360671997, |
| "learning_rate": 4.196387106322253e-06, |
| "loss": 0.0088, |
| "step": 5830 |
| }, |
| { |
| "epoch": 8.76876876876877, |
| "grad_norm": 0.3946534991264343, |
| "learning_rate": 4.097393104810126e-06, |
| "loss": 0.0085, |
| "step": 5840 |
| }, |
| { |
| "epoch": 8.783783783783784, |
| "grad_norm": 0.21715781092643738, |
| "learning_rate": 3.999530827321251e-06, |
| "loss": 0.0102, |
| "step": 5850 |
| }, |
| { |
| "epoch": 8.7987987987988, |
| "grad_norm": 0.214228555560112, |
| "learning_rate": 3.902802686639967e-06, |
| "loss": 0.009, |
| "step": 5860 |
| }, |
| { |
| "epoch": 8.813813813813814, |
| "grad_norm": 0.2998834550380707, |
| "learning_rate": 3.807211067588573e-06, |
| "loss": 0.0108, |
| "step": 5870 |
| }, |
| { |
| "epoch": 8.82882882882883, |
| "grad_norm": 0.26151779294013977, |
| "learning_rate": 3.7127583269685297e-06, |
| "loss": 0.0109, |
| "step": 5880 |
| }, |
| { |
| "epoch": 8.843843843843844, |
| "grad_norm": 0.41609615087509155, |
| "learning_rate": 3.619446793502396e-06, |
| "loss": 0.0096, |
| "step": 5890 |
| }, |
| { |
| "epoch": 8.85885885885886, |
| "grad_norm": 0.5297430753707886, |
| "learning_rate": 3.5272787677763818e-06, |
| "loss": 0.0086, |
| "step": 5900 |
| }, |
| { |
| "epoch": 8.873873873873874, |
| "grad_norm": 0.22240638732910156, |
| "learning_rate": 3.436256522183612e-06, |
| "loss": 0.0069, |
| "step": 5910 |
| }, |
| { |
| "epoch": 8.88888888888889, |
| "grad_norm": 0.405985951423645, |
| "learning_rate": 3.346382300868134e-06, |
| "loss": 0.0105, |
| "step": 5920 |
| }, |
| { |
| "epoch": 8.903903903903904, |
| "grad_norm": 0.16240379214286804, |
| "learning_rate": 3.257658319669582e-06, |
| "loss": 0.0065, |
| "step": 5930 |
| }, |
| { |
| "epoch": 8.91891891891892, |
| "grad_norm": 0.24528710544109344, |
| "learning_rate": 3.1700867660685195e-06, |
| "loss": 0.0081, |
| "step": 5940 |
| }, |
| { |
| "epoch": 8.933933933933934, |
| "grad_norm": 0.3795296251773834, |
| "learning_rate": 3.0836697991325548e-06, |
| "loss": 0.0105, |
| "step": 5950 |
| }, |
| { |
| "epoch": 8.94894894894895, |
| "grad_norm": 0.1127309650182724, |
| "learning_rate": 2.998409549463055e-06, |
| "loss": 0.0093, |
| "step": 5960 |
| }, |
| { |
| "epoch": 8.963963963963964, |
| "grad_norm": 0.23780913650989532, |
| "learning_rate": 2.9143081191426557e-06, |
| "loss": 0.0073, |
| "step": 5970 |
| }, |
| { |
| "epoch": 8.97897897897898, |
| "grad_norm": 0.23503072559833527, |
| "learning_rate": 2.8313675816834253e-06, |
| "loss": 0.0104, |
| "step": 5980 |
| }, |
| { |
| "epoch": 8.993993993993994, |
| "grad_norm": 0.18971222639083862, |
| "learning_rate": 2.74958998197572e-06, |
| "loss": 0.0084, |
| "step": 5990 |
| }, |
| { |
| "epoch": 9.00900900900901, |
| "grad_norm": 0.23308244347572327, |
| "learning_rate": 2.6689773362378046e-06, |
| "loss": 0.0116, |
| "step": 6000 |
| }, |
| { |
| "epoch": 9.024024024024024, |
| "grad_norm": 0.17444804310798645, |
| "learning_rate": 2.5895316319661245e-06, |
| "loss": 0.0086, |
| "step": 6010 |
| }, |
| { |
| "epoch": 9.03903903903904, |
| "grad_norm": 0.29627522826194763, |
| "learning_rate": 2.5112548278862948e-06, |
| "loss": 0.0105, |
| "step": 6020 |
| }, |
| { |
| "epoch": 9.054054054054054, |
| "grad_norm": 0.12174025923013687, |
| "learning_rate": 2.4341488539048105e-06, |
| "loss": 0.0098, |
| "step": 6030 |
| }, |
| { |
| "epoch": 9.06906906906907, |
| "grad_norm": 0.20848111808300018, |
| "learning_rate": 2.3582156110614985e-06, |
| "loss": 0.0096, |
| "step": 6040 |
| }, |
| { |
| "epoch": 9.084084084084084, |
| "grad_norm": 0.3482631742954254, |
| "learning_rate": 2.283456971482595e-06, |
| "loss": 0.0114, |
| "step": 6050 |
| }, |
| { |
| "epoch": 9.0990990990991, |
| "grad_norm": 0.2726683020591736, |
| "learning_rate": 2.2098747783346195e-06, |
| "loss": 0.0093, |
| "step": 6060 |
| }, |
| { |
| "epoch": 9.114114114114114, |
| "grad_norm": 0.25941407680511475, |
| "learning_rate": 2.1374708457789326e-06, |
| "loss": 0.0076, |
| "step": 6070 |
| }, |
| { |
| "epoch": 9.12912912912913, |
| "grad_norm": 0.4473312199115753, |
| "learning_rate": 2.066246958927004e-06, |
| "loss": 0.0103, |
| "step": 6080 |
| }, |
| { |
| "epoch": 9.144144144144144, |
| "grad_norm": 0.116793192923069, |
| "learning_rate": 1.9962048737963925e-06, |
| "loss": 0.011, |
| "step": 6090 |
| }, |
| { |
| "epoch": 9.15915915915916, |
| "grad_norm": 0.5143449306488037, |
| "learning_rate": 1.927346317267453e-06, |
| "loss": 0.0105, |
| "step": 6100 |
| }, |
| { |
| "epoch": 9.174174174174174, |
| "grad_norm": 0.0962725579738617, |
| "learning_rate": 1.8596729870407837e-06, |
| "loss": 0.0093, |
| "step": 6110 |
| }, |
| { |
| "epoch": 9.18918918918919, |
| "grad_norm": 0.4194902181625366, |
| "learning_rate": 1.7931865515953327e-06, |
| "loss": 0.0086, |
| "step": 6120 |
| }, |
| { |
| "epoch": 9.204204204204204, |
| "grad_norm": 0.34342023730278015, |
| "learning_rate": 1.72788865014728e-06, |
| "loss": 0.0112, |
| "step": 6130 |
| }, |
| { |
| "epoch": 9.21921921921922, |
| "grad_norm": 0.27022379636764526, |
| "learning_rate": 1.6637808926096289e-06, |
| "loss": 0.0107, |
| "step": 6140 |
| }, |
| { |
| "epoch": 9.234234234234235, |
| "grad_norm": 0.2723633646965027, |
| "learning_rate": 1.6008648595525254e-06, |
| "loss": 0.0083, |
| "step": 6150 |
| }, |
| { |
| "epoch": 9.24924924924925, |
| "grad_norm": 0.2724420428276062, |
| "learning_rate": 1.5391421021642427e-06, |
| "loss": 0.0108, |
| "step": 6160 |
| }, |
| { |
| "epoch": 9.264264264264265, |
| "grad_norm": 0.26074615120887756, |
| "learning_rate": 1.4786141422129752e-06, |
| "loss": 0.0076, |
| "step": 6170 |
| }, |
| { |
| "epoch": 9.27927927927928, |
| "grad_norm": 0.6156070828437805, |
| "learning_rate": 1.419282472009309e-06, |
| "loss": 0.012, |
| "step": 6180 |
| }, |
| { |
| "epoch": 9.294294294294295, |
| "grad_norm": 0.23706451058387756, |
| "learning_rate": 1.3611485543694458e-06, |
| "loss": 0.0102, |
| "step": 6190 |
| }, |
| { |
| "epoch": 9.30930930930931, |
| "grad_norm": 0.2707103490829468, |
| "learning_rate": 1.3042138225791034e-06, |
| "loss": 0.0077, |
| "step": 6200 |
| }, |
| { |
| "epoch": 9.324324324324325, |
| "grad_norm": 0.2296745330095291, |
| "learning_rate": 1.2484796803582e-06, |
| "loss": 0.0098, |
| "step": 6210 |
| }, |
| { |
| "epoch": 9.33933933933934, |
| "grad_norm": 0.39453765749931335, |
| "learning_rate": 1.193947501826248e-06, |
| "loss": 0.015, |
| "step": 6220 |
| }, |
| { |
| "epoch": 9.354354354354355, |
| "grad_norm": 0.30170953273773193, |
| "learning_rate": 1.1406186314684652e-06, |
| "loss": 0.0092, |
| "step": 6230 |
| }, |
| { |
| "epoch": 9.36936936936937, |
| "grad_norm": 0.3405857980251312, |
| "learning_rate": 1.0884943841026396e-06, |
| "loss": 0.0124, |
| "step": 6240 |
| }, |
| { |
| "epoch": 9.384384384384385, |
| "grad_norm": 0.40069127082824707, |
| "learning_rate": 1.037576044846683e-06, |
| "loss": 0.0111, |
| "step": 6250 |
| }, |
| { |
| "epoch": 9.3993993993994, |
| "grad_norm": 0.3969111144542694, |
| "learning_rate": 9.878648690869852e-07, |
| "loss": 0.0086, |
| "step": 6260 |
| }, |
| { |
| "epoch": 9.414414414414415, |
| "grad_norm": 0.1136733889579773, |
| "learning_rate": 9.393620824474481e-07, |
| "loss": 0.0082, |
| "step": 6270 |
| }, |
| { |
| "epoch": 9.42942942942943, |
| "grad_norm": 0.17583416402339935, |
| "learning_rate": 8.920688807592559e-07, |
| "loss": 0.0115, |
| "step": 6280 |
| }, |
| { |
| "epoch": 9.444444444444445, |
| "grad_norm": 0.34847551584243774, |
| "learning_rate": 8.459864300313925e-07, |
| "loss": 0.0124, |
| "step": 6290 |
| }, |
| { |
| "epoch": 9.45945945945946, |
| "grad_norm": 0.24079295992851257, |
| "learning_rate": 8.011158664219253e-07, |
| "loss": 0.0099, |
| "step": 6300 |
| }, |
| { |
| "epoch": 9.474474474474475, |
| "grad_norm": 0.2628285884857178, |
| "learning_rate": 7.574582962099507e-07, |
| "loss": 0.0087, |
| "step": 6310 |
| }, |
| { |
| "epoch": 9.48948948948949, |
| "grad_norm": 0.3399566113948822, |
| "learning_rate": 7.150147957683429e-07, |
| "loss": 0.0098, |
| "step": 6320 |
| }, |
| { |
| "epoch": 9.504504504504505, |
| "grad_norm": 0.40738004446029663, |
| "learning_rate": 6.737864115372139e-07, |
| "loss": 0.0094, |
| "step": 6330 |
| }, |
| { |
| "epoch": 9.51951951951952, |
| "grad_norm": 0.2358725666999817, |
| "learning_rate": 6.337741599981184e-07, |
| "loss": 0.0068, |
| "step": 6340 |
| }, |
| { |
| "epoch": 9.534534534534535, |
| "grad_norm": 0.15583853423595428, |
| "learning_rate": 5.949790276489731e-07, |
| "loss": 0.0107, |
| "step": 6350 |
| }, |
| { |
| "epoch": 9.54954954954955, |
| "grad_norm": 0.3986859619617462, |
| "learning_rate": 5.574019709797429e-07, |
| "loss": 0.0091, |
| "step": 6360 |
| }, |
| { |
| "epoch": 9.564564564564565, |
| "grad_norm": 0.38407695293426514, |
| "learning_rate": 5.210439164488823e-07, |
| "loss": 0.0088, |
| "step": 6370 |
| }, |
| { |
| "epoch": 9.57957957957958, |
| "grad_norm": 0.2681930959224701, |
| "learning_rate": 4.859057604604644e-07, |
| "loss": 0.009, |
| "step": 6380 |
| }, |
| { |
| "epoch": 9.594594594594595, |
| "grad_norm": 0.3744381368160248, |
| "learning_rate": 4.519883693420879e-07, |
| "loss": 0.0094, |
| "step": 6390 |
| }, |
| { |
| "epoch": 9.60960960960961, |
| "grad_norm": 0.2845977246761322, |
| "learning_rate": 4.1929257932351585e-07, |
| "loss": 0.0081, |
| "step": 6400 |
| }, |
| { |
| "epoch": 9.624624624624625, |
| "grad_norm": 0.24698816239833832, |
| "learning_rate": 3.878191965160705e-07, |
| "loss": 0.0066, |
| "step": 6410 |
| }, |
| { |
| "epoch": 9.63963963963964, |
| "grad_norm": 0.26620975136756897, |
| "learning_rate": 3.5756899689275425e-07, |
| "loss": 0.0078, |
| "step": 6420 |
| }, |
| { |
| "epoch": 9.654654654654655, |
| "grad_norm": 0.4993063807487488, |
| "learning_rate": 3.285427262690932e-07, |
| "loss": 0.012, |
| "step": 6430 |
| }, |
| { |
| "epoch": 9.66966966966967, |
| "grad_norm": 0.46098390221595764, |
| "learning_rate": 3.0074110028480154e-07, |
| "loss": 0.0133, |
| "step": 6440 |
| }, |
| { |
| "epoch": 9.684684684684685, |
| "grad_norm": 0.2004263997077942, |
| "learning_rate": 2.7416480438609003e-07, |
| "loss": 0.0083, |
| "step": 6450 |
| }, |
| { |
| "epoch": 9.6996996996997, |
| "grad_norm": 0.3100525736808777, |
| "learning_rate": 2.4881449380878e-07, |
| "loss": 0.0101, |
| "step": 6460 |
| }, |
| { |
| "epoch": 9.714714714714715, |
| "grad_norm": 0.33245450258255005, |
| "learning_rate": 2.246907935621545e-07, |
| "loss": 0.0104, |
| "step": 6470 |
| }, |
| { |
| "epoch": 9.72972972972973, |
| "grad_norm": 0.42327046394348145, |
| "learning_rate": 2.0179429841354346e-07, |
| "loss": 0.0122, |
| "step": 6480 |
| }, |
| { |
| "epoch": 9.744744744744745, |
| "grad_norm": 0.11561805009841919, |
| "learning_rate": 1.8012557287367392e-07, |
| "loss": 0.0064, |
| "step": 6490 |
| }, |
| { |
| "epoch": 9.75975975975976, |
| "grad_norm": 0.2033301591873169, |
| "learning_rate": 1.596851511827313e-07, |
| "loss": 0.0111, |
| "step": 6500 |
| }, |
| { |
| "epoch": 9.774774774774775, |
| "grad_norm": 0.07641670107841492, |
| "learning_rate": 1.4047353729719216e-07, |
| "loss": 0.0095, |
| "step": 6510 |
| }, |
| { |
| "epoch": 9.78978978978979, |
| "grad_norm": 0.12249872833490372, |
| "learning_rate": 1.2249120487740629e-07, |
| "loss": 0.0082, |
| "step": 6520 |
| }, |
| { |
| "epoch": 9.804804804804805, |
| "grad_norm": 0.33827194571495056, |
| "learning_rate": 1.0573859727592838e-07, |
| "loss": 0.0093, |
| "step": 6530 |
| }, |
| { |
| "epoch": 9.81981981981982, |
| "grad_norm": 0.40104958415031433, |
| "learning_rate": 9.021612752656006e-08, |
| "loss": 0.0121, |
| "step": 6540 |
| }, |
| { |
| "epoch": 9.834834834834835, |
| "grad_norm": 0.2793465554714203, |
| "learning_rate": 7.59241783341913e-08, |
| "loss": 0.0101, |
| "step": 6550 |
| }, |
| { |
| "epoch": 9.84984984984985, |
| "grad_norm": 0.37869784235954285, |
| "learning_rate": 6.28631020653414e-08, |
| "loss": 0.0088, |
| "step": 6560 |
| }, |
| { |
| "epoch": 9.864864864864865, |
| "grad_norm": 0.13755618035793304, |
| "learning_rate": 5.103322073949923e-08, |
| "loss": 0.0117, |
| "step": 6570 |
| }, |
| { |
| "epoch": 9.87987987987988, |
| "grad_norm": 0.32356616854667664, |
| "learning_rate": 4.0434826021168434e-08, |
| "loss": 0.01, |
| "step": 6580 |
| }, |
| { |
| "epoch": 9.894894894894895, |
| "grad_norm": 0.2811928689479828, |
| "learning_rate": 3.106817921266769e-08, |
| "loss": 0.0093, |
| "step": 6590 |
| }, |
| { |
| "epoch": 9.90990990990991, |
| "grad_norm": 0.2781268358230591, |
| "learning_rate": 2.293351124771359e-08, |
| "loss": 0.0082, |
| "step": 6600 |
| }, |
| { |
| "epoch": 9.924924924924925, |
| "grad_norm": 0.22087417542934418, |
| "learning_rate": 1.6031022685691898e-08, |
| "loss": 0.0093, |
| "step": 6610 |
| }, |
| { |
| "epoch": 9.93993993993994, |
| "grad_norm": 0.2339886873960495, |
| "learning_rate": 1.0360883706750368e-08, |
| "loss": 0.0073, |
| "step": 6620 |
| }, |
| { |
| "epoch": 9.954954954954955, |
| "grad_norm": 0.3688123822212219, |
| "learning_rate": 5.923234107579889e-09, |
| "loss": 0.0098, |
| "step": 6630 |
| }, |
| { |
| "epoch": 9.96996996996997, |
| "grad_norm": 0.16975067555904388, |
| "learning_rate": 2.718183297967247e-09, |
| "loss": 0.0072, |
| "step": 6640 |
| }, |
| { |
| "epoch": 9.984984984984985, |
| "grad_norm": 0.41199740767478943, |
| "learning_rate": 7.458102981139359e-10, |
| "loss": 0.0094, |
| "step": 6650 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.8860284090042114, |
| "learning_rate": 6.16373667106096e-12, |
| "loss": 0.0097, |
| "step": 6660 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 6660, |
| "total_flos": 0.0, |
| "train_loss": 0.025241373628154172, |
| "train_runtime": 6896.3696, |
| "train_samples_per_second": 47.265, |
| "train_steps_per_second": 0.966 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 6660, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 49, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|