| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 4218, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.007112375533428165, |
| "grad_norm": 45.94488445688017, |
| "learning_rate": 2.132701421800948e-07, |
| "loss": 4.9235, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.01422475106685633, |
| "grad_norm": 56.45310643883483, |
| "learning_rate": 4.502369668246446e-07, |
| "loss": 4.7616, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.021337126600284494, |
| "grad_norm": 47.57072569736425, |
| "learning_rate": 6.872037914691944e-07, |
| "loss": 4.0518, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.02844950213371266, |
| "grad_norm": 14.9615219454182, |
| "learning_rate": 9.241706161137441e-07, |
| "loss": 3.1168, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.03556187766714083, |
| "grad_norm": 9.209691028948875, |
| "learning_rate": 1.161137440758294e-06, |
| "loss": 2.408, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.04267425320056899, |
| "grad_norm": 3.738856271681981, |
| "learning_rate": 1.3981042654028437e-06, |
| "loss": 2.0996, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.049786628733997154, |
| "grad_norm": 4.299210545328982, |
| "learning_rate": 1.6350710900473934e-06, |
| "loss": 1.961, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.05689900426742532, |
| "grad_norm": 4.288339811445908, |
| "learning_rate": 1.8720379146919433e-06, |
| "loss": 1.8454, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.06401137980085349, |
| "grad_norm": 4.487588443815648, |
| "learning_rate": 2.109004739336493e-06, |
| "loss": 1.8, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.07112375533428165, |
| "grad_norm": 5.903658522691362, |
| "learning_rate": 2.345971563981043e-06, |
| "loss": 1.7189, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07823613086770982, |
| "grad_norm": 6.538803049950138, |
| "learning_rate": 2.5829383886255925e-06, |
| "loss": 1.6861, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.08534850640113797, |
| "grad_norm": 6.420212036240461, |
| "learning_rate": 2.8199052132701426e-06, |
| "loss": 1.6933, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.09246088193456614, |
| "grad_norm": 6.08601994925446, |
| "learning_rate": 3.0568720379146923e-06, |
| "loss": 1.6477, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.09957325746799431, |
| "grad_norm": 6.641158507404538, |
| "learning_rate": 3.293838862559242e-06, |
| "loss": 1.5837, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.10668563300142248, |
| "grad_norm": 5.6671416215198445, |
| "learning_rate": 3.5308056872037916e-06, |
| "loss": 1.553, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.11379800853485064, |
| "grad_norm": 4.895576620125158, |
| "learning_rate": 3.7677725118483417e-06, |
| "loss": 1.601, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.12091038406827881, |
| "grad_norm": 5.00629870941093, |
| "learning_rate": 4.004739336492891e-06, |
| "loss": 1.4953, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.12802275960170698, |
| "grad_norm": 4.098161366916081, |
| "learning_rate": 4.2417061611374415e-06, |
| "loss": 1.4986, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.13513513513513514, |
| "grad_norm": 4.279942094132115, |
| "learning_rate": 4.478672985781991e-06, |
| "loss": 1.4177, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.1422475106685633, |
| "grad_norm": 2.9957264584301506, |
| "learning_rate": 4.715639810426541e-06, |
| "loss": 1.4234, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.14935988620199148, |
| "grad_norm": 2.960846105003115, |
| "learning_rate": 4.952606635071091e-06, |
| "loss": 1.4034, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.15647226173541964, |
| "grad_norm": 2.8142395090714207, |
| "learning_rate": 5.18957345971564e-06, |
| "loss": 1.4172, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.16358463726884778, |
| "grad_norm": 3.1953820486960938, |
| "learning_rate": 5.42654028436019e-06, |
| "loss": 1.3695, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.17069701280227595, |
| "grad_norm": 3.0329786581569813, |
| "learning_rate": 5.66350710900474e-06, |
| "loss": 1.3815, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.17780938833570412, |
| "grad_norm": 3.023917167954777, |
| "learning_rate": 5.90047393364929e-06, |
| "loss": 1.3494, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.18492176386913228, |
| "grad_norm": 3.3061969408501186, |
| "learning_rate": 6.137440758293839e-06, |
| "loss": 1.351, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.19203413940256045, |
| "grad_norm": 3.0703691360984116, |
| "learning_rate": 6.374407582938389e-06, |
| "loss": 1.3007, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.19914651493598862, |
| "grad_norm": 2.6510030082143072, |
| "learning_rate": 6.611374407582939e-06, |
| "loss": 1.2318, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.20625889046941678, |
| "grad_norm": 2.781634197302321, |
| "learning_rate": 6.848341232227489e-06, |
| "loss": 1.2452, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.21337126600284495, |
| "grad_norm": 2.889926592158047, |
| "learning_rate": 7.085308056872039e-06, |
| "loss": 1.2299, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.22048364153627312, |
| "grad_norm": 2.9460513709926546, |
| "learning_rate": 7.322274881516588e-06, |
| "loss": 1.2481, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.22759601706970128, |
| "grad_norm": 3.117250263470296, |
| "learning_rate": 7.559241706161138e-06, |
| "loss": 1.1874, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.23470839260312945, |
| "grad_norm": 3.1068660585891443, |
| "learning_rate": 7.796208530805689e-06, |
| "loss": 1.242, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.24182076813655762, |
| "grad_norm": 3.2303235755610458, |
| "learning_rate": 8.033175355450237e-06, |
| "loss": 1.1656, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.24893314366998578, |
| "grad_norm": 3.380471682074544, |
| "learning_rate": 8.270142180094787e-06, |
| "loss": 1.1626, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.25604551920341395, |
| "grad_norm": 3.0003799025494455, |
| "learning_rate": 8.507109004739337e-06, |
| "loss": 1.1136, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 3.3507131315688037, |
| "learning_rate": 8.744075829383887e-06, |
| "loss": 1.109, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.2702702702702703, |
| "grad_norm": 3.286430938699654, |
| "learning_rate": 8.981042654028437e-06, |
| "loss": 1.0926, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.2773826458036984, |
| "grad_norm": 3.543025306575121, |
| "learning_rate": 9.218009478672988e-06, |
| "loss": 0.9856, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.2844950213371266, |
| "grad_norm": 2.9641151250477, |
| "learning_rate": 9.454976303317538e-06, |
| "loss": 1.0438, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.29160739687055476, |
| "grad_norm": 3.0879210891464175, |
| "learning_rate": 9.691943127962086e-06, |
| "loss": 0.9834, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.29871977240398295, |
| "grad_norm": 3.5828764512704274, |
| "learning_rate": 9.928909952606636e-06, |
| "loss": 1.0355, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.3058321479374111, |
| "grad_norm": 3.0432346994349944, |
| "learning_rate": 9.99991609608766e-06, |
| "loss": 0.9973, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.3129445234708393, |
| "grad_norm": 3.6852442122283384, |
| "learning_rate": 9.999505144928566e-06, |
| "loss": 1.0118, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.3200568990042674, |
| "grad_norm": 3.4571934113589893, |
| "learning_rate": 9.998751763712045e-06, |
| "loss": 0.915, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.32716927453769556, |
| "grad_norm": 3.3733896978659215, |
| "learning_rate": 9.997656004039284e-06, |
| "loss": 0.8872, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.33428165007112376, |
| "grad_norm": 3.1986482463279344, |
| "learning_rate": 9.99621794096192e-06, |
| "loss": 0.9233, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.3413940256045519, |
| "grad_norm": 3.3781480125146217, |
| "learning_rate": 9.994437672976904e-06, |
| "loss": 0.8156, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.3485064011379801, |
| "grad_norm": 3.6561286544224516, |
| "learning_rate": 9.99231532201976e-06, |
| "loss": 0.8749, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.35561877667140823, |
| "grad_norm": 4.142627644307138, |
| "learning_rate": 9.989851033456224e-06, |
| "loss": 0.8598, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.3627311522048364, |
| "grad_norm": 3.7494771233239828, |
| "learning_rate": 9.987044976072298e-06, |
| "loss": 0.8118, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.36984352773826457, |
| "grad_norm": 3.6547956812812123, |
| "learning_rate": 9.983897342062681e-06, |
| "loss": 0.8227, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.37695590327169276, |
| "grad_norm": 3.679890083139226, |
| "learning_rate": 9.98040834701761e-06, |
| "loss": 0.8132, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.3840682788051209, |
| "grad_norm": 3.252191257909053, |
| "learning_rate": 9.97657822990809e-06, |
| "loss": 0.7806, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.3911806543385491, |
| "grad_norm": 3.614922960561001, |
| "learning_rate": 9.972407253069527e-06, |
| "loss": 0.8095, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.39829302987197723, |
| "grad_norm": 3.793537378483368, |
| "learning_rate": 9.967895702183767e-06, |
| "loss": 0.7911, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.40540540540540543, |
| "grad_norm": 3.65980827340659, |
| "learning_rate": 9.963043886259518e-06, |
| "loss": 0.7712, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.41251778093883357, |
| "grad_norm": 3.5164539759645037, |
| "learning_rate": 9.957852137611187e-06, |
| "loss": 0.7634, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.41963015647226176, |
| "grad_norm": 3.3236842648189633, |
| "learning_rate": 9.952320811836129e-06, |
| "loss": 0.6903, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.4267425320056899, |
| "grad_norm": 3.294343434220933, |
| "learning_rate": 9.94645028779028e-06, |
| "loss": 0.7238, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.43385490753911804, |
| "grad_norm": 3.4974393759929208, |
| "learning_rate": 9.94024096756221e-06, |
| "loss": 0.694, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.44096728307254623, |
| "grad_norm": 4.433758888856019, |
| "learning_rate": 9.933693276445588e-06, |
| "loss": 0.7057, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.4480796586059744, |
| "grad_norm": 3.3896425434092503, |
| "learning_rate": 9.92680766291005e-06, |
| "loss": 0.7001, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.45519203413940257, |
| "grad_norm": 3.2995707993625834, |
| "learning_rate": 9.91958459857048e-06, |
| "loss": 0.6451, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.4623044096728307, |
| "grad_norm": 3.5589453987217805, |
| "learning_rate": 9.912024578154706e-06, |
| "loss": 0.6539, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.4694167852062589, |
| "grad_norm": 3.457156793924661, |
| "learning_rate": 9.904128119469625e-06, |
| "loss": 0.6383, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.47652916073968704, |
| "grad_norm": 3.791061357289613, |
| "learning_rate": 9.895895763365722e-06, |
| "loss": 0.6319, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.48364153627311524, |
| "grad_norm": 3.7253719001786307, |
| "learning_rate": 9.88732807370004e-06, |
| "loss": 0.589, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.4907539118065434, |
| "grad_norm": 3.8753257386340167, |
| "learning_rate": 9.878425637297549e-06, |
| "loss": 0.5236, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.49786628733997157, |
| "grad_norm": 3.810036186400155, |
| "learning_rate": 9.869189063910959e-06, |
| "loss": 0.524, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.5049786628733998, |
| "grad_norm": 4.2180281642967365, |
| "learning_rate": 9.859618986178953e-06, |
| "loss": 0.5336, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.5120910384068279, |
| "grad_norm": 3.938273345051735, |
| "learning_rate": 9.84971605958286e-06, |
| "loss": 0.5202, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.519203413940256, |
| "grad_norm": 3.5712127017141397, |
| "learning_rate": 9.839480962401753e-06, |
| "loss": 0.4938, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 3.383580945232286, |
| "learning_rate": 9.828914395665996e-06, |
| "loss": 0.4503, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.5334281650071123, |
| "grad_norm": 3.850151538007975, |
| "learning_rate": 9.818017083109233e-06, |
| "loss": 0.5067, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.5405405405405406, |
| "grad_norm": 3.579242735091459, |
| "learning_rate": 9.8067897711188e-06, |
| "loss": 0.4296, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.5476529160739687, |
| "grad_norm": 3.33637898169204, |
| "learning_rate": 9.795233228684631e-06, |
| "loss": 0.422, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.5547652916073968, |
| "grad_norm": 3.3180173487560998, |
| "learning_rate": 9.783348247346558e-06, |
| "loss": 0.4352, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.561877667140825, |
| "grad_norm": 3.3074859328364172, |
| "learning_rate": 9.771135641140117e-06, |
| "loss": 0.3788, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.5689900426742532, |
| "grad_norm": 3.935128904527344, |
| "learning_rate": 9.758596246540782e-06, |
| "loss": 0.4512, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.5761024182076814, |
| "grad_norm": 3.130800872692149, |
| "learning_rate": 9.74573092240668e-06, |
| "loss": 0.4286, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.5832147937411095, |
| "grad_norm": 3.4818017716980076, |
| "learning_rate": 9.732540549919758e-06, |
| "loss": 0.3976, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.5903271692745377, |
| "grad_norm": 3.7176422056718708, |
| "learning_rate": 9.719026032525432e-06, |
| "loss": 0.3845, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.5974395448079659, |
| "grad_norm": 4.0428367587373115, |
| "learning_rate": 9.70518829587071e-06, |
| "loss": 0.3761, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.604551920341394, |
| "grad_norm": 3.32333703731893, |
| "learning_rate": 9.691028287740783e-06, |
| "loss": 0.3663, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.6116642958748222, |
| "grad_norm": 4.055447477108677, |
| "learning_rate": 9.67654697799412e-06, |
| "loss": 0.3683, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.6187766714082503, |
| "grad_norm": 2.801736293850873, |
| "learning_rate": 9.661745358496033e-06, |
| "loss": 0.3302, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.6258890469416786, |
| "grad_norm": 2.9454979478833576, |
| "learning_rate": 9.64662444305074e-06, |
| "loss": 0.3714, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.6330014224751067, |
| "grad_norm": 3.933969741535959, |
| "learning_rate": 9.631185267331937e-06, |
| "loss": 0.3214, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.6401137980085349, |
| "grad_norm": 3.0707180797561398, |
| "learning_rate": 9.615428888811842e-06, |
| "loss": 0.3151, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.647226173541963, |
| "grad_norm": 3.6006782352295095, |
| "learning_rate": 9.59935638668879e-06, |
| "loss": 0.3134, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.6543385490753911, |
| "grad_norm": 4.528381319074012, |
| "learning_rate": 9.582968861813295e-06, |
| "loss": 0.2826, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.6614509246088194, |
| "grad_norm": 3.084970600037643, |
| "learning_rate": 9.566267436612662e-06, |
| "loss": 0.3272, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.6685633001422475, |
| "grad_norm": 3.1926454881670008, |
| "learning_rate": 9.549253255014105e-06, |
| "loss": 0.2838, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.6756756756756757, |
| "grad_norm": 3.3232334022391083, |
| "learning_rate": 9.531927482366398e-06, |
| "loss": 0.2676, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.6827880512091038, |
| "grad_norm": 3.373450413027547, |
| "learning_rate": 9.514291305360053e-06, |
| "loss": 0.2615, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.689900426742532, |
| "grad_norm": 3.298511219641843, |
| "learning_rate": 9.496345931946039e-06, |
| "loss": 0.2232, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.6970128022759602, |
| "grad_norm": 2.8709213001564726, |
| "learning_rate": 9.47809259125306e-06, |
| "loss": 0.2628, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.7041251778093883, |
| "grad_norm": 3.0027633203506, |
| "learning_rate": 9.459532533503347e-06, |
| "loss": 0.2404, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.7112375533428165, |
| "grad_norm": 3.0886670354052823, |
| "learning_rate": 9.440667029927043e-06, |
| "loss": 0.2259, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.7183499288762447, |
| "grad_norm": 3.413560155663082, |
| "learning_rate": 9.421497372675133e-06, |
| "loss": 0.208, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.7254623044096729, |
| "grad_norm": 2.26900305381711, |
| "learning_rate": 9.402024874730928e-06, |
| "loss": 0.2277, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.732574679943101, |
| "grad_norm": 3.5894430284698315, |
| "learning_rate": 9.382250869820146e-06, |
| "loss": 0.1926, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.7396870554765291, |
| "grad_norm": 3.267737905170995, |
| "learning_rate": 9.36217671231956e-06, |
| "loss": 0.2299, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.7467994310099573, |
| "grad_norm": 2.7538943048992737, |
| "learning_rate": 9.341803777164228e-06, |
| "loss": 0.1708, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.7539118065433855, |
| "grad_norm": 3.867540040555883, |
| "learning_rate": 9.321133459753322e-06, |
| "loss": 0.2072, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.7610241820768137, |
| "grad_norm": 2.3384449104832226, |
| "learning_rate": 9.300167175854564e-06, |
| "loss": 0.1875, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.7681365576102418, |
| "grad_norm": 3.6436777076779348, |
| "learning_rate": 9.278906361507238e-06, |
| "loss": 0.173, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.7752489331436699, |
| "grad_norm": 2.623342004246653, |
| "learning_rate": 9.257352472923842e-06, |
| "loss": 0.1489, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.7823613086770982, |
| "grad_norm": 2.9293688128652606, |
| "learning_rate": 9.235506986390346e-06, |
| "loss": 0.1423, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 3.1229986788680653, |
| "learning_rate": 9.213371398165077e-06, |
| "loss": 0.1564, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.7965860597439545, |
| "grad_norm": 3.5638406658438826, |
| "learning_rate": 9.190947224376238e-06, |
| "loss": 0.1872, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.8036984352773826, |
| "grad_norm": 3.754826640146973, |
| "learning_rate": 9.168236000918063e-06, |
| "loss": 0.1483, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.8108108108108109, |
| "grad_norm": 2.494125324383473, |
| "learning_rate": 9.145239283345618e-06, |
| "loss": 0.1272, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.817923186344239, |
| "grad_norm": 2.1750463421723003, |
| "learning_rate": 9.121958646768251e-06, |
| "loss": 0.1361, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.8250355618776671, |
| "grad_norm": 2.6835693031385035, |
| "learning_rate": 9.09839568574173e-06, |
| "loss": 0.1001, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.8321479374110953, |
| "grad_norm": 2.520530993255376, |
| "learning_rate": 9.074552014158994e-06, |
| "loss": 0.1193, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.8392603129445235, |
| "grad_norm": 2.583475505711053, |
| "learning_rate": 9.050429265139647e-06, |
| "loss": 0.1122, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.8463726884779517, |
| "grad_norm": 3.0551608668064736, |
| "learning_rate": 9.026029090918076e-06, |
| "loss": 0.1345, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.8534850640113798, |
| "grad_norm": 2.7079152732306917, |
| "learning_rate": 9.001353162730297e-06, |
| "loss": 0.1134, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8605974395448079, |
| "grad_norm": 2.2611924634890075, |
| "learning_rate": 8.976403170699486e-06, |
| "loss": 0.1026, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.8677098150782361, |
| "grad_norm": 1.987002883566529, |
| "learning_rate": 8.951180823720212e-06, |
| "loss": 0.0967, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.8748221906116643, |
| "grad_norm": 2.7967317585114615, |
| "learning_rate": 8.925687849341398e-06, |
| "loss": 0.0819, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.8819345661450925, |
| "grad_norm": 2.65859268119004, |
| "learning_rate": 8.899925993647994e-06, |
| "loss": 0.0931, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.8890469416785206, |
| "grad_norm": 2.5541801040927226, |
| "learning_rate": 8.873897021141378e-06, |
| "loss": 0.0888, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.8961593172119487, |
| "grad_norm": 2.6513786896328413, |
| "learning_rate": 8.847602714618504e-06, |
| "loss": 0.0839, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.903271692745377, |
| "grad_norm": 2.3497391818693587, |
| "learning_rate": 8.821044875049796e-06, |
| "loss": 0.0878, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.9103840682788051, |
| "grad_norm": 2.067880100094928, |
| "learning_rate": 8.794225321455788e-06, |
| "loss": 0.0866, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.9174964438122333, |
| "grad_norm": 2.466600341108382, |
| "learning_rate": 8.767145890782542e-06, |
| "loss": 0.0849, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.9246088193456614, |
| "grad_norm": 2.694537159823399, |
| "learning_rate": 8.739808437775825e-06, |
| "loss": 0.0773, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.9317211948790897, |
| "grad_norm": 3.051119356918663, |
| "learning_rate": 8.71221483485407e-06, |
| "loss": 0.0887, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.9388335704125178, |
| "grad_norm": 2.091226963672429, |
| "learning_rate": 8.684366971980139e-06, |
| "loss": 0.0739, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.9459459459459459, |
| "grad_norm": 2.6573993659558885, |
| "learning_rate": 8.656266756531857e-06, |
| "loss": 0.0757, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.9530583214793741, |
| "grad_norm": 2.5135440840845593, |
| "learning_rate": 8.627916113171396e-06, |
| "loss": 0.0695, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.9601706970128022, |
| "grad_norm": 1.8647689285533582, |
| "learning_rate": 8.599316983713419e-06, |
| "loss": 0.0703, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.9672830725462305, |
| "grad_norm": 2.1656321527764444, |
| "learning_rate": 8.570471326992105e-06, |
| "loss": 0.062, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.9743954480796586, |
| "grad_norm": 2.705238359384965, |
| "learning_rate": 8.54138111872697e-06, |
| "loss": 0.0755, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.9815078236130867, |
| "grad_norm": 1.4926114349562027, |
| "learning_rate": 8.512048351387551e-06, |
| "loss": 0.0656, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.9886201991465149, |
| "grad_norm": 2.193183643997932, |
| "learning_rate": 8.482475034056927e-06, |
| "loss": 0.0659, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.9957325746799431, |
| "grad_norm": 2.0527279052017264, |
| "learning_rate": 8.452663192294121e-06, |
| "loss": 0.0576, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.0028449502133712, |
| "grad_norm": 2.043379604895136, |
| "learning_rate": 8.42261486799536e-06, |
| "loss": 0.0518, |
| "step": 1410 |
| }, |
| { |
| "epoch": 1.0099573257467995, |
| "grad_norm": 1.7935460456418109, |
| "learning_rate": 8.392332119254214e-06, |
| "loss": 0.0363, |
| "step": 1420 |
| }, |
| { |
| "epoch": 1.0170697012802277, |
| "grad_norm": 1.9591421706180754, |
| "learning_rate": 8.361817020220647e-06, |
| "loss": 0.0345, |
| "step": 1430 |
| }, |
| { |
| "epoch": 1.0241820768136558, |
| "grad_norm": 1.904127146547918, |
| "learning_rate": 8.331071660958936e-06, |
| "loss": 0.039, |
| "step": 1440 |
| }, |
| { |
| "epoch": 1.031294452347084, |
| "grad_norm": 1.8927150070468237, |
| "learning_rate": 8.300098147304523e-06, |
| "loss": 0.0365, |
| "step": 1450 |
| }, |
| { |
| "epoch": 1.038406827880512, |
| "grad_norm": 1.9578224146696355, |
| "learning_rate": 8.268898600719785e-06, |
| "loss": 0.0431, |
| "step": 1460 |
| }, |
| { |
| "epoch": 1.0455192034139402, |
| "grad_norm": 2.119890142949488, |
| "learning_rate": 8.237475158148724e-06, |
| "loss": 0.0429, |
| "step": 1470 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 1.9482483964200852, |
| "learning_rate": 8.205829971870602e-06, |
| "loss": 0.0397, |
| "step": 1480 |
| }, |
| { |
| "epoch": 1.0597439544807965, |
| "grad_norm": 1.7329874393672655, |
| "learning_rate": 8.173965209352524e-06, |
| "loss": 0.0344, |
| "step": 1490 |
| }, |
| { |
| "epoch": 1.0668563300142249, |
| "grad_norm": 1.8911139378477928, |
| "learning_rate": 8.14188305310099e-06, |
| "loss": 0.0464, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.073968705547653, |
| "grad_norm": 2.450233012383526, |
| "learning_rate": 8.109585700512395e-06, |
| "loss": 0.0375, |
| "step": 1510 |
| }, |
| { |
| "epoch": 1.0810810810810811, |
| "grad_norm": 2.0138094788301166, |
| "learning_rate": 8.077075363722542e-06, |
| "loss": 0.0389, |
| "step": 1520 |
| }, |
| { |
| "epoch": 1.0881934566145093, |
| "grad_norm": 2.076572644222088, |
| "learning_rate": 8.044354269455109e-06, |
| "loss": 0.0436, |
| "step": 1530 |
| }, |
| { |
| "epoch": 1.0953058321479374, |
| "grad_norm": 1.9101229450735917, |
| "learning_rate": 8.011424658869142e-06, |
| "loss": 0.0357, |
| "step": 1540 |
| }, |
| { |
| "epoch": 1.1024182076813656, |
| "grad_norm": 1.130649417703215, |
| "learning_rate": 7.978288787405556e-06, |
| "loss": 0.0362, |
| "step": 1550 |
| }, |
| { |
| "epoch": 1.1095305832147937, |
| "grad_norm": 1.1581533245467266, |
| "learning_rate": 7.944948924632643e-06, |
| "loss": 0.0345, |
| "step": 1560 |
| }, |
| { |
| "epoch": 1.1166429587482218, |
| "grad_norm": 1.6643524677849526, |
| "learning_rate": 7.911407354090634e-06, |
| "loss": 0.0354, |
| "step": 1570 |
| }, |
| { |
| "epoch": 1.12375533428165, |
| "grad_norm": 1.9726198917599644, |
| "learning_rate": 7.877666373135287e-06, |
| "loss": 0.0346, |
| "step": 1580 |
| }, |
| { |
| "epoch": 1.1308677098150781, |
| "grad_norm": 1.6692436200631287, |
| "learning_rate": 7.84372829278053e-06, |
| "loss": 0.038, |
| "step": 1590 |
| }, |
| { |
| "epoch": 1.1379800853485065, |
| "grad_norm": 1.7045565380565189, |
| "learning_rate": 7.809595437540189e-06, |
| "loss": 0.0327, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.1450924608819346, |
| "grad_norm": 1.9976160352568044, |
| "learning_rate": 7.775270145268755e-06, |
| "loss": 0.0256, |
| "step": 1610 |
| }, |
| { |
| "epoch": 1.1522048364153628, |
| "grad_norm": 1.3781171703418404, |
| "learning_rate": 7.740754767001278e-06, |
| "loss": 0.039, |
| "step": 1620 |
| }, |
| { |
| "epoch": 1.159317211948791, |
| "grad_norm": 1.675366937408603, |
| "learning_rate": 7.706051666792318e-06, |
| "loss": 0.0353, |
| "step": 1630 |
| }, |
| { |
| "epoch": 1.166429587482219, |
| "grad_norm": 1.5507760610752672, |
| "learning_rate": 7.671163221554043e-06, |
| "loss": 0.0353, |
| "step": 1640 |
| }, |
| { |
| "epoch": 1.1735419630156472, |
| "grad_norm": 1.5578057994726024, |
| "learning_rate": 7.636091820893417e-06, |
| "loss": 0.0374, |
| "step": 1650 |
| }, |
| { |
| "epoch": 1.1806543385490753, |
| "grad_norm": 1.9536673456849045, |
| "learning_rate": 7.600839866948528e-06, |
| "loss": 0.0363, |
| "step": 1660 |
| }, |
| { |
| "epoch": 1.1877667140825037, |
| "grad_norm": 1.4180294508669007, |
| "learning_rate": 7.565409774224066e-06, |
| "loss": 0.0349, |
| "step": 1670 |
| }, |
| { |
| "epoch": 1.1948790896159318, |
| "grad_norm": 1.6616296432221909, |
| "learning_rate": 7.529803969425941e-06, |
| "loss": 0.0307, |
| "step": 1680 |
| }, |
| { |
| "epoch": 1.20199146514936, |
| "grad_norm": 1.7138246686303804, |
| "learning_rate": 7.494024891295075e-06, |
| "loss": 0.0322, |
| "step": 1690 |
| }, |
| { |
| "epoch": 1.209103840682788, |
| "grad_norm": 1.3613855884690513, |
| "learning_rate": 7.458074990440363e-06, |
| "loss": 0.0293, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.2162162162162162, |
| "grad_norm": 2.4114521805394205, |
| "learning_rate": 7.421956729170823e-06, |
| "loss": 0.0344, |
| "step": 1710 |
| }, |
| { |
| "epoch": 1.2233285917496444, |
| "grad_norm": 1.9233612034450194, |
| "learning_rate": 7.385672581326954e-06, |
| "loss": 0.0351, |
| "step": 1720 |
| }, |
| { |
| "epoch": 1.2304409672830725, |
| "grad_norm": 1.7307194070590812, |
| "learning_rate": 7.34922503211128e-06, |
| "loss": 0.0353, |
| "step": 1730 |
| }, |
| { |
| "epoch": 1.2375533428165006, |
| "grad_norm": 1.468735660134803, |
| "learning_rate": 7.312616577918149e-06, |
| "loss": 0.03, |
| "step": 1740 |
| }, |
| { |
| "epoch": 1.2446657183499288, |
| "grad_norm": 0.9815553395553774, |
| "learning_rate": 7.2758497261627345e-06, |
| "loss": 0.0267, |
| "step": 1750 |
| }, |
| { |
| "epoch": 1.251778093883357, |
| "grad_norm": 1.4851270984075178, |
| "learning_rate": 7.238926995109306e-06, |
| "loss": 0.0288, |
| "step": 1760 |
| }, |
| { |
| "epoch": 1.2588904694167853, |
| "grad_norm": 2.2537032746619183, |
| "learning_rate": 7.201850913698736e-06, |
| "loss": 0.0364, |
| "step": 1770 |
| }, |
| { |
| "epoch": 1.2660028449502134, |
| "grad_norm": 1.454211009387941, |
| "learning_rate": 7.164624021375294e-06, |
| "loss": 0.0252, |
| "step": 1780 |
| }, |
| { |
| "epoch": 1.2731152204836416, |
| "grad_norm": 1.4034123768391151, |
| "learning_rate": 7.12724886791271e-06, |
| "loss": 0.0266, |
| "step": 1790 |
| }, |
| { |
| "epoch": 1.2802275960170697, |
| "grad_norm": 1.546526107411268, |
| "learning_rate": 7.08972801323953e-06, |
| "loss": 0.03, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.2873399715504978, |
| "grad_norm": 1.6929689381873503, |
| "learning_rate": 7.052064027263785e-06, |
| "loss": 0.0235, |
| "step": 1810 |
| }, |
| { |
| "epoch": 1.294452347083926, |
| "grad_norm": 1.5130921744879449, |
| "learning_rate": 7.014259489696968e-06, |
| "loss": 0.0243, |
| "step": 1820 |
| }, |
| { |
| "epoch": 1.3015647226173541, |
| "grad_norm": 1.9572718096346318, |
| "learning_rate": 6.976316989877343e-06, |
| "loss": 0.0249, |
| "step": 1830 |
| }, |
| { |
| "epoch": 1.3086770981507825, |
| "grad_norm": 1.2611303057850376, |
| "learning_rate": 6.938239126592592e-06, |
| "loss": 0.0263, |
| "step": 1840 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 1.2902816153314383, |
| "learning_rate": 6.90002850790182e-06, |
| "loss": 0.0298, |
| "step": 1850 |
| }, |
| { |
| "epoch": 1.3229018492176388, |
| "grad_norm": 0.9719782814773048, |
| "learning_rate": 6.861687750956922e-06, |
| "loss": 0.027, |
| "step": 1860 |
| }, |
| { |
| "epoch": 1.330014224751067, |
| "grad_norm": 1.1718631838309244, |
| "learning_rate": 6.823219481823318e-06, |
| "loss": 0.0245, |
| "step": 1870 |
| }, |
| { |
| "epoch": 1.337126600284495, |
| "grad_norm": 1.3461970346065844, |
| "learning_rate": 6.784626335300102e-06, |
| "loss": 0.0198, |
| "step": 1880 |
| }, |
| { |
| "epoch": 1.3442389758179232, |
| "grad_norm": 1.1445639186428003, |
| "learning_rate": 6.745910954739563e-06, |
| "loss": 0.0274, |
| "step": 1890 |
| }, |
| { |
| "epoch": 1.3513513513513513, |
| "grad_norm": 1.9649035858601103, |
| "learning_rate": 6.707075991866143e-06, |
| "loss": 0.0268, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.3584637268847795, |
| "grad_norm": 1.3779682004442027, |
| "learning_rate": 6.668124106594813e-06, |
| "loss": 0.0274, |
| "step": 1910 |
| }, |
| { |
| "epoch": 1.3655761024182076, |
| "grad_norm": 0.9339287727084011, |
| "learning_rate": 6.629057966848879e-06, |
| "loss": 0.0244, |
| "step": 1920 |
| }, |
| { |
| "epoch": 1.3726884779516357, |
| "grad_norm": 1.3418194746364869, |
| "learning_rate": 6.589880248377258e-06, |
| "loss": 0.023, |
| "step": 1930 |
| }, |
| { |
| "epoch": 1.379800853485064, |
| "grad_norm": 1.6101698103903805, |
| "learning_rate": 6.550593634571205e-06, |
| "loss": 0.018, |
| "step": 1940 |
| }, |
| { |
| "epoch": 1.3869132290184922, |
| "grad_norm": 1.7415141112043047, |
| "learning_rate": 6.511200816280523e-06, |
| "loss": 0.021, |
| "step": 1950 |
| }, |
| { |
| "epoch": 1.3940256045519204, |
| "grad_norm": 1.2100486434644262, |
| "learning_rate": 6.471704491629251e-06, |
| "loss": 0.0285, |
| "step": 1960 |
| }, |
| { |
| "epoch": 1.4011379800853485, |
| "grad_norm": 1.301261422264456, |
| "learning_rate": 6.432107365830872e-06, |
| "loss": 0.0198, |
| "step": 1970 |
| }, |
| { |
| "epoch": 1.4082503556187767, |
| "grad_norm": 1.3543714484816034, |
| "learning_rate": 6.392412151003019e-06, |
| "loss": 0.0244, |
| "step": 1980 |
| }, |
| { |
| "epoch": 1.4153627311522048, |
| "grad_norm": 1.4893305665999936, |
| "learning_rate": 6.3526215659817156e-06, |
| "loss": 0.0226, |
| "step": 1990 |
| }, |
| { |
| "epoch": 1.422475106685633, |
| "grad_norm": 1.1217736569772296, |
| "learning_rate": 6.312738336135159e-06, |
| "loss": 0.019, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.4295874822190613, |
| "grad_norm": 1.530506526795571, |
| "learning_rate": 6.272765193177044e-06, |
| "loss": 0.0196, |
| "step": 2010 |
| }, |
| { |
| "epoch": 1.4366998577524894, |
| "grad_norm": 1.1830746085813704, |
| "learning_rate": 6.23270487497947e-06, |
| "loss": 0.0189, |
| "step": 2020 |
| }, |
| { |
| "epoch": 1.4438122332859176, |
| "grad_norm": 1.3714016439826322, |
| "learning_rate": 6.192560125385412e-06, |
| "loss": 0.025, |
| "step": 2030 |
| }, |
| { |
| "epoch": 1.4509246088193457, |
| "grad_norm": 1.1129988250796872, |
| "learning_rate": 6.152333694020781e-06, |
| "loss": 0.0184, |
| "step": 2040 |
| }, |
| { |
| "epoch": 1.4580369843527738, |
| "grad_norm": 2.0430785612059346, |
| "learning_rate": 6.112028336106108e-06, |
| "loss": 0.023, |
| "step": 2050 |
| }, |
| { |
| "epoch": 1.465149359886202, |
| "grad_norm": 1.4200748013522733, |
| "learning_rate": 6.071646812267817e-06, |
| "loss": 0.0167, |
| "step": 2060 |
| }, |
| { |
| "epoch": 1.4722617354196301, |
| "grad_norm": 1.8027434372189237, |
| "learning_rate": 6.031191888349155e-06, |
| "loss": 0.0202, |
| "step": 2070 |
| }, |
| { |
| "epoch": 1.4793741109530583, |
| "grad_norm": 1.1171787456661884, |
| "learning_rate": 5.990666335220738e-06, |
| "loss": 0.0178, |
| "step": 2080 |
| }, |
| { |
| "epoch": 1.4864864864864864, |
| "grad_norm": 1.6452874612147976, |
| "learning_rate": 5.950072928590781e-06, |
| "loss": 0.018, |
| "step": 2090 |
| }, |
| { |
| "epoch": 1.4935988620199145, |
| "grad_norm": 0.9884439749765455, |
| "learning_rate": 5.909414448814971e-06, |
| "loss": 0.0209, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.5007112375533427, |
| "grad_norm": 1.554996157376441, |
| "learning_rate": 5.8686936807060335e-06, |
| "loss": 0.0192, |
| "step": 2110 |
| }, |
| { |
| "epoch": 1.5078236130867708, |
| "grad_norm": 1.0929475144672365, |
| "learning_rate": 5.827913413343003e-06, |
| "loss": 0.018, |
| "step": 2120 |
| }, |
| { |
| "epoch": 1.5149359886201992, |
| "grad_norm": 1.0492081159201816, |
| "learning_rate": 5.787076439880177e-06, |
| "loss": 0.0179, |
| "step": 2130 |
| }, |
| { |
| "epoch": 1.5220483641536273, |
| "grad_norm": 1.2333928332291602, |
| "learning_rate": 5.746185557355814e-06, |
| "loss": 0.0211, |
| "step": 2140 |
| }, |
| { |
| "epoch": 1.5291607396870555, |
| "grad_norm": 0.8940904857757537, |
| "learning_rate": 5.70524356650056e-06, |
| "loss": 0.0168, |
| "step": 2150 |
| }, |
| { |
| "epoch": 1.5362731152204836, |
| "grad_norm": 0.9594678027850269, |
| "learning_rate": 5.664253271545603e-06, |
| "loss": 0.0172, |
| "step": 2160 |
| }, |
| { |
| "epoch": 1.543385490753912, |
| "grad_norm": 1.133529225026687, |
| "learning_rate": 5.623217480030622e-06, |
| "loss": 0.0178, |
| "step": 2170 |
| }, |
| { |
| "epoch": 1.55049786628734, |
| "grad_norm": 1.0245366404113008, |
| "learning_rate": 5.58213900261148e-06, |
| "loss": 0.0135, |
| "step": 2180 |
| }, |
| { |
| "epoch": 1.5576102418207682, |
| "grad_norm": 0.7068889699880522, |
| "learning_rate": 5.541020652867713e-06, |
| "loss": 0.0153, |
| "step": 2190 |
| }, |
| { |
| "epoch": 1.5647226173541964, |
| "grad_norm": 1.2084727884034199, |
| "learning_rate": 5.49986524710983e-06, |
| "loss": 0.0143, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.5718349928876245, |
| "grad_norm": 1.5054621892964164, |
| "learning_rate": 5.4586756041864065e-06, |
| "loss": 0.016, |
| "step": 2210 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 1.4176580158063212, |
| "learning_rate": 5.417454545291017e-06, |
| "loss": 0.0168, |
| "step": 2220 |
| }, |
| { |
| "epoch": 1.5860597439544808, |
| "grad_norm": 1.1824924291702557, |
| "learning_rate": 5.376204893769e-06, |
| "loss": 0.0198, |
| "step": 2230 |
| }, |
| { |
| "epoch": 1.593172119487909, |
| "grad_norm": 1.7631808589665254, |
| "learning_rate": 5.334929474924093e-06, |
| "loss": 0.0155, |
| "step": 2240 |
| }, |
| { |
| "epoch": 1.600284495021337, |
| "grad_norm": 1.215149372258629, |
| "learning_rate": 5.293631115824897e-06, |
| "loss": 0.0138, |
| "step": 2250 |
| }, |
| { |
| "epoch": 1.6073968705547652, |
| "grad_norm": 1.718329335563461, |
| "learning_rate": 5.252312645111266e-06, |
| "loss": 0.0173, |
| "step": 2260 |
| }, |
| { |
| "epoch": 1.6145092460881934, |
| "grad_norm": 1.0751615799620988, |
| "learning_rate": 5.2109768928005454e-06, |
| "loss": 0.0142, |
| "step": 2270 |
| }, |
| { |
| "epoch": 1.6216216216216215, |
| "grad_norm": 0.8027120709435296, |
| "learning_rate": 5.169626690093751e-06, |
| "loss": 0.014, |
| "step": 2280 |
| }, |
| { |
| "epoch": 1.6287339971550496, |
| "grad_norm": 1.6699231722730825, |
| "learning_rate": 5.128264869181646e-06, |
| "loss": 0.0127, |
| "step": 2290 |
| }, |
| { |
| "epoch": 1.635846372688478, |
| "grad_norm": 1.2559995566307685, |
| "learning_rate": 5.086894263050755e-06, |
| "loss": 0.011, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.6429587482219061, |
| "grad_norm": 1.349960059022035, |
| "learning_rate": 5.045517705289328e-06, |
| "loss": 0.0111, |
| "step": 2310 |
| }, |
| { |
| "epoch": 1.6500711237553343, |
| "grad_norm": 0.8142603267011976, |
| "learning_rate": 5.004138029893257e-06, |
| "loss": 0.0138, |
| "step": 2320 |
| }, |
| { |
| "epoch": 1.6571834992887624, |
| "grad_norm": 1.0621437820203163, |
| "learning_rate": 4.9627580710719734e-06, |
| "loss": 0.0128, |
| "step": 2330 |
| }, |
| { |
| "epoch": 1.6642958748221908, |
| "grad_norm": 1.7262184368035551, |
| "learning_rate": 4.921380663054318e-06, |
| "loss": 0.0128, |
| "step": 2340 |
| }, |
| { |
| "epoch": 1.671408250355619, |
| "grad_norm": 1.2695847947859624, |
| "learning_rate": 4.880008639894421e-06, |
| "loss": 0.014, |
| "step": 2350 |
| }, |
| { |
| "epoch": 1.678520625889047, |
| "grad_norm": 0.9261536386806662, |
| "learning_rate": 4.838644835277585e-06, |
| "loss": 0.0144, |
| "step": 2360 |
| }, |
| { |
| "epoch": 1.6856330014224752, |
| "grad_norm": 0.6867762051400554, |
| "learning_rate": 4.79729208232621e-06, |
| "loss": 0.0109, |
| "step": 2370 |
| }, |
| { |
| "epoch": 1.6927453769559033, |
| "grad_norm": 0.6232870542134327, |
| "learning_rate": 4.75595321340573e-06, |
| "loss": 0.0122, |
| "step": 2380 |
| }, |
| { |
| "epoch": 1.6998577524893315, |
| "grad_norm": 0.970176828182309, |
| "learning_rate": 4.714631059930622e-06, |
| "loss": 0.012, |
| "step": 2390 |
| }, |
| { |
| "epoch": 1.7069701280227596, |
| "grad_norm": 1.6173382913062293, |
| "learning_rate": 4.6733284521704816e-06, |
| "loss": 0.0124, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.7140825035561877, |
| "grad_norm": 0.9844171855603, |
| "learning_rate": 4.632048219056159e-06, |
| "loss": 0.012, |
| "step": 2410 |
| }, |
| { |
| "epoch": 1.7211948790896159, |
| "grad_norm": 1.3183824382551952, |
| "learning_rate": 4.590793187986003e-06, |
| "loss": 0.0149, |
| "step": 2420 |
| }, |
| { |
| "epoch": 1.728307254623044, |
| "grad_norm": 0.5730734000902559, |
| "learning_rate": 4.549566184632206e-06, |
| "loss": 0.0117, |
| "step": 2430 |
| }, |
| { |
| "epoch": 1.7354196301564722, |
| "grad_norm": 0.9239894283732394, |
| "learning_rate": 4.508370032747261e-06, |
| "loss": 0.0092, |
| "step": 2440 |
| }, |
| { |
| "epoch": 1.7425320056899003, |
| "grad_norm": 0.9732516534559529, |
| "learning_rate": 4.467207553970564e-06, |
| "loss": 0.012, |
| "step": 2450 |
| }, |
| { |
| "epoch": 1.7496443812233284, |
| "grad_norm": 0.9139268416210883, |
| "learning_rate": 4.426081567635137e-06, |
| "loss": 0.0092, |
| "step": 2460 |
| }, |
| { |
| "epoch": 1.7567567567567568, |
| "grad_norm": 1.2921223854630304, |
| "learning_rate": 4.3849948905745385e-06, |
| "loss": 0.0137, |
| "step": 2470 |
| }, |
| { |
| "epoch": 1.763869132290185, |
| "grad_norm": 0.8703692417885042, |
| "learning_rate": 4.343950336929927e-06, |
| "loss": 0.0095, |
| "step": 2480 |
| }, |
| { |
| "epoch": 1.770981507823613, |
| "grad_norm": 0.9536442700427114, |
| "learning_rate": 4.302950717957304e-06, |
| "loss": 0.0098, |
| "step": 2490 |
| }, |
| { |
| "epoch": 1.7780938833570412, |
| "grad_norm": 0.852536162993322, |
| "learning_rate": 4.261998841834972e-06, |
| "loss": 0.0101, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.7852062588904696, |
| "grad_norm": 1.248725823462744, |
| "learning_rate": 4.221097513471199e-06, |
| "loss": 0.0094, |
| "step": 2510 |
| }, |
| { |
| "epoch": 1.7923186344238977, |
| "grad_norm": 0.487586863686056, |
| "learning_rate": 4.18024953431209e-06, |
| "loss": 0.009, |
| "step": 2520 |
| }, |
| { |
| "epoch": 1.7994310099573259, |
| "grad_norm": 0.6857485925261184, |
| "learning_rate": 4.13945770214971e-06, |
| "loss": 0.0098, |
| "step": 2530 |
| }, |
| { |
| "epoch": 1.806543385490754, |
| "grad_norm": 0.5224101041795471, |
| "learning_rate": 4.098724810930472e-06, |
| "loss": 0.0077, |
| "step": 2540 |
| }, |
| { |
| "epoch": 1.8136557610241821, |
| "grad_norm": 0.3255236838052598, |
| "learning_rate": 4.058053650563747e-06, |
| "loss": 0.0069, |
| "step": 2550 |
| }, |
| { |
| "epoch": 1.8207681365576103, |
| "grad_norm": 0.5535169044707119, |
| "learning_rate": 4.017447006730796e-06, |
| "loss": 0.0084, |
| "step": 2560 |
| }, |
| { |
| "epoch": 1.8278805120910384, |
| "grad_norm": 0.6587680546008802, |
| "learning_rate": 3.976907660693954e-06, |
| "loss": 0.0068, |
| "step": 2570 |
| }, |
| { |
| "epoch": 1.8349928876244666, |
| "grad_norm": 0.7451030339766666, |
| "learning_rate": 3.936438389106154e-06, |
| "loss": 0.0091, |
| "step": 2580 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 0.7854707802079127, |
| "learning_rate": 3.896041963820724e-06, |
| "loss": 0.0105, |
| "step": 2590 |
| }, |
| { |
| "epoch": 1.8492176386913228, |
| "grad_norm": 0.6990927586140553, |
| "learning_rate": 3.855721151701548e-06, |
| "loss": 0.0099, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.856330014224751, |
| "grad_norm": 1.318630670215527, |
| "learning_rate": 3.815478714433559e-06, |
| "loss": 0.0095, |
| "step": 2610 |
| }, |
| { |
| "epoch": 1.863442389758179, |
| "grad_norm": 0.8518153474787149, |
| "learning_rate": 3.775317408333571e-06, |
| "loss": 0.0105, |
| "step": 2620 |
| }, |
| { |
| "epoch": 1.8705547652916072, |
| "grad_norm": 1.0023735620026466, |
| "learning_rate": 3.7352399841614996e-06, |
| "loss": 0.0082, |
| "step": 2630 |
| }, |
| { |
| "epoch": 1.8776671408250356, |
| "grad_norm": 0.9809887806472293, |
| "learning_rate": 3.695249186931954e-06, |
| "loss": 0.0087, |
| "step": 2640 |
| }, |
| { |
| "epoch": 1.8847795163584637, |
| "grad_norm": 0.9540456428445807, |
| "learning_rate": 3.655347755726224e-06, |
| "loss": 0.0076, |
| "step": 2650 |
| }, |
| { |
| "epoch": 1.8918918918918919, |
| "grad_norm": 0.7066159412282622, |
| "learning_rate": 3.6155384235046674e-06, |
| "loss": 0.0086, |
| "step": 2660 |
| }, |
| { |
| "epoch": 1.89900426742532, |
| "grad_norm": 0.5137592216850851, |
| "learning_rate": 3.5758239169195276e-06, |
| "loss": 0.005, |
| "step": 2670 |
| }, |
| { |
| "epoch": 1.9061166429587484, |
| "grad_norm": 0.3439517878091387, |
| "learning_rate": 3.5362069561281764e-06, |
| "loss": 0.0072, |
| "step": 2680 |
| }, |
| { |
| "epoch": 1.9132290184921765, |
| "grad_norm": 0.3970319267325305, |
| "learning_rate": 3.4966902546068016e-06, |
| "loss": 0.0072, |
| "step": 2690 |
| }, |
| { |
| "epoch": 1.9203413940256047, |
| "grad_norm": 0.9810798909167313, |
| "learning_rate": 3.4572765189645516e-06, |
| "loss": 0.0073, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.9274537695590328, |
| "grad_norm": 1.4872117479815739, |
| "learning_rate": 3.4179684487581555e-06, |
| "loss": 0.0067, |
| "step": 2710 |
| }, |
| { |
| "epoch": 1.934566145092461, |
| "grad_norm": 0.17941271447530188, |
| "learning_rate": 3.3787687363070256e-06, |
| "loss": 0.0075, |
| "step": 2720 |
| }, |
| { |
| "epoch": 1.941678520625889, |
| "grad_norm": 0.21377268278340267, |
| "learning_rate": 3.3396800665088435e-06, |
| "loss": 0.0069, |
| "step": 2730 |
| }, |
| { |
| "epoch": 1.9487908961593172, |
| "grad_norm": 0.8027020001474104, |
| "learning_rate": 3.300705116655672e-06, |
| "loss": 0.0058, |
| "step": 2740 |
| }, |
| { |
| "epoch": 1.9559032716927454, |
| "grad_norm": 0.607769605088779, |
| "learning_rate": 3.26184655625058e-06, |
| "loss": 0.0055, |
| "step": 2750 |
| }, |
| { |
| "epoch": 1.9630156472261735, |
| "grad_norm": 0.29396831979764293, |
| "learning_rate": 3.2231070468247954e-06, |
| "loss": 0.0062, |
| "step": 2760 |
| }, |
| { |
| "epoch": 1.9701280227596016, |
| "grad_norm": 0.49083863249583537, |
| "learning_rate": 3.1844892417554102e-06, |
| "loss": 0.0063, |
| "step": 2770 |
| }, |
| { |
| "epoch": 1.9772403982930298, |
| "grad_norm": 0.710753958854101, |
| "learning_rate": 3.1459957860836528e-06, |
| "loss": 0.0065, |
| "step": 2780 |
| }, |
| { |
| "epoch": 1.984352773826458, |
| "grad_norm": 0.27012727932102704, |
| "learning_rate": 3.1076293163337074e-06, |
| "loss": 0.0068, |
| "step": 2790 |
| }, |
| { |
| "epoch": 1.991465149359886, |
| "grad_norm": 0.34603765606499187, |
| "learning_rate": 3.069392460332141e-06, |
| "loss": 0.0057, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.9985775248933144, |
| "grad_norm": 0.3721250969176249, |
| "learning_rate": 3.031287837027911e-06, |
| "loss": 0.0066, |
| "step": 2810 |
| }, |
| { |
| "epoch": 2.0056899004267423, |
| "grad_norm": 0.781768421432185, |
| "learning_rate": 2.9933180563129936e-06, |
| "loss": 0.0041, |
| "step": 2820 |
| }, |
| { |
| "epoch": 2.012802275960171, |
| "grad_norm": 0.24350008390092337, |
| "learning_rate": 2.955485718843616e-06, |
| "loss": 0.0056, |
| "step": 2830 |
| }, |
| { |
| "epoch": 2.019914651493599, |
| "grad_norm": 0.4576741832894929, |
| "learning_rate": 2.917793415862129e-06, |
| "loss": 0.0048, |
| "step": 2840 |
| }, |
| { |
| "epoch": 2.027027027027027, |
| "grad_norm": 0.9890835980780475, |
| "learning_rate": 2.880243729019546e-06, |
| "loss": 0.0038, |
| "step": 2850 |
| }, |
| { |
| "epoch": 2.0341394025604553, |
| "grad_norm": 0.3917033136267895, |
| "learning_rate": 2.842839230198685e-06, |
| "loss": 0.0052, |
| "step": 2860 |
| }, |
| { |
| "epoch": 2.0412517780938835, |
| "grad_norm": 0.12450209954114903, |
| "learning_rate": 2.805582481338044e-06, |
| "loss": 0.0047, |
| "step": 2870 |
| }, |
| { |
| "epoch": 2.0483641536273116, |
| "grad_norm": 0.5486661654701261, |
| "learning_rate": 2.7684760342563045e-06, |
| "loss": 0.0047, |
| "step": 2880 |
| }, |
| { |
| "epoch": 2.0554765291607398, |
| "grad_norm": 0.22758726780410876, |
| "learning_rate": 2.731522430477571e-06, |
| "loss": 0.0056, |
| "step": 2890 |
| }, |
| { |
| "epoch": 2.062588904694168, |
| "grad_norm": 0.2218164583744802, |
| "learning_rate": 2.694724201057273e-06, |
| "loss": 0.0048, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.069701280227596, |
| "grad_norm": 0.45353402328041514, |
| "learning_rate": 2.6580838664088214e-06, |
| "loss": 0.0042, |
| "step": 2910 |
| }, |
| { |
| "epoch": 2.076813655761024, |
| "grad_norm": 0.29165554258590237, |
| "learning_rate": 2.6216039361309753e-06, |
| "loss": 0.0044, |
| "step": 2920 |
| }, |
| { |
| "epoch": 2.0839260312944523, |
| "grad_norm": 0.42787997336579114, |
| "learning_rate": 2.5852869088359495e-06, |
| "loss": 0.0041, |
| "step": 2930 |
| }, |
| { |
| "epoch": 2.0910384068278804, |
| "grad_norm": 0.44323215466285076, |
| "learning_rate": 2.549135271978275e-06, |
| "loss": 0.0032, |
| "step": 2940 |
| }, |
| { |
| "epoch": 2.0981507823613086, |
| "grad_norm": 0.1143123602309504, |
| "learning_rate": 2.5131515016844345e-06, |
| "loss": 0.0046, |
| "step": 2950 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 0.16583828479799412, |
| "learning_rate": 2.4773380625832603e-06, |
| "loss": 0.0047, |
| "step": 2960 |
| }, |
| { |
| "epoch": 2.112375533428165, |
| "grad_norm": 0.15755302830922696, |
| "learning_rate": 2.4416974076371304e-06, |
| "loss": 0.0039, |
| "step": 2970 |
| }, |
| { |
| "epoch": 2.119487908961593, |
| "grad_norm": 0.62834650400931, |
| "learning_rate": 2.406231977973942e-06, |
| "loss": 0.0037, |
| "step": 2980 |
| }, |
| { |
| "epoch": 2.126600284495021, |
| "grad_norm": 0.3425562134173693, |
| "learning_rate": 2.3709442027199387e-06, |
| "loss": 0.0049, |
| "step": 2990 |
| }, |
| { |
| "epoch": 2.1337126600284497, |
| "grad_norm": 0.1176241490475843, |
| "learning_rate": 2.3358364988333066e-06, |
| "loss": 0.0045, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.140825035561878, |
| "grad_norm": 0.21718467446163836, |
| "learning_rate": 2.3009112709386454e-06, |
| "loss": 0.0052, |
| "step": 3010 |
| }, |
| { |
| "epoch": 2.147937411095306, |
| "grad_norm": 0.1447042548468856, |
| "learning_rate": 2.2661709111622666e-06, |
| "loss": 0.0047, |
| "step": 3020 |
| }, |
| { |
| "epoch": 2.155049786628734, |
| "grad_norm": 0.2850367854449551, |
| "learning_rate": 2.2316177989683458e-06, |
| "loss": 0.004, |
| "step": 3030 |
| }, |
| { |
| "epoch": 2.1621621621621623, |
| "grad_norm": 0.33564220562935804, |
| "learning_rate": 2.197254300995953e-06, |
| "loss": 0.0052, |
| "step": 3040 |
| }, |
| { |
| "epoch": 2.1692745376955904, |
| "grad_norm": 0.1545067926251289, |
| "learning_rate": 2.163082770896943e-06, |
| "loss": 0.0043, |
| "step": 3050 |
| }, |
| { |
| "epoch": 2.1763869132290186, |
| "grad_norm": 0.08868335935281069, |
| "learning_rate": 2.1291055491747643e-06, |
| "loss": 0.0034, |
| "step": 3060 |
| }, |
| { |
| "epoch": 2.1834992887624467, |
| "grad_norm": 0.0678499455537346, |
| "learning_rate": 2.095324963024137e-06, |
| "loss": 0.0039, |
| "step": 3070 |
| }, |
| { |
| "epoch": 2.190611664295875, |
| "grad_norm": 0.1962461433328382, |
| "learning_rate": 2.061743326171668e-06, |
| "loss": 0.0038, |
| "step": 3080 |
| }, |
| { |
| "epoch": 2.197724039829303, |
| "grad_norm": 0.07801886707618137, |
| "learning_rate": 2.02836293871736e-06, |
| "loss": 0.0035, |
| "step": 3090 |
| }, |
| { |
| "epoch": 2.204836415362731, |
| "grad_norm": 0.3629078506453925, |
| "learning_rate": 1.9951860869771e-06, |
| "loss": 0.0038, |
| "step": 3100 |
| }, |
| { |
| "epoch": 2.2119487908961593, |
| "grad_norm": 0.8806588814039079, |
| "learning_rate": 1.962215043326029e-06, |
| "loss": 0.004, |
| "step": 3110 |
| }, |
| { |
| "epoch": 2.2190611664295874, |
| "grad_norm": 0.33169199243250613, |
| "learning_rate": 1.9294520660429284e-06, |
| "loss": 0.0036, |
| "step": 3120 |
| }, |
| { |
| "epoch": 2.2261735419630155, |
| "grad_norm": 0.12310821458251077, |
| "learning_rate": 1.8968993991555301e-06, |
| "loss": 0.0045, |
| "step": 3130 |
| }, |
| { |
| "epoch": 2.2332859174964437, |
| "grad_norm": 0.1564234234161847, |
| "learning_rate": 1.8645592722868223e-06, |
| "loss": 0.0041, |
| "step": 3140 |
| }, |
| { |
| "epoch": 2.240398293029872, |
| "grad_norm": 0.1908716606221835, |
| "learning_rate": 1.8324339005023273e-06, |
| "loss": 0.0042, |
| "step": 3150 |
| }, |
| { |
| "epoch": 2.2475106685633, |
| "grad_norm": 0.17491525199519603, |
| "learning_rate": 1.8005254841584035e-06, |
| "loss": 0.0032, |
| "step": 3160 |
| }, |
| { |
| "epoch": 2.2546230440967285, |
| "grad_norm": 0.15681019357467124, |
| "learning_rate": 1.768836208751516e-06, |
| "loss": 0.0039, |
| "step": 3170 |
| }, |
| { |
| "epoch": 2.2617354196301562, |
| "grad_norm": 0.16172138112249296, |
| "learning_rate": 1.7373682447685624e-06, |
| "loss": 0.004, |
| "step": 3180 |
| }, |
| { |
| "epoch": 2.268847795163585, |
| "grad_norm": 0.10575834882863448, |
| "learning_rate": 1.706123747538196e-06, |
| "loss": 0.0035, |
| "step": 3190 |
| }, |
| { |
| "epoch": 2.275960170697013, |
| "grad_norm": 0.18222310954574267, |
| "learning_rate": 1.6751048570832184e-06, |
| "loss": 0.0041, |
| "step": 3200 |
| }, |
| { |
| "epoch": 2.283072546230441, |
| "grad_norm": 0.14875677905536833, |
| "learning_rate": 1.6443136979739855e-06, |
| "loss": 0.003, |
| "step": 3210 |
| }, |
| { |
| "epoch": 2.2901849217638692, |
| "grad_norm": 0.10898246145730768, |
| "learning_rate": 1.6137523791829007e-06, |
| "loss": 0.0034, |
| "step": 3220 |
| }, |
| { |
| "epoch": 2.2972972972972974, |
| "grad_norm": 0.1309461753215428, |
| "learning_rate": 1.5834229939399637e-06, |
| "loss": 0.0034, |
| "step": 3230 |
| }, |
| { |
| "epoch": 2.3044096728307255, |
| "grad_norm": 0.07200423508178247, |
| "learning_rate": 1.5533276195893987e-06, |
| "loss": 0.0037, |
| "step": 3240 |
| }, |
| { |
| "epoch": 2.3115220483641536, |
| "grad_norm": 0.28943328560772674, |
| "learning_rate": 1.5234683174473669e-06, |
| "loss": 0.0039, |
| "step": 3250 |
| }, |
| { |
| "epoch": 2.318634423897582, |
| "grad_norm": 0.5192612699526135, |
| "learning_rate": 1.493847132660789e-06, |
| "loss": 0.0034, |
| "step": 3260 |
| }, |
| { |
| "epoch": 2.32574679943101, |
| "grad_norm": 0.1606295965015448, |
| "learning_rate": 1.4644660940672628e-06, |
| "loss": 0.0044, |
| "step": 3270 |
| }, |
| { |
| "epoch": 2.332859174964438, |
| "grad_norm": 0.37034704670980706, |
| "learning_rate": 1.435327214056103e-06, |
| "loss": 0.0036, |
| "step": 3280 |
| }, |
| { |
| "epoch": 2.339971550497866, |
| "grad_norm": 0.1985714241377405, |
| "learning_rate": 1.406432488430508e-06, |
| "loss": 0.0041, |
| "step": 3290 |
| }, |
| { |
| "epoch": 2.3470839260312943, |
| "grad_norm": 0.13803180507649276, |
| "learning_rate": 1.3777838962708602e-06, |
| "loss": 0.0035, |
| "step": 3300 |
| }, |
| { |
| "epoch": 2.3541963015647225, |
| "grad_norm": 0.16321860803207505, |
| "learning_rate": 1.3493833997991745e-06, |
| "loss": 0.0033, |
| "step": 3310 |
| }, |
| { |
| "epoch": 2.3613086770981506, |
| "grad_norm": 0.2001811539323451, |
| "learning_rate": 1.3212329442446985e-06, |
| "loss": 0.0042, |
| "step": 3320 |
| }, |
| { |
| "epoch": 2.3684210526315788, |
| "grad_norm": 0.1453173744872287, |
| "learning_rate": 1.2933344577106822e-06, |
| "loss": 0.0032, |
| "step": 3330 |
| }, |
| { |
| "epoch": 2.3755334281650073, |
| "grad_norm": 0.10401910511567347, |
| "learning_rate": 1.2656898510423122e-06, |
| "loss": 0.0031, |
| "step": 3340 |
| }, |
| { |
| "epoch": 2.382645803698435, |
| "grad_norm": 0.10582948879092595, |
| "learning_rate": 1.2383010176958372e-06, |
| "loss": 0.0033, |
| "step": 3350 |
| }, |
| { |
| "epoch": 2.3897581792318636, |
| "grad_norm": 0.16511981732406306, |
| "learning_rate": 1.2111698336088717e-06, |
| "loss": 0.0039, |
| "step": 3360 |
| }, |
| { |
| "epoch": 2.3968705547652918, |
| "grad_norm": 0.14041169290258051, |
| "learning_rate": 1.1842981570719237e-06, |
| "loss": 0.0034, |
| "step": 3370 |
| }, |
| { |
| "epoch": 2.40398293029872, |
| "grad_norm": 0.216807318559693, |
| "learning_rate": 1.157687828601094e-06, |
| "loss": 0.0039, |
| "step": 3380 |
| }, |
| { |
| "epoch": 2.411095305832148, |
| "grad_norm": 0.1487410996270859, |
| "learning_rate": 1.1313406708120327e-06, |
| "loss": 0.0033, |
| "step": 3390 |
| }, |
| { |
| "epoch": 2.418207681365576, |
| "grad_norm": 0.17410715559913836, |
| "learning_rate": 1.1052584882950896e-06, |
| "loss": 0.0032, |
| "step": 3400 |
| }, |
| { |
| "epoch": 2.4253200568990043, |
| "grad_norm": 0.14679067077660998, |
| "learning_rate": 1.0794430674917262e-06, |
| "loss": 0.0029, |
| "step": 3410 |
| }, |
| { |
| "epoch": 2.4324324324324325, |
| "grad_norm": 0.11730320262217042, |
| "learning_rate": 1.0538961765721429e-06, |
| "loss": 0.0034, |
| "step": 3420 |
| }, |
| { |
| "epoch": 2.4395448079658606, |
| "grad_norm": 0.15601345944604691, |
| "learning_rate": 1.0286195653141822e-06, |
| "loss": 0.0033, |
| "step": 3430 |
| }, |
| { |
| "epoch": 2.4466571834992887, |
| "grad_norm": 0.15596374680032918, |
| "learning_rate": 1.0036149649834786e-06, |
| "loss": 0.0033, |
| "step": 3440 |
| }, |
| { |
| "epoch": 2.453769559032717, |
| "grad_norm": 0.15341222073346109, |
| "learning_rate": 9.788840882148803e-07, |
| "loss": 0.0032, |
| "step": 3450 |
| }, |
| { |
| "epoch": 2.460881934566145, |
| "grad_norm": 0.18113221503751906, |
| "learning_rate": 9.544286288951393e-07, |
| "loss": 0.0028, |
| "step": 3460 |
| }, |
| { |
| "epoch": 2.467994310099573, |
| "grad_norm": 0.23824252331061962, |
| "learning_rate": 9.302502620469073e-07, |
| "loss": 0.003, |
| "step": 3470 |
| }, |
| { |
| "epoch": 2.4751066856330013, |
| "grad_norm": 0.1804454838531882, |
| "learning_rate": 9.063506437139901e-07, |
| "loss": 0.0033, |
| "step": 3480 |
| }, |
| { |
| "epoch": 2.4822190611664294, |
| "grad_norm": 0.12129461355182411, |
| "learning_rate": 8.827314108479357e-07, |
| "loss": 0.0035, |
| "step": 3490 |
| }, |
| { |
| "epoch": 2.4893314366998576, |
| "grad_norm": 0.2496105490338266, |
| "learning_rate": 8.593941811959078e-07, |
| "loss": 0.0037, |
| "step": 3500 |
| }, |
| { |
| "epoch": 2.496443812233286, |
| "grad_norm": 0.12260976552880777, |
| "learning_rate": 8.363405531898833e-07, |
| "loss": 0.0035, |
| "step": 3510 |
| }, |
| { |
| "epoch": 2.503556187766714, |
| "grad_norm": 0.17068909040005176, |
| "learning_rate": 8.135721058371681e-07, |
| "loss": 0.0038, |
| "step": 3520 |
| }, |
| { |
| "epoch": 2.5106685633001424, |
| "grad_norm": 0.14486041747836928, |
| "learning_rate": 7.910903986122537e-07, |
| "loss": 0.0023, |
| "step": 3530 |
| }, |
| { |
| "epoch": 2.5177809388335706, |
| "grad_norm": 0.16537212820522457, |
| "learning_rate": 7.688969713499983e-07, |
| "loss": 0.0033, |
| "step": 3540 |
| }, |
| { |
| "epoch": 2.5248933143669987, |
| "grad_norm": 0.06547618532234573, |
| "learning_rate": 7.469933441401606e-07, |
| "loss": 0.0036, |
| "step": 3550 |
| }, |
| { |
| "epoch": 2.532005689900427, |
| "grad_norm": 0.09486129847604534, |
| "learning_rate": 7.253810172232867e-07, |
| "loss": 0.0029, |
| "step": 3560 |
| }, |
| { |
| "epoch": 2.539118065433855, |
| "grad_norm": 0.15420596551214288, |
| "learning_rate": 7.040614708879489e-07, |
| "loss": 0.0031, |
| "step": 3570 |
| }, |
| { |
| "epoch": 2.546230440967283, |
| "grad_norm": 0.18795827544823362, |
| "learning_rate": 6.830361653693673e-07, |
| "loss": 0.0031, |
| "step": 3580 |
| }, |
| { |
| "epoch": 2.5533428165007113, |
| "grad_norm": 0.20144541991501458, |
| "learning_rate": 6.623065407493801e-07, |
| "loss": 0.0031, |
| "step": 3590 |
| }, |
| { |
| "epoch": 2.5604551920341394, |
| "grad_norm": 0.11898776472079374, |
| "learning_rate": 6.418740168578208e-07, |
| "loss": 0.0029, |
| "step": 3600 |
| }, |
| { |
| "epoch": 2.5675675675675675, |
| "grad_norm": 0.11704775629045612, |
| "learning_rate": 6.217399931752627e-07, |
| "loss": 0.0031, |
| "step": 3610 |
| }, |
| { |
| "epoch": 2.5746799431009957, |
| "grad_norm": 0.13757018665386925, |
| "learning_rate": 6.019058487371687e-07, |
| "loss": 0.0028, |
| "step": 3620 |
| }, |
| { |
| "epoch": 2.581792318634424, |
| "grad_norm": 0.07705433560973203, |
| "learning_rate": 5.82372942039432e-07, |
| "loss": 0.0037, |
| "step": 3630 |
| }, |
| { |
| "epoch": 2.588904694167852, |
| "grad_norm": 0.12004181043862794, |
| "learning_rate": 5.631426109453364e-07, |
| "loss": 0.003, |
| "step": 3640 |
| }, |
| { |
| "epoch": 2.59601706970128, |
| "grad_norm": 0.11547199526456815, |
| "learning_rate": 5.44216172593916e-07, |
| "loss": 0.0032, |
| "step": 3650 |
| }, |
| { |
| "epoch": 2.6031294452347082, |
| "grad_norm": 0.20275686253937805, |
| "learning_rate": 5.255949233097451e-07, |
| "loss": 0.0035, |
| "step": 3660 |
| }, |
| { |
| "epoch": 2.6102418207681364, |
| "grad_norm": 0.1327960409529542, |
| "learning_rate": 5.072801385141429e-07, |
| "loss": 0.0032, |
| "step": 3670 |
| }, |
| { |
| "epoch": 2.617354196301565, |
| "grad_norm": 0.13522734646826431, |
| "learning_rate": 4.89273072637827e-07, |
| "loss": 0.0027, |
| "step": 3680 |
| }, |
| { |
| "epoch": 2.6244665718349927, |
| "grad_norm": 0.0921535098896707, |
| "learning_rate": 4.7157495903498105e-07, |
| "loss": 0.0029, |
| "step": 3690 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.1305724860300583, |
| "learning_rate": 4.541870098987911e-07, |
| "loss": 0.0035, |
| "step": 3700 |
| }, |
| { |
| "epoch": 2.6386913229018494, |
| "grad_norm": 0.1366897855739292, |
| "learning_rate": 4.371104161784073e-07, |
| "loss": 0.0039, |
| "step": 3710 |
| }, |
| { |
| "epoch": 2.6458036984352775, |
| "grad_norm": 0.16675061725996185, |
| "learning_rate": 4.2034634749738623e-07, |
| "loss": 0.003, |
| "step": 3720 |
| }, |
| { |
| "epoch": 2.6529160739687057, |
| "grad_norm": 0.12062320450080749, |
| "learning_rate": 4.038959520735658e-07, |
| "loss": 0.0032, |
| "step": 3730 |
| }, |
| { |
| "epoch": 2.660028449502134, |
| "grad_norm": 0.07277873243358957, |
| "learning_rate": 3.8776035664043033e-07, |
| "loss": 0.0033, |
| "step": 3740 |
| }, |
| { |
| "epoch": 2.667140825035562, |
| "grad_norm": 0.09995970754512991, |
| "learning_rate": 3.719406663699349e-07, |
| "loss": 0.0036, |
| "step": 3750 |
| }, |
| { |
| "epoch": 2.67425320056899, |
| "grad_norm": 0.14356536332083528, |
| "learning_rate": 3.564379647968064e-07, |
| "loss": 0.0034, |
| "step": 3760 |
| }, |
| { |
| "epoch": 2.681365576102418, |
| "grad_norm": 0.1289519043233803, |
| "learning_rate": 3.4125331374433414e-07, |
| "loss": 0.0029, |
| "step": 3770 |
| }, |
| { |
| "epoch": 2.6884779516358464, |
| "grad_norm": 0.10645779562131363, |
| "learning_rate": 3.2638775325163517e-07, |
| "loss": 0.0027, |
| "step": 3780 |
| }, |
| { |
| "epoch": 2.6955903271692745, |
| "grad_norm": 0.10980156190201901, |
| "learning_rate": 3.1184230150243025e-07, |
| "loss": 0.0026, |
| "step": 3790 |
| }, |
| { |
| "epoch": 2.7027027027027026, |
| "grad_norm": 0.1212601092847071, |
| "learning_rate": 2.9761795475529375e-07, |
| "loss": 0.0027, |
| "step": 3800 |
| }, |
| { |
| "epoch": 2.7098150782361308, |
| "grad_norm": 0.10465054324216685, |
| "learning_rate": 2.8371568727542486e-07, |
| "loss": 0.0032, |
| "step": 3810 |
| }, |
| { |
| "epoch": 2.716927453769559, |
| "grad_norm": 0.14087107927522052, |
| "learning_rate": 2.7013645126791446e-07, |
| "loss": 0.0027, |
| "step": 3820 |
| }, |
| { |
| "epoch": 2.724039829302987, |
| "grad_norm": 0.11777162015019617, |
| "learning_rate": 2.5688117681252677e-07, |
| "loss": 0.0031, |
| "step": 3830 |
| }, |
| { |
| "epoch": 2.731152204836415, |
| "grad_norm": 0.12580839073471906, |
| "learning_rate": 2.439507717999945e-07, |
| "loss": 0.0027, |
| "step": 3840 |
| }, |
| { |
| "epoch": 2.7382645803698438, |
| "grad_norm": 0.11019351778666993, |
| "learning_rate": 2.3134612186983817e-07, |
| "loss": 0.0032, |
| "step": 3850 |
| }, |
| { |
| "epoch": 2.7453769559032715, |
| "grad_norm": 0.2540811705778796, |
| "learning_rate": 2.1906809034970057e-07, |
| "loss": 0.0032, |
| "step": 3860 |
| }, |
| { |
| "epoch": 2.7524893314367, |
| "grad_norm": 0.14533749828341638, |
| "learning_rate": 2.0711751819622038e-07, |
| "loss": 0.0028, |
| "step": 3870 |
| }, |
| { |
| "epoch": 2.759601706970128, |
| "grad_norm": 0.17723003777910762, |
| "learning_rate": 1.954952239374286e-07, |
| "loss": 0.0033, |
| "step": 3880 |
| }, |
| { |
| "epoch": 2.7667140825035563, |
| "grad_norm": 0.1714781247080342, |
| "learning_rate": 1.8420200361669137e-07, |
| "loss": 0.0028, |
| "step": 3890 |
| }, |
| { |
| "epoch": 2.7738264580369845, |
| "grad_norm": 0.1442879683659834, |
| "learning_rate": 1.732386307381767e-07, |
| "loss": 0.0028, |
| "step": 3900 |
| }, |
| { |
| "epoch": 2.7809388335704126, |
| "grad_norm": 0.11658671113478708, |
| "learning_rate": 1.6260585621388604e-07, |
| "loss": 0.0032, |
| "step": 3910 |
| }, |
| { |
| "epoch": 2.7880512091038407, |
| "grad_norm": 0.13555304661960596, |
| "learning_rate": 1.523044083122138e-07, |
| "loss": 0.0033, |
| "step": 3920 |
| }, |
| { |
| "epoch": 2.795163584637269, |
| "grad_norm": 0.16068613052421124, |
| "learning_rate": 1.4233499260807194e-07, |
| "loss": 0.0034, |
| "step": 3930 |
| }, |
| { |
| "epoch": 2.802275960170697, |
| "grad_norm": 0.1397672323891182, |
| "learning_rate": 1.326982919345582e-07, |
| "loss": 0.003, |
| "step": 3940 |
| }, |
| { |
| "epoch": 2.809388335704125, |
| "grad_norm": 0.1228326098193467, |
| "learning_rate": 1.2339496633619218e-07, |
| "loss": 0.0026, |
| "step": 3950 |
| }, |
| { |
| "epoch": 2.8165007112375533, |
| "grad_norm": 0.09294084238773208, |
| "learning_rate": 1.1442565302370146e-07, |
| "loss": 0.0026, |
| "step": 3960 |
| }, |
| { |
| "epoch": 2.8236130867709814, |
| "grad_norm": 0.10538827214385106, |
| "learning_rate": 1.0579096633038411e-07, |
| "loss": 0.0033, |
| "step": 3970 |
| }, |
| { |
| "epoch": 2.8307254623044096, |
| "grad_norm": 0.09895208971100541, |
| "learning_rate": 9.749149767002197e-08, |
| "loss": 0.0029, |
| "step": 3980 |
| }, |
| { |
| "epoch": 2.8378378378378377, |
| "grad_norm": 0.17612347880517987, |
| "learning_rate": 8.952781549638412e-08, |
| "loss": 0.0038, |
| "step": 3990 |
| }, |
| { |
| "epoch": 2.844950213371266, |
| "grad_norm": 0.13285843764249902, |
| "learning_rate": 8.190046526428241e-08, |
| "loss": 0.0028, |
| "step": 4000 |
| }, |
| { |
| "epoch": 2.852062588904694, |
| "grad_norm": 0.15853886614347157, |
| "learning_rate": 7.460996939221643e-08, |
| "loss": 0.0032, |
| "step": 4010 |
| }, |
| { |
| "epoch": 2.8591749644381226, |
| "grad_norm": 0.10115826454451997, |
| "learning_rate": 6.765682722659151e-08, |
| "loss": 0.0034, |
| "step": 4020 |
| }, |
| { |
| "epoch": 2.8662873399715503, |
| "grad_norm": 0.16050424912282388, |
| "learning_rate": 6.104151500751609e-08, |
| "loss": 0.0026, |
| "step": 4030 |
| }, |
| { |
| "epoch": 2.873399715504979, |
| "grad_norm": 0.10822054946183253, |
| "learning_rate": 5.476448583618288e-08, |
| "loss": 0.0035, |
| "step": 4040 |
| }, |
| { |
| "epoch": 2.8805120910384066, |
| "grad_norm": 0.1113521110254991, |
| "learning_rate": 4.8826169643832464e-08, |
| "loss": 0.0026, |
| "step": 4050 |
| }, |
| { |
| "epoch": 2.887624466571835, |
| "grad_norm": 0.14081228392187445, |
| "learning_rate": 4.322697316231361e-08, |
| "loss": 0.0032, |
| "step": 4060 |
| }, |
| { |
| "epoch": 2.8947368421052633, |
| "grad_norm": 0.11756191197474342, |
| "learning_rate": 3.796727989621385e-08, |
| "loss": 0.0024, |
| "step": 4070 |
| }, |
| { |
| "epoch": 2.9018492176386914, |
| "grad_norm": 0.14346626654053973, |
| "learning_rate": 3.304745009660326e-08, |
| "loss": 0.003, |
| "step": 4080 |
| }, |
| { |
| "epoch": 2.9089615931721196, |
| "grad_norm": 0.13833583160259022, |
| "learning_rate": 2.8467820736350903e-08, |
| "loss": 0.0028, |
| "step": 4090 |
| }, |
| { |
| "epoch": 2.9160739687055477, |
| "grad_norm": 0.08441703695039304, |
| "learning_rate": 2.422870548705103e-08, |
| "loss": 0.003, |
| "step": 4100 |
| }, |
| { |
| "epoch": 2.923186344238976, |
| "grad_norm": 0.15199272572784162, |
| "learning_rate": 2.0330394697534726e-08, |
| "loss": 0.0032, |
| "step": 4110 |
| }, |
| { |
| "epoch": 2.930298719772404, |
| "grad_norm": 0.09905970954206261, |
| "learning_rate": 1.677315537398583e-08, |
| "loss": 0.0033, |
| "step": 4120 |
| }, |
| { |
| "epoch": 2.937411095305832, |
| "grad_norm": 0.12746964816800027, |
| "learning_rate": 1.355723116165164e-08, |
| "loss": 0.003, |
| "step": 4130 |
| }, |
| { |
| "epoch": 2.9445234708392602, |
| "grad_norm": 0.1730883953102828, |
| "learning_rate": 1.0682842328154086e-08, |
| "loss": 0.003, |
| "step": 4140 |
| }, |
| { |
| "epoch": 2.9516358463726884, |
| "grad_norm": 0.14592570068315344, |
| "learning_rate": 8.150185748405092e-09, |
| "loss": 0.0034, |
| "step": 4150 |
| }, |
| { |
| "epoch": 2.9587482219061165, |
| "grad_norm": 0.16218729377273186, |
| "learning_rate": 5.959434891121274e-09, |
| "loss": 0.0031, |
| "step": 4160 |
| }, |
| { |
| "epoch": 2.9658605974395447, |
| "grad_norm": 0.1534720207270455, |
| "learning_rate": 4.110739806940656e-09, |
| "loss": 0.0028, |
| "step": 4170 |
| }, |
| { |
| "epoch": 2.972972972972973, |
| "grad_norm": 0.1535652411238345, |
| "learning_rate": 2.604227118148117e-09, |
| "loss": 0.0025, |
| "step": 4180 |
| }, |
| { |
| "epoch": 2.9800853485064014, |
| "grad_norm": 0.21854345544372025, |
| "learning_rate": 1.4400000100017741e-09, |
| "loss": 0.0028, |
| "step": 4190 |
| }, |
| { |
| "epoch": 2.987197724039829, |
| "grad_norm": 0.11360018294244285, |
| "learning_rate": 6.181382236641887e-10, |
| "loss": 0.0027, |
| "step": 4200 |
| }, |
| { |
| "epoch": 2.9943100995732577, |
| "grad_norm": 0.13109703302719727, |
| "learning_rate": 1.3869805074284704e-10, |
| "loss": 0.003, |
| "step": 4210 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 4218, |
| "total_flos": 247294279680000.0, |
| "train_loss": 0.26427117863254007, |
| "train_runtime": 27747.0354, |
| "train_samples_per_second": 9.727, |
| "train_steps_per_second": 0.152 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 4218, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 247294279680000.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|