{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 15.748031496062993, "eval_steps": 500, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07874015748031496, "grad_norm": 5.119835376739502, "learning_rate": 1.8e-06, "loss": 0.8008, "step": 10 }, { "epoch": 0.15748031496062992, "grad_norm": 2.7846670150756836, "learning_rate": 3.8e-06, "loss": 0.6753, "step": 20 }, { "epoch": 0.23622047244094488, "grad_norm": 0.9429484605789185, "learning_rate": 5.8e-06, "loss": 0.3854, "step": 30 }, { "epoch": 0.31496062992125984, "grad_norm": 0.4289324879646301, "learning_rate": 7.8e-06, "loss": 0.2374, "step": 40 }, { "epoch": 0.3937007874015748, "grad_norm": 0.7212084531784058, "learning_rate": 9.800000000000001e-06, "loss": 0.1937, "step": 50 }, { "epoch": 0.47244094488188976, "grad_norm": 0.3743739426136017, "learning_rate": 1.18e-05, "loss": 0.169, "step": 60 }, { "epoch": 0.5511811023622047, "grad_norm": 0.33906233310699463, "learning_rate": 1.3800000000000002e-05, "loss": 0.1513, "step": 70 }, { "epoch": 0.6299212598425197, "grad_norm": 0.4938454031944275, "learning_rate": 1.58e-05, "loss": 0.1333, "step": 80 }, { "epoch": 0.7086614173228346, "grad_norm": 0.3791239261627197, "learning_rate": 1.78e-05, "loss": 0.1209, "step": 90 }, { "epoch": 0.7874015748031497, "grad_norm": 0.31115081906318665, "learning_rate": 1.9800000000000004e-05, "loss": 0.108, "step": 100 }, { "epoch": 0.8661417322834646, "grad_norm": 0.3266952633857727, "learning_rate": 2.18e-05, "loss": 0.105, "step": 110 }, { "epoch": 0.9448818897637795, "grad_norm": 0.42559316754341125, "learning_rate": 2.38e-05, "loss": 0.0931, "step": 120 }, { "epoch": 1.0236220472440944, "grad_norm": 0.40235716104507446, "learning_rate": 2.58e-05, "loss": 0.0867, "step": 130 }, { "epoch": 1.1023622047244095, "grad_norm": 0.3455784022808075, "learning_rate": 2.7800000000000005e-05, "loss": 0.0843, "step": 140 }, { "epoch": 1.1811023622047245, "grad_norm": 0.39895790815353394, "learning_rate": 2.98e-05, "loss": 0.0803, "step": 150 }, { "epoch": 1.2598425196850394, "grad_norm": 0.32412493228912354, "learning_rate": 3.18e-05, "loss": 0.075, "step": 160 }, { "epoch": 1.3385826771653544, "grad_norm": 0.3238581717014313, "learning_rate": 3.38e-05, "loss": 0.069, "step": 170 }, { "epoch": 1.4173228346456692, "grad_norm": 0.35713112354278564, "learning_rate": 3.58e-05, "loss": 0.0632, "step": 180 }, { "epoch": 1.4960629921259843, "grad_norm": 0.31078988313674927, "learning_rate": 3.7800000000000004e-05, "loss": 0.0606, "step": 190 }, { "epoch": 1.574803149606299, "grad_norm": 0.23168951272964478, "learning_rate": 3.9800000000000005e-05, "loss": 0.0566, "step": 200 }, { "epoch": 1.6535433070866141, "grad_norm": 0.2528112828731537, "learning_rate": 4.18e-05, "loss": 0.0553, "step": 210 }, { "epoch": 1.7322834645669292, "grad_norm": 0.31132972240448, "learning_rate": 4.38e-05, "loss": 0.0516, "step": 220 }, { "epoch": 1.811023622047244, "grad_norm": 0.3506482243537903, "learning_rate": 4.58e-05, "loss": 0.051, "step": 230 }, { "epoch": 1.889763779527559, "grad_norm": 0.30420321226119995, "learning_rate": 4.78e-05, "loss": 0.0498, "step": 240 }, { "epoch": 1.968503937007874, "grad_norm": 0.27608105540275574, "learning_rate": 4.9800000000000004e-05, "loss": 0.0425, "step": 250 }, { "epoch": 2.047244094488189, "grad_norm": 0.24153359234333038, "learning_rate": 5.1800000000000005e-05, "loss": 0.0425, "step": 260 }, { "epoch": 2.1259842519685037, "grad_norm": 0.2684983015060425, "learning_rate": 5.380000000000001e-05, "loss": 0.0397, "step": 270 }, { "epoch": 2.204724409448819, "grad_norm": 0.2812291979789734, "learning_rate": 5.580000000000001e-05, "loss": 0.0347, "step": 280 }, { "epoch": 2.283464566929134, "grad_norm": 0.272079199552536, "learning_rate": 5.7799999999999995e-05, "loss": 0.0339, "step": 290 }, { "epoch": 2.362204724409449, "grad_norm": 0.30601683259010315, "learning_rate": 5.9800000000000003e-05, "loss": 0.0359, "step": 300 }, { "epoch": 2.440944881889764, "grad_norm": 0.3129172921180725, "learning_rate": 6.18e-05, "loss": 0.0351, "step": 310 }, { "epoch": 2.5196850393700787, "grad_norm": 0.27252131700515747, "learning_rate": 6.38e-05, "loss": 0.0287, "step": 320 }, { "epoch": 2.5984251968503935, "grad_norm": 0.2653070390224457, "learning_rate": 6.58e-05, "loss": 0.0313, "step": 330 }, { "epoch": 2.677165354330709, "grad_norm": 0.35808777809143066, "learning_rate": 6.780000000000001e-05, "loss": 0.0356, "step": 340 }, { "epoch": 2.7559055118110236, "grad_norm": 0.26742085814476013, "learning_rate": 6.98e-05, "loss": 0.0299, "step": 350 }, { "epoch": 2.8346456692913384, "grad_norm": 0.4106348156929016, "learning_rate": 7.18e-05, "loss": 0.0324, "step": 360 }, { "epoch": 2.9133858267716537, "grad_norm": 0.213535338640213, "learning_rate": 7.38e-05, "loss": 0.0273, "step": 370 }, { "epoch": 2.9921259842519685, "grad_norm": 0.26808497309684753, "learning_rate": 7.58e-05, "loss": 0.0254, "step": 380 }, { "epoch": 3.0708661417322833, "grad_norm": 0.18177832663059235, "learning_rate": 7.780000000000001e-05, "loss": 0.0261, "step": 390 }, { "epoch": 3.1496062992125986, "grad_norm": 0.2706851065158844, "learning_rate": 7.98e-05, "loss": 0.0268, "step": 400 }, { "epoch": 3.2283464566929134, "grad_norm": 0.29524528980255127, "learning_rate": 8.18e-05, "loss": 0.0285, "step": 410 }, { "epoch": 3.3070866141732282, "grad_norm": 0.21399272978305817, "learning_rate": 8.38e-05, "loss": 0.0252, "step": 420 }, { "epoch": 3.3858267716535435, "grad_norm": 0.290097177028656, "learning_rate": 8.58e-05, "loss": 0.0279, "step": 430 }, { "epoch": 3.4645669291338583, "grad_norm": 0.2789689004421234, "learning_rate": 8.78e-05, "loss": 0.0236, "step": 440 }, { "epoch": 3.543307086614173, "grad_norm": 0.3307545781135559, "learning_rate": 8.98e-05, "loss": 0.0256, "step": 450 }, { "epoch": 3.622047244094488, "grad_norm": 0.2919306457042694, "learning_rate": 9.180000000000001e-05, "loss": 0.0227, "step": 460 }, { "epoch": 3.7007874015748032, "grad_norm": 0.27534034848213196, "learning_rate": 9.38e-05, "loss": 0.0219, "step": 470 }, { "epoch": 3.779527559055118, "grad_norm": 0.26348116993904114, "learning_rate": 9.58e-05, "loss": 0.0242, "step": 480 }, { "epoch": 3.8582677165354333, "grad_norm": 0.29468125104904175, "learning_rate": 9.78e-05, "loss": 0.0224, "step": 490 }, { "epoch": 3.937007874015748, "grad_norm": 0.20534993708133698, "learning_rate": 9.98e-05, "loss": 0.0235, "step": 500 }, { "epoch": 4.015748031496063, "grad_norm": 0.2911393642425537, "learning_rate": 9.9999778549206e-05, "loss": 0.0206, "step": 510 }, { "epoch": 4.094488188976378, "grad_norm": 0.2478438913822174, "learning_rate": 9.999901304280685e-05, "loss": 0.0219, "step": 520 }, { "epoch": 4.173228346456693, "grad_norm": 0.32605063915252686, "learning_rate": 9.999770075521164e-05, "loss": 0.0232, "step": 530 }, { "epoch": 4.251968503937007, "grad_norm": 0.22585000097751617, "learning_rate": 9.99958417007713e-05, "loss": 0.0228, "step": 540 }, { "epoch": 4.330708661417323, "grad_norm": 0.3227289319038391, "learning_rate": 9.999343589981615e-05, "loss": 0.018, "step": 550 }, { "epoch": 4.409448818897638, "grad_norm": 0.2523372769355774, "learning_rate": 9.999048337865568e-05, "loss": 0.0215, "step": 560 }, { "epoch": 4.488188976377953, "grad_norm": 0.34720173478126526, "learning_rate": 9.998698416957815e-05, "loss": 0.0242, "step": 570 }, { "epoch": 4.566929133858268, "grad_norm": 0.24005654454231262, "learning_rate": 9.998293831085037e-05, "loss": 0.0213, "step": 580 }, { "epoch": 4.645669291338582, "grad_norm": 0.28940242528915405, "learning_rate": 9.997834584671719e-05, "loss": 0.0204, "step": 590 }, { "epoch": 4.724409448818898, "grad_norm": 0.2654191255569458, "learning_rate": 9.997320682740107e-05, "loss": 0.0217, "step": 600 }, { "epoch": 4.803149606299213, "grad_norm": 0.2912241816520691, "learning_rate": 9.996752130910149e-05, "loss": 0.0197, "step": 610 }, { "epoch": 4.881889763779528, "grad_norm": 0.23718924820423126, "learning_rate": 9.99612893539944e-05, "loss": 0.0209, "step": 620 }, { "epoch": 4.960629921259843, "grad_norm": 0.2647818326950073, "learning_rate": 9.995451103023144e-05, "loss": 0.0222, "step": 630 }, { "epoch": 5.039370078740157, "grad_norm": 0.35324886441230774, "learning_rate": 9.994718641193928e-05, "loss": 0.0224, "step": 640 }, { "epoch": 5.118110236220472, "grad_norm": 0.2671961188316345, "learning_rate": 9.993931557921874e-05, "loss": 0.0219, "step": 650 }, { "epoch": 5.196850393700787, "grad_norm": 0.2596529722213745, "learning_rate": 9.993089861814402e-05, "loss": 0.0203, "step": 660 }, { "epoch": 5.275590551181103, "grad_norm": 0.25885483622550964, "learning_rate": 9.992193562076166e-05, "loss": 0.0188, "step": 670 }, { "epoch": 5.354330708661418, "grad_norm": 0.24976016581058502, "learning_rate": 9.991242668508954e-05, "loss": 0.0175, "step": 680 }, { "epoch": 5.433070866141732, "grad_norm": 0.24121227860450745, "learning_rate": 9.990237191511587e-05, "loss": 0.0158, "step": 690 }, { "epoch": 5.511811023622047, "grad_norm": 0.22227917611598969, "learning_rate": 9.989177142079802e-05, "loss": 0.0177, "step": 700 }, { "epoch": 5.590551181102362, "grad_norm": 0.231464222073555, "learning_rate": 9.988062531806126e-05, "loss": 0.0183, "step": 710 }, { "epoch": 5.669291338582677, "grad_norm": 0.16609017550945282, "learning_rate": 9.986893372879762e-05, "loss": 0.018, "step": 720 }, { "epoch": 5.748031496062993, "grad_norm": 0.19624024629592896, "learning_rate": 9.985669678086443e-05, "loss": 0.018, "step": 730 }, { "epoch": 5.826771653543307, "grad_norm": 0.22255055606365204, "learning_rate": 9.984391460808298e-05, "loss": 0.0199, "step": 740 }, { "epoch": 5.905511811023622, "grad_norm": 0.22765639424324036, "learning_rate": 9.983058735023709e-05, "loss": 0.0191, "step": 750 }, { "epoch": 5.984251968503937, "grad_norm": 0.23915418982505798, "learning_rate": 9.98167151530715e-05, "loss": 0.0178, "step": 760 }, { "epoch": 6.062992125984252, "grad_norm": 0.2489311248064041, "learning_rate": 9.980229816829034e-05, "loss": 0.0202, "step": 770 }, { "epoch": 6.141732283464567, "grad_norm": 0.22865547239780426, "learning_rate": 9.978733655355544e-05, "loss": 0.0187, "step": 780 }, { "epoch": 6.2204724409448815, "grad_norm": 0.19393905997276306, "learning_rate": 9.977183047248464e-05, "loss": 0.0168, "step": 790 }, { "epoch": 6.299212598425197, "grad_norm": 0.20525363087654114, "learning_rate": 9.975578009464992e-05, "loss": 0.018, "step": 800 }, { "epoch": 6.377952755905512, "grad_norm": 0.2537108063697815, "learning_rate": 9.97391855955757e-05, "loss": 0.0143, "step": 810 }, { "epoch": 6.456692913385827, "grad_norm": 0.2665018141269684, "learning_rate": 9.972204715673669e-05, "loss": 0.0165, "step": 820 }, { "epoch": 6.535433070866142, "grad_norm": 0.18383699655532837, "learning_rate": 9.970436496555617e-05, "loss": 0.0164, "step": 830 }, { "epoch": 6.6141732283464565, "grad_norm": 0.3430931270122528, "learning_rate": 9.968613921540373e-05, "loss": 0.0176, "step": 840 }, { "epoch": 6.692913385826771, "grad_norm": 0.2601425349712372, "learning_rate": 9.966737010559326e-05, "loss": 0.0175, "step": 850 }, { "epoch": 6.771653543307087, "grad_norm": 0.19988982379436493, "learning_rate": 9.964805784138072e-05, "loss": 0.0172, "step": 860 }, { "epoch": 6.850393700787402, "grad_norm": 0.18660953640937805, "learning_rate": 9.962820263396195e-05, "loss": 0.0158, "step": 870 }, { "epoch": 6.929133858267717, "grad_norm": 0.22756962478160858, "learning_rate": 9.960780470047033e-05, "loss": 0.0185, "step": 880 }, { "epoch": 7.0078740157480315, "grad_norm": 0.14548353850841522, "learning_rate": 9.958686426397437e-05, "loss": 0.0164, "step": 890 }, { "epoch": 7.086614173228346, "grad_norm": 0.20737145841121674, "learning_rate": 9.956538155347534e-05, "loss": 0.0182, "step": 900 }, { "epoch": 7.165354330708661, "grad_norm": 0.20689648389816284, "learning_rate": 9.95433568039047e-05, "loss": 0.0145, "step": 910 }, { "epoch": 7.244094488188976, "grad_norm": 0.26220783591270447, "learning_rate": 9.952079025612162e-05, "loss": 0.0145, "step": 920 }, { "epoch": 7.322834645669292, "grad_norm": 0.23523452877998352, "learning_rate": 9.949768215691022e-05, "loss": 0.0168, "step": 930 }, { "epoch": 7.4015748031496065, "grad_norm": 0.207063227891922, "learning_rate": 9.9474032758977e-05, "loss": 0.0154, "step": 940 }, { "epoch": 7.480314960629921, "grad_norm": 0.2092580646276474, "learning_rate": 9.944984232094794e-05, "loss": 0.0169, "step": 950 }, { "epoch": 7.559055118110236, "grad_norm": 0.1808154582977295, "learning_rate": 9.942511110736584e-05, "loss": 0.0157, "step": 960 }, { "epoch": 7.637795275590551, "grad_norm": 0.2190985083580017, "learning_rate": 9.939983938868726e-05, "loss": 0.0155, "step": 970 }, { "epoch": 7.716535433070866, "grad_norm": 0.1607908308506012, "learning_rate": 9.93740274412797e-05, "loss": 0.0136, "step": 980 }, { "epoch": 7.7952755905511815, "grad_norm": 0.20882774889469147, "learning_rate": 9.934767554741846e-05, "loss": 0.0192, "step": 990 }, { "epoch": 7.874015748031496, "grad_norm": 0.18141894042491913, "learning_rate": 9.932078399528361e-05, "loss": 0.0134, "step": 1000 }, { "epoch": 7.952755905511811, "grad_norm": 0.1842644363641739, "learning_rate": 9.929335307895689e-05, "loss": 0.0145, "step": 1010 }, { "epoch": 8.031496062992126, "grad_norm": 0.19102592766284943, "learning_rate": 9.926538309841839e-05, "loss": 0.0179, "step": 1020 }, { "epoch": 8.11023622047244, "grad_norm": 0.2554001212120056, "learning_rate": 9.923687435954334e-05, "loss": 0.0145, "step": 1030 }, { "epoch": 8.188976377952756, "grad_norm": 0.2188219279050827, "learning_rate": 9.920782717409873e-05, "loss": 0.0133, "step": 1040 }, { "epoch": 8.26771653543307, "grad_norm": 0.19668325781822205, "learning_rate": 9.917824185973994e-05, "loss": 0.013, "step": 1050 }, { "epoch": 8.346456692913385, "grad_norm": 0.19224300980567932, "learning_rate": 9.914811874000723e-05, "loss": 0.012, "step": 1060 }, { "epoch": 8.4251968503937, "grad_norm": 0.2617517113685608, "learning_rate": 9.911745814432218e-05, "loss": 0.0144, "step": 1070 }, { "epoch": 8.503937007874015, "grad_norm": 0.340850293636322, "learning_rate": 9.90862604079842e-05, "loss": 0.0163, "step": 1080 }, { "epoch": 8.582677165354331, "grad_norm": 0.24036389589309692, "learning_rate": 9.90545258721667e-05, "loss": 0.0143, "step": 1090 }, { "epoch": 8.661417322834646, "grad_norm": 0.2523621916770935, "learning_rate": 9.90222548839135e-05, "loss": 0.0137, "step": 1100 }, { "epoch": 8.740157480314961, "grad_norm": 0.25303855538368225, "learning_rate": 9.898944779613495e-05, "loss": 0.0124, "step": 1110 }, { "epoch": 8.818897637795276, "grad_norm": 0.2672367990016937, "learning_rate": 9.89561049676041e-05, "loss": 0.0135, "step": 1120 }, { "epoch": 8.89763779527559, "grad_norm": 0.22292408347129822, "learning_rate": 9.89222267629528e-05, "loss": 0.0155, "step": 1130 }, { "epoch": 8.976377952755906, "grad_norm": 0.2113981992006302, "learning_rate": 9.888781355266763e-05, "loss": 0.0139, "step": 1140 }, { "epoch": 9.05511811023622, "grad_norm": 0.16752807796001434, "learning_rate": 9.885286571308598e-05, "loss": 0.0124, "step": 1150 }, { "epoch": 9.133858267716535, "grad_norm": 0.1773703545331955, "learning_rate": 9.881738362639182e-05, "loss": 0.015, "step": 1160 }, { "epoch": 9.21259842519685, "grad_norm": 0.26974138617515564, "learning_rate": 9.878136768061154e-05, "loss": 0.0162, "step": 1170 }, { "epoch": 9.291338582677165, "grad_norm": 0.2184063196182251, "learning_rate": 9.874481826960979e-05, "loss": 0.0148, "step": 1180 }, { "epoch": 9.37007874015748, "grad_norm": 0.1977306753396988, "learning_rate": 9.870773579308503e-05, "loss": 0.0123, "step": 1190 }, { "epoch": 9.448818897637794, "grad_norm": 0.1981269121170044, "learning_rate": 9.867012065656533e-05, "loss": 0.0152, "step": 1200 }, { "epoch": 9.527559055118111, "grad_norm": 0.17817805707454681, "learning_rate": 9.863197327140376e-05, "loss": 0.0123, "step": 1210 }, { "epoch": 9.606299212598426, "grad_norm": 0.23420843482017517, "learning_rate": 9.859329405477403e-05, "loss": 0.0129, "step": 1220 }, { "epoch": 9.68503937007874, "grad_norm": 0.25216200947761536, "learning_rate": 9.855408342966585e-05, "loss": 0.0138, "step": 1230 }, { "epoch": 9.763779527559056, "grad_norm": 0.1990588754415512, "learning_rate": 9.851434182488033e-05, "loss": 0.0129, "step": 1240 }, { "epoch": 9.84251968503937, "grad_norm": 0.27837619185447693, "learning_rate": 9.84740696750253e-05, "loss": 0.0124, "step": 1250 }, { "epoch": 9.921259842519685, "grad_norm": 0.21090054512023926, "learning_rate": 9.843326742051055e-05, "loss": 0.013, "step": 1260 }, { "epoch": 10.0, "grad_norm": 0.19581645727157593, "learning_rate": 9.839193550754297e-05, "loss": 0.0126, "step": 1270 }, { "epoch": 10.078740157480315, "grad_norm": 0.21251627802848816, "learning_rate": 9.835007438812177e-05, "loss": 0.0148, "step": 1280 }, { "epoch": 10.15748031496063, "grad_norm": 0.18511821329593658, "learning_rate": 9.830768452003341e-05, "loss": 0.0133, "step": 1290 }, { "epoch": 10.236220472440944, "grad_norm": 0.18811464309692383, "learning_rate": 9.826476636684671e-05, "loss": 0.0126, "step": 1300 }, { "epoch": 10.31496062992126, "grad_norm": 0.18782231211662292, "learning_rate": 9.822132039790773e-05, "loss": 0.0117, "step": 1310 }, { "epoch": 10.393700787401574, "grad_norm": 0.16824057698249817, "learning_rate": 9.817734708833461e-05, "loss": 0.0106, "step": 1320 }, { "epoch": 10.472440944881889, "grad_norm": 0.1814710795879364, "learning_rate": 9.813284691901243e-05, "loss": 0.0162, "step": 1330 }, { "epoch": 10.551181102362206, "grad_norm": 0.2217687964439392, "learning_rate": 9.808782037658792e-05, "loss": 0.0155, "step": 1340 }, { "epoch": 10.62992125984252, "grad_norm": 0.19781896471977234, "learning_rate": 9.804226795346411e-05, "loss": 0.0133, "step": 1350 }, { "epoch": 10.708661417322835, "grad_norm": 0.24714171886444092, "learning_rate": 9.799619014779503e-05, "loss": 0.0129, "step": 1360 }, { "epoch": 10.78740157480315, "grad_norm": 0.16805458068847656, "learning_rate": 9.794958746348013e-05, "loss": 0.0125, "step": 1370 }, { "epoch": 10.866141732283465, "grad_norm": 0.18694327771663666, "learning_rate": 9.790246041015896e-05, "loss": 0.0112, "step": 1380 }, { "epoch": 10.94488188976378, "grad_norm": 0.21768535673618317, "learning_rate": 9.785480950320538e-05, "loss": 0.0121, "step": 1390 }, { "epoch": 11.023622047244094, "grad_norm": 0.16912485659122467, "learning_rate": 9.78066352637221e-05, "loss": 0.0109, "step": 1400 }, { "epoch": 11.10236220472441, "grad_norm": 0.15913233160972595, "learning_rate": 9.775793821853488e-05, "loss": 0.0115, "step": 1410 }, { "epoch": 11.181102362204724, "grad_norm": 0.15250848233699799, "learning_rate": 9.77087189001868e-05, "loss": 0.0123, "step": 1420 }, { "epoch": 11.259842519685039, "grad_norm": 0.17317131161689758, "learning_rate": 9.765897784693243e-05, "loss": 0.0117, "step": 1430 }, { "epoch": 11.338582677165354, "grad_norm": 0.23304998874664307, "learning_rate": 9.760871560273197e-05, "loss": 0.0107, "step": 1440 }, { "epoch": 11.417322834645669, "grad_norm": 0.2260117381811142, "learning_rate": 9.755793271724526e-05, "loss": 0.0113, "step": 1450 }, { "epoch": 11.496062992125985, "grad_norm": 0.20854035019874573, "learning_rate": 9.750662974582584e-05, "loss": 0.0156, "step": 1460 }, { "epoch": 11.5748031496063, "grad_norm": 0.18729598820209503, "learning_rate": 9.745480724951473e-05, "loss": 0.0115, "step": 1470 }, { "epoch": 11.653543307086615, "grad_norm": 0.1489574909210205, "learning_rate": 9.740246579503447e-05, "loss": 0.0122, "step": 1480 }, { "epoch": 11.73228346456693, "grad_norm": 0.16865724325180054, "learning_rate": 9.734960595478284e-05, "loss": 0.0121, "step": 1490 }, { "epoch": 11.811023622047244, "grad_norm": 0.1705121397972107, "learning_rate": 9.729622830682657e-05, "loss": 0.0117, "step": 1500 }, { "epoch": 11.88976377952756, "grad_norm": 0.12779462337493896, "learning_rate": 9.724233343489504e-05, "loss": 0.013, "step": 1510 }, { "epoch": 11.968503937007874, "grad_norm": 0.21109400689601898, "learning_rate": 9.718792192837396e-05, "loss": 0.0105, "step": 1520 }, { "epoch": 12.047244094488189, "grad_norm": 0.17350123822689056, "learning_rate": 9.713299438229886e-05, "loss": 0.0129, "step": 1530 }, { "epoch": 12.125984251968504, "grad_norm": 0.19555015861988068, "learning_rate": 9.707755139734855e-05, "loss": 0.0131, "step": 1540 }, { "epoch": 12.204724409448819, "grad_norm": 0.22949132323265076, "learning_rate": 9.702159357983866e-05, "loss": 0.0122, "step": 1550 }, { "epoch": 12.283464566929133, "grad_norm": 0.21299389004707336, "learning_rate": 9.696512154171492e-05, "loss": 0.013, "step": 1560 }, { "epoch": 12.362204724409448, "grad_norm": 0.2029636800289154, "learning_rate": 9.690813590054645e-05, "loss": 0.0127, "step": 1570 }, { "epoch": 12.440944881889763, "grad_norm": 0.2509428858757019, "learning_rate": 9.685063727951914e-05, "loss": 0.0115, "step": 1580 }, { "epoch": 12.519685039370078, "grad_norm": 0.17952832579612732, "learning_rate": 9.679262630742865e-05, "loss": 0.0123, "step": 1590 }, { "epoch": 12.598425196850394, "grad_norm": 0.17356553673744202, "learning_rate": 9.673410361867373e-05, "loss": 0.0133, "step": 1600 }, { "epoch": 12.67716535433071, "grad_norm": 0.2649160921573639, "learning_rate": 9.667506985324909e-05, "loss": 0.0116, "step": 1610 }, { "epoch": 12.755905511811024, "grad_norm": 0.209790900349617, "learning_rate": 9.661552565673855e-05, "loss": 0.0104, "step": 1620 }, { "epoch": 12.834645669291339, "grad_norm": 0.21641805768013, "learning_rate": 9.655547168030789e-05, "loss": 0.0129, "step": 1630 }, { "epoch": 12.913385826771654, "grad_norm": 0.2454116940498352, "learning_rate": 9.649490858069777e-05, "loss": 0.0104, "step": 1640 }, { "epoch": 12.992125984251969, "grad_norm": 0.21532991528511047, "learning_rate": 9.643383702021658e-05, "loss": 0.0111, "step": 1650 }, { "epoch": 13.070866141732283, "grad_norm": 0.22105859220027924, "learning_rate": 9.637225766673307e-05, "loss": 0.0098, "step": 1660 }, { "epoch": 13.149606299212598, "grad_norm": 0.16493800282478333, "learning_rate": 9.631017119366922e-05, "loss": 0.0107, "step": 1670 }, { "epoch": 13.228346456692913, "grad_norm": 0.18128368258476257, "learning_rate": 9.624757827999273e-05, "loss": 0.0117, "step": 1680 }, { "epoch": 13.307086614173228, "grad_norm": 0.2412339746952057, "learning_rate": 9.618447961020971e-05, "loss": 0.015, "step": 1690 }, { "epoch": 13.385826771653543, "grad_norm": 0.23829780519008636, "learning_rate": 9.612087587435707e-05, "loss": 0.0146, "step": 1700 }, { "epoch": 13.464566929133857, "grad_norm": 0.17435379326343536, "learning_rate": 9.605676776799508e-05, "loss": 0.012, "step": 1710 }, { "epoch": 13.543307086614174, "grad_norm": 0.26677370071411133, "learning_rate": 9.599215599219973e-05, "loss": 0.0119, "step": 1720 }, { "epoch": 13.622047244094489, "grad_norm": 0.17352107167243958, "learning_rate": 9.592704125355505e-05, "loss": 0.0119, "step": 1730 }, { "epoch": 13.700787401574804, "grad_norm": 0.1817910224199295, "learning_rate": 9.586142426414538e-05, "loss": 0.011, "step": 1740 }, { "epoch": 13.779527559055119, "grad_norm": 0.26779067516326904, "learning_rate": 9.57953057415476e-05, "loss": 0.0137, "step": 1750 }, { "epoch": 13.858267716535433, "grad_norm": 0.16992807388305664, "learning_rate": 9.572868640882328e-05, "loss": 0.0116, "step": 1760 }, { "epoch": 13.937007874015748, "grad_norm": 0.2475721836090088, "learning_rate": 9.56615669945108e-05, "loss": 0.013, "step": 1770 }, { "epoch": 14.015748031496063, "grad_norm": 0.30210572481155396, "learning_rate": 9.55939482326173e-05, "loss": 0.0124, "step": 1780 }, { "epoch": 14.094488188976378, "grad_norm": 0.19526968896389008, "learning_rate": 9.552583086261069e-05, "loss": 0.0135, "step": 1790 }, { "epoch": 14.173228346456693, "grad_norm": 0.1772489845752716, "learning_rate": 9.545721562941168e-05, "loss": 0.0119, "step": 1800 }, { "epoch": 14.251968503937007, "grad_norm": 0.20985430479049683, "learning_rate": 9.538810328338543e-05, "loss": 0.0106, "step": 1810 }, { "epoch": 14.330708661417322, "grad_norm": 0.2288864552974701, "learning_rate": 9.531849458033349e-05, "loss": 0.0121, "step": 1820 }, { "epoch": 14.409448818897637, "grad_norm": 0.14826878905296326, "learning_rate": 9.524839028148547e-05, "loss": 0.0109, "step": 1830 }, { "epoch": 14.488188976377952, "grad_norm": 0.24729447066783905, "learning_rate": 9.517779115349077e-05, "loss": 0.0122, "step": 1840 }, { "epoch": 14.566929133858268, "grad_norm": 0.23712359368801117, "learning_rate": 9.510669796841014e-05, "loss": 0.012, "step": 1850 }, { "epoch": 14.645669291338583, "grad_norm": 0.24393972754478455, "learning_rate": 9.503511150370727e-05, "loss": 0.014, "step": 1860 }, { "epoch": 14.724409448818898, "grad_norm": 0.16620883345603943, "learning_rate": 9.496303254224024e-05, "loss": 0.0124, "step": 1870 }, { "epoch": 14.803149606299213, "grad_norm": 0.19335606694221497, "learning_rate": 9.489046187225306e-05, "loss": 0.0116, "step": 1880 }, { "epoch": 14.881889763779528, "grad_norm": 0.17978379130363464, "learning_rate": 9.481740028736692e-05, "loss": 0.0127, "step": 1890 }, { "epoch": 14.960629921259843, "grad_norm": 0.15655072033405304, "learning_rate": 9.474384858657164e-05, "loss": 0.0112, "step": 1900 }, { "epoch": 15.039370078740157, "grad_norm": 0.13158245384693146, "learning_rate": 9.466980757421679e-05, "loss": 0.011, "step": 1910 }, { "epoch": 15.118110236220472, "grad_norm": 0.21858836710453033, "learning_rate": 9.459527806000305e-05, "loss": 0.0116, "step": 1920 }, { "epoch": 15.196850393700787, "grad_norm": 0.19522692263126373, "learning_rate": 9.452026085897325e-05, "loss": 0.0118, "step": 1930 }, { "epoch": 15.275590551181102, "grad_norm": 0.20890626311302185, "learning_rate": 9.444475679150348e-05, "loss": 0.0111, "step": 1940 }, { "epoch": 15.354330708661417, "grad_norm": 0.20746995508670807, "learning_rate": 9.436876668329411e-05, "loss": 0.0107, "step": 1950 }, { "epoch": 15.433070866141732, "grad_norm": 0.18878526985645294, "learning_rate": 9.429229136536079e-05, "loss": 0.0105, "step": 1960 }, { "epoch": 15.511811023622048, "grad_norm": 0.18786223232746124, "learning_rate": 9.421533167402534e-05, "loss": 0.0112, "step": 1970 }, { "epoch": 15.590551181102363, "grad_norm": 0.12698164582252502, "learning_rate": 9.413788845090666e-05, "loss": 0.011, "step": 1980 }, { "epoch": 15.669291338582678, "grad_norm": 0.22439169883728027, "learning_rate": 9.405996254291136e-05, "loss": 0.0113, "step": 1990 }, { "epoch": 15.748031496062993, "grad_norm": 0.16835476458072662, "learning_rate": 9.398155480222474e-05, "loss": 0.0111, "step": 2000 } ], "logging_steps": 10, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 79, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }