{ "best_global_step": 59376, "best_metric": 0.44603702425956726, "best_model_checkpoint": "./my_model1/checkpoint-59376", "epoch": 2.0, "eval_steps": 500, "global_step": 59376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033683643222850983, "grad_norm": 6.297862529754639, "learning_rate": 1.1115477460281817e-06, "loss": 4.6374, "step": 100 }, { "epoch": 0.006736728644570197, "grad_norm": 2.914098024368286, "learning_rate": 2.234323247066749e-06, "loss": 4.3734, "step": 200 }, { "epoch": 0.010105092966855295, "grad_norm": 2.6752781867980957, "learning_rate": 3.357098748105317e-06, "loss": 4.1151, "step": 300 }, { "epoch": 0.013473457289140393, "grad_norm": 2.557371139526367, "learning_rate": 4.479874249143884e-06, "loss": 3.8823, "step": 400 }, { "epoch": 0.016841821611425493, "grad_norm": 2.3797903060913086, "learning_rate": 5.602649750182451e-06, "loss": 3.6958, "step": 500 }, { "epoch": 0.02021018593371059, "grad_norm": 2.318178653717041, "learning_rate": 6.725425251221018e-06, "loss": 3.5736, "step": 600 }, { "epoch": 0.02357855025599569, "grad_norm": 2.263061046600342, "learning_rate": 7.848200752259587e-06, "loss": 3.4535, "step": 700 }, { "epoch": 0.026946914578280787, "grad_norm": 2.245070219039917, "learning_rate": 8.970976253298154e-06, "loss": 3.3254, "step": 800 }, { "epoch": 0.030315278900565887, "grad_norm": 2.1786885261535645, "learning_rate": 1.009375175433672e-05, "loss": 3.1968, "step": 900 }, { "epoch": 0.03368364322285099, "grad_norm": 2.1120216846466064, "learning_rate": 1.1216527255375288e-05, "loss": 3.0749, "step": 1000 }, { "epoch": 0.03705200754513608, "grad_norm": 2.225299835205078, "learning_rate": 1.2339302756413855e-05, "loss": 2.956, "step": 1100 }, { "epoch": 0.04042037186742118, "grad_norm": 1.9265443086624146, "learning_rate": 1.3462078257452423e-05, "loss": 2.8324, "step": 1200 }, { "epoch": 0.04378873618970628, "grad_norm": 4.846482276916504, "learning_rate": 1.458485375849099e-05, "loss": 2.7146, "step": 1300 }, { "epoch": 0.04715710051199138, "grad_norm": 8.298853874206543, "learning_rate": 1.5707629259529558e-05, "loss": 2.6336, "step": 1400 }, { "epoch": 0.05052546483427647, "grad_norm": 12.867733001708984, "learning_rate": 1.6830404760568124e-05, "loss": 2.5894, "step": 1500 }, { "epoch": 0.05389382915656157, "grad_norm": 17.92266082763672, "learning_rate": 1.7953180261606693e-05, "loss": 2.5615, "step": 1600 }, { "epoch": 0.057262193478846674, "grad_norm": 13.567904472351074, "learning_rate": 1.907595576264526e-05, "loss": 2.5376, "step": 1700 }, { "epoch": 0.060630557801131774, "grad_norm": 18.349245071411133, "learning_rate": 2.0198731263683825e-05, "loss": 2.5115, "step": 1800 }, { "epoch": 0.06399892212341687, "grad_norm": 18.910877227783203, "learning_rate": 2.1321506764722397e-05, "loss": 2.4836, "step": 1900 }, { "epoch": 0.06736728644570197, "grad_norm": 5.438470363616943, "learning_rate": 2.2444282265760963e-05, "loss": 2.448, "step": 2000 }, { "epoch": 0.07073565076798706, "grad_norm": 1.8990598917007446, "learning_rate": 2.356705776679953e-05, "loss": 2.3836, "step": 2100 }, { "epoch": 0.07410401509027216, "grad_norm": 1.7939313650131226, "learning_rate": 2.46898332678381e-05, "loss": 2.2869, "step": 2200 }, { "epoch": 0.07747237941255726, "grad_norm": 2.6316609382629395, "learning_rate": 2.581260876887666e-05, "loss": 2.1664, "step": 2300 }, { "epoch": 0.08084074373484236, "grad_norm": 3.9971001148223877, "learning_rate": 2.693538426991523e-05, "loss": 2.0635, "step": 2400 }, { "epoch": 0.08420910805712746, "grad_norm": 2.845649242401123, "learning_rate": 2.8058159770953803e-05, "loss": 2.0033, "step": 2500 }, { "epoch": 0.08757747237941256, "grad_norm": 11.22779655456543, "learning_rate": 2.9180935271992365e-05, "loss": 1.944, "step": 2600 }, { "epoch": 0.09094583670169766, "grad_norm": 8.039031982421875, "learning_rate": 3.0303710773030935e-05, "loss": 1.8935, "step": 2700 }, { "epoch": 0.09431420102398276, "grad_norm": 19.868438720703125, "learning_rate": 3.14264862740695e-05, "loss": 1.8509, "step": 2800 }, { "epoch": 0.09768256534626785, "grad_norm": 19.26648712158203, "learning_rate": 3.254926177510807e-05, "loss": 1.808, "step": 2900 }, { "epoch": 0.10105092966855295, "grad_norm": 10.993364334106445, "learning_rate": 3.367203727614663e-05, "loss": 1.7658, "step": 3000 }, { "epoch": 0.10441929399083805, "grad_norm": 12.577337265014648, "learning_rate": 3.47948127771852e-05, "loss": 1.7268, "step": 3100 }, { "epoch": 0.10778765831312315, "grad_norm": 15.279227256774902, "learning_rate": 3.591758827822377e-05, "loss": 1.6961, "step": 3200 }, { "epoch": 0.11115602263540825, "grad_norm": 15.154927253723145, "learning_rate": 3.704036377926234e-05, "loss": 1.6632, "step": 3300 }, { "epoch": 0.11452438695769335, "grad_norm": 5.024831295013428, "learning_rate": 3.816313928030091e-05, "loss": 1.627, "step": 3400 }, { "epoch": 0.11789275127997845, "grad_norm": 7.439777851104736, "learning_rate": 3.928591478133947e-05, "loss": 1.5909, "step": 3500 }, { "epoch": 0.12126111560226355, "grad_norm": 7.653560638427734, "learning_rate": 4.040869028237804e-05, "loss": 1.5621, "step": 3600 }, { "epoch": 0.12462947992454863, "grad_norm": 7.883094310760498, "learning_rate": 4.1531465783416603e-05, "loss": 1.5307, "step": 3700 }, { "epoch": 0.12799784424683375, "grad_norm": 3.2945971488952637, "learning_rate": 4.265424128445518e-05, "loss": 1.5016, "step": 3800 }, { "epoch": 0.13136620856911885, "grad_norm": 5.135283946990967, "learning_rate": 4.377701678549374e-05, "loss": 1.4741, "step": 3900 }, { "epoch": 0.13473457289140395, "grad_norm": 8.129427909851074, "learning_rate": 4.489979228653231e-05, "loss": 1.4423, "step": 4000 }, { "epoch": 0.13810293721368902, "grad_norm": 8.010125160217285, "learning_rate": 4.6022567787570874e-05, "loss": 1.4146, "step": 4100 }, { "epoch": 0.14147130153597412, "grad_norm": 3.1212265491485596, "learning_rate": 4.714534328860944e-05, "loss": 1.3919, "step": 4200 }, { "epoch": 0.14483966585825922, "grad_norm": 3.6468098163604736, "learning_rate": 4.826811878964801e-05, "loss": 1.373, "step": 4300 }, { "epoch": 0.14820803018054432, "grad_norm": 4.597881317138672, "learning_rate": 4.939089429068658e-05, "loss": 1.352, "step": 4400 }, { "epoch": 0.15157639450282942, "grad_norm": 4.9619622230529785, "learning_rate": 5.051366979172515e-05, "loss": 1.3299, "step": 4500 }, { "epoch": 0.15494475882511452, "grad_norm": 4.055070877075195, "learning_rate": 5.163644529276371e-05, "loss": 1.312, "step": 4600 }, { "epoch": 0.15831312314739962, "grad_norm": 4.076910018920898, "learning_rate": 5.2759220793802276e-05, "loss": 1.2963, "step": 4700 }, { "epoch": 0.16168148746968472, "grad_norm": 2.7936923503875732, "learning_rate": 5.388199629484085e-05, "loss": 1.2782, "step": 4800 }, { "epoch": 0.16504985179196982, "grad_norm": 3.8645057678222656, "learning_rate": 5.5004771795879414e-05, "loss": 1.2598, "step": 4900 }, { "epoch": 0.16841821611425492, "grad_norm": 3.8098433017730713, "learning_rate": 5.6127547296917983e-05, "loss": 1.2426, "step": 5000 }, { "epoch": 0.17178658043654002, "grad_norm": 3.690554618835449, "learning_rate": 5.7250322797956546e-05, "loss": 1.2257, "step": 5100 }, { "epoch": 0.17515494475882512, "grad_norm": 3.7821402549743652, "learning_rate": 5.837309829899512e-05, "loss": 1.2138, "step": 5200 }, { "epoch": 0.17852330908111022, "grad_norm": 4.070770263671875, "learning_rate": 5.9495873800033684e-05, "loss": 1.2005, "step": 5300 }, { "epoch": 0.18189167340339532, "grad_norm": 5.843082904815674, "learning_rate": 6.061864930107225e-05, "loss": 1.1795, "step": 5400 }, { "epoch": 0.18526003772568042, "grad_norm": 4.773739337921143, "learning_rate": 6.174142480211082e-05, "loss": 1.1665, "step": 5500 }, { "epoch": 0.18862840204796552, "grad_norm": 3.8879311084747314, "learning_rate": 6.286420030314939e-05, "loss": 1.1529, "step": 5600 }, { "epoch": 0.1919967663702506, "grad_norm": 4.927277088165283, "learning_rate": 6.398697580418795e-05, "loss": 1.1397, "step": 5700 }, { "epoch": 0.1953651306925357, "grad_norm": 3.640209913253784, "learning_rate": 6.510975130522652e-05, "loss": 1.1199, "step": 5800 }, { "epoch": 0.1987334950148208, "grad_norm": 5.0505595207214355, "learning_rate": 6.62325268062651e-05, "loss": 1.1073, "step": 5900 }, { "epoch": 0.2021018593371059, "grad_norm": 3.703660011291504, "learning_rate": 6.735530230730366e-05, "loss": 1.0966, "step": 6000 }, { "epoch": 0.205470223659391, "grad_norm": 3.3192944526672363, "learning_rate": 6.847807780834223e-05, "loss": 1.0823, "step": 6100 }, { "epoch": 0.2088385879816761, "grad_norm": 4.713069915771484, "learning_rate": 6.96008533093808e-05, "loss": 1.0718, "step": 6200 }, { "epoch": 0.2122069523039612, "grad_norm": 4.135160446166992, "learning_rate": 7.072362881041936e-05, "loss": 1.057, "step": 6300 }, { "epoch": 0.2155753166262463, "grad_norm": 4.193116664886475, "learning_rate": 7.184640431145793e-05, "loss": 1.0509, "step": 6400 }, { "epoch": 0.2189436809485314, "grad_norm": 4.028440475463867, "learning_rate": 7.296917981249649e-05, "loss": 1.0365, "step": 6500 }, { "epoch": 0.2223120452708165, "grad_norm": 4.614249229431152, "learning_rate": 7.409195531353507e-05, "loss": 1.0293, "step": 6600 }, { "epoch": 0.2256804095931016, "grad_norm": 4.366164684295654, "learning_rate": 7.521473081457363e-05, "loss": 1.0198, "step": 6700 }, { "epoch": 0.2290487739153867, "grad_norm": 5.207546710968018, "learning_rate": 7.63375063156122e-05, "loss": 1.0059, "step": 6800 }, { "epoch": 0.2324171382376718, "grad_norm": 3.651235342025757, "learning_rate": 7.746028181665077e-05, "loss": 1.0009, "step": 6900 }, { "epoch": 0.2357855025599569, "grad_norm": 4.040618896484375, "learning_rate": 7.858305731768933e-05, "loss": 0.9907, "step": 7000 }, { "epoch": 0.239153866882242, "grad_norm": 3.792742967605591, "learning_rate": 7.97058328187279e-05, "loss": 0.9847, "step": 7100 }, { "epoch": 0.2425222312045271, "grad_norm": 4.362412929534912, "learning_rate": 8.082860831976646e-05, "loss": 0.9738, "step": 7200 }, { "epoch": 0.24589059552681217, "grad_norm": 4.572664737701416, "learning_rate": 8.195138382080504e-05, "loss": 0.9678, "step": 7300 }, { "epoch": 0.24925895984909727, "grad_norm": 4.474113464355469, "learning_rate": 8.30741593218436e-05, "loss": 0.957, "step": 7400 }, { "epoch": 0.2526273241713824, "grad_norm": 4.847846984863281, "learning_rate": 8.419693482288217e-05, "loss": 0.9492, "step": 7500 }, { "epoch": 0.2559956884936675, "grad_norm": 4.326010227203369, "learning_rate": 8.531971032392074e-05, "loss": 0.9444, "step": 7600 }, { "epoch": 0.2593640528159526, "grad_norm": 4.634029388427734, "learning_rate": 8.64424858249593e-05, "loss": 0.9337, "step": 7700 }, { "epoch": 0.2627324171382377, "grad_norm": 3.841517925262451, "learning_rate": 8.756526132599788e-05, "loss": 0.9282, "step": 7800 }, { "epoch": 0.2661007814605228, "grad_norm": 4.89427375793457, "learning_rate": 8.868803682703643e-05, "loss": 0.9164, "step": 7900 }, { "epoch": 0.2694691457828079, "grad_norm": 4.296108245849609, "learning_rate": 8.9810812328075e-05, "loss": 0.9146, "step": 8000 }, { "epoch": 0.27283751010509294, "grad_norm": 4.8395586013793945, "learning_rate": 9.093358782911357e-05, "loss": 0.903, "step": 8100 }, { "epoch": 0.27620587442737804, "grad_norm": 4.250405788421631, "learning_rate": 9.205636333015214e-05, "loss": 0.9013, "step": 8200 }, { "epoch": 0.27957423874966314, "grad_norm": 3.9244723320007324, "learning_rate": 9.317913883119071e-05, "loss": 0.8968, "step": 8300 }, { "epoch": 0.28294260307194824, "grad_norm": 4.492284774780273, "learning_rate": 9.430191433222928e-05, "loss": 0.8924, "step": 8400 }, { "epoch": 0.28631096739423334, "grad_norm": 4.632638454437256, "learning_rate": 9.542468983326785e-05, "loss": 0.8822, "step": 8500 }, { "epoch": 0.28967933171651844, "grad_norm": 3.6097586154937744, "learning_rate": 9.65474653343064e-05, "loss": 0.8774, "step": 8600 }, { "epoch": 0.29304769603880354, "grad_norm": 3.6722657680511475, "learning_rate": 9.767024083534497e-05, "loss": 0.8697, "step": 8700 }, { "epoch": 0.29641606036108864, "grad_norm": 4.693965911865234, "learning_rate": 9.879301633638355e-05, "loss": 0.8583, "step": 8800 }, { "epoch": 0.29978442468337374, "grad_norm": 3.5417885780334473, "learning_rate": 9.991579183742211e-05, "loss": 0.8498, "step": 8900 }, { "epoch": 0.30315278900565884, "grad_norm": 5.091881275177002, "learning_rate": 0.00010103856733846069, "loss": 0.8396, "step": 9000 }, { "epoch": 0.30652115332794394, "grad_norm": 4.218757152557373, "learning_rate": 0.00010216134283949925, "loss": 0.8314, "step": 9100 }, { "epoch": 0.30988951765022904, "grad_norm": 3.600708246231079, "learning_rate": 0.00010328411834053782, "loss": 0.8249, "step": 9200 }, { "epoch": 0.31325788197251414, "grad_norm": 3.8332407474517822, "learning_rate": 0.00010440689384157639, "loss": 0.8187, "step": 9300 }, { "epoch": 0.31662624629479924, "grad_norm": 3.1585068702697754, "learning_rate": 0.00010552966934261494, "loss": 0.8087, "step": 9400 }, { "epoch": 0.31999461061708434, "grad_norm": 3.4112815856933594, "learning_rate": 0.00010665244484365351, "loss": 0.8015, "step": 9500 }, { "epoch": 0.32336297493936944, "grad_norm": 4.372965335845947, "learning_rate": 0.00010777522034469207, "loss": 0.7892, "step": 9600 }, { "epoch": 0.32673133926165454, "grad_norm": 3.7581305503845215, "learning_rate": 0.00010889799584573066, "loss": 0.7814, "step": 9700 }, { "epoch": 0.33009970358393964, "grad_norm": 4.480976581573486, "learning_rate": 0.00011002077134676922, "loss": 0.7625, "step": 9800 }, { "epoch": 0.33346806790622474, "grad_norm": 3.4865591526031494, "learning_rate": 0.00011114354684780779, "loss": 0.7524, "step": 9900 }, { "epoch": 0.33683643222850984, "grad_norm": 3.5094540119171143, "learning_rate": 0.00011226632234884636, "loss": 0.7421, "step": 10000 }, { "epoch": 0.34020479655079494, "grad_norm": 3.0365946292877197, "learning_rate": 0.00011338909784988491, "loss": 0.7354, "step": 10100 }, { "epoch": 0.34357316087308004, "grad_norm": 3.5247597694396973, "learning_rate": 0.00011451187335092348, "loss": 0.7224, "step": 10200 }, { "epoch": 0.34694152519536514, "grad_norm": 3.1095457077026367, "learning_rate": 0.00011563464885196205, "loss": 0.7195, "step": 10300 }, { "epoch": 0.35030988951765024, "grad_norm": 3.8091487884521484, "learning_rate": 0.00011675742435300064, "loss": 0.713, "step": 10400 }, { "epoch": 0.35367825383993534, "grad_norm": 2.9617044925689697, "learning_rate": 0.00011788019985403919, "loss": 0.7067, "step": 10500 }, { "epoch": 0.35704661816222044, "grad_norm": 4.0781331062316895, "learning_rate": 0.00011900297535507776, "loss": 0.7022, "step": 10600 }, { "epoch": 0.36041498248450554, "grad_norm": 2.9260106086730957, "learning_rate": 0.00012012575085611633, "loss": 0.6967, "step": 10700 }, { "epoch": 0.36378334680679064, "grad_norm": 3.00919508934021, "learning_rate": 0.00012124852635715489, "loss": 0.6934, "step": 10800 }, { "epoch": 0.36715171112907574, "grad_norm": 2.74841046333313, "learning_rate": 0.00012237130185819344, "loss": 0.6874, "step": 10900 }, { "epoch": 0.37052007545136084, "grad_norm": 2.3908281326293945, "learning_rate": 0.000123494077359232, "loss": 0.6843, "step": 11000 }, { "epoch": 0.37388843977364594, "grad_norm": 2.5212063789367676, "learning_rate": 0.0001246168528602706, "loss": 0.681, "step": 11100 }, { "epoch": 0.37725680409593104, "grad_norm": 2.342548370361328, "learning_rate": 0.00012573962836130918, "loss": 0.6755, "step": 11200 }, { "epoch": 0.3806251684182161, "grad_norm": 2.2817301750183105, "learning_rate": 0.00012686240386234775, "loss": 0.6762, "step": 11300 }, { "epoch": 0.3839935327405012, "grad_norm": 2.4880239963531494, "learning_rate": 0.0001279851793633863, "loss": 0.6696, "step": 11400 }, { "epoch": 0.3873618970627863, "grad_norm": 2.2513132095336914, "learning_rate": 0.00012910795486442486, "loss": 0.6698, "step": 11500 }, { "epoch": 0.3907302613850714, "grad_norm": 2.4084956645965576, "learning_rate": 0.00013023073036546343, "loss": 0.6669, "step": 11600 }, { "epoch": 0.3940986257073565, "grad_norm": 2.5854873657226562, "learning_rate": 0.000131353505866502, "loss": 0.6629, "step": 11700 }, { "epoch": 0.3974669900296416, "grad_norm": 2.377323627471924, "learning_rate": 0.00013247628136754056, "loss": 0.6607, "step": 11800 }, { "epoch": 0.4008353543519267, "grad_norm": 2.0934255123138428, "learning_rate": 0.00013359905686857913, "loss": 0.6557, "step": 11900 }, { "epoch": 0.4042037186742118, "grad_norm": 2.2876408100128174, "learning_rate": 0.0001347218323696177, "loss": 0.6537, "step": 12000 }, { "epoch": 0.4075720829964969, "grad_norm": 2.856818199157715, "learning_rate": 0.00013584460787065627, "loss": 0.6534, "step": 12100 }, { "epoch": 0.410940447318782, "grad_norm": 2.3577589988708496, "learning_rate": 0.00013696738337169484, "loss": 0.6468, "step": 12200 }, { "epoch": 0.4143088116410671, "grad_norm": 2.1369576454162598, "learning_rate": 0.0001380901588727334, "loss": 0.6466, "step": 12300 }, { "epoch": 0.4176771759633522, "grad_norm": 2.0527994632720947, "learning_rate": 0.00013921293437377195, "loss": 0.6423, "step": 12400 }, { "epoch": 0.4210455402856373, "grad_norm": 2.1849894523620605, "learning_rate": 0.00014033570987481052, "loss": 0.6408, "step": 12500 }, { "epoch": 0.4244139046079224, "grad_norm": 2.403149127960205, "learning_rate": 0.00014145848537584912, "loss": 0.6401, "step": 12600 }, { "epoch": 0.4277822689302075, "grad_norm": 1.983995795249939, "learning_rate": 0.0001425812608768877, "loss": 0.6387, "step": 12700 }, { "epoch": 0.4311506332524926, "grad_norm": 2.141962766647339, "learning_rate": 0.00014370403637792623, "loss": 0.635, "step": 12800 }, { "epoch": 0.4345189975747777, "grad_norm": 1.9785326719284058, "learning_rate": 0.0001448268118789648, "loss": 0.6314, "step": 12900 }, { "epoch": 0.4378873618970628, "grad_norm": 2.0606772899627686, "learning_rate": 0.00014594958738000337, "loss": 0.6285, "step": 13000 }, { "epoch": 0.4412557262193479, "grad_norm": 1.88225519657135, "learning_rate": 0.00014707236288104194, "loss": 0.6296, "step": 13100 }, { "epoch": 0.444624090541633, "grad_norm": 2.204674005508423, "learning_rate": 0.0001481951383820805, "loss": 0.628, "step": 13200 }, { "epoch": 0.4479924548639181, "grad_norm": 1.8650182485580444, "learning_rate": 0.00014931791388311908, "loss": 0.6264, "step": 13300 }, { "epoch": 0.4513608191862032, "grad_norm": 1.7972240447998047, "learning_rate": 0.00015044068938415765, "loss": 0.6211, "step": 13400 }, { "epoch": 0.4547291835084883, "grad_norm": 1.8085206747055054, "learning_rate": 0.00015156346488519621, "loss": 0.6223, "step": 13500 }, { "epoch": 0.4580975478307734, "grad_norm": 1.877871036529541, "learning_rate": 0.00015268624038623478, "loss": 0.624, "step": 13600 }, { "epoch": 0.4614659121530585, "grad_norm": 2.295692205429077, "learning_rate": 0.00015380901588727335, "loss": 0.6198, "step": 13700 }, { "epoch": 0.4648342764753436, "grad_norm": 2.4655864238739014, "learning_rate": 0.0001549317913883119, "loss": 0.6171, "step": 13800 }, { "epoch": 0.4682026407976287, "grad_norm": 1.9931831359863281, "learning_rate": 0.00015605456688935046, "loss": 0.6146, "step": 13900 }, { "epoch": 0.4715710051199138, "grad_norm": 1.7389591932296753, "learning_rate": 0.00015717734239038906, "loss": 0.6141, "step": 14000 }, { "epoch": 0.4749393694421989, "grad_norm": 2.0048677921295166, "learning_rate": 0.00015830011789142763, "loss": 0.613, "step": 14100 }, { "epoch": 0.478307733764484, "grad_norm": 2.0038020610809326, "learning_rate": 0.0001594228933924662, "loss": 0.6116, "step": 14200 }, { "epoch": 0.4816760980867691, "grad_norm": 1.8391730785369873, "learning_rate": 0.00016054566889350474, "loss": 0.6093, "step": 14300 }, { "epoch": 0.4850444624090542, "grad_norm": 1.769494652748108, "learning_rate": 0.0001616684443945433, "loss": 0.6081, "step": 14400 }, { "epoch": 0.4884128267313393, "grad_norm": 1.9740633964538574, "learning_rate": 0.00016279121989558188, "loss": 0.6069, "step": 14500 }, { "epoch": 0.49178119105362433, "grad_norm": 2.1322596073150635, "learning_rate": 0.00016391399539662045, "loss": 0.6067, "step": 14600 }, { "epoch": 0.49514955537590943, "grad_norm": 1.6382005214691162, "learning_rate": 0.00016503677089765902, "loss": 0.604, "step": 14700 }, { "epoch": 0.49851791969819453, "grad_norm": 1.49541175365448, "learning_rate": 0.0001661595463986976, "loss": 0.6027, "step": 14800 }, { "epoch": 0.5018862840204796, "grad_norm": 1.5882339477539062, "learning_rate": 0.00016728232189973616, "loss": 0.6014, "step": 14900 }, { "epoch": 0.5052546483427648, "grad_norm": 1.491133213043213, "learning_rate": 0.00016840509740077473, "loss": 0.5983, "step": 15000 }, { "epoch": 0.5086230126650498, "grad_norm": 1.7467178106307983, "learning_rate": 0.0001695278729018133, "loss": 0.5996, "step": 15100 }, { "epoch": 0.511991376987335, "grad_norm": 1.5445200204849243, "learning_rate": 0.00017065064840285186, "loss": 0.5937, "step": 15200 }, { "epoch": 0.51535974130962, "grad_norm": 1.613213300704956, "learning_rate": 0.0001717734239038904, "loss": 0.5924, "step": 15300 }, { "epoch": 0.5187281056319052, "grad_norm": 1.67715585231781, "learning_rate": 0.00017289619940492898, "loss": 0.594, "step": 15400 }, { "epoch": 0.5220964699541902, "grad_norm": 1.7080377340316772, "learning_rate": 0.00017401897490596757, "loss": 0.5935, "step": 15500 }, { "epoch": 0.5254648342764754, "grad_norm": 1.7722272872924805, "learning_rate": 0.00017514175040700614, "loss": 0.5914, "step": 15600 }, { "epoch": 0.5288331985987604, "grad_norm": 1.7470366954803467, "learning_rate": 0.00017626452590804468, "loss": 0.5883, "step": 15700 }, { "epoch": 0.5322015629210456, "grad_norm": 1.974663496017456, "learning_rate": 0.00017738730140908325, "loss": 0.5908, "step": 15800 }, { "epoch": 0.5355699272433306, "grad_norm": 1.4482321739196777, "learning_rate": 0.00017851007691012182, "loss": 0.5885, "step": 15900 }, { "epoch": 0.5389382915656158, "grad_norm": 1.750618815422058, "learning_rate": 0.0001796328524111604, "loss": 0.5855, "step": 16000 }, { "epoch": 0.5423066558879008, "grad_norm": 1.3821526765823364, "learning_rate": 0.00018075562791219896, "loss": 0.5884, "step": 16100 }, { "epoch": 0.5456750202101859, "grad_norm": 1.4892586469650269, "learning_rate": 0.00018187840341323753, "loss": 0.5838, "step": 16200 }, { "epoch": 0.549043384532471, "grad_norm": 1.5591208934783936, "learning_rate": 0.0001830011789142761, "loss": 0.5834, "step": 16300 }, { "epoch": 0.5524117488547561, "grad_norm": 1.326253056526184, "learning_rate": 0.00018412395441531467, "loss": 0.5828, "step": 16400 }, { "epoch": 0.5557801131770412, "grad_norm": 1.5288639068603516, "learning_rate": 0.00018524672991635324, "loss": 0.5793, "step": 16500 }, { "epoch": 0.5591484774993263, "grad_norm": 1.4673304557800293, "learning_rate": 0.0001863695054173918, "loss": 0.5791, "step": 16600 }, { "epoch": 0.5625168418216114, "grad_norm": 1.6291229724884033, "learning_rate": 0.00018749228091843035, "loss": 0.5792, "step": 16700 }, { "epoch": 0.5658852061438965, "grad_norm": 1.3908525705337524, "learning_rate": 0.00018861505641946892, "loss": 0.5795, "step": 16800 }, { "epoch": 0.5692535704661816, "grad_norm": 1.4598628282546997, "learning_rate": 0.00018973783192050752, "loss": 0.576, "step": 16900 }, { "epoch": 0.5726219347884667, "grad_norm": 1.2881489992141724, "learning_rate": 0.00019086060742154608, "loss": 0.575, "step": 17000 }, { "epoch": 0.5759902991107518, "grad_norm": 1.2719937562942505, "learning_rate": 0.00019198338292258465, "loss": 0.5747, "step": 17100 }, { "epoch": 0.5793586634330369, "grad_norm": 1.2574406862258911, "learning_rate": 0.0001931061584236232, "loss": 0.573, "step": 17200 }, { "epoch": 0.582727027755322, "grad_norm": 1.457133173942566, "learning_rate": 0.00019422893392466177, "loss": 0.5738, "step": 17300 }, { "epoch": 0.5860953920776071, "grad_norm": 1.2623742818832397, "learning_rate": 0.00019535170942570033, "loss": 0.571, "step": 17400 }, { "epoch": 0.5894637563998922, "grad_norm": 1.4135565757751465, "learning_rate": 0.0001964744849267389, "loss": 0.5706, "step": 17500 }, { "epoch": 0.5928321207221773, "grad_norm": 1.502484917640686, "learning_rate": 0.00019759726042777747, "loss": 0.5713, "step": 17600 }, { "epoch": 0.5962004850444624, "grad_norm": 1.3130122423171997, "learning_rate": 0.00019872003592881604, "loss": 0.5683, "step": 17700 }, { "epoch": 0.5995688493667475, "grad_norm": 1.2580504417419434, "learning_rate": 0.0001998428114298546, "loss": 0.5696, "step": 17800 }, { "epoch": 0.6029372136890326, "grad_norm": 1.204026460647583, "learning_rate": 0.00019975859987929996, "loss": 0.5664, "step": 17900 }, { "epoch": 0.6063055780113177, "grad_norm": 1.3051841259002686, "learning_rate": 0.00019947790206453243, "loss": 0.5666, "step": 18000 }, { "epoch": 0.6096739423336028, "grad_norm": 1.1939951181411743, "learning_rate": 0.00019919720424976494, "loss": 0.5634, "step": 18100 }, { "epoch": 0.6130423066558879, "grad_norm": 1.25477135181427, "learning_rate": 0.00019891650643499742, "loss": 0.5628, "step": 18200 }, { "epoch": 0.616410670978173, "grad_norm": 1.1275781393051147, "learning_rate": 0.0001986358086202299, "loss": 0.5624, "step": 18300 }, { "epoch": 0.6197790353004581, "grad_norm": 1.1167781352996826, "learning_rate": 0.00019835511080546237, "loss": 0.5617, "step": 18400 }, { "epoch": 0.6231473996227432, "grad_norm": 1.193454623222351, "learning_rate": 0.00019807441299069488, "loss": 0.5605, "step": 18500 }, { "epoch": 0.6265157639450283, "grad_norm": 1.1406720876693726, "learning_rate": 0.00019779371517592739, "loss": 0.5587, "step": 18600 }, { "epoch": 0.6298841282673134, "grad_norm": 1.2136386632919312, "learning_rate": 0.00019751301736115986, "loss": 0.5573, "step": 18700 }, { "epoch": 0.6332524925895985, "grad_norm": 1.216199278831482, "learning_rate": 0.00019723231954639234, "loss": 0.5563, "step": 18800 }, { "epoch": 0.6366208569118836, "grad_norm": 1.2443403005599976, "learning_rate": 0.00019695162173162482, "loss": 0.5519, "step": 18900 }, { "epoch": 0.6399892212341687, "grad_norm": 1.1415669918060303, "learning_rate": 0.0001966709239168573, "loss": 0.5551, "step": 19000 }, { "epoch": 0.6433575855564538, "grad_norm": 1.2228775024414062, "learning_rate": 0.0001963902261020898, "loss": 0.5547, "step": 19100 }, { "epoch": 0.6467259498787389, "grad_norm": 1.1878366470336914, "learning_rate": 0.0001961095282873223, "loss": 0.5537, "step": 19200 }, { "epoch": 0.650094314201024, "grad_norm": 1.1277652978897095, "learning_rate": 0.0001958288304725548, "loss": 0.5521, "step": 19300 }, { "epoch": 0.6534626785233091, "grad_norm": 1.2011772394180298, "learning_rate": 0.00019554813265778727, "loss": 0.5519, "step": 19400 }, { "epoch": 0.6568310428455941, "grad_norm": 1.1792044639587402, "learning_rate": 0.00019526743484301975, "loss": 0.5493, "step": 19500 }, { "epoch": 0.6601994071678793, "grad_norm": 1.1553574800491333, "learning_rate": 0.00019498673702825225, "loss": 0.5464, "step": 19600 }, { "epoch": 0.6635677714901643, "grad_norm": 1.1871212720870972, "learning_rate": 0.00019470603921348473, "loss": 0.5489, "step": 19700 }, { "epoch": 0.6669361358124495, "grad_norm": 1.0879842042922974, "learning_rate": 0.0001944253413987172, "loss": 0.5476, "step": 19800 }, { "epoch": 0.6703045001347345, "grad_norm": 1.3135937452316284, "learning_rate": 0.0001941446435839497, "loss": 0.5482, "step": 19900 }, { "epoch": 0.6736728644570197, "grad_norm": 1.0638514757156372, "learning_rate": 0.0001938639457691822, "loss": 0.546, "step": 20000 }, { "epoch": 0.6770412287793047, "grad_norm": 1.139218807220459, "learning_rate": 0.0001935832479544147, "loss": 0.5434, "step": 20100 }, { "epoch": 0.6804095931015899, "grad_norm": 1.0563747882843018, "learning_rate": 0.00019330255013964718, "loss": 0.5462, "step": 20200 }, { "epoch": 0.6837779574238749, "grad_norm": 1.0997061729431152, "learning_rate": 0.00019302185232487965, "loss": 0.5401, "step": 20300 }, { "epoch": 0.6871463217461601, "grad_norm": 1.0555341243743896, "learning_rate": 0.00019274115451011213, "loss": 0.5413, "step": 20400 }, { "epoch": 0.6905146860684451, "grad_norm": 1.1296801567077637, "learning_rate": 0.00019246045669534464, "loss": 0.5394, "step": 20500 }, { "epoch": 0.6938830503907303, "grad_norm": 1.1637988090515137, "learning_rate": 0.00019217975888057714, "loss": 0.5405, "step": 20600 }, { "epoch": 0.6972514147130153, "grad_norm": 1.1942201852798462, "learning_rate": 0.00019189906106580962, "loss": 0.5401, "step": 20700 }, { "epoch": 0.7006197790353005, "grad_norm": 1.104561686515808, "learning_rate": 0.0001916183632510421, "loss": 0.5385, "step": 20800 }, { "epoch": 0.7039881433575855, "grad_norm": 1.0518121719360352, "learning_rate": 0.00019133766543627458, "loss": 0.5394, "step": 20900 }, { "epoch": 0.7073565076798707, "grad_norm": 1.0300666093826294, "learning_rate": 0.00019105696762150706, "loss": 0.5361, "step": 21000 }, { "epoch": 0.7107248720021557, "grad_norm": 0.9076865315437317, "learning_rate": 0.00019077626980673956, "loss": 0.5384, "step": 21100 }, { "epoch": 0.7140932363244409, "grad_norm": 1.170762062072754, "learning_rate": 0.00019049557199197204, "loss": 0.5356, "step": 21200 }, { "epoch": 0.7174616006467259, "grad_norm": 1.102295160293579, "learning_rate": 0.00019021487417720455, "loss": 0.5359, "step": 21300 }, { "epoch": 0.7208299649690111, "grad_norm": 1.102849006652832, "learning_rate": 0.00018993417636243703, "loss": 0.535, "step": 21400 }, { "epoch": 0.7241983292912961, "grad_norm": 0.9895302653312683, "learning_rate": 0.0001896534785476695, "loss": 0.533, "step": 21500 }, { "epoch": 0.7275666936135813, "grad_norm": 1.0017067193984985, "learning_rate": 0.000189372780732902, "loss": 0.5328, "step": 21600 }, { "epoch": 0.7309350579358663, "grad_norm": 1.068293809890747, "learning_rate": 0.0001890920829181345, "loss": 0.5355, "step": 21700 }, { "epoch": 0.7343034222581515, "grad_norm": 1.092910647392273, "learning_rate": 0.00018881138510336697, "loss": 0.5322, "step": 21800 }, { "epoch": 0.7376717865804365, "grad_norm": 1.0329002141952515, "learning_rate": 0.00018853068728859947, "loss": 0.5308, "step": 21900 }, { "epoch": 0.7410401509027217, "grad_norm": 1.1431453227996826, "learning_rate": 0.00018824998947383195, "loss": 0.5312, "step": 22000 }, { "epoch": 0.7444085152250067, "grad_norm": 0.9961342811584473, "learning_rate": 0.00018796929165906446, "loss": 0.5316, "step": 22100 }, { "epoch": 0.7477768795472919, "grad_norm": 0.9267546534538269, "learning_rate": 0.00018768859384429693, "loss": 0.5308, "step": 22200 }, { "epoch": 0.7511452438695769, "grad_norm": 1.0788689851760864, "learning_rate": 0.0001874078960295294, "loss": 0.5297, "step": 22300 }, { "epoch": 0.7545136081918621, "grad_norm": 1.0680807828903198, "learning_rate": 0.0001871271982147619, "loss": 0.5283, "step": 22400 }, { "epoch": 0.7578819725141471, "grad_norm": 1.122947096824646, "learning_rate": 0.0001868465003999944, "loss": 0.5268, "step": 22500 }, { "epoch": 0.7612503368364322, "grad_norm": 1.0286208391189575, "learning_rate": 0.0001865658025852269, "loss": 0.5264, "step": 22600 }, { "epoch": 0.7646187011587173, "grad_norm": 1.0122915506362915, "learning_rate": 0.00018628510477045938, "loss": 0.5261, "step": 22700 }, { "epoch": 0.7679870654810024, "grad_norm": 1.0254476070404053, "learning_rate": 0.00018600440695569186, "loss": 0.5253, "step": 22800 }, { "epoch": 0.7713554298032875, "grad_norm": 0.9192175269126892, "learning_rate": 0.00018572370914092434, "loss": 0.5235, "step": 22900 }, { "epoch": 0.7747237941255726, "grad_norm": 1.0937845706939697, "learning_rate": 0.00018544301132615684, "loss": 0.5243, "step": 23000 }, { "epoch": 0.7780921584478577, "grad_norm": 1.0288293361663818, "learning_rate": 0.00018516231351138932, "loss": 0.5221, "step": 23100 }, { "epoch": 0.7814605227701428, "grad_norm": 1.0520168542861938, "learning_rate": 0.0001848816156966218, "loss": 0.5237, "step": 23200 }, { "epoch": 0.7848288870924279, "grad_norm": 0.9760498404502869, "learning_rate": 0.0001846009178818543, "loss": 0.5245, "step": 23300 }, { "epoch": 0.788197251414713, "grad_norm": 1.0123729705810547, "learning_rate": 0.00018432022006708678, "loss": 0.5238, "step": 23400 }, { "epoch": 0.7915656157369981, "grad_norm": 0.9239659905433655, "learning_rate": 0.0001840395222523193, "loss": 0.5228, "step": 23500 }, { "epoch": 0.7949339800592832, "grad_norm": 0.964204728603363, "learning_rate": 0.00018375882443755177, "loss": 0.5202, "step": 23600 }, { "epoch": 0.7983023443815683, "grad_norm": 1.024375081062317, "learning_rate": 0.00018347812662278425, "loss": 0.5214, "step": 23700 }, { "epoch": 0.8016707087038534, "grad_norm": 0.9285891652107239, "learning_rate": 0.00018319742880801672, "loss": 0.5216, "step": 23800 }, { "epoch": 0.8050390730261385, "grad_norm": 0.9374035000801086, "learning_rate": 0.00018291673099324923, "loss": 0.5199, "step": 23900 }, { "epoch": 0.8084074373484236, "grad_norm": 0.9423925280570984, "learning_rate": 0.00018263603317848174, "loss": 0.5182, "step": 24000 }, { "epoch": 0.8117758016707087, "grad_norm": 0.9198417663574219, "learning_rate": 0.00018235533536371421, "loss": 0.5195, "step": 24100 }, { "epoch": 0.8151441659929938, "grad_norm": 0.8950690627098083, "learning_rate": 0.0001820746375489467, "loss": 0.5174, "step": 24200 }, { "epoch": 0.8185125303152789, "grad_norm": 0.9775617718696594, "learning_rate": 0.00018179393973417917, "loss": 0.5163, "step": 24300 }, { "epoch": 0.821880894637564, "grad_norm": 0.961654543876648, "learning_rate": 0.00018151324191941165, "loss": 0.5145, "step": 24400 }, { "epoch": 0.8252492589598491, "grad_norm": 0.884971559047699, "learning_rate": 0.00018123254410464415, "loss": 0.5159, "step": 24500 }, { "epoch": 0.8286176232821342, "grad_norm": 0.9463781118392944, "learning_rate": 0.00018095184628987666, "loss": 0.5147, "step": 24600 }, { "epoch": 0.8319859876044193, "grad_norm": 0.9335620999336243, "learning_rate": 0.00018067114847510914, "loss": 0.5148, "step": 24700 }, { "epoch": 0.8353543519267044, "grad_norm": 1.0065468549728394, "learning_rate": 0.00018039045066034162, "loss": 0.5145, "step": 24800 }, { "epoch": 0.8387227162489895, "grad_norm": 0.9249733686447144, "learning_rate": 0.0001801097528455741, "loss": 0.5144, "step": 24900 }, { "epoch": 0.8420910805712746, "grad_norm": 0.9696065783500671, "learning_rate": 0.0001798290550308066, "loss": 0.5146, "step": 25000 }, { "epoch": 0.8454594448935597, "grad_norm": 0.9490009546279907, "learning_rate": 0.00017954835721603908, "loss": 0.5128, "step": 25100 }, { "epoch": 0.8488278092158448, "grad_norm": 0.9294765591621399, "learning_rate": 0.00017926765940127156, "loss": 0.5128, "step": 25200 }, { "epoch": 0.8521961735381299, "grad_norm": 0.9910796284675598, "learning_rate": 0.00017898696158650406, "loss": 0.5118, "step": 25300 }, { "epoch": 0.855564537860415, "grad_norm": 0.9949105381965637, "learning_rate": 0.00017870626377173654, "loss": 0.511, "step": 25400 }, { "epoch": 0.8589329021827001, "grad_norm": 0.9345620274543762, "learning_rate": 0.00017842556595696905, "loss": 0.5119, "step": 25500 }, { "epoch": 0.8623012665049852, "grad_norm": 0.9553151726722717, "learning_rate": 0.00017814486814220153, "loss": 0.5103, "step": 25600 }, { "epoch": 0.8656696308272703, "grad_norm": 0.878685474395752, "learning_rate": 0.000177864170327434, "loss": 0.5112, "step": 25700 }, { "epoch": 0.8690379951495554, "grad_norm": 0.9728811979293823, "learning_rate": 0.00017758347251266648, "loss": 0.5088, "step": 25800 }, { "epoch": 0.8724063594718404, "grad_norm": 0.9711565375328064, "learning_rate": 0.000177302774697899, "loss": 0.5087, "step": 25900 }, { "epoch": 0.8757747237941256, "grad_norm": 0.9093062281608582, "learning_rate": 0.0001770220768831315, "loss": 0.5086, "step": 26000 }, { "epoch": 0.8791430881164106, "grad_norm": 0.9751853942871094, "learning_rate": 0.00017674137906836397, "loss": 0.5106, "step": 26100 }, { "epoch": 0.8825114524386958, "grad_norm": 0.9044291377067566, "learning_rate": 0.00017646068125359645, "loss": 0.5077, "step": 26200 }, { "epoch": 0.8858798167609808, "grad_norm": 0.9224226474761963, "learning_rate": 0.00017617998343882893, "loss": 0.5075, "step": 26300 }, { "epoch": 0.889248181083266, "grad_norm": 0.9981474876403809, "learning_rate": 0.0001758992856240614, "loss": 0.5048, "step": 26400 }, { "epoch": 0.892616545405551, "grad_norm": 0.8626927733421326, "learning_rate": 0.0001756185878092939, "loss": 0.506, "step": 26500 }, { "epoch": 0.8959849097278362, "grad_norm": 0.8800698518753052, "learning_rate": 0.00017533788999452642, "loss": 0.5067, "step": 26600 }, { "epoch": 0.8993532740501212, "grad_norm": 0.8937718272209167, "learning_rate": 0.0001750571921797589, "loss": 0.5059, "step": 26700 }, { "epoch": 0.9027216383724064, "grad_norm": 0.8680539727210999, "learning_rate": 0.00017477649436499138, "loss": 0.5074, "step": 26800 }, { "epoch": 0.9060900026946914, "grad_norm": 0.8701693415641785, "learning_rate": 0.00017449579655022385, "loss": 0.5048, "step": 26900 }, { "epoch": 0.9094583670169766, "grad_norm": 0.937451958656311, "learning_rate": 0.00017421509873545636, "loss": 0.5036, "step": 27000 }, { "epoch": 0.9128267313392616, "grad_norm": 0.845152735710144, "learning_rate": 0.00017393440092068884, "loss": 0.5015, "step": 27100 }, { "epoch": 0.9161950956615468, "grad_norm": 0.8485780358314514, "learning_rate": 0.00017365370310592132, "loss": 0.5021, "step": 27200 }, { "epoch": 0.9195634599838318, "grad_norm": 0.8812822699546814, "learning_rate": 0.00017337300529115382, "loss": 0.5028, "step": 27300 }, { "epoch": 0.922931824306117, "grad_norm": 0.9817461371421814, "learning_rate": 0.0001730923074763863, "loss": 0.5032, "step": 27400 }, { "epoch": 0.926300188628402, "grad_norm": 0.8648643493652344, "learning_rate": 0.0001728116096616188, "loss": 0.5023, "step": 27500 }, { "epoch": 0.9296685529506872, "grad_norm": 0.8859161734580994, "learning_rate": 0.00017253091184685128, "loss": 0.5022, "step": 27600 }, { "epoch": 0.9330369172729722, "grad_norm": 0.8662147521972656, "learning_rate": 0.00017225021403208376, "loss": 0.5, "step": 27700 }, { "epoch": 0.9364052815952574, "grad_norm": 0.9094113111495972, "learning_rate": 0.00017196951621731624, "loss": 0.5018, "step": 27800 }, { "epoch": 0.9397736459175424, "grad_norm": 0.924689531326294, "learning_rate": 0.00017168881840254875, "loss": 0.5008, "step": 27900 }, { "epoch": 0.9431420102398276, "grad_norm": 0.8770294785499573, "learning_rate": 0.00017140812058778125, "loss": 0.5023, "step": 28000 }, { "epoch": 0.9465103745621126, "grad_norm": 0.8615702390670776, "learning_rate": 0.00017112742277301373, "loss": 0.4988, "step": 28100 }, { "epoch": 0.9498787388843978, "grad_norm": 0.9163374304771423, "learning_rate": 0.0001708467249582462, "loss": 0.5004, "step": 28200 }, { "epoch": 0.9532471032066828, "grad_norm": 0.8876280784606934, "learning_rate": 0.0001705660271434787, "loss": 0.5005, "step": 28300 }, { "epoch": 0.956615467528968, "grad_norm": 0.9345399737358093, "learning_rate": 0.00017028532932871117, "loss": 0.498, "step": 28400 }, { "epoch": 0.959983831851253, "grad_norm": 0.8554583191871643, "learning_rate": 0.00017000463151394367, "loss": 0.4995, "step": 28500 }, { "epoch": 0.9633521961735382, "grad_norm": 0.910744845867157, "learning_rate": 0.00016972393369917615, "loss": 0.499, "step": 28600 }, { "epoch": 0.9667205604958232, "grad_norm": 0.9200494289398193, "learning_rate": 0.00016944323588440866, "loss": 0.497, "step": 28700 }, { "epoch": 0.9700889248181084, "grad_norm": 0.821864902973175, "learning_rate": 0.00016916253806964113, "loss": 0.4976, "step": 28800 }, { "epoch": 0.9734572891403934, "grad_norm": 0.8839085698127747, "learning_rate": 0.0001688818402548736, "loss": 0.4981, "step": 28900 }, { "epoch": 0.9768256534626786, "grad_norm": 0.8938930630683899, "learning_rate": 0.00016860114244010612, "loss": 0.4982, "step": 29000 }, { "epoch": 0.9801940177849636, "grad_norm": 0.8309621810913086, "learning_rate": 0.0001683204446253386, "loss": 0.4971, "step": 29100 }, { "epoch": 0.9835623821072487, "grad_norm": 0.8898798227310181, "learning_rate": 0.00016803974681057107, "loss": 0.4981, "step": 29200 }, { "epoch": 0.9869307464295338, "grad_norm": 0.9762869477272034, "learning_rate": 0.00016775904899580358, "loss": 0.4968, "step": 29300 }, { "epoch": 0.9902991107518189, "grad_norm": 0.8826524615287781, "learning_rate": 0.00016747835118103606, "loss": 0.4983, "step": 29400 }, { "epoch": 0.993667475074104, "grad_norm": 0.8983336687088013, "learning_rate": 0.00016719765336626856, "loss": 0.4964, "step": 29500 }, { "epoch": 0.9970358393963891, "grad_norm": 0.8700274229049683, "learning_rate": 0.00016691695555150104, "loss": 0.496, "step": 29600 }, { "epoch": 1.0, "eval_loss": 0.49384912848472595, "eval_runtime": 9.0835, "eval_samples_per_second": 550.447, "eval_steps_per_second": 8.697, "step": 29688 }, { "epoch": 1.0004042037186742, "grad_norm": 0.9031352996826172, "learning_rate": 0.00016663625773673352, "loss": 0.4948, "step": 29700 }, { "epoch": 1.0037725680409593, "grad_norm": 0.8552715182304382, "learning_rate": 0.000166355559921966, "loss": 0.4954, "step": 29800 }, { "epoch": 1.0071409323632443, "grad_norm": 0.8794796466827393, "learning_rate": 0.0001660748621071985, "loss": 0.4944, "step": 29900 }, { "epoch": 1.0105092966855296, "grad_norm": 0.876146137714386, "learning_rate": 0.000165794164292431, "loss": 0.4954, "step": 30000 }, { "epoch": 1.0138776610078146, "grad_norm": 0.8548246026039124, "learning_rate": 0.0001655134664776635, "loss": 0.4957, "step": 30100 }, { "epoch": 1.0172460253300997, "grad_norm": 0.8883000016212463, "learning_rate": 0.00016523276866289597, "loss": 0.4939, "step": 30200 }, { "epoch": 1.0206143896523847, "grad_norm": 0.8102014064788818, "learning_rate": 0.00016495207084812845, "loss": 0.491, "step": 30300 }, { "epoch": 1.02398275397467, "grad_norm": 0.9280298948287964, "learning_rate": 0.00016467137303336095, "loss": 0.4939, "step": 30400 }, { "epoch": 1.027351118296955, "grad_norm": 0.9322350025177002, "learning_rate": 0.00016439067521859343, "loss": 0.4923, "step": 30500 }, { "epoch": 1.03071948261924, "grad_norm": 0.8731549978256226, "learning_rate": 0.0001641099774038259, "loss": 0.4929, "step": 30600 }, { "epoch": 1.0340878469415251, "grad_norm": 0.8500041365623474, "learning_rate": 0.00016382927958905841, "loss": 0.492, "step": 30700 }, { "epoch": 1.0374562112638104, "grad_norm": 0.8375087976455688, "learning_rate": 0.0001635485817742909, "loss": 0.4917, "step": 30800 }, { "epoch": 1.0408245755860954, "grad_norm": 0.8288936018943787, "learning_rate": 0.0001632678839595234, "loss": 0.4928, "step": 30900 }, { "epoch": 1.0441929399083805, "grad_norm": 0.8341562151908875, "learning_rate": 0.00016298718614475588, "loss": 0.4889, "step": 31000 }, { "epoch": 1.0475613042306655, "grad_norm": 0.8432872891426086, "learning_rate": 0.00016270648832998835, "loss": 0.4915, "step": 31100 }, { "epoch": 1.0509296685529508, "grad_norm": 0.8462439775466919, "learning_rate": 0.00016242579051522083, "loss": 0.4883, "step": 31200 }, { "epoch": 1.0542980328752358, "grad_norm": 0.8429282903671265, "learning_rate": 0.00016214509270045334, "loss": 0.4895, "step": 31300 }, { "epoch": 1.0576663971975209, "grad_norm": 0.8985344767570496, "learning_rate": 0.00016186439488568584, "loss": 0.4906, "step": 31400 }, { "epoch": 1.061034761519806, "grad_norm": 0.9159397482872009, "learning_rate": 0.00016158369707091832, "loss": 0.4891, "step": 31500 }, { "epoch": 1.0644031258420912, "grad_norm": 0.8448222279548645, "learning_rate": 0.0001613029992561508, "loss": 0.4891, "step": 31600 }, { "epoch": 1.0677714901643762, "grad_norm": 0.8303894400596619, "learning_rate": 0.00016102230144138328, "loss": 0.4902, "step": 31700 }, { "epoch": 1.0711398544866613, "grad_norm": 0.8498880863189697, "learning_rate": 0.00016074160362661576, "loss": 0.4871, "step": 31800 }, { "epoch": 1.0745082188089463, "grad_norm": 0.7907134294509888, "learning_rate": 0.00016046090581184826, "loss": 0.4885, "step": 31900 }, { "epoch": 1.0778765831312316, "grad_norm": 0.9202895164489746, "learning_rate": 0.00016018020799708077, "loss": 0.4888, "step": 32000 }, { "epoch": 1.0812449474535166, "grad_norm": 0.8670128583908081, "learning_rate": 0.00015989951018231325, "loss": 0.4859, "step": 32100 }, { "epoch": 1.0846133117758017, "grad_norm": 0.8007021546363831, "learning_rate": 0.00015961881236754573, "loss": 0.4885, "step": 32200 }, { "epoch": 1.0879816760980867, "grad_norm": 0.9113264083862305, "learning_rate": 0.0001593381145527782, "loss": 0.4876, "step": 32300 }, { "epoch": 1.0913500404203718, "grad_norm": 0.8807794451713562, "learning_rate": 0.0001590574167380107, "loss": 0.489, "step": 32400 }, { "epoch": 1.094718404742657, "grad_norm": 0.8606187105178833, "learning_rate": 0.0001587767189232432, "loss": 0.489, "step": 32500 }, { "epoch": 1.098086769064942, "grad_norm": 0.8390567898750305, "learning_rate": 0.00015849602110847567, "loss": 0.4874, "step": 32600 }, { "epoch": 1.1014551333872271, "grad_norm": 0.8143624663352966, "learning_rate": 0.00015821532329370817, "loss": 0.4883, "step": 32700 }, { "epoch": 1.1048234977095122, "grad_norm": 0.9023911356925964, "learning_rate": 0.00015793462547894065, "loss": 0.4884, "step": 32800 }, { "epoch": 1.1081918620317974, "grad_norm": 0.9291363954544067, "learning_rate": 0.00015765392766417316, "loss": 0.4869, "step": 32900 }, { "epoch": 1.1115602263540825, "grad_norm": 0.834904134273529, "learning_rate": 0.00015737322984940563, "loss": 0.4863, "step": 33000 }, { "epoch": 1.1149285906763675, "grad_norm": 0.8896390795707703, "learning_rate": 0.0001570925320346381, "loss": 0.485, "step": 33100 }, { "epoch": 1.1182969549986526, "grad_norm": 0.8215962648391724, "learning_rate": 0.0001568118342198706, "loss": 0.4867, "step": 33200 }, { "epoch": 1.1216653193209378, "grad_norm": 0.8174338936805725, "learning_rate": 0.0001565311364051031, "loss": 0.4865, "step": 33300 }, { "epoch": 1.1250336836432229, "grad_norm": 0.8599314093589783, "learning_rate": 0.0001562504385903356, "loss": 0.4832, "step": 33400 }, { "epoch": 1.128402047965508, "grad_norm": 0.7674278020858765, "learning_rate": 0.00015596974077556808, "loss": 0.4846, "step": 33500 }, { "epoch": 1.131770412287793, "grad_norm": 0.8474441170692444, "learning_rate": 0.00015568904296080056, "loss": 0.4848, "step": 33600 }, { "epoch": 1.1351387766100782, "grad_norm": 0.8045397996902466, "learning_rate": 0.00015540834514603304, "loss": 0.483, "step": 33700 }, { "epoch": 1.1385071409323633, "grad_norm": 0.8756964802742004, "learning_rate": 0.00015512764733126552, "loss": 0.4836, "step": 33800 }, { "epoch": 1.1418755052546483, "grad_norm": 0.8357768654823303, "learning_rate": 0.00015484694951649802, "loss": 0.4852, "step": 33900 }, { "epoch": 1.1452438695769334, "grad_norm": 0.9370204210281372, "learning_rate": 0.0001545662517017305, "loss": 0.4839, "step": 34000 }, { "epoch": 1.1486122338992186, "grad_norm": 0.8853762149810791, "learning_rate": 0.000154285553886963, "loss": 0.4844, "step": 34100 }, { "epoch": 1.1519805982215037, "grad_norm": 0.7827624678611755, "learning_rate": 0.00015400485607219548, "loss": 0.4832, "step": 34200 }, { "epoch": 1.1553489625437887, "grad_norm": 0.8665288090705872, "learning_rate": 0.00015372415825742796, "loss": 0.4838, "step": 34300 }, { "epoch": 1.1587173268660738, "grad_norm": 0.8360339403152466, "learning_rate": 0.00015344346044266047, "loss": 0.4821, "step": 34400 }, { "epoch": 1.162085691188359, "grad_norm": 0.8605954051017761, "learning_rate": 0.00015316276262789295, "loss": 0.4825, "step": 34500 }, { "epoch": 1.165454055510644, "grad_norm": 0.857475221157074, "learning_rate": 0.00015288206481312542, "loss": 0.4827, "step": 34600 }, { "epoch": 1.1688224198329291, "grad_norm": 0.8108141422271729, "learning_rate": 0.00015260136699835793, "loss": 0.4803, "step": 34700 }, { "epoch": 1.1721907841552142, "grad_norm": 0.8359714150428772, "learning_rate": 0.0001523206691835904, "loss": 0.4809, "step": 34800 }, { "epoch": 1.1755591484774994, "grad_norm": 0.8128540515899658, "learning_rate": 0.00015203997136882291, "loss": 0.4823, "step": 34900 }, { "epoch": 1.1789275127997845, "grad_norm": 0.8871669769287109, "learning_rate": 0.0001517592735540554, "loss": 0.4806, "step": 35000 }, { "epoch": 1.1822958771220695, "grad_norm": 0.8477233052253723, "learning_rate": 0.00015147857573928787, "loss": 0.481, "step": 35100 }, { "epoch": 1.1856642414443546, "grad_norm": 0.7827205061912537, "learning_rate": 0.00015119787792452035, "loss": 0.4792, "step": 35200 }, { "epoch": 1.1890326057666396, "grad_norm": 0.8286157250404358, "learning_rate": 0.00015091718010975286, "loss": 0.4782, "step": 35300 }, { "epoch": 1.1924009700889249, "grad_norm": 0.76893150806427, "learning_rate": 0.00015063648229498536, "loss": 0.4805, "step": 35400 }, { "epoch": 1.19576933441121, "grad_norm": 0.8076749444007874, "learning_rate": 0.00015035578448021784, "loss": 0.4813, "step": 35500 }, { "epoch": 1.199137698733495, "grad_norm": 0.8551127910614014, "learning_rate": 0.00015007508666545032, "loss": 0.4797, "step": 35600 }, { "epoch": 1.2025060630557802, "grad_norm": 0.9260111451148987, "learning_rate": 0.0001497943888506828, "loss": 0.4801, "step": 35700 }, { "epoch": 1.2058744273780653, "grad_norm": 0.9091964960098267, "learning_rate": 0.00014951369103591527, "loss": 0.4782, "step": 35800 }, { "epoch": 1.2092427917003503, "grad_norm": 0.8588406443595886, "learning_rate": 0.00014923299322114778, "loss": 0.4806, "step": 35900 }, { "epoch": 1.2126111560226354, "grad_norm": 0.8295513391494751, "learning_rate": 0.00014895229540638026, "loss": 0.479, "step": 36000 }, { "epoch": 1.2159795203449204, "grad_norm": 0.8360409736633301, "learning_rate": 0.00014867159759161276, "loss": 0.4793, "step": 36100 }, { "epoch": 1.2193478846672057, "grad_norm": 0.8704560995101929, "learning_rate": 0.00014839089977684524, "loss": 0.4788, "step": 36200 }, { "epoch": 1.2227162489894907, "grad_norm": 0.8278842568397522, "learning_rate": 0.00014811020196207772, "loss": 0.4796, "step": 36300 }, { "epoch": 1.2260846133117758, "grad_norm": 0.8524438142776489, "learning_rate": 0.00014782950414731023, "loss": 0.4784, "step": 36400 }, { "epoch": 1.2294529776340608, "grad_norm": 0.7825035452842712, "learning_rate": 0.0001475488063325427, "loss": 0.4783, "step": 36500 }, { "epoch": 1.232821341956346, "grad_norm": 0.8001949787139893, "learning_rate": 0.00014726810851777518, "loss": 0.4789, "step": 36600 }, { "epoch": 1.2361897062786311, "grad_norm": 0.7923149466514587, "learning_rate": 0.0001469874107030077, "loss": 0.4788, "step": 36700 }, { "epoch": 1.2395580706009162, "grad_norm": 0.8405751585960388, "learning_rate": 0.0001467067128882402, "loss": 0.4773, "step": 36800 }, { "epoch": 1.2429264349232012, "grad_norm": 0.8324115872383118, "learning_rate": 0.00014642601507347267, "loss": 0.4782, "step": 36900 }, { "epoch": 1.2462947992454865, "grad_norm": 0.8548023700714111, "learning_rate": 0.00014614531725870515, "loss": 0.4798, "step": 37000 }, { "epoch": 1.2496631635677715, "grad_norm": 0.8439319729804993, "learning_rate": 0.00014586461944393763, "loss": 0.4757, "step": 37100 }, { "epoch": 1.2530315278900566, "grad_norm": 0.7825635075569153, "learning_rate": 0.0001455839216291701, "loss": 0.4783, "step": 37200 }, { "epoch": 1.2563998922123416, "grad_norm": 0.8164156675338745, "learning_rate": 0.0001453032238144026, "loss": 0.4779, "step": 37300 }, { "epoch": 1.2597682565346267, "grad_norm": 0.8076338768005371, "learning_rate": 0.00014502252599963512, "loss": 0.4773, "step": 37400 }, { "epoch": 1.263136620856912, "grad_norm": 0.8112064003944397, "learning_rate": 0.0001447418281848676, "loss": 0.4754, "step": 37500 }, { "epoch": 1.266504985179197, "grad_norm": 0.7940359711647034, "learning_rate": 0.00014446113037010008, "loss": 0.4745, "step": 37600 }, { "epoch": 1.269873349501482, "grad_norm": 0.8495946526527405, "learning_rate": 0.00014418043255533255, "loss": 0.4758, "step": 37700 }, { "epoch": 1.2732417138237673, "grad_norm": 0.8374922275543213, "learning_rate": 0.00014389973474056506, "loss": 0.4771, "step": 37800 }, { "epoch": 1.2766100781460523, "grad_norm": 0.8647417426109314, "learning_rate": 0.00014361903692579754, "loss": 0.4771, "step": 37900 }, { "epoch": 1.2799784424683374, "grad_norm": 0.8156632781028748, "learning_rate": 0.00014333833911103002, "loss": 0.4747, "step": 38000 }, { "epoch": 1.2833468067906224, "grad_norm": 0.7802369594573975, "learning_rate": 0.00014305764129626252, "loss": 0.4741, "step": 38100 }, { "epoch": 1.2867151711129075, "grad_norm": 0.7542524337768555, "learning_rate": 0.000142776943481495, "loss": 0.4761, "step": 38200 }, { "epoch": 1.2900835354351927, "grad_norm": 0.8326511383056641, "learning_rate": 0.0001424962456667275, "loss": 0.4734, "step": 38300 }, { "epoch": 1.2934518997574778, "grad_norm": 0.7556424736976624, "learning_rate": 0.00014221554785195998, "loss": 0.4757, "step": 38400 }, { "epoch": 1.2968202640797628, "grad_norm": 0.8151201605796814, "learning_rate": 0.00014193485003719246, "loss": 0.4743, "step": 38500 }, { "epoch": 1.300188628402048, "grad_norm": 0.8914119601249695, "learning_rate": 0.00014165415222242494, "loss": 0.4769, "step": 38600 }, { "epoch": 1.3035569927243331, "grad_norm": 0.8541133999824524, "learning_rate": 0.00014137345440765745, "loss": 0.4744, "step": 38700 }, { "epoch": 1.3069253570466182, "grad_norm": 0.8853744864463806, "learning_rate": 0.00014109275659288995, "loss": 0.474, "step": 38800 }, { "epoch": 1.3102937213689032, "grad_norm": 0.8547524809837341, "learning_rate": 0.00014081205877812243, "loss": 0.474, "step": 38900 }, { "epoch": 1.3136620856911883, "grad_norm": 0.7881298661231995, "learning_rate": 0.0001405313609633549, "loss": 0.4727, "step": 39000 }, { "epoch": 1.3170304500134735, "grad_norm": 0.7588589191436768, "learning_rate": 0.0001402506631485874, "loss": 0.473, "step": 39100 }, { "epoch": 1.3203988143357586, "grad_norm": 0.7980801463127136, "learning_rate": 0.00013996996533381987, "loss": 0.4727, "step": 39200 }, { "epoch": 1.3237671786580436, "grad_norm": 0.8034206628799438, "learning_rate": 0.00013968926751905237, "loss": 0.4737, "step": 39300 }, { "epoch": 1.3271355429803289, "grad_norm": 0.7804720401763916, "learning_rate": 0.00013940856970428485, "loss": 0.4754, "step": 39400 }, { "epoch": 1.330503907302614, "grad_norm": 0.8541818261146545, "learning_rate": 0.00013912787188951736, "loss": 0.4733, "step": 39500 }, { "epoch": 1.333872271624899, "grad_norm": 0.8339990377426147, "learning_rate": 0.00013884717407474983, "loss": 0.4721, "step": 39600 }, { "epoch": 1.337240635947184, "grad_norm": 0.8007979393005371, "learning_rate": 0.0001385664762599823, "loss": 0.4745, "step": 39700 }, { "epoch": 1.340609000269469, "grad_norm": 0.848199188709259, "learning_rate": 0.00013828577844521482, "loss": 0.4725, "step": 39800 }, { "epoch": 1.3439773645917543, "grad_norm": 0.9129810333251953, "learning_rate": 0.0001380050806304473, "loss": 0.4716, "step": 39900 }, { "epoch": 1.3473457289140394, "grad_norm": 0.869888186454773, "learning_rate": 0.00013772438281567978, "loss": 0.4744, "step": 40000 }, { "epoch": 1.3507140932363244, "grad_norm": 0.8916295170783997, "learning_rate": 0.00013744368500091228, "loss": 0.4712, "step": 40100 }, { "epoch": 1.3540824575586097, "grad_norm": 0.8144074082374573, "learning_rate": 0.00013716298718614476, "loss": 0.4734, "step": 40200 }, { "epoch": 1.3574508218808947, "grad_norm": 0.7844826579093933, "learning_rate": 0.00013688228937137726, "loss": 0.473, "step": 40300 }, { "epoch": 1.3608191862031798, "grad_norm": 0.8559306859970093, "learning_rate": 0.00013660159155660974, "loss": 0.4708, "step": 40400 }, { "epoch": 1.3641875505254648, "grad_norm": 0.7995209693908691, "learning_rate": 0.00013632089374184222, "loss": 0.472, "step": 40500 }, { "epoch": 1.3675559148477499, "grad_norm": 0.845758855342865, "learning_rate": 0.0001360401959270747, "loss": 0.4714, "step": 40600 }, { "epoch": 1.3709242791700351, "grad_norm": 0.8122411370277405, "learning_rate": 0.0001357594981123072, "loss": 0.4715, "step": 40700 }, { "epoch": 1.3742926434923202, "grad_norm": 0.7860530614852905, "learning_rate": 0.0001354788002975397, "loss": 0.4718, "step": 40800 }, { "epoch": 1.3776610078146052, "grad_norm": 0.7795781493186951, "learning_rate": 0.0001351981024827722, "loss": 0.4696, "step": 40900 }, { "epoch": 1.3810293721368903, "grad_norm": 0.7595000267028809, "learning_rate": 0.00013491740466800467, "loss": 0.4703, "step": 41000 }, { "epoch": 1.3843977364591753, "grad_norm": 0.8687454462051392, "learning_rate": 0.00013463670685323715, "loss": 0.4698, "step": 41100 }, { "epoch": 1.3877661007814606, "grad_norm": 0.8719391226768494, "learning_rate": 0.00013435600903846962, "loss": 0.4689, "step": 41200 }, { "epoch": 1.3911344651037456, "grad_norm": 0.8451808094978333, "learning_rate": 0.00013407531122370213, "loss": 0.4681, "step": 41300 }, { "epoch": 1.3945028294260307, "grad_norm": 0.8027797341346741, "learning_rate": 0.0001337946134089346, "loss": 0.4717, "step": 41400 }, { "epoch": 1.397871193748316, "grad_norm": 0.7488086819648743, "learning_rate": 0.00013351391559416711, "loss": 0.4694, "step": 41500 }, { "epoch": 1.401239558070601, "grad_norm": 0.8326307535171509, "learning_rate": 0.0001332332177793996, "loss": 0.4693, "step": 41600 }, { "epoch": 1.404607922392886, "grad_norm": 0.8087652325630188, "learning_rate": 0.00013295251996463207, "loss": 0.4684, "step": 41700 }, { "epoch": 1.407976286715171, "grad_norm": 0.7918603420257568, "learning_rate": 0.00013267182214986458, "loss": 0.47, "step": 41800 }, { "epoch": 1.411344651037456, "grad_norm": 0.8231304883956909, "learning_rate": 0.00013239112433509705, "loss": 0.4694, "step": 41900 }, { "epoch": 1.4147130153597414, "grad_norm": 0.7812530994415283, "learning_rate": 0.00013211042652032953, "loss": 0.4695, "step": 42000 }, { "epoch": 1.4180813796820264, "grad_norm": 0.854972779750824, "learning_rate": 0.00013182972870556204, "loss": 0.47, "step": 42100 }, { "epoch": 1.4214497440043115, "grad_norm": 0.8728025555610657, "learning_rate": 0.00013154903089079452, "loss": 0.468, "step": 42200 }, { "epoch": 1.4248181083265967, "grad_norm": 0.8394129276275635, "learning_rate": 0.00013126833307602702, "loss": 0.4698, "step": 42300 }, { "epoch": 1.4281864726488818, "grad_norm": 0.7810468673706055, "learning_rate": 0.0001309876352612595, "loss": 0.4694, "step": 42400 }, { "epoch": 1.4315548369711668, "grad_norm": 0.8251649737358093, "learning_rate": 0.00013070693744649198, "loss": 0.4651, "step": 42500 }, { "epoch": 1.4349232012934519, "grad_norm": 0.8438547253608704, "learning_rate": 0.00013042623963172446, "loss": 0.4685, "step": 42600 }, { "epoch": 1.438291565615737, "grad_norm": 0.7687946557998657, "learning_rate": 0.00013014554181695696, "loss": 0.4657, "step": 42700 }, { "epoch": 1.4416599299380222, "grad_norm": 0.7573995590209961, "learning_rate": 0.00012986484400218947, "loss": 0.4667, "step": 42800 }, { "epoch": 1.4450282942603072, "grad_norm": 0.8200283646583557, "learning_rate": 0.00012958414618742195, "loss": 0.4666, "step": 42900 }, { "epoch": 1.4483966585825923, "grad_norm": 0.8411341905593872, "learning_rate": 0.00012930344837265443, "loss": 0.4679, "step": 43000 }, { "epoch": 1.4517650229048775, "grad_norm": 0.8489885330200195, "learning_rate": 0.0001290227505578869, "loss": 0.4679, "step": 43100 }, { "epoch": 1.4551333872271626, "grad_norm": 0.8161250352859497, "learning_rate": 0.00012874205274311938, "loss": 0.4688, "step": 43200 }, { "epoch": 1.4585017515494476, "grad_norm": 0.7844269871711731, "learning_rate": 0.0001284613549283519, "loss": 0.4666, "step": 43300 }, { "epoch": 1.4618701158717327, "grad_norm": 0.7773265838623047, "learning_rate": 0.00012818065711358437, "loss": 0.4687, "step": 43400 }, { "epoch": 1.4652384801940177, "grad_norm": 0.8081590533256531, "learning_rate": 0.00012789995929881687, "loss": 0.4643, "step": 43500 }, { "epoch": 1.468606844516303, "grad_norm": 0.7888718843460083, "learning_rate": 0.00012761926148404935, "loss": 0.4682, "step": 43600 }, { "epoch": 1.471975208838588, "grad_norm": 0.7907763719558716, "learning_rate": 0.00012733856366928186, "loss": 0.4653, "step": 43700 }, { "epoch": 1.475343573160873, "grad_norm": 0.7945205569267273, "learning_rate": 0.00012705786585451433, "loss": 0.4648, "step": 43800 }, { "epoch": 1.478711937483158, "grad_norm": 0.8834030032157898, "learning_rate": 0.0001267771680397468, "loss": 0.4664, "step": 43900 }, { "epoch": 1.4820803018054431, "grad_norm": 0.7815008759498596, "learning_rate": 0.0001264964702249793, "loss": 0.4662, "step": 44000 }, { "epoch": 1.4854486661277284, "grad_norm": 0.8282730579376221, "learning_rate": 0.0001262157724102118, "loss": 0.4652, "step": 44100 }, { "epoch": 1.4888170304500135, "grad_norm": 0.7864588499069214, "learning_rate": 0.0001259350745954443, "loss": 0.4651, "step": 44200 }, { "epoch": 1.4921853947722985, "grad_norm": 0.7972845435142517, "learning_rate": 0.00012565437678067678, "loss": 0.4656, "step": 44300 }, { "epoch": 1.4955537590945838, "grad_norm": 0.8192013502120972, "learning_rate": 0.00012537367896590926, "loss": 0.4665, "step": 44400 }, { "epoch": 1.4989221234168688, "grad_norm": 0.8526120185852051, "learning_rate": 0.00012509298115114174, "loss": 0.4654, "step": 44500 }, { "epoch": 1.5022904877391539, "grad_norm": 0.8241577744483948, "learning_rate": 0.00012481228333637422, "loss": 0.4648, "step": 44600 }, { "epoch": 1.5056588520614391, "grad_norm": 0.8311729431152344, "learning_rate": 0.00012453158552160672, "loss": 0.4662, "step": 44700 }, { "epoch": 1.509027216383724, "grad_norm": 0.7880195379257202, "learning_rate": 0.00012425088770683923, "loss": 0.4637, "step": 44800 }, { "epoch": 1.5123955807060092, "grad_norm": 0.7668688893318176, "learning_rate": 0.0001239701898920717, "loss": 0.465, "step": 44900 }, { "epoch": 1.5157639450282943, "grad_norm": 0.8149063587188721, "learning_rate": 0.00012368949207730418, "loss": 0.4634, "step": 45000 }, { "epoch": 1.5191323093505793, "grad_norm": 0.7656127214431763, "learning_rate": 0.00012340879426253666, "loss": 0.4635, "step": 45100 }, { "epoch": 1.5225006736728646, "grad_norm": 0.8114592432975769, "learning_rate": 0.00012312809644776917, "loss": 0.4635, "step": 45200 }, { "epoch": 1.5258690379951494, "grad_norm": 0.8734049797058105, "learning_rate": 0.00012284739863300165, "loss": 0.4629, "step": 45300 }, { "epoch": 1.5292374023174347, "grad_norm": 0.806281328201294, "learning_rate": 0.00012256670081823413, "loss": 0.4644, "step": 45400 }, { "epoch": 1.5326057666397197, "grad_norm": 0.8073423504829407, "learning_rate": 0.00012228600300346663, "loss": 0.4626, "step": 45500 }, { "epoch": 1.5359741309620047, "grad_norm": 0.8023707270622253, "learning_rate": 0.00012200530518869911, "loss": 0.4637, "step": 45600 }, { "epoch": 1.53934249528429, "grad_norm": 0.821060299873352, "learning_rate": 0.00012172460737393161, "loss": 0.4624, "step": 45700 }, { "epoch": 1.542710859606575, "grad_norm": 0.7743229866027832, "learning_rate": 0.00012144390955916409, "loss": 0.4631, "step": 45800 }, { "epoch": 1.54607922392886, "grad_norm": 0.8501706719398499, "learning_rate": 0.00012116321174439657, "loss": 0.4646, "step": 45900 }, { "epoch": 1.5494475882511454, "grad_norm": 0.798643946647644, "learning_rate": 0.00012088251392962906, "loss": 0.4618, "step": 46000 }, { "epoch": 1.5528159525734302, "grad_norm": 0.771360456943512, "learning_rate": 0.00012060181611486154, "loss": 0.464, "step": 46100 }, { "epoch": 1.5561843168957155, "grad_norm": 0.7841131687164307, "learning_rate": 0.00012032111830009405, "loss": 0.4618, "step": 46200 }, { "epoch": 1.5595526812180005, "grad_norm": 0.7240998148918152, "learning_rate": 0.00012004042048532653, "loss": 0.4648, "step": 46300 }, { "epoch": 1.5629210455402855, "grad_norm": 0.8445931673049927, "learning_rate": 0.00011975972267055902, "loss": 0.4606, "step": 46400 }, { "epoch": 1.5662894098625708, "grad_norm": 0.8375403881072998, "learning_rate": 0.0001194790248557915, "loss": 0.4633, "step": 46500 }, { "epoch": 1.5696577741848559, "grad_norm": 0.7885960340499878, "learning_rate": 0.00011919832704102399, "loss": 0.4632, "step": 46600 }, { "epoch": 1.573026138507141, "grad_norm": 0.8243712186813354, "learning_rate": 0.0001189176292262565, "loss": 0.4618, "step": 46700 }, { "epoch": 1.5763945028294262, "grad_norm": 0.8182551860809326, "learning_rate": 0.00011863693141148897, "loss": 0.4607, "step": 46800 }, { "epoch": 1.579762867151711, "grad_norm": 0.7784871459007263, "learning_rate": 0.00011835623359672145, "loss": 0.4628, "step": 46900 }, { "epoch": 1.5831312314739963, "grad_norm": 0.8082338571548462, "learning_rate": 0.00011807553578195394, "loss": 0.4621, "step": 47000 }, { "epoch": 1.5864995957962813, "grad_norm": 0.8203257322311401, "learning_rate": 0.00011779483796718642, "loss": 0.461, "step": 47100 }, { "epoch": 1.5898679601185663, "grad_norm": 0.7920771837234497, "learning_rate": 0.00011751414015241893, "loss": 0.4611, "step": 47200 }, { "epoch": 1.5932363244408516, "grad_norm": 0.8124784827232361, "learning_rate": 0.0001172334423376514, "loss": 0.4598, "step": 47300 }, { "epoch": 1.5966046887631367, "grad_norm": 0.8094605803489685, "learning_rate": 0.0001169527445228839, "loss": 0.4605, "step": 47400 }, { "epoch": 1.5999730530854217, "grad_norm": 0.7639499306678772, "learning_rate": 0.00011667204670811638, "loss": 0.46, "step": 47500 }, { "epoch": 1.603341417407707, "grad_norm": 0.8600967526435852, "learning_rate": 0.00011639134889334887, "loss": 0.4623, "step": 47600 }, { "epoch": 1.6067097817299918, "grad_norm": 0.7747792601585388, "learning_rate": 0.00011611065107858137, "loss": 0.463, "step": 47700 }, { "epoch": 1.610078146052277, "grad_norm": 0.8040998578071594, "learning_rate": 0.00011582995326381385, "loss": 0.459, "step": 47800 }, { "epoch": 1.613446510374562, "grad_norm": 0.7648651003837585, "learning_rate": 0.00011554925544904633, "loss": 0.4618, "step": 47900 }, { "epoch": 1.6168148746968471, "grad_norm": 0.789125382900238, "learning_rate": 0.00011526855763427882, "loss": 0.4599, "step": 48000 }, { "epoch": 1.6201832390191324, "grad_norm": 0.8133670687675476, "learning_rate": 0.0001149878598195113, "loss": 0.4594, "step": 48100 }, { "epoch": 1.6235516033414175, "grad_norm": 0.7992141842842102, "learning_rate": 0.0001147071620047438, "loss": 0.4602, "step": 48200 }, { "epoch": 1.6269199676637025, "grad_norm": 0.780681312084198, "learning_rate": 0.00011442646418997628, "loss": 0.4587, "step": 48300 }, { "epoch": 1.6302883319859878, "grad_norm": 0.7979656457901001, "learning_rate": 0.00011414576637520878, "loss": 0.4587, "step": 48400 }, { "epoch": 1.6336566963082726, "grad_norm": 0.8527476787567139, "learning_rate": 0.00011386506856044125, "loss": 0.4586, "step": 48500 }, { "epoch": 1.6370250606305579, "grad_norm": 0.8187114000320435, "learning_rate": 0.00011358437074567375, "loss": 0.4611, "step": 48600 }, { "epoch": 1.640393424952843, "grad_norm": 0.7977433204650879, "learning_rate": 0.00011330367293090625, "loss": 0.4581, "step": 48700 }, { "epoch": 1.643761789275128, "grad_norm": 0.8355839252471924, "learning_rate": 0.00011302297511613873, "loss": 0.46, "step": 48800 }, { "epoch": 1.6471301535974132, "grad_norm": 0.7887241840362549, "learning_rate": 0.00011274227730137121, "loss": 0.4595, "step": 48900 }, { "epoch": 1.650498517919698, "grad_norm": 0.8219642639160156, "learning_rate": 0.0001124615794866037, "loss": 0.4605, "step": 49000 }, { "epoch": 1.6538668822419833, "grad_norm": 0.797517716884613, "learning_rate": 0.00011218088167183618, "loss": 0.46, "step": 49100 }, { "epoch": 1.6572352465642683, "grad_norm": 0.81880784034729, "learning_rate": 0.00011190018385706868, "loss": 0.4602, "step": 49200 }, { "epoch": 1.6606036108865534, "grad_norm": 0.8267971277236938, "learning_rate": 0.00011161948604230116, "loss": 0.4584, "step": 49300 }, { "epoch": 1.6639719752088387, "grad_norm": 0.8257302045822144, "learning_rate": 0.00011133878822753366, "loss": 0.4602, "step": 49400 }, { "epoch": 1.6673403395311237, "grad_norm": 0.7903374433517456, "learning_rate": 0.00011105809041276613, "loss": 0.4558, "step": 49500 }, { "epoch": 1.6707087038534087, "grad_norm": 0.7741321921348572, "learning_rate": 0.00011077739259799863, "loss": 0.4596, "step": 49600 }, { "epoch": 1.674077068175694, "grad_norm": 0.771134078502655, "learning_rate": 0.00011049669478323113, "loss": 0.4568, "step": 49700 }, { "epoch": 1.6774454324979788, "grad_norm": 0.7859461307525635, "learning_rate": 0.00011021599696846361, "loss": 0.4577, "step": 49800 }, { "epoch": 1.680813796820264, "grad_norm": 0.7759444117546082, "learning_rate": 0.00010993529915369609, "loss": 0.457, "step": 49900 }, { "epoch": 1.6841821611425492, "grad_norm": 0.8348528742790222, "learning_rate": 0.00010965460133892858, "loss": 0.4569, "step": 50000 }, { "epoch": 1.6875505254648342, "grad_norm": 0.8011546730995178, "learning_rate": 0.00010937390352416106, "loss": 0.4585, "step": 50100 }, { "epoch": 1.6909188897871195, "grad_norm": 0.790429413318634, "learning_rate": 0.00010909320570939356, "loss": 0.4582, "step": 50200 }, { "epoch": 1.6942872541094045, "grad_norm": 0.8371046781539917, "learning_rate": 0.00010881250789462604, "loss": 0.4591, "step": 50300 }, { "epoch": 1.6976556184316896, "grad_norm": 0.7836015820503235, "learning_rate": 0.00010853181007985853, "loss": 0.4581, "step": 50400 }, { "epoch": 1.7010239827539748, "grad_norm": 0.846708357334137, "learning_rate": 0.00010825111226509101, "loss": 0.4569, "step": 50500 }, { "epoch": 1.7043923470762596, "grad_norm": 0.797223687171936, "learning_rate": 0.00010797041445032352, "loss": 0.4569, "step": 50600 }, { "epoch": 1.707760711398545, "grad_norm": 0.8466051816940308, "learning_rate": 0.00010768971663555601, "loss": 0.4567, "step": 50700 }, { "epoch": 1.71112907572083, "grad_norm": 0.7285684943199158, "learning_rate": 0.00010740901882078849, "loss": 0.456, "step": 50800 }, { "epoch": 1.714497440043115, "grad_norm": 0.8624778985977173, "learning_rate": 0.00010712832100602097, "loss": 0.4588, "step": 50900 }, { "epoch": 1.7178658043654003, "grad_norm": 0.7958481311798096, "learning_rate": 0.00010684762319125346, "loss": 0.4566, "step": 51000 }, { "epoch": 1.7212341686876853, "grad_norm": 0.7974202036857605, "learning_rate": 0.00010656692537648596, "loss": 0.4566, "step": 51100 }, { "epoch": 1.7246025330099704, "grad_norm": 0.8782477378845215, "learning_rate": 0.00010628622756171844, "loss": 0.4577, "step": 51200 }, { "epoch": 1.7279708973322556, "grad_norm": 0.8142967820167542, "learning_rate": 0.00010600552974695092, "loss": 0.4543, "step": 51300 }, { "epoch": 1.7313392616545404, "grad_norm": 0.7704757452011108, "learning_rate": 0.00010572483193218341, "loss": 0.4565, "step": 51400 }, { "epoch": 1.7347076259768257, "grad_norm": 0.8298918604850769, "learning_rate": 0.00010544413411741589, "loss": 0.4564, "step": 51500 }, { "epoch": 1.7380759902991108, "grad_norm": 0.7840197682380676, "learning_rate": 0.0001051634363026484, "loss": 0.457, "step": 51600 }, { "epoch": 1.7414443546213958, "grad_norm": 0.8080000281333923, "learning_rate": 0.00010488273848788088, "loss": 0.4563, "step": 51700 }, { "epoch": 1.744812718943681, "grad_norm": 0.8133041262626648, "learning_rate": 0.00010460204067311337, "loss": 0.4529, "step": 51800 }, { "epoch": 1.7481810832659659, "grad_norm": 0.8792639374732971, "learning_rate": 0.00010432134285834585, "loss": 0.4536, "step": 51900 }, { "epoch": 1.7515494475882512, "grad_norm": 0.8580865263938904, "learning_rate": 0.00010404064504357834, "loss": 0.4542, "step": 52000 }, { "epoch": 1.7549178119105362, "grad_norm": 0.7759612798690796, "learning_rate": 0.00010375994722881084, "loss": 0.4557, "step": 52100 }, { "epoch": 1.7582861762328212, "grad_norm": 0.748423159122467, "learning_rate": 0.00010347924941404332, "loss": 0.454, "step": 52200 }, { "epoch": 1.7616545405551065, "grad_norm": 0.7873731851577759, "learning_rate": 0.0001031985515992758, "loss": 0.4543, "step": 52300 }, { "epoch": 1.7650229048773916, "grad_norm": 0.7736590504646301, "learning_rate": 0.00010291785378450829, "loss": 0.4556, "step": 52400 }, { "epoch": 1.7683912691996766, "grad_norm": 0.7629456520080566, "learning_rate": 0.00010263715596974077, "loss": 0.4545, "step": 52500 }, { "epoch": 1.7717596335219619, "grad_norm": 0.8270254731178284, "learning_rate": 0.00010235645815497328, "loss": 0.4546, "step": 52600 }, { "epoch": 1.7751279978442467, "grad_norm": 0.7610684633255005, "learning_rate": 0.00010207576034020576, "loss": 0.4527, "step": 52700 }, { "epoch": 1.778496362166532, "grad_norm": 0.8228756785392761, "learning_rate": 0.00010179506252543825, "loss": 0.4568, "step": 52800 }, { "epoch": 1.781864726488817, "grad_norm": 0.8317448496818542, "learning_rate": 0.00010151436471067073, "loss": 0.4543, "step": 52900 }, { "epoch": 1.785233090811102, "grad_norm": 0.7914367318153381, "learning_rate": 0.00010123366689590322, "loss": 0.4529, "step": 53000 }, { "epoch": 1.7886014551333873, "grad_norm": 0.8205628395080566, "learning_rate": 0.00010095296908113572, "loss": 0.4537, "step": 53100 }, { "epoch": 1.7919698194556724, "grad_norm": 0.794956386089325, "learning_rate": 0.0001006722712663682, "loss": 0.455, "step": 53200 }, { "epoch": 1.7953381837779574, "grad_norm": 0.8285955786705017, "learning_rate": 0.00010039157345160068, "loss": 0.4535, "step": 53300 }, { "epoch": 1.7987065481002427, "grad_norm": 0.8204521536827087, "learning_rate": 0.00010011087563683317, "loss": 0.4561, "step": 53400 }, { "epoch": 1.8020749124225275, "grad_norm": 0.8407822251319885, "learning_rate": 9.983017782206566e-05, "loss": 0.4563, "step": 53500 }, { "epoch": 1.8054432767448128, "grad_norm": 0.8138654828071594, "learning_rate": 9.954948000729814e-05, "loss": 0.4547, "step": 53600 }, { "epoch": 1.8088116410670978, "grad_norm": 0.8514792323112488, "learning_rate": 9.926878219253063e-05, "loss": 0.453, "step": 53700 }, { "epoch": 1.8121800053893828, "grad_norm": 0.836942195892334, "learning_rate": 9.898808437776313e-05, "loss": 0.4554, "step": 53800 }, { "epoch": 1.815548369711668, "grad_norm": 0.8424620628356934, "learning_rate": 9.87073865629956e-05, "loss": 0.4541, "step": 53900 }, { "epoch": 1.8189167340339532, "grad_norm": 0.7823119163513184, "learning_rate": 9.84266887482281e-05, "loss": 0.4539, "step": 54000 }, { "epoch": 1.8222850983562382, "grad_norm": 0.8232121467590332, "learning_rate": 9.814599093346059e-05, "loss": 0.4518, "step": 54100 }, { "epoch": 1.8256534626785235, "grad_norm": 0.7991457581520081, "learning_rate": 9.786529311869308e-05, "loss": 0.4516, "step": 54200 }, { "epoch": 1.8290218270008083, "grad_norm": 0.7749050855636597, "learning_rate": 9.758459530392556e-05, "loss": 0.4528, "step": 54300 }, { "epoch": 1.8323901913230936, "grad_norm": 0.7452788949012756, "learning_rate": 9.730389748915805e-05, "loss": 0.4555, "step": 54400 }, { "epoch": 1.8357585556453786, "grad_norm": 0.816150963306427, "learning_rate": 9.702319967439054e-05, "loss": 0.4514, "step": 54500 }, { "epoch": 1.8391269199676636, "grad_norm": 0.785351037979126, "learning_rate": 9.674250185962302e-05, "loss": 0.4517, "step": 54600 }, { "epoch": 1.842495284289949, "grad_norm": 0.828187108039856, "learning_rate": 9.646180404485551e-05, "loss": 0.4533, "step": 54700 }, { "epoch": 1.845863648612234, "grad_norm": 0.7950621247291565, "learning_rate": 9.6181106230088e-05, "loss": 0.4523, "step": 54800 }, { "epoch": 1.849232012934519, "grad_norm": 0.7881097197532654, "learning_rate": 9.590040841532048e-05, "loss": 0.4516, "step": 54900 }, { "epoch": 1.852600377256804, "grad_norm": 0.7643069624900818, "learning_rate": 9.561971060055298e-05, "loss": 0.453, "step": 55000 }, { "epoch": 1.855968741579089, "grad_norm": 0.7741556167602539, "learning_rate": 9.533901278578547e-05, "loss": 0.4528, "step": 55100 }, { "epoch": 1.8593371059013744, "grad_norm": 0.8289052844047546, "learning_rate": 9.505831497101796e-05, "loss": 0.452, "step": 55200 }, { "epoch": 1.8627054702236594, "grad_norm": 0.7747401595115662, "learning_rate": 9.477761715625044e-05, "loss": 0.453, "step": 55300 }, { "epoch": 1.8660738345459444, "grad_norm": 0.837910532951355, "learning_rate": 9.449691934148293e-05, "loss": 0.4532, "step": 55400 }, { "epoch": 1.8694421988682297, "grad_norm": 0.7754988670349121, "learning_rate": 9.421622152671542e-05, "loss": 0.4497, "step": 55500 }, { "epoch": 1.8728105631905145, "grad_norm": 0.8681314587593079, "learning_rate": 9.39355237119479e-05, "loss": 0.451, "step": 55600 }, { "epoch": 1.8761789275127998, "grad_norm": 0.8410942554473877, "learning_rate": 9.365482589718039e-05, "loss": 0.451, "step": 55700 }, { "epoch": 1.8795472918350848, "grad_norm": 0.8542850613594055, "learning_rate": 9.337412808241288e-05, "loss": 0.4524, "step": 55800 }, { "epoch": 1.8829156561573699, "grad_norm": 0.806122899055481, "learning_rate": 9.309343026764538e-05, "loss": 0.4535, "step": 55900 }, { "epoch": 1.8862840204796552, "grad_norm": 0.8103610277175903, "learning_rate": 9.281273245287786e-05, "loss": 0.4514, "step": 56000 }, { "epoch": 1.8896523848019402, "grad_norm": 0.7982548475265503, "learning_rate": 9.253203463811035e-05, "loss": 0.4504, "step": 56100 }, { "epoch": 1.8930207491242252, "grad_norm": 0.8081793189048767, "learning_rate": 9.225133682334284e-05, "loss": 0.4522, "step": 56200 }, { "epoch": 1.8963891134465105, "grad_norm": 0.8284481763839722, "learning_rate": 9.197063900857532e-05, "loss": 0.4501, "step": 56300 }, { "epoch": 1.8997574777687953, "grad_norm": 0.7722172737121582, "learning_rate": 9.168994119380781e-05, "loss": 0.4528, "step": 56400 }, { "epoch": 1.9031258420910806, "grad_norm": 0.8065896034240723, "learning_rate": 9.14092433790403e-05, "loss": 0.4527, "step": 56500 }, { "epoch": 1.9064942064133656, "grad_norm": 0.8267763257026672, "learning_rate": 9.112854556427278e-05, "loss": 0.4547, "step": 56600 }, { "epoch": 1.9098625707356507, "grad_norm": 0.803359866142273, "learning_rate": 9.084784774950527e-05, "loss": 0.4506, "step": 56700 }, { "epoch": 1.913230935057936, "grad_norm": 0.7984471321105957, "learning_rate": 9.056714993473776e-05, "loss": 0.4498, "step": 56800 }, { "epoch": 1.916599299380221, "grad_norm": 0.8118926286697388, "learning_rate": 9.028645211997026e-05, "loss": 0.4511, "step": 56900 }, { "epoch": 1.919967663702506, "grad_norm": 0.7954909205436707, "learning_rate": 9.000575430520273e-05, "loss": 0.45, "step": 57000 }, { "epoch": 1.9233360280247913, "grad_norm": 0.7925546765327454, "learning_rate": 8.972505649043523e-05, "loss": 0.4503, "step": 57100 }, { "epoch": 1.9267043923470761, "grad_norm": 0.7257952690124512, "learning_rate": 8.944435867566772e-05, "loss": 0.4501, "step": 57200 }, { "epoch": 1.9300727566693614, "grad_norm": 0.7644702196121216, "learning_rate": 8.91636608609002e-05, "loss": 0.4502, "step": 57300 }, { "epoch": 1.9334411209916464, "grad_norm": 0.8492504358291626, "learning_rate": 8.888296304613269e-05, "loss": 0.451, "step": 57400 }, { "epoch": 1.9368094853139315, "grad_norm": 0.7511376142501831, "learning_rate": 8.860226523136518e-05, "loss": 0.4511, "step": 57500 }, { "epoch": 1.9401778496362168, "grad_norm": 0.8549360036849976, "learning_rate": 8.832156741659766e-05, "loss": 0.4504, "step": 57600 }, { "epoch": 1.9435462139585018, "grad_norm": 0.7821473479270935, "learning_rate": 8.804086960183015e-05, "loss": 0.4508, "step": 57700 }, { "epoch": 1.9469145782807868, "grad_norm": 0.8527407050132751, "learning_rate": 8.776017178706264e-05, "loss": 0.4514, "step": 57800 }, { "epoch": 1.9502829426030721, "grad_norm": 0.8409647941589355, "learning_rate": 8.747947397229514e-05, "loss": 0.4498, "step": 57900 }, { "epoch": 1.953651306925357, "grad_norm": 0.8430731296539307, "learning_rate": 8.719877615752761e-05, "loss": 0.4498, "step": 58000 }, { "epoch": 1.9570196712476422, "grad_norm": 0.8346706032752991, "learning_rate": 8.69180783427601e-05, "loss": 0.4475, "step": 58100 }, { "epoch": 1.9603880355699272, "grad_norm": 0.7488289475440979, "learning_rate": 8.66373805279926e-05, "loss": 0.4488, "step": 58200 }, { "epoch": 1.9637563998922123, "grad_norm": 0.836130678653717, "learning_rate": 8.635668271322508e-05, "loss": 0.451, "step": 58300 }, { "epoch": 1.9671247642144976, "grad_norm": 0.7900556921958923, "learning_rate": 8.607598489845757e-05, "loss": 0.4463, "step": 58400 }, { "epoch": 1.9704931285367824, "grad_norm": 0.8496758341789246, "learning_rate": 8.579528708369006e-05, "loss": 0.4475, "step": 58500 }, { "epoch": 1.9738614928590676, "grad_norm": 0.8665506839752197, "learning_rate": 8.551458926892254e-05, "loss": 0.4474, "step": 58600 }, { "epoch": 1.9772298571813527, "grad_norm": 0.8058724999427795, "learning_rate": 8.523389145415503e-05, "loss": 0.4496, "step": 58700 }, { "epoch": 1.9805982215036377, "grad_norm": 0.8007978796958923, "learning_rate": 8.495319363938752e-05, "loss": 0.4494, "step": 58800 }, { "epoch": 1.983966585825923, "grad_norm": 0.7893068790435791, "learning_rate": 8.467249582462001e-05, "loss": 0.4477, "step": 58900 }, { "epoch": 1.987334950148208, "grad_norm": 0.8267046213150024, "learning_rate": 8.439179800985249e-05, "loss": 0.4479, "step": 59000 }, { "epoch": 1.990703314470493, "grad_norm": 0.8301923274993896, "learning_rate": 8.411110019508498e-05, "loss": 0.4486, "step": 59100 }, { "epoch": 1.9940716787927784, "grad_norm": 0.7466899156570435, "learning_rate": 8.383040238031748e-05, "loss": 0.4481, "step": 59200 }, { "epoch": 1.9974400431150632, "grad_norm": 0.8137242794036865, "learning_rate": 8.354970456554995e-05, "loss": 0.4501, "step": 59300 }, { "epoch": 2.0, "eval_loss": 0.44603702425956726, "eval_runtime": 7.7293, "eval_samples_per_second": 646.889, "eval_steps_per_second": 10.221, "step": 59376 } ], "logging_steps": 100, "max_steps": 89064, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2413913702400000.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }