{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.118375, "eval_steps": 500, "global_step": 2900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00125, "grad_norm": 0.36102330684661865, "learning_rate": 5.5665e-06, "loss": 2.681707572937012, "step": 10 }, { "epoch": 0.0025, "grad_norm": 0.34577861428260803, "learning_rate": 1.17515e-05, "loss": 2.6720260620117187, "step": 20 }, { "epoch": 0.00375, "grad_norm": 0.3295978009700775, "learning_rate": 1.79365e-05, "loss": 2.672147750854492, "step": 30 }, { "epoch": 0.005, "grad_norm": 0.32688695192337036, "learning_rate": 2.41215e-05, "loss": 2.675041389465332, "step": 40 }, { "epoch": 0.00625, "grad_norm": 0.3257655203342438, "learning_rate": 3.03065e-05, "loss": 2.675174522399902, "step": 50 }, { "epoch": 0.0075, "grad_norm": 0.336309552192688, "learning_rate": 3.6491499999999994e-05, "loss": 2.6966915130615234, "step": 60 }, { "epoch": 0.00875, "grad_norm": 0.3346744179725647, "learning_rate": 4.26765e-05, "loss": 2.6632720947265627, "step": 70 }, { "epoch": 0.01, "grad_norm": 0.32752859592437744, "learning_rate": 4.88615e-05, "loss": 2.695608139038086, "step": 80 }, { "epoch": 0.01125, "grad_norm": 0.32975664734840393, "learning_rate": 5.50465e-05, "loss": 2.6731294631958007, "step": 90 }, { "epoch": 0.0125, "grad_norm": 0.33192330598831177, "learning_rate": 6.12315e-05, "loss": 2.6482282638549806, "step": 100 }, { "epoch": 0.01375, "grad_norm": 0.3244248032569885, "learning_rate": 6.74165e-05, "loss": 2.700460433959961, "step": 110 }, { "epoch": 0.015, "grad_norm": 0.32452520728111267, "learning_rate": 7.36015e-05, "loss": 2.6673652648925783, "step": 120 }, { "epoch": 0.01625, "grad_norm": 0.32950156927108765, "learning_rate": 7.97865e-05, "loss": 2.66015510559082, "step": 130 }, { "epoch": 0.0175, "grad_norm": 0.3157300651073456, "learning_rate": 8.597149999999999e-05, "loss": 2.653401184082031, "step": 140 }, { "epoch": 0.01875, "grad_norm": 0.3447306156158447, "learning_rate": 9.21565e-05, "loss": 2.638433837890625, "step": 150 }, { "epoch": 0.02, "grad_norm": 0.33060336112976074, "learning_rate": 9.834150000000001e-05, "loss": 2.6599313735961916, "step": 160 }, { "epoch": 0.02125, "grad_norm": 0.33370116353034973, "learning_rate": 0.00010452649999999999, "loss": 2.675436019897461, "step": 170 }, { "epoch": 0.0225, "grad_norm": 0.32309311628341675, "learning_rate": 0.0001107115, "loss": 2.682134246826172, "step": 180 }, { "epoch": 0.02375, "grad_norm": 0.3298942446708679, "learning_rate": 0.0001168965, "loss": 2.6672037124633787, "step": 190 }, { "epoch": 0.025, "grad_norm": 0.3257051408290863, "learning_rate": 0.0001230815, "loss": 2.6710464477539064, "step": 200 }, { "epoch": 0.02625, "grad_norm": 0.32734546065330505, "learning_rate": 0.00012369959364576377, "loss": 2.6694522857666017, "step": 210 }, { "epoch": 0.0275, "grad_norm": 0.3286871016025543, "learning_rate": 0.00012369818897130838, "loss": 2.67569580078125, "step": 220 }, { "epoch": 0.02875, "grad_norm": 0.3206029534339905, "learning_rate": 0.0001236957809826964, "loss": 2.671968460083008, "step": 230 }, { "epoch": 0.03, "grad_norm": 0.32244956493377686, "learning_rate": 0.0001236923697189907, "loss": 2.6653528213500977, "step": 240 }, { "epoch": 0.03125, "grad_norm": 0.3286353647708893, "learning_rate": 0.00012368795523552952, "loss": 2.644626998901367, "step": 250 }, { "epoch": 0.0325, "grad_norm": 0.31846532225608826, "learning_rate": 0.00012368253760392556, "loss": 2.6661434173583984, "step": 260 }, { "epoch": 0.03375, "grad_norm": 0.34063664078712463, "learning_rate": 0.00012367611691206466, "loss": 2.658544921875, "step": 270 }, { "epoch": 0.035, "grad_norm": 0.3394038677215576, "learning_rate": 0.00012366869326410474, "loss": 2.671076202392578, "step": 280 }, { "epoch": 0.03625, "grad_norm": 0.3454046845436096, "learning_rate": 0.00012366026678047368, "loss": 2.690570068359375, "step": 290 }, { "epoch": 0.0375, "grad_norm": 0.32945406436920166, "learning_rate": 0.00012365083759786766, "loss": 2.6626564025878907, "step": 300 }, { "epoch": 0.03875, "grad_norm": 0.3266613483428955, "learning_rate": 0.00012364040586924886, "loss": 2.6811601638793947, "step": 310 }, { "epoch": 0.04, "grad_norm": 0.32137027382850647, "learning_rate": 0.0001236289717638429, "loss": 2.656772422790527, "step": 320 }, { "epoch": 0.04125, "grad_norm": 0.31430286169052124, "learning_rate": 0.00012361653546713627, "loss": 2.667566680908203, "step": 330 }, { "epoch": 0.0425, "grad_norm": 0.3187640905380249, "learning_rate": 0.00012360309718087312, "loss": 2.6774127960205076, "step": 340 }, { "epoch": 0.04375, "grad_norm": 0.3238705098628998, "learning_rate": 0.00012358865712305212, "loss": 2.650909423828125, "step": 350 }, { "epoch": 0.045, "grad_norm": 0.3178948163986206, "learning_rate": 0.00012357321552792288, "loss": 2.6466007232666016, "step": 360 }, { "epoch": 0.04625, "grad_norm": 0.3393631875514984, "learning_rate": 0.0001235567726459822, "loss": 2.6694786071777346, "step": 370 }, { "epoch": 0.0475, "grad_norm": 0.33097463846206665, "learning_rate": 0.00012353932874396988, "loss": 2.6705909729003907, "step": 380 }, { "epoch": 0.04875, "grad_norm": 0.3237457275390625, "learning_rate": 0.00012352088410486452, "loss": 2.666813087463379, "step": 390 }, { "epoch": 0.05, "grad_norm": 0.32804396748542786, "learning_rate": 0.0001235014390278789, "loss": 2.6341262817382813, "step": 400 }, { "epoch": 0.05125, "grad_norm": 0.3098997473716736, "learning_rate": 0.0001234809938284551, "loss": 2.653286361694336, "step": 410 }, { "epoch": 0.0525, "grad_norm": 0.31869447231292725, "learning_rate": 0.00012345954883825937, "loss": 2.6676279067993165, "step": 420 }, { "epoch": 0.05375, "grad_norm": 0.3462599813938141, "learning_rate": 0.0001234371044051768, "loss": 2.6937137603759767, "step": 430 }, { "epoch": 0.055, "grad_norm": 0.33410680294036865, "learning_rate": 0.00012341366089330566, "loss": 2.6624752044677735, "step": 440 }, { "epoch": 0.05625, "grad_norm": 0.3401891589164734, "learning_rate": 0.00012338921868295142, "loss": 2.6673324584960936, "step": 450 }, { "epoch": 0.0575, "grad_norm": 0.3144513964653015, "learning_rate": 0.00012336377817062075, "loss": 2.6684280395507813, "step": 460 }, { "epoch": 0.05875, "grad_norm": 0.31319352984428406, "learning_rate": 0.00012333733976901485, "loss": 2.6631874084472655, "step": 470 }, { "epoch": 0.06, "grad_norm": 0.3231050670146942, "learning_rate": 0.00012330990390702298, "loss": 2.6671581268310547, "step": 480 }, { "epoch": 0.06125, "grad_norm": 0.3283950686454773, "learning_rate": 0.00012328147102971544, "loss": 2.6682722091674806, "step": 490 }, { "epoch": 0.0625, "grad_norm": 0.3203584849834442, "learning_rate": 0.0001232520415983362, "loss": 2.6619497299194337, "step": 500 }, { "epoch": 0.06375, "grad_norm": 0.3314996063709259, "learning_rate": 0.00012322161609029563, "loss": 2.675333023071289, "step": 510 }, { "epoch": 0.065, "grad_norm": 0.3124040961265564, "learning_rate": 0.00012319019499916267, "loss": 2.674266052246094, "step": 520 }, { "epoch": 0.06625, "grad_norm": 0.334187775850296, "learning_rate": 0.0001231577788346567, "loss": 2.6644060134887697, "step": 530 }, { "epoch": 0.0675, "grad_norm": 0.33853819966316223, "learning_rate": 0.00012312436812263953, "loss": 2.6285802841186525, "step": 540 }, { "epoch": 0.06875, "grad_norm": 0.3226993680000305, "learning_rate": 0.00012308996340510664, "loss": 2.6620355606079102, "step": 550 }, { "epoch": 0.07, "grad_norm": 0.31965890526771545, "learning_rate": 0.0001230545652401785, "loss": 2.669430160522461, "step": 560 }, { "epoch": 0.07125, "grad_norm": 0.323632150888443, "learning_rate": 0.00012301817420209152, "loss": 2.6710559844970705, "step": 570 }, { "epoch": 0.0725, "grad_norm": 0.3202168047428131, "learning_rate": 0.00012298079088118863, "loss": 2.6743343353271483, "step": 580 }, { "epoch": 0.07375, "grad_norm": 0.3278695344924927, "learning_rate": 0.00012294241588390982, "loss": 2.643411636352539, "step": 590 }, { "epoch": 0.075, "grad_norm": 0.3302673101425171, "learning_rate": 0.0001229030498327823, "loss": 2.7156848907470703, "step": 600 }, { "epoch": 0.07625, "grad_norm": 0.31964629888534546, "learning_rate": 0.00012286269336641027, "loss": 2.6369789123535154, "step": 610 }, { "epoch": 0.0775, "grad_norm": 0.32528844475746155, "learning_rate": 0.00012282134713946472, "loss": 2.655129241943359, "step": 620 }, { "epoch": 0.07875, "grad_norm": 0.3346538245677948, "learning_rate": 0.00012277901182267275, "loss": 2.6634849548339843, "step": 630 }, { "epoch": 0.08, "grad_norm": 0.32035592198371887, "learning_rate": 0.00012273568810280665, "loss": 2.6622406005859376, "step": 640 }, { "epoch": 0.08125, "grad_norm": 0.32753705978393555, "learning_rate": 0.00012269137668267276, "loss": 2.6673862457275392, "step": 650 }, { "epoch": 0.0825, "grad_norm": 0.3323623538017273, "learning_rate": 0.00012264607828110018, "loss": 2.6660182952880858, "step": 660 }, { "epoch": 0.08375, "grad_norm": 0.3228432238101959, "learning_rate": 0.0001225997936329289, "loss": 2.690377044677734, "step": 670 }, { "epoch": 0.085, "grad_norm": 0.3340938687324524, "learning_rate": 0.00012255252348899816, "loss": 2.6579252243041993, "step": 680 }, { "epoch": 0.08625, "grad_norm": 0.32717493176460266, "learning_rate": 0.00012250426861613406, "loss": 2.6669349670410156, "step": 690 }, { "epoch": 0.0875, "grad_norm": 0.3213510513305664, "learning_rate": 0.0001224550297971371, "loss": 2.658818817138672, "step": 700 }, { "epoch": 0.08875, "grad_norm": 0.3103785216808319, "learning_rate": 0.00012240480783076967, "loss": 2.64670467376709, "step": 710 }, { "epoch": 0.09, "grad_norm": 0.3206445276737213, "learning_rate": 0.00012235360353174288, "loss": 2.649314117431641, "step": 720 }, { "epoch": 0.09125, "grad_norm": 0.3210267722606659, "learning_rate": 0.00012230141773070355, "loss": 2.6637636184692384, "step": 730 }, { "epoch": 0.0925, "grad_norm": 0.312549352645874, "learning_rate": 0.00012224825127422055, "loss": 2.6725765228271485, "step": 740 }, { "epoch": 0.09375, "grad_norm": 0.32557615637779236, "learning_rate": 0.00012219410502477114, "loss": 2.6337608337402343, "step": 750 }, { "epoch": 0.095, "grad_norm": 0.31713125109672546, "learning_rate": 0.00012213897986072705, "loss": 2.6361785888671876, "step": 760 }, { "epoch": 0.09625, "grad_norm": 0.3173486590385437, "learning_rate": 0.00012208287667634017, "loss": 2.6491493225097655, "step": 770 }, { "epoch": 0.0975, "grad_norm": 0.32202011346817017, "learning_rate": 0.00012202579638172791, "loss": 2.665495681762695, "step": 780 }, { "epoch": 0.09875, "grad_norm": 0.31751731038093567, "learning_rate": 0.0001219677399028587, "loss": 2.670880889892578, "step": 790 }, { "epoch": 0.1, "grad_norm": 0.3310263156890869, "learning_rate": 0.00012190870818153682, "loss": 2.6745986938476562, "step": 800 }, { "epoch": 0.10125, "grad_norm": 0.3246520757675171, "learning_rate": 0.00012184870217538704, "loss": 2.6367824554443358, "step": 810 }, { "epoch": 0.1025, "grad_norm": 0.31728002429008484, "learning_rate": 0.0001217877228578393, "loss": 2.657224655151367, "step": 820 }, { "epoch": 0.10375, "grad_norm": 0.32666370272636414, "learning_rate": 0.00012172577121811272, "loss": 2.629240798950195, "step": 830 }, { "epoch": 0.105, "grad_norm": 0.32864195108413696, "learning_rate": 0.00012166284826119965, "loss": 2.6314460754394533, "step": 840 }, { "epoch": 0.10625, "grad_norm": 0.331391304731369, "learning_rate": 0.00012159895500784936, "loss": 2.6207229614257814, "step": 850 }, { "epoch": 0.1075, "grad_norm": 0.32856595516204834, "learning_rate": 0.00012153409249455148, "loss": 2.6828586578369142, "step": 860 }, { "epoch": 0.10875, "grad_norm": 0.3259557783603668, "learning_rate": 0.00012146826177351913, "loss": 2.6800840377807615, "step": 870 }, { "epoch": 0.11, "grad_norm": 0.3368566930294037, "learning_rate": 0.00012140146391267196, "loss": 2.644548797607422, "step": 880 }, { "epoch": 0.11125, "grad_norm": 0.3319634199142456, "learning_rate": 0.00012133369999561872, "loss": 2.6457305908203126, "step": 890 }, { "epoch": 0.1125, "grad_norm": 0.31302639842033386, "learning_rate": 0.00012126497112163972, "loss": 2.6418832778930663, "step": 900 }, { "epoch": 0.11375, "grad_norm": 0.32079464197158813, "learning_rate": 0.00012119527840566905, "loss": 2.6311697006225585, "step": 910 }, { "epoch": 0.115, "grad_norm": 0.32719048857688904, "learning_rate": 0.00012112462297827639, "loss": 2.641567611694336, "step": 920 }, { "epoch": 0.11625, "grad_norm": 0.32264548540115356, "learning_rate": 0.00012105300598564874, "loss": 2.6696403503417967, "step": 930 }, { "epoch": 0.1175, "grad_norm": 0.3197903335094452, "learning_rate": 0.00012098042858957183, "loss": 2.6566593170166017, "step": 940 }, { "epoch": 0.11875, "grad_norm": 0.3231068253517151, "learning_rate": 0.00012090689196741124, "loss": 2.63052978515625, "step": 950 }, { "epoch": 0.12, "grad_norm": 0.3268223702907562, "learning_rate": 0.00012083239731209331, "loss": 2.6513845443725588, "step": 960 }, { "epoch": 0.12125, "grad_norm": 0.3304605484008789, "learning_rate": 0.00012075694583208578, "loss": 2.6264434814453126, "step": 970 }, { "epoch": 0.1225, "grad_norm": 0.3171931505203247, "learning_rate": 0.00012068053875137824, "loss": 2.636788558959961, "step": 980 }, { "epoch": 0.12375, "grad_norm": 0.3341807425022125, "learning_rate": 0.00012060317730946224, "loss": 2.6531208038330076, "step": 990 }, { "epoch": 0.125, "grad_norm": 0.3334127962589264, "learning_rate": 0.00012052486276131108, "loss": 2.6705049514770507, "step": 1000 }, { "epoch": 0.12625, "grad_norm": 0.307980477809906, "learning_rate": 0.00012044559637735965, "loss": 2.6561138153076174, "step": 1010 }, { "epoch": 0.1275, "grad_norm": 0.31699395179748535, "learning_rate": 0.00012036537944348368, "loss": 2.633596420288086, "step": 1020 }, { "epoch": 0.12875, "grad_norm": 0.32349589467048645, "learning_rate": 0.0001202842132609789, "loss": 2.651826858520508, "step": 1030 }, { "epoch": 0.13, "grad_norm": 0.3407875895500183, "learning_rate": 0.00012020209914653999, "loss": 2.6381755828857423, "step": 1040 }, { "epoch": 0.13125, "grad_norm": 0.31691980361938477, "learning_rate": 0.00012011903843223914, "loss": 2.6360122680664064, "step": 1050 }, { "epoch": 0.1325, "grad_norm": 0.31067660450935364, "learning_rate": 0.0001200350324655045, "loss": 2.6421882629394533, "step": 1060 }, { "epoch": 0.13375, "grad_norm": 0.32634156942367554, "learning_rate": 0.0001199500826090983, "loss": 2.63830509185791, "step": 1070 }, { "epoch": 0.135, "grad_norm": 0.3369225263595581, "learning_rate": 0.00011986419024109472, "loss": 2.63408203125, "step": 1080 }, { "epoch": 0.13625, "grad_norm": 0.3302381932735443, "learning_rate": 0.0001197773567548576, "loss": 2.6358100891113283, "step": 1090 }, { "epoch": 0.1375, "grad_norm": 0.33104801177978516, "learning_rate": 0.00011968958355901778, "loss": 2.6341053009033204, "step": 1100 }, { "epoch": 0.13875, "grad_norm": 0.3302455544471741, "learning_rate": 0.00011960087207745023, "loss": 2.659340667724609, "step": 1110 }, { "epoch": 0.14, "grad_norm": 0.318013995885849, "learning_rate": 0.00011951122374925103, "loss": 2.6539737701416017, "step": 1120 }, { "epoch": 0.14125, "grad_norm": 0.31688031554222107, "learning_rate": 0.00011942064002871398, "loss": 2.650745391845703, "step": 1130 }, { "epoch": 0.1425, "grad_norm": 0.3218444883823395, "learning_rate": 0.00011932912238530696, "loss": 2.6293779373168946, "step": 1140 }, { "epoch": 0.14375, "grad_norm": 0.31668025255203247, "learning_rate": 0.0001192366723036482, "loss": 2.652189254760742, "step": 1150 }, { "epoch": 0.145, "grad_norm": 0.32894524931907654, "learning_rate": 0.0001191432912834821, "loss": 2.6034008026123048, "step": 1160 }, { "epoch": 0.14625, "grad_norm": 0.326031357049942, "learning_rate": 0.00011904898083965494, "loss": 2.6356990814208983, "step": 1170 }, { "epoch": 0.1475, "grad_norm": 0.3148091733455658, "learning_rate": 0.00011895374250209033, "loss": 2.6438148498535154, "step": 1180 }, { "epoch": 0.14875, "grad_norm": 0.3154153823852539, "learning_rate": 0.00011885757781576434, "loss": 2.653242301940918, "step": 1190 }, { "epoch": 0.15, "grad_norm": 0.31809449195861816, "learning_rate": 0.00011876048834068046, "loss": 2.6228126525878905, "step": 1200 }, { "epoch": 0.15125, "grad_norm": 0.32725268602371216, "learning_rate": 0.0001186624756518443, "loss": 2.6216796875, "step": 1210 }, { "epoch": 0.1525, "grad_norm": 0.32540032267570496, "learning_rate": 0.00011856354133923805, "loss": 2.67537841796875, "step": 1220 }, { "epoch": 0.15375, "grad_norm": 0.3263508975505829, "learning_rate": 0.00011846368700779467, "loss": 2.6610176086425783, "step": 1230 }, { "epoch": 0.155, "grad_norm": 0.3205776512622833, "learning_rate": 0.00011836291427737183, "loss": 2.6613521575927734, "step": 1240 }, { "epoch": 0.15625, "grad_norm": 0.31028124690055847, "learning_rate": 0.00011826122478272567, "loss": 2.633769416809082, "step": 1250 }, { "epoch": 0.1575, "grad_norm": 0.31673797965049744, "learning_rate": 0.00011815862017348429, "loss": 2.624924087524414, "step": 1260 }, { "epoch": 0.15875, "grad_norm": 0.32373106479644775, "learning_rate": 0.00011805510211412097, "loss": 2.6462501525878905, "step": 1270 }, { "epoch": 0.16, "grad_norm": 0.31725797057151794, "learning_rate": 0.0001179506722839271, "loss": 2.6365428924560548, "step": 1280 }, { "epoch": 0.16125, "grad_norm": 0.3195420205593109, "learning_rate": 0.00011784533237698511, "loss": 2.6311481475830076, "step": 1290 }, { "epoch": 0.1625, "grad_norm": 0.3341420888900757, "learning_rate": 0.00011773908410214081, "loss": 2.642291450500488, "step": 1300 }, { "epoch": 0.16375, "grad_norm": 0.3230491876602173, "learning_rate": 0.00011763192918297575, "loss": 2.638113594055176, "step": 1310 }, { "epoch": 0.165, "grad_norm": 0.3223067820072174, "learning_rate": 0.0001175238693577793, "loss": 2.6444271087646483, "step": 1320 }, { "epoch": 0.16625, "grad_norm": 0.31934627890586853, "learning_rate": 0.00011741490637952035, "loss": 2.6657215118408204, "step": 1330 }, { "epoch": 0.1675, "grad_norm": 0.3097170889377594, "learning_rate": 0.00011730504201581893, "loss": 2.645807647705078, "step": 1340 }, { "epoch": 0.16875, "grad_norm": 0.32414084672927856, "learning_rate": 0.00011719427804891757, "loss": 2.641864776611328, "step": 1350 }, { "epoch": 0.17, "grad_norm": 0.31383687257766724, "learning_rate": 0.00011708261627565232, "loss": 2.662236785888672, "step": 1360 }, { "epoch": 0.17125, "grad_norm": 0.31501343846321106, "learning_rate": 0.00011697005850742364, "loss": 2.6557693481445312, "step": 1370 }, { "epoch": 0.1725, "grad_norm": 0.31809887290000916, "learning_rate": 0.00011685660657016701, "loss": 2.6280593872070312, "step": 1380 }, { "epoch": 0.17375, "grad_norm": 0.31885311007499695, "learning_rate": 0.0001167422623043233, "loss": 2.6564004898071287, "step": 1390 }, { "epoch": 0.175, "grad_norm": 0.3105798065662384, "learning_rate": 0.00011662702756480891, "loss": 2.64355354309082, "step": 1400 }, { "epoch": 0.17625, "grad_norm": 0.3361447751522064, "learning_rate": 0.00011651090422098569, "loss": 2.6594215393066407, "step": 1410 }, { "epoch": 0.1775, "grad_norm": 0.32253745198249817, "learning_rate": 0.00011639389415663065, "loss": 2.642239570617676, "step": 1420 }, { "epoch": 0.17875, "grad_norm": 0.32338932156562805, "learning_rate": 0.00011627599926990531, "loss": 2.6702959060668947, "step": 1430 }, { "epoch": 0.18, "grad_norm": 0.3116281032562256, "learning_rate": 0.00011615722147332501, "loss": 2.6370218276977537, "step": 1440 }, { "epoch": 0.18125, "grad_norm": 0.3282069265842438, "learning_rate": 0.00011603756269372781, "loss": 2.589012336730957, "step": 1450 }, { "epoch": 0.1825, "grad_norm": 0.32347872853279114, "learning_rate": 0.00011591702487224326, "loss": 2.638626480102539, "step": 1460 }, { "epoch": 0.18375, "grad_norm": 0.31963029503822327, "learning_rate": 0.0001157956099642609, "loss": 2.6150590896606447, "step": 1470 }, { "epoch": 0.185, "grad_norm": 0.31573331356048584, "learning_rate": 0.00011567331993939861, "loss": 2.6242300033569337, "step": 1480 }, { "epoch": 0.18625, "grad_norm": 0.318210631608963, "learning_rate": 0.00011555015678147051, "loss": 2.6236839294433594, "step": 1490 }, { "epoch": 0.1875, "grad_norm": 0.3299921751022339, "learning_rate": 0.0001154261224884549, "loss": 2.633551597595215, "step": 1500 }, { "epoch": 0.18875, "grad_norm": 0.32802239060401917, "learning_rate": 0.00011530121907246187, "loss": 2.650678253173828, "step": 1510 }, { "epoch": 0.19, "grad_norm": 0.3139156401157379, "learning_rate": 0.0001151754485597005, "loss": 2.6056631088256834, "step": 1520 }, { "epoch": 0.19125, "grad_norm": 0.320236474275589, "learning_rate": 0.00011504881299044619, "loss": 2.6355617523193358, "step": 1530 }, { "epoch": 0.1925, "grad_norm": 0.3379780054092407, "learning_rate": 0.00011492131441900742, "loss": 2.6405055999755858, "step": 1540 }, { "epoch": 0.19375, "grad_norm": 0.3395773470401764, "learning_rate": 0.00011479295491369245, "loss": 2.6217134475708006, "step": 1550 }, { "epoch": 0.195, "grad_norm": 0.33206456899642944, "learning_rate": 0.00011466373655677584, "loss": 2.6553268432617188, "step": 1560 }, { "epoch": 0.19625, "grad_norm": 0.3266463577747345, "learning_rate": 0.00011453366144446457, "loss": 2.615655517578125, "step": 1570 }, { "epoch": 0.1975, "grad_norm": 0.3166464567184448, "learning_rate": 0.0001144027316868641, "loss": 2.6240345001220704, "step": 1580 }, { "epoch": 0.19875, "grad_norm": 0.31986290216445923, "learning_rate": 0.00011427094940794416, "loss": 2.6230613708496096, "step": 1590 }, { "epoch": 0.2, "grad_norm": 0.3255802392959595, "learning_rate": 0.00011413831674550421, "loss": 2.6539276123046873, "step": 1600 }, { "epoch": 0.20125, "grad_norm": 0.3255312144756317, "learning_rate": 0.00011400483585113883, "loss": 2.6217121124267577, "step": 1610 }, { "epoch": 0.2025, "grad_norm": 0.3323643207550049, "learning_rate": 0.0001138705088902028, "loss": 2.652513885498047, "step": 1620 }, { "epoch": 0.20375, "grad_norm": 0.3227868974208832, "learning_rate": 0.00011373533804177592, "loss": 2.630014991760254, "step": 1630 }, { "epoch": 0.205, "grad_norm": 0.31701064109802246, "learning_rate": 0.00011359932549862779, "loss": 2.639967346191406, "step": 1640 }, { "epoch": 0.20625, "grad_norm": 0.3187071681022644, "learning_rate": 0.00011346247346718207, "loss": 2.6362884521484373, "step": 1650 }, { "epoch": 0.2075, "grad_norm": 0.31707099080085754, "learning_rate": 0.00011332478416748083, "loss": 2.649311065673828, "step": 1660 }, { "epoch": 0.20875, "grad_norm": 0.3297825753688812, "learning_rate": 0.00011318625983314848, "loss": 2.6421716690063475, "step": 1670 }, { "epoch": 0.21, "grad_norm": 0.3198815584182739, "learning_rate": 0.00011304690271135548, "loss": 2.633087730407715, "step": 1680 }, { "epoch": 0.21125, "grad_norm": 0.3226505219936371, "learning_rate": 0.00011290671506278205, "loss": 2.6442310333251955, "step": 1690 }, { "epoch": 0.2125, "grad_norm": 0.33370015025138855, "learning_rate": 0.00011276569916158123, "loss": 2.6304306030273437, "step": 1700 }, { "epoch": 0.21375, "grad_norm": 0.3307320773601532, "learning_rate": 0.0001126238572953423, "loss": 2.6353145599365235, "step": 1710 }, { "epoch": 0.215, "grad_norm": 0.31320619583129883, "learning_rate": 0.00011248119176505343, "loss": 2.6117172241210938, "step": 1720 }, { "epoch": 0.21625, "grad_norm": 0.3411354422569275, "learning_rate": 0.00011233770488506444, "loss": 2.6199378967285156, "step": 1730 }, { "epoch": 0.2175, "grad_norm": 0.3345658779144287, "learning_rate": 0.0001121933989830493, "loss": 2.617340850830078, "step": 1740 }, { "epoch": 0.21875, "grad_norm": 0.328173965215683, "learning_rate": 0.0001120482763999683, "loss": 2.646270751953125, "step": 1750 }, { "epoch": 0.22, "grad_norm": 0.31834596395492554, "learning_rate": 0.00011190233949003007, "loss": 2.6598697662353517, "step": 1760 }, { "epoch": 0.22125, "grad_norm": 0.32211023569107056, "learning_rate": 0.00011175559062065348, "loss": 2.617197036743164, "step": 1770 }, { "epoch": 0.2225, "grad_norm": 0.30770230293273926, "learning_rate": 0.00011160803217242911, "loss": 2.6376068115234377, "step": 1780 }, { "epoch": 0.22375, "grad_norm": 0.3243764042854309, "learning_rate": 0.00011145966653908078, "loss": 2.606427764892578, "step": 1790 }, { "epoch": 0.225, "grad_norm": 0.33548685908317566, "learning_rate": 0.00011131049612742655, "loss": 2.6384208679199217, "step": 1800 }, { "epoch": 0.22625, "grad_norm": 0.3262486159801483, "learning_rate": 0.00011116052335733979, "loss": 2.658290672302246, "step": 1810 }, { "epoch": 0.2275, "grad_norm": 0.31495559215545654, "learning_rate": 0.00011100975066170992, "loss": 2.662753105163574, "step": 1820 }, { "epoch": 0.22875, "grad_norm": 0.3250574469566345, "learning_rate": 0.00011085818048640288, "loss": 2.6388259887695313, "step": 1830 }, { "epoch": 0.23, "grad_norm": 0.34293144941329956, "learning_rate": 0.00011070581529022152, "loss": 2.6388187408447266, "step": 1840 }, { "epoch": 0.23125, "grad_norm": 0.31609639525413513, "learning_rate": 0.00011055265754486565, "loss": 2.637576675415039, "step": 1850 }, { "epoch": 0.2325, "grad_norm": 0.3181133270263672, "learning_rate": 0.00011039870973489204, "loss": 2.634903907775879, "step": 1860 }, { "epoch": 0.23375, "grad_norm": 0.3416786193847656, "learning_rate": 0.00011024397435767398, "loss": 2.616485023498535, "step": 1870 }, { "epoch": 0.235, "grad_norm": 0.315266489982605, "learning_rate": 0.00011008845392336087, "loss": 2.6373340606689455, "step": 1880 }, { "epoch": 0.23625, "grad_norm": 0.3316870927810669, "learning_rate": 0.0001099321509548375, "loss": 2.6363605499267577, "step": 1890 }, { "epoch": 0.2375, "grad_norm": 0.3230259418487549, "learning_rate": 0.00010977506798768303, "loss": 2.5958734512329102, "step": 1900 }, { "epoch": 0.23875, "grad_norm": 0.32272425293922424, "learning_rate": 0.00010961720757012995, "loss": 2.608958435058594, "step": 1910 }, { "epoch": 0.24, "grad_norm": 0.3131502568721771, "learning_rate": 0.00010945857226302276, "loss": 2.6321544647216797, "step": 1920 }, { "epoch": 0.24125, "grad_norm": 0.32143065333366394, "learning_rate": 0.00010929916463977628, "loss": 2.613364410400391, "step": 1930 }, { "epoch": 0.2425, "grad_norm": 0.3152971565723419, "learning_rate": 0.00010913898728633408, "loss": 2.613265800476074, "step": 1940 }, { "epoch": 0.24375, "grad_norm": 0.32848265767097473, "learning_rate": 0.00010897804280112643, "loss": 2.6013004302978517, "step": 1950 }, { "epoch": 1.000875, "grad_norm": 0.3237718939781189, "learning_rate": 0.00010881633379502814, "loss": 2.8611122131347657, "step": 1960 }, { "epoch": 1.002125, "grad_norm": 0.3280915915966034, "learning_rate": 0.00010865386289131632, "loss": 2.5412445068359375, "step": 1970 }, { "epoch": 1.003375, "grad_norm": 0.33189550042152405, "learning_rate": 0.00010849063272562764, "loss": 2.559256362915039, "step": 1980 }, { "epoch": 1.004625, "grad_norm": 0.3265272378921509, "learning_rate": 0.00010832664594591574, "loss": 2.5583423614501952, "step": 1990 }, { "epoch": 1.005875, "grad_norm": 0.3453090488910675, "learning_rate": 0.00010816190521240819, "loss": 2.5712684631347655, "step": 2000 }, { "epoch": 1.007125, "grad_norm": 0.3423366844654083, "learning_rate": 0.00010799641319756335, "loss": 2.5412336349487306, "step": 2010 }, { "epoch": 1.008375, "grad_norm": 0.32097378373146057, "learning_rate": 0.00010783017258602704, "loss": 2.5253084182739256, "step": 2020 }, { "epoch": 1.009625, "grad_norm": 0.3252958357334137, "learning_rate": 0.00010766318607458898, "loss": 2.5738031387329103, "step": 2030 }, { "epoch": 1.010875, "grad_norm": 0.3372173011302948, "learning_rate": 0.00010749545637213897, "loss": 2.54388370513916, "step": 2040 }, { "epoch": 1.012125, "grad_norm": 0.33359599113464355, "learning_rate": 0.00010732698619962306, "loss": 2.55248908996582, "step": 2050 }, { "epoch": 1.013375, "grad_norm": 0.34591928124427795, "learning_rate": 0.00010715777828999937, "loss": 2.5376352310180663, "step": 2060 }, { "epoch": 1.014625, "grad_norm": 0.35322073101997375, "learning_rate": 0.00010698783538819372, "loss": 2.534122085571289, "step": 2070 }, { "epoch": 1.015875, "grad_norm": 0.3539016544818878, "learning_rate": 0.00010681716025105512, "loss": 2.492664337158203, "step": 2080 }, { "epoch": 1.017125, "grad_norm": 0.3429170548915863, "learning_rate": 0.00010664575564731107, "loss": 2.5008804321289064, "step": 2090 }, { "epoch": 1.018375, "grad_norm": 0.3576091229915619, "learning_rate": 0.00010647362435752263, "loss": 2.5176633834838866, "step": 2100 }, { "epoch": 1.019625, "grad_norm": 0.3297135829925537, "learning_rate": 0.00010630076917403929, "loss": 2.500911331176758, "step": 2110 }, { "epoch": 1.020875, "grad_norm": 0.33164292573928833, "learning_rate": 0.00010612719290095374, "loss": 2.513214111328125, "step": 2120 }, { "epoch": 1.022125, "grad_norm": 0.32680895924568176, "learning_rate": 0.00010595289835405624, "loss": 2.501193809509277, "step": 2130 }, { "epoch": 1.023375, "grad_norm": 0.33465543389320374, "learning_rate": 0.00010577788836078916, "loss": 2.4999351501464844, "step": 2140 }, { "epoch": 1.024625, "grad_norm": 0.3344171941280365, "learning_rate": 0.00010560216576020092, "loss": 2.4867813110351564, "step": 2150 }, { "epoch": 1.025875, "grad_norm": 0.34607550501823425, "learning_rate": 0.00010542573340289998, "loss": 2.503824234008789, "step": 2160 }, { "epoch": 1.027125, "grad_norm": 0.33892592787742615, "learning_rate": 0.00010524859415100871, "loss": 2.4990135192871095, "step": 2170 }, { "epoch": 1.028375, "grad_norm": 0.3448082208633423, "learning_rate": 0.00010507075087811677, "loss": 2.4324840545654296, "step": 2180 }, { "epoch": 1.029625, "grad_norm": 0.3321894407272339, "learning_rate": 0.00010489220646923464, "loss": 2.4842708587646483, "step": 2190 }, { "epoch": 1.030875, "grad_norm": 0.3443576395511627, "learning_rate": 0.0001047129638207468, "loss": 2.485816764831543, "step": 2200 }, { "epoch": 1.032125, "grad_norm": 0.3381134271621704, "learning_rate": 0.00010453302584036468, "loss": 2.4841537475585938, "step": 2210 }, { "epoch": 1.033375, "grad_norm": 0.3401469588279724, "learning_rate": 0.00010435239544707952, "loss": 2.48382453918457, "step": 2220 }, { "epoch": 1.034625, "grad_norm": 0.35364025831222534, "learning_rate": 0.00010417107557111507, "loss": 2.4872058868408202, "step": 2230 }, { "epoch": 1.035875, "grad_norm": 0.3584776818752289, "learning_rate": 0.00010398906915388, "loss": 2.455089569091797, "step": 2240 }, { "epoch": 1.037125, "grad_norm": 0.3385666608810425, "learning_rate": 0.00010380637914792015, "loss": 2.4457998275756836, "step": 2250 }, { "epoch": 1.038375, "grad_norm": 0.3520835340023041, "learning_rate": 0.00010362300851687071, "loss": 2.479095458984375, "step": 2260 }, { "epoch": 1.039625, "grad_norm": 0.34799060225486755, "learning_rate": 0.00010343896023540814, "loss": 2.4659198760986327, "step": 2270 }, { "epoch": 1.040875, "grad_norm": 0.35186630487442017, "learning_rate": 0.00010325423728920182, "loss": 2.4467798233032227, "step": 2280 }, { "epoch": 1.042125, "grad_norm": 0.3423445224761963, "learning_rate": 0.00010306884267486574, "loss": 2.4702438354492187, "step": 2290 }, { "epoch": 1.043375, "grad_norm": 0.3398495018482208, "learning_rate": 0.00010288277939990981, "loss": 2.471152496337891, "step": 2300 }, { "epoch": 1.044625, "grad_norm": 0.34717217087745667, "learning_rate": 0.00010269605048269109, "loss": 2.4720317840576174, "step": 2310 }, { "epoch": 1.045875, "grad_norm": 0.34331125020980835, "learning_rate": 0.00010250865895236482, "loss": 2.4562469482421876, "step": 2320 }, { "epoch": 1.047125, "grad_norm": 0.35022589564323425, "learning_rate": 0.00010232060784883528, "loss": 2.461803436279297, "step": 2330 }, { "epoch": 1.048375, "grad_norm": 0.3725920617580414, "learning_rate": 0.00010213190022270653, "loss": 2.4350805282592773, "step": 2340 }, { "epoch": 1.049625, "grad_norm": 0.3634240925312042, "learning_rate": 0.00010194253913523282, "loss": 2.454206848144531, "step": 2350 }, { "epoch": 1.050875, "grad_norm": 0.35172227025032043, "learning_rate": 0.000101752527658269, "loss": 2.4318115234375, "step": 2360 }, { "epoch": 1.052125, "grad_norm": 0.35827362537384033, "learning_rate": 0.00010156186887422071, "loss": 2.4692001342773438, "step": 2370 }, { "epoch": 1.053375, "grad_norm": 0.36834755539894104, "learning_rate": 0.00010137056587599428, "loss": 2.4683910369873048, "step": 2380 }, { "epoch": 1.054625, "grad_norm": 0.3573245108127594, "learning_rate": 0.00010117862176694666, "loss": 2.4428688049316407, "step": 2390 }, { "epoch": 1.055875, "grad_norm": 0.33202221989631653, "learning_rate": 0.00010098603966083503, "loss": 2.4585454940795897, "step": 2400 }, { "epoch": 1.057125, "grad_norm": 0.35598650574684143, "learning_rate": 0.00010079282268176628, "loss": 2.4740036010742186, "step": 2410 }, { "epoch": 1.058375, "grad_norm": 0.36041730642318726, "learning_rate": 0.00010059897396414633, "loss": 2.4598981857299806, "step": 2420 }, { "epoch": 1.059625, "grad_norm": 0.3481718599796295, "learning_rate": 0.00010040449665262931, "loss": 2.4539608001708983, "step": 2430 }, { "epoch": 1.060875, "grad_norm": 0.3672044277191162, "learning_rate": 0.00010020939390206654, "loss": 2.433728790283203, "step": 2440 }, { "epoch": 1.062125, "grad_norm": 0.35078802704811096, "learning_rate": 0.00010001366887745531, "loss": 2.454706573486328, "step": 2450 }, { "epoch": 1.063375, "grad_norm": 0.36530086398124695, "learning_rate": 9.981732475388758e-05, "loss": 2.4748252868652343, "step": 2460 }, { "epoch": 1.064625, "grad_norm": 0.3578907251358032, "learning_rate": 9.962036471649851e-05, "loss": 2.480423355102539, "step": 2470 }, { "epoch": 1.065875, "grad_norm": 0.370403528213501, "learning_rate": 9.942279196041466e-05, "loss": 2.480521011352539, "step": 2480 }, { "epoch": 1.067125, "grad_norm": 0.36263224482536316, "learning_rate": 9.922460969070231e-05, "loss": 2.4786655426025392, "step": 2490 }, { "epoch": 1.068375, "grad_norm": 0.35858920216560364, "learning_rate": 9.902582112231533e-05, "loss": 2.461780548095703, "step": 2500 }, { "epoch": 1.069625, "grad_norm": 0.35088691115379333, "learning_rate": 9.882642948004314e-05, "loss": 2.4797664642333985, "step": 2510 }, { "epoch": 1.070875, "grad_norm": 0.36078205704689026, "learning_rate": 9.862643799845839e-05, "loss": 2.4529985427856444, "step": 2520 }, { "epoch": 1.072125, "grad_norm": 0.35207876563072205, "learning_rate": 9.842584992186434e-05, "loss": 2.4753444671630858, "step": 2530 }, { "epoch": 1.073375, "grad_norm": 0.36091098189353943, "learning_rate": 9.822466850424243e-05, "loss": 2.4327056884765623, "step": 2540 }, { "epoch": 1.074625, "grad_norm": 0.3577967584133148, "learning_rate": 9.802289700919933e-05, "loss": 2.461964416503906, "step": 2550 }, { "epoch": 1.075875, "grad_norm": 0.36178645491600037, "learning_rate": 9.782053870991414e-05, "loss": 2.4669708251953124, "step": 2560 }, { "epoch": 1.077125, "grad_norm": 0.3427974581718445, "learning_rate": 9.761759688908519e-05, "loss": 2.4416053771972654, "step": 2570 }, { "epoch": 1.078375, "grad_norm": 0.3656075894832611, "learning_rate": 9.741407483887678e-05, "loss": 2.4402462005615235, "step": 2580 }, { "epoch": 1.079625, "grad_norm": 0.37002095580101013, "learning_rate": 9.720997586086587e-05, "loss": 2.451791191101074, "step": 2590 }, { "epoch": 1.080875, "grad_norm": 0.3515098989009857, "learning_rate": 9.700530326598842e-05, "loss": 2.459187889099121, "step": 2600 }, { "epoch": 1.082125, "grad_norm": 0.3765217959880829, "learning_rate": 9.680006037448575e-05, "loss": 2.4384769439697265, "step": 2610 }, { "epoch": 1.083375, "grad_norm": 0.5566070675849915, "learning_rate": 9.659425051585065e-05, "loss": 2.4481531143188477, "step": 2620 }, { "epoch": 1.084625, "grad_norm": 0.3657555878162384, "learning_rate": 9.638787702877333e-05, "loss": 2.470143508911133, "step": 2630 }, { "epoch": 1.085875, "grad_norm": 0.35306495428085327, "learning_rate": 9.618094326108734e-05, "loss": 2.4623140335083007, "step": 2640 }, { "epoch": 1.087125, "grad_norm": 0.35708507895469666, "learning_rate": 9.597345256971521e-05, "loss": 2.4393037796020507, "step": 2650 }, { "epoch": 1.088375, "grad_norm": 0.36429449915885925, "learning_rate": 9.576540832061398e-05, "loss": 2.4460866928100584, "step": 2660 }, { "epoch": 1.089625, "grad_norm": 0.3617342710494995, "learning_rate": 9.555681388872065e-05, "loss": 2.476423454284668, "step": 2670 }, { "epoch": 1.090875, "grad_norm": 0.3526591360569, "learning_rate": 9.534767265789737e-05, "loss": 2.460892105102539, "step": 2680 }, { "epoch": 1.092125, "grad_norm": 0.3697713613510132, "learning_rate": 9.51379880208766e-05, "loss": 2.46860294342041, "step": 2690 }, { "epoch": 1.093375, "grad_norm": 0.37604451179504395, "learning_rate": 9.492776337920603e-05, "loss": 2.465809631347656, "step": 2700 }, { "epoch": 1.094625, "grad_norm": 0.37269482016563416, "learning_rate": 9.471700214319343e-05, "loss": 2.4291683197021485, "step": 2710 }, { "epoch": 1.095875, "grad_norm": 0.37273484468460083, "learning_rate": 9.45057077318513e-05, "loss": 2.447264862060547, "step": 2720 }, { "epoch": 1.097125, "grad_norm": 0.3633696734905243, "learning_rate": 9.429388357284143e-05, "loss": 2.471749114990234, "step": 2730 }, { "epoch": 1.098375, "grad_norm": 0.35682767629623413, "learning_rate": 9.40815331024193e-05, "loss": 2.42556209564209, "step": 2740 }, { "epoch": 1.099625, "grad_norm": 0.3471936285495758, "learning_rate": 9.386865976537827e-05, "loss": 2.446389007568359, "step": 2750 }, { "epoch": 1.100875, "grad_norm": 0.38089418411254883, "learning_rate": 9.365526701499384e-05, "loss": 2.4501571655273438, "step": 2760 }, { "epoch": 1.102125, "grad_norm": 0.3654205799102783, "learning_rate": 9.344135831296749e-05, "loss": 2.439041519165039, "step": 2770 }, { "epoch": 1.103375, "grad_norm": 0.3512708842754364, "learning_rate": 9.322693712937054e-05, "loss": 2.4336933135986327, "step": 2780 }, { "epoch": 1.104625, "grad_norm": 0.36569294333457947, "learning_rate": 9.301200694258795e-05, "loss": 2.444048309326172, "step": 2790 }, { "epoch": 1.105875, "grad_norm": 0.36901962757110596, "learning_rate": 9.279657123926178e-05, "loss": 2.4316547393798826, "step": 2800 }, { "epoch": 1.107125, "grad_norm": 0.36593225598335266, "learning_rate": 9.25806335142348e-05, "loss": 2.4314062118530275, "step": 2810 }, { "epoch": 1.108375, "grad_norm": 0.371039479970932, "learning_rate": 9.236419727049352e-05, "loss": 2.4478275299072267, "step": 2820 }, { "epoch": 1.109625, "grad_norm": 0.3607841730117798, "learning_rate": 9.214726601911162e-05, "loss": 2.471347999572754, "step": 2830 }, { "epoch": 1.110875, "grad_norm": 0.35733747482299805, "learning_rate": 9.192984327919289e-05, "loss": 2.4454570770263673, "step": 2840 }, { "epoch": 1.112125, "grad_norm": 0.3512793183326721, "learning_rate": 9.171193257781413e-05, "loss": 2.4474578857421876, "step": 2850 }, { "epoch": 1.113375, "grad_norm": 0.3591439127922058, "learning_rate": 9.149353744996798e-05, "loss": 2.3968666076660154, "step": 2860 }, { "epoch": 1.114625, "grad_norm": 0.37512722611427307, "learning_rate": 9.127466143850551e-05, "loss": 2.4625476837158202, "step": 2870 }, { "epoch": 1.115875, "grad_norm": 0.3683817982673645, "learning_rate": 9.105530809407877e-05, "loss": 2.4239782333374023, "step": 2880 }, { "epoch": 1.117125, "grad_norm": 0.350392609834671, "learning_rate": 9.08354809750833e-05, "loss": 2.4604770660400392, "step": 2890 }, { "epoch": 1.118375, "grad_norm": 0.3627133071422577, "learning_rate": 9.061518364760018e-05, "loss": 2.4404422760009767, "step": 2900 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.8847708770441626e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }