{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999946314489719, "eval_steps": 500, "global_step": 27939, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005368551028077522, "grad_norm": 173.3736114501953, "learning_rate": 5.959475566150179e-07, "loss": 8.043, "step": 50 }, { "epoch": 0.010737102056155044, "grad_norm": 34.7974739074707, "learning_rate": 1.1918951132300359e-06, "loss": 6.5425, "step": 100 }, { "epoch": 0.016105653084232566, "grad_norm": 31.309803009033203, "learning_rate": 1.7878426698450538e-06, "loss": 5.5084, "step": 150 }, { "epoch": 0.02147420411231009, "grad_norm": 30.26763153076172, "learning_rate": 2.3837902264600717e-06, "loss": 4.6124, "step": 200 }, { "epoch": 0.02684275514038761, "grad_norm": 24.171663284301758, "learning_rate": 2.9797377830750894e-06, "loss": 4.0755, "step": 250 }, { "epoch": 0.03221130616846513, "grad_norm": 19.654661178588867, "learning_rate": 3.5756853396901076e-06, "loss": 3.4494, "step": 300 }, { "epoch": 0.03757985719654265, "grad_norm": 19.887666702270508, "learning_rate": 4.171632896305126e-06, "loss": 3.401, "step": 350 }, { "epoch": 0.04294840822462018, "grad_norm": 27.6500301361084, "learning_rate": 4.7675804529201435e-06, "loss": 3.0908, "step": 400 }, { "epoch": 0.048316959252697696, "grad_norm": 20.856115341186523, "learning_rate": 5.363528009535162e-06, "loss": 3.3005, "step": 450 }, { "epoch": 0.05368551028077522, "grad_norm": 22.7480525970459, "learning_rate": 5.959475566150179e-06, "loss": 2.8446, "step": 500 }, { "epoch": 0.05905406130885274, "grad_norm": 20.75531578063965, "learning_rate": 6.5554231227651975e-06, "loss": 2.7879, "step": 550 }, { "epoch": 0.06442261233693027, "grad_norm": 25.86386489868164, "learning_rate": 7.151370679380215e-06, "loss": 2.7259, "step": 600 }, { "epoch": 0.06979116336500779, "grad_norm": 18.4228458404541, "learning_rate": 7.747318235995233e-06, "loss": 2.7685, "step": 650 }, { "epoch": 0.0751597143930853, "grad_norm": 25.71515655517578, "learning_rate": 8.343265792610251e-06, "loss": 2.8611, "step": 700 }, { "epoch": 0.08052826542116283, "grad_norm": 14.991035461425781, "learning_rate": 8.939213349225268e-06, "loss": 2.7934, "step": 750 }, { "epoch": 0.08589681644924035, "grad_norm": 11.806473731994629, "learning_rate": 9.535160905840287e-06, "loss": 2.6654, "step": 800 }, { "epoch": 0.09126536747731787, "grad_norm": 17.34215545654297, "learning_rate": 9.999995934757252e-06, "loss": 2.552, "step": 850 }, { "epoch": 0.09663391850539539, "grad_norm": 16.925975799560547, "learning_rate": 9.999874985890243e-06, "loss": 2.425, "step": 900 }, { "epoch": 0.10200246953347292, "grad_norm": 26.792736053466797, "learning_rate": 9.99958605643335e-06, "loss": 2.4877, "step": 950 }, { "epoch": 0.10737102056155044, "grad_norm": 12.235079765319824, "learning_rate": 9.999129156093722e-06, "loss": 2.3378, "step": 1000 }, { "epoch": 0.11273957158962795, "grad_norm": 15.029677391052246, "learning_rate": 9.998504300221821e-06, "loss": 2.5849, "step": 1050 }, { "epoch": 0.11810812261770548, "grad_norm": 17.80719757080078, "learning_rate": 9.997711509810904e-06, "loss": 2.3923, "step": 1100 }, { "epoch": 0.123476673645783, "grad_norm": 13.069009780883789, "learning_rate": 9.996750811496319e-06, "loss": 2.3446, "step": 1150 }, { "epoch": 0.12884522467386053, "grad_norm": 12.205385208129883, "learning_rate": 9.995622237554603e-06, "loss": 2.627, "step": 1200 }, { "epoch": 0.13421377570193804, "grad_norm": 17.56980323791504, "learning_rate": 9.994325825902411e-06, "loss": 2.3043, "step": 1250 }, { "epoch": 0.13958232673001558, "grad_norm": 11.382326126098633, "learning_rate": 9.992861620095228e-06, "loss": 2.2816, "step": 1300 }, { "epoch": 0.1449508777580931, "grad_norm": 10.679113388061523, "learning_rate": 9.991229669325917e-06, "loss": 2.6668, "step": 1350 }, { "epoch": 0.1503194287861706, "grad_norm": 15.005497932434082, "learning_rate": 9.98943002842306e-06, "loss": 2.5911, "step": 1400 }, { "epoch": 0.15568797981424815, "grad_norm": 10.959671020507812, "learning_rate": 9.987462757849114e-06, "loss": 2.4963, "step": 1450 }, { "epoch": 0.16105653084232566, "grad_norm": 13.526322364807129, "learning_rate": 9.985327923698387e-06, "loss": 2.4527, "step": 1500 }, { "epoch": 0.16642508187040317, "grad_norm": 11.613245964050293, "learning_rate": 9.983025597694814e-06, "loss": 2.3502, "step": 1550 }, { "epoch": 0.1717936328984807, "grad_norm": 19.931095123291016, "learning_rate": 9.980555857189542e-06, "loss": 2.2207, "step": 1600 }, { "epoch": 0.17716218392655822, "grad_norm": 13.534950256347656, "learning_rate": 9.977918785158341e-06, "loss": 2.2449, "step": 1650 }, { "epoch": 0.18253073495463573, "grad_norm": 21.363548278808594, "learning_rate": 9.97511447019881e-06, "loss": 2.6087, "step": 1700 }, { "epoch": 0.18789928598271327, "grad_norm": 22.931884765625, "learning_rate": 9.9721430065274e-06, "loss": 2.4396, "step": 1750 }, { "epoch": 0.19326783701079078, "grad_norm": 12.380692481994629, "learning_rate": 9.96900449397625e-06, "loss": 2.4232, "step": 1800 }, { "epoch": 0.19863638803886832, "grad_norm": 13.600720405578613, "learning_rate": 9.965699037989835e-06, "loss": 2.6398, "step": 1850 }, { "epoch": 0.20400493906694583, "grad_norm": 23.817028045654297, "learning_rate": 9.962226749621423e-06, "loss": 1.9173, "step": 1900 }, { "epoch": 0.20937349009502335, "grad_norm": 22.198123931884766, "learning_rate": 9.958587745529338e-06, "loss": 2.02, "step": 1950 }, { "epoch": 0.21474204112310089, "grad_norm": 19.710922241210938, "learning_rate": 9.954782147973048e-06, "loss": 1.9257, "step": 2000 }, { "epoch": 0.2201105921511784, "grad_norm": 11.56153678894043, "learning_rate": 9.950810084809056e-06, "loss": 2.4561, "step": 2050 }, { "epoch": 0.2254791431792559, "grad_norm": 11.210532188415527, "learning_rate": 9.946671689486598e-06, "loss": 2.2825, "step": 2100 }, { "epoch": 0.23084769420733345, "grad_norm": 22.60122299194336, "learning_rate": 9.94236710104317e-06, "loss": 2.3433, "step": 2150 }, { "epoch": 0.23621624523541096, "grad_norm": 21.053020477294922, "learning_rate": 9.937896464099849e-06, "loss": 2.3371, "step": 2200 }, { "epoch": 0.24158479626348847, "grad_norm": 11.02442455291748, "learning_rate": 9.933259928856438e-06, "loss": 1.9893, "step": 2250 }, { "epoch": 0.246953347291566, "grad_norm": 23.212339401245117, "learning_rate": 9.928457651086414e-06, "loss": 2.2982, "step": 2300 }, { "epoch": 0.2523218983196435, "grad_norm": 12.233504295349121, "learning_rate": 9.923489792131701e-06, "loss": 2.3542, "step": 2350 }, { "epoch": 0.25769044934772106, "grad_norm": 22.132793426513672, "learning_rate": 9.918356518897252e-06, "loss": 2.217, "step": 2400 }, { "epoch": 0.26305900037579855, "grad_norm": 9.651283264160156, "learning_rate": 9.91305800384543e-06, "loss": 2.2106, "step": 2450 }, { "epoch": 0.2684275514038761, "grad_norm": 11.630431175231934, "learning_rate": 9.907594424990221e-06, "loss": 2.4414, "step": 2500 }, { "epoch": 0.2737961024319536, "grad_norm": 11.314878463745117, "learning_rate": 9.901965965891255e-06, "loss": 2.1554, "step": 2550 }, { "epoch": 0.27916465346003116, "grad_norm": 8.078882217407227, "learning_rate": 9.896172815647638e-06, "loss": 2.17, "step": 2600 }, { "epoch": 0.28453320448810865, "grad_norm": 11.758085250854492, "learning_rate": 9.890215168891593e-06, "loss": 2.5758, "step": 2650 }, { "epoch": 0.2899017555161862, "grad_norm": 7.850560188293457, "learning_rate": 9.88409322578193e-06, "loss": 2.2779, "step": 2700 }, { "epoch": 0.2952703065442637, "grad_norm": 9.131479263305664, "learning_rate": 9.877807191997314e-06, "loss": 2.1076, "step": 2750 }, { "epoch": 0.3006388575723412, "grad_norm": 10.000419616699219, "learning_rate": 9.871357278729355e-06, "loss": 2.2511, "step": 2800 }, { "epoch": 0.30600740860041875, "grad_norm": 8.93471622467041, "learning_rate": 9.86474370267552e-06, "loss": 2.764, "step": 2850 }, { "epoch": 0.3113759596284963, "grad_norm": 12.075305938720703, "learning_rate": 9.857966686031848e-06, "loss": 2.3072, "step": 2900 }, { "epoch": 0.3167445106565738, "grad_norm": 20.350278854370117, "learning_rate": 9.85102645648548e-06, "loss": 1.9735, "step": 2950 }, { "epoch": 0.3221130616846513, "grad_norm": 19.958290100097656, "learning_rate": 9.843923247207018e-06, "loss": 2.3562, "step": 3000 }, { "epoch": 0.32748161271272885, "grad_norm": 19.133190155029297, "learning_rate": 9.836657296842684e-06, "loss": 2.2974, "step": 3050 }, { "epoch": 0.33285016374080634, "grad_norm": 8.62128734588623, "learning_rate": 9.82922884950631e-06, "loss": 2.0612, "step": 3100 }, { "epoch": 0.3382187147688839, "grad_norm": 17.61878204345703, "learning_rate": 9.821638154771125e-06, "loss": 2.164, "step": 3150 }, { "epoch": 0.3435872657969614, "grad_norm": 12.352161407470703, "learning_rate": 9.813885467661386e-06, "loss": 2.3342, "step": 3200 }, { "epoch": 0.3489558168250389, "grad_norm": 17.905899047851562, "learning_rate": 9.805971048643792e-06, "loss": 2.5445, "step": 3250 }, { "epoch": 0.35432436785311644, "grad_norm": 12.0066499710083, "learning_rate": 9.79789516361875e-06, "loss": 1.9144, "step": 3300 }, { "epoch": 0.359692918881194, "grad_norm": 8.261287689208984, "learning_rate": 9.789658083911428e-06, "loss": 2.2585, "step": 3350 }, { "epoch": 0.36506146990927146, "grad_norm": 9.940215110778809, "learning_rate": 9.781260086262645e-06, "loss": 2.3342, "step": 3400 }, { "epoch": 0.370430020937349, "grad_norm": 14.417253494262695, "learning_rate": 9.77270145281958e-06, "loss": 2.121, "step": 3450 }, { "epoch": 0.37579857196542654, "grad_norm": 8.75595474243164, "learning_rate": 9.763982471126277e-06, "loss": 2.5303, "step": 3500 }, { "epoch": 0.381167122993504, "grad_norm": 10.554227828979492, "learning_rate": 9.755103434113998e-06, "loss": 2.3582, "step": 3550 }, { "epoch": 0.38653567402158157, "grad_norm": 7.218326091766357, "learning_rate": 9.74606464009138e-06, "loss": 2.1692, "step": 3600 }, { "epoch": 0.3919042250496591, "grad_norm": 9.922089576721191, "learning_rate": 9.736866392734402e-06, "loss": 2.3558, "step": 3650 }, { "epoch": 0.39727277607773664, "grad_norm": 7.799227714538574, "learning_rate": 9.727509001076197e-06, "loss": 2.3932, "step": 3700 }, { "epoch": 0.40264132710581413, "grad_norm": 20.772369384765625, "learning_rate": 9.71799277949666e-06, "loss": 1.7941, "step": 3750 }, { "epoch": 0.40800987813389167, "grad_norm": 7.913593292236328, "learning_rate": 9.708318047711883e-06, "loss": 2.0766, "step": 3800 }, { "epoch": 0.4133784291619692, "grad_norm": 11.947430610656738, "learning_rate": 9.698485130763428e-06, "loss": 1.9943, "step": 3850 }, { "epoch": 0.4187469801900467, "grad_norm": 6.923953533172607, "learning_rate": 9.688494359007392e-06, "loss": 2.3402, "step": 3900 }, { "epoch": 0.42411553121812423, "grad_norm": 10.6170072555542, "learning_rate": 9.678346068103312e-06, "loss": 2.2038, "step": 3950 }, { "epoch": 0.42948408224620177, "grad_norm": 8.604026794433594, "learning_rate": 9.668040599002893e-06, "loss": 2.2163, "step": 4000 }, { "epoch": 0.43485263327427925, "grad_norm": 6.9104323387146, "learning_rate": 9.657578297938547e-06, "loss": 2.4159, "step": 4050 }, { "epoch": 0.4402211843023568, "grad_norm": 7.583080768585205, "learning_rate": 9.646959516411765e-06, "loss": 2.4815, "step": 4100 }, { "epoch": 0.44558973533043433, "grad_norm": 10.75753402709961, "learning_rate": 9.636184611181301e-06, "loss": 2.5791, "step": 4150 }, { "epoch": 0.4509582863585118, "grad_norm": 8.691010475158691, "learning_rate": 9.625253944251193e-06, "loss": 2.1982, "step": 4200 }, { "epoch": 0.45632683738658936, "grad_norm": 7.544954299926758, "learning_rate": 9.614167882858602e-06, "loss": 2.4019, "step": 4250 }, { "epoch": 0.4616953884146669, "grad_norm": 8.283909797668457, "learning_rate": 9.602926799461466e-06, "loss": 2.2008, "step": 4300 }, { "epoch": 0.4670639394427444, "grad_norm": 8.905241966247559, "learning_rate": 9.591531071725992e-06, "loss": 2.4518, "step": 4350 }, { "epoch": 0.4724324904708219, "grad_norm": 6.864408016204834, "learning_rate": 9.579981082513963e-06, "loss": 2.4392, "step": 4400 }, { "epoch": 0.47780104149889946, "grad_norm": 9.704352378845215, "learning_rate": 9.568277219869887e-06, "loss": 2.5323, "step": 4450 }, { "epoch": 0.48316959252697694, "grad_norm": 10.778401374816895, "learning_rate": 9.556419877007938e-06, "loss": 1.8399, "step": 4500 }, { "epoch": 0.4885381435550545, "grad_norm": 7.707949161529541, "learning_rate": 9.544409452298773e-06, "loss": 2.2951, "step": 4550 }, { "epoch": 0.493906694583132, "grad_norm": 10.046951293945312, "learning_rate": 9.532246349256122e-06, "loss": 2.2967, "step": 4600 }, { "epoch": 0.49927524561120956, "grad_norm": 6.738362789154053, "learning_rate": 9.51993097652325e-06, "loss": 2.5509, "step": 4650 }, { "epoch": 0.504643796639287, "grad_norm": 8.04515552520752, "learning_rate": 9.507463747859217e-06, "loss": 1.8399, "step": 4700 }, { "epoch": 0.5100123476673646, "grad_norm": 7.664340019226074, "learning_rate": 9.49484508212498e-06, "loss": 1.8151, "step": 4750 }, { "epoch": 0.5153808986954421, "grad_norm": 7.901899814605713, "learning_rate": 9.48207540326932e-06, "loss": 2.1866, "step": 4800 }, { "epoch": 0.5207494497235197, "grad_norm": 13.268074989318848, "learning_rate": 9.469155140314604e-06, "loss": 1.752, "step": 4850 }, { "epoch": 0.5261180007515971, "grad_norm": 7.320032596588135, "learning_rate": 9.45608472734236e-06, "loss": 2.0824, "step": 4900 }, { "epoch": 0.5314865517796746, "grad_norm": 11.113451957702637, "learning_rate": 9.442864603478709e-06, "loss": 2.2699, "step": 4950 }, { "epoch": 0.5368551028077522, "grad_norm": 9.071028709411621, "learning_rate": 9.429495212879593e-06, "loss": 2.4293, "step": 5000 }, { "epoch": 0.5422236538358297, "grad_norm": 7.577270984649658, "learning_rate": 9.415977004715868e-06, "loss": 2.095, "step": 5050 }, { "epoch": 0.5475922048639073, "grad_norm": 10.385214805603027, "learning_rate": 9.402310433158206e-06, "loss": 2.2864, "step": 5100 }, { "epoch": 0.5529607558919848, "grad_norm": 8.965862274169922, "learning_rate": 9.388495957361836e-06, "loss": 2.8198, "step": 5150 }, { "epoch": 0.5583293069200623, "grad_norm": 10.5585355758667, "learning_rate": 9.374534041451124e-06, "loss": 1.8603, "step": 5200 }, { "epoch": 0.5636978579481398, "grad_norm": 9.451448440551758, "learning_rate": 9.360425154503969e-06, "loss": 2.0864, "step": 5250 }, { "epoch": 0.5690664089762173, "grad_norm": 13.569538116455078, "learning_rate": 9.346169770536056e-06, "loss": 1.8889, "step": 5300 }, { "epoch": 0.5744349600042948, "grad_norm": 16.18905258178711, "learning_rate": 9.331768368484918e-06, "loss": 2.2455, "step": 5350 }, { "epoch": 0.5798035110323724, "grad_norm": 8.802836418151855, "learning_rate": 9.317221432193859e-06, "loss": 2.1094, "step": 5400 }, { "epoch": 0.5851720620604499, "grad_norm": 18.47078514099121, "learning_rate": 9.302529450395682e-06, "loss": 2.9082, "step": 5450 }, { "epoch": 0.5905406130885275, "grad_norm": 8.37303352355957, "learning_rate": 9.287692916696287e-06, "loss": 2.1908, "step": 5500 }, { "epoch": 0.5959091641166049, "grad_norm": 6.674489498138428, "learning_rate": 9.27271232955807e-06, "loss": 1.985, "step": 5550 }, { "epoch": 0.6012777151446824, "grad_norm": 6.357884883880615, "learning_rate": 9.257588192283189e-06, "loss": 2.4121, "step": 5600 }, { "epoch": 0.60664626617276, "grad_norm": 20.018545150756836, "learning_rate": 9.242321012996649e-06, "loss": 2.3254, "step": 5650 }, { "epoch": 0.6120148172008375, "grad_norm": 6.632571220397949, "learning_rate": 9.226911304629231e-06, "loss": 2.0863, "step": 5700 }, { "epoch": 0.617383368228915, "grad_norm": 6.925139427185059, "learning_rate": 9.211359584900261e-06, "loss": 2.2034, "step": 5750 }, { "epoch": 0.6227519192569926, "grad_norm": 17.126575469970703, "learning_rate": 9.195666376300212e-06, "loss": 2.4107, "step": 5800 }, { "epoch": 0.62812047028507, "grad_norm": 8.107414245605469, "learning_rate": 9.179832206073152e-06, "loss": 2.436, "step": 5850 }, { "epoch": 0.6334890213131475, "grad_norm": 7.757444381713867, "learning_rate": 9.163857606199039e-06, "loss": 2.3477, "step": 5900 }, { "epoch": 0.6388575723412251, "grad_norm": 18.574289321899414, "learning_rate": 9.147743113375827e-06, "loss": 2.0063, "step": 5950 }, { "epoch": 0.6442261233693026, "grad_norm": 8.169580459594727, "learning_rate": 9.13148926900146e-06, "loss": 2.4438, "step": 6000 }, { "epoch": 0.6495946743973802, "grad_norm": 6.801086902618408, "learning_rate": 9.115096619155663e-06, "loss": 2.0501, "step": 6050 }, { "epoch": 0.6549632254254577, "grad_norm": 10.179924964904785, "learning_rate": 9.098565714581601e-06, "loss": 2.3302, "step": 6100 }, { "epoch": 0.6603317764535352, "grad_norm": 9.375894546508789, "learning_rate": 9.081897110667388e-06, "loss": 2.4207, "step": 6150 }, { "epoch": 0.6657003274816127, "grad_norm": 18.672060012817383, "learning_rate": 9.065091367427401e-06, "loss": 2.4269, "step": 6200 }, { "epoch": 0.6710688785096902, "grad_norm": 9.124485969543457, "learning_rate": 9.048149049483497e-06, "loss": 2.6344, "step": 6250 }, { "epoch": 0.6764374295377678, "grad_norm": 17.736295700073242, "learning_rate": 9.031070726046014e-06, "loss": 2.0867, "step": 6300 }, { "epoch": 0.6818059805658453, "grad_norm": 10.588594436645508, "learning_rate": 9.013856970894672e-06, "loss": 2.3375, "step": 6350 }, { "epoch": 0.6871745315939228, "grad_norm": 19.190420150756836, "learning_rate": 8.996508362359278e-06, "loss": 2.3877, "step": 6400 }, { "epoch": 0.6925430826220004, "grad_norm": 17.127389907836914, "learning_rate": 8.979025483300305e-06, "loss": 2.4791, "step": 6450 }, { "epoch": 0.6979116336500778, "grad_norm": 17.82516860961914, "learning_rate": 8.961408921089304e-06, "loss": 2.13, "step": 6500 }, { "epoch": 0.7032801846781553, "grad_norm": 7.146808624267578, "learning_rate": 8.943659267589177e-06, "loss": 2.2693, "step": 6550 }, { "epoch": 0.7086487357062329, "grad_norm": 16.019311904907227, "learning_rate": 8.925777119134288e-06, "loss": 2.0913, "step": 6600 }, { "epoch": 0.7140172867343104, "grad_norm": 18.300996780395508, "learning_rate": 8.90776307651043e-06, "loss": 2.5383, "step": 6650 }, { "epoch": 0.719385837762388, "grad_norm": 17.373537063598633, "learning_rate": 8.889617744934632e-06, "loss": 2.2624, "step": 6700 }, { "epoch": 0.7247543887904655, "grad_norm": 8.753660202026367, "learning_rate": 8.871341734034835e-06, "loss": 2.5488, "step": 6750 }, { "epoch": 0.7301229398185429, "grad_norm": 10.378632545471191, "learning_rate": 8.852935657829414e-06, "loss": 1.8973, "step": 6800 }, { "epoch": 0.7354914908466205, "grad_norm": 6.974676132202148, "learning_rate": 8.834400134706538e-06, "loss": 1.7114, "step": 6850 }, { "epoch": 0.740860041874698, "grad_norm": 17.7410888671875, "learning_rate": 8.815735787403397e-06, "loss": 2.063, "step": 6900 }, { "epoch": 0.7462285929027755, "grad_norm": 18.06396484375, "learning_rate": 8.796943242985283e-06, "loss": 2.3759, "step": 6950 }, { "epoch": 0.7515971439308531, "grad_norm": 7.956383228302002, "learning_rate": 8.778023132824523e-06, "loss": 1.8869, "step": 7000 }, { "epoch": 0.7569656949589306, "grad_norm": 7.235781669616699, "learning_rate": 8.758976092579263e-06, "loss": 2.0486, "step": 7050 }, { "epoch": 0.762334245987008, "grad_norm": 8.756217956542969, "learning_rate": 8.739802762172112e-06, "loss": 2.3013, "step": 7100 }, { "epoch": 0.7677027970150856, "grad_norm": 10.677332878112793, "learning_rate": 8.72050378576865e-06, "loss": 2.2763, "step": 7150 }, { "epoch": 0.7730713480431631, "grad_norm": 7.494720935821533, "learning_rate": 8.701079811755775e-06, "loss": 2.4137, "step": 7200 }, { "epoch": 0.7784398990712407, "grad_norm": 8.669584274291992, "learning_rate": 8.681531492719924e-06, "loss": 2.0786, "step": 7250 }, { "epoch": 0.7838084500993182, "grad_norm": 6.614152431488037, "learning_rate": 8.661859485425153e-06, "loss": 2.0609, "step": 7300 }, { "epoch": 0.7891770011273957, "grad_norm": 6.329990863800049, "learning_rate": 8.642064450791063e-06, "loss": 2.1517, "step": 7350 }, { "epoch": 0.7945455521554733, "grad_norm": 8.025002479553223, "learning_rate": 8.622147053870603e-06, "loss": 1.9102, "step": 7400 }, { "epoch": 0.7999141031835507, "grad_norm": 6.091482639312744, "learning_rate": 8.60210796382772e-06, "loss": 1.824, "step": 7450 }, { "epoch": 0.8052826542116283, "grad_norm": 9.765973091125488, "learning_rate": 8.58194785391488e-06, "loss": 2.4761, "step": 7500 }, { "epoch": 0.8106512052397058, "grad_norm": 8.054783821105957, "learning_rate": 8.561667401450448e-06, "loss": 2.152, "step": 7550 }, { "epoch": 0.8160197562677833, "grad_norm": 8.503512382507324, "learning_rate": 8.541267287795936e-06, "loss": 2.3627, "step": 7600 }, { "epoch": 0.8213883072958609, "grad_norm": 6.793158054351807, "learning_rate": 8.520748198333104e-06, "loss": 2.0025, "step": 7650 }, { "epoch": 0.8267568583239384, "grad_norm": 8.554807662963867, "learning_rate": 8.50011082244094e-06, "loss": 2.8647, "step": 7700 }, { "epoch": 0.8321254093520158, "grad_norm": 8.009889602661133, "learning_rate": 8.479355853472492e-06, "loss": 2.1245, "step": 7750 }, { "epoch": 0.8374939603800934, "grad_norm": 5.670645713806152, "learning_rate": 8.458483988731585e-06, "loss": 2.0752, "step": 7800 }, { "epoch": 0.8428625114081709, "grad_norm": 7.713712692260742, "learning_rate": 8.43749592944938e-06, "loss": 2.413, "step": 7850 }, { "epoch": 0.8482310624362485, "grad_norm": 8.770386695861816, "learning_rate": 8.41639238076082e-06, "loss": 1.9887, "step": 7900 }, { "epoch": 0.853599613464326, "grad_norm": 7.535435199737549, "learning_rate": 8.39517405168095e-06, "loss": 1.9605, "step": 7950 }, { "epoch": 0.8589681644924035, "grad_norm": 8.992109298706055, "learning_rate": 8.373841655081077e-06, "loss": 1.9232, "step": 8000 }, { "epoch": 0.864336715520481, "grad_norm": 5.412756443023682, "learning_rate": 8.352395907664832e-06, "loss": 2.3468, "step": 8050 }, { "epoch": 0.8697052665485585, "grad_norm": 6.559614181518555, "learning_rate": 8.330837529944093e-06, "loss": 2.0389, "step": 8100 }, { "epoch": 0.875073817576636, "grad_norm": 8.850929260253906, "learning_rate": 8.309167246214771e-06, "loss": 2.3683, "step": 8150 }, { "epoch": 0.8804423686047136, "grad_norm": 17.323158264160156, "learning_rate": 8.287385784532475e-06, "loss": 2.2053, "step": 8200 }, { "epoch": 0.8858109196327911, "grad_norm": 6.870123863220215, "learning_rate": 8.265493876688062e-06, "loss": 2.4002, "step": 8250 }, { "epoch": 0.8911794706608687, "grad_norm": 8.322813034057617, "learning_rate": 8.243492258183038e-06, "loss": 2.2789, "step": 8300 }, { "epoch": 0.8965480216889462, "grad_norm": 6.7904839515686035, "learning_rate": 8.221381668204858e-06, "loss": 2.5743, "step": 8350 }, { "epoch": 0.9019165727170236, "grad_norm": 8.629620552062988, "learning_rate": 8.199162849602083e-06, "loss": 2.1342, "step": 8400 }, { "epoch": 0.9072851237451012, "grad_norm": 6.57612943649292, "learning_rate": 8.176836548859426e-06, "loss": 2.3242, "step": 8450 }, { "epoch": 0.9126536747731787, "grad_norm": 18.26816177368164, "learning_rate": 8.15440351607268e-06, "loss": 2.2392, "step": 8500 }, { "epoch": 0.9180222258012563, "grad_norm": 7.219480037689209, "learning_rate": 8.131864504923501e-06, "loss": 1.9592, "step": 8550 }, { "epoch": 0.9233907768293338, "grad_norm": 17.576231002807617, "learning_rate": 8.109220272654103e-06, "loss": 2.2499, "step": 8600 }, { "epoch": 0.9287593278574113, "grad_norm": 8.521632194519043, "learning_rate": 8.086471580041806e-06, "loss": 2.2565, "step": 8650 }, { "epoch": 0.9341278788854888, "grad_norm": 7.680962562561035, "learning_rate": 8.063619191373478e-06, "loss": 2.114, "step": 8700 }, { "epoch": 0.9394964299135663, "grad_norm": 6.641688346862793, "learning_rate": 8.040663874419863e-06, "loss": 2.3469, "step": 8750 }, { "epoch": 0.9448649809416438, "grad_norm": 7.556726932525635, "learning_rate": 8.017606400409781e-06, "loss": 2.233, "step": 8800 }, { "epoch": 0.9502335319697214, "grad_norm": 7.783173561096191, "learning_rate": 7.994447544004215e-06, "loss": 2.0393, "step": 8850 }, { "epoch": 0.9556020829977989, "grad_norm": 17.22361946105957, "learning_rate": 7.971188083270294e-06, "loss": 2.0588, "step": 8900 }, { "epoch": 0.9609706340258765, "grad_norm": 8.3529052734375, "learning_rate": 7.947828799655142e-06, "loss": 1.9373, "step": 8950 }, { "epoch": 0.9663391850539539, "grad_norm": 15.621068954467773, "learning_rate": 7.92437047795963e-06, "loss": 1.8505, "step": 9000 }, { "epoch": 0.9717077360820314, "grad_norm": 7.267556190490723, "learning_rate": 7.900813906312004e-06, "loss": 1.8633, "step": 9050 }, { "epoch": 0.977076287110109, "grad_norm": 6.835626602172852, "learning_rate": 7.877159876141415e-06, "loss": 2.0578, "step": 9100 }, { "epoch": 0.9824448381381865, "grad_norm": 6.728379726409912, "learning_rate": 7.853409182151321e-06, "loss": 2.4776, "step": 9150 }, { "epoch": 0.987813389166264, "grad_norm": 7.951884746551514, "learning_rate": 7.829562622292788e-06, "loss": 2.405, "step": 9200 }, { "epoch": 0.9931819401943416, "grad_norm": 7.328428268432617, "learning_rate": 7.805620997737691e-06, "loss": 2.5417, "step": 9250 }, { "epoch": 0.9985504912224191, "grad_norm": 6.334090709686279, "learning_rate": 7.781585112851778e-06, "loss": 1.9777, "step": 9300 }, { "epoch": 1.0039727277607773, "grad_norm": 15.359502792358398, "learning_rate": 7.757455775167669e-06, "loss": 1.7902, "step": 9350 }, { "epoch": 1.009341278788855, "grad_norm": 7.517678737640381, "learning_rate": 7.733233795357706e-06, "loss": 1.7571, "step": 9400 }, { "epoch": 1.0147098298169324, "grad_norm": 9.919081687927246, "learning_rate": 7.708919987206727e-06, "loss": 1.6278, "step": 9450 }, { "epoch": 1.02007838084501, "grad_norm": 16.150758743286133, "learning_rate": 7.684515167584725e-06, "loss": 1.9163, "step": 9500 }, { "epoch": 1.0254469318730874, "grad_norm": 5.594913482666016, "learning_rate": 7.660020156419398e-06, "loss": 2.0141, "step": 9550 }, { "epoch": 1.0308154829011649, "grad_norm": 6.763994216918945, "learning_rate": 7.63543577666861e-06, "loss": 1.7438, "step": 9600 }, { "epoch": 1.0361840339292425, "grad_norm": 7.240082740783691, "learning_rate": 7.6107628542927305e-06, "loss": 1.859, "step": 9650 }, { "epoch": 1.04155258495732, "grad_norm": 8.067387580871582, "learning_rate": 7.5860022182269e-06, "loss": 1.7783, "step": 9700 }, { "epoch": 1.0469211359853976, "grad_norm": 6.464083194732666, "learning_rate": 7.561154700353166e-06, "loss": 1.4824, "step": 9750 }, { "epoch": 1.052289687013475, "grad_norm": 6.964838027954102, "learning_rate": 7.536221135472545e-06, "loss": 1.533, "step": 9800 }, { "epoch": 1.0576582380415527, "grad_norm": 6.4511823654174805, "learning_rate": 7.511202361276966e-06, "loss": 1.5956, "step": 9850 }, { "epoch": 1.06302678906963, "grad_norm": 17.920740127563477, "learning_rate": 7.486099218321138e-06, "loss": 1.7306, "step": 9900 }, { "epoch": 1.0683953400977075, "grad_norm": 8.541760444641113, "learning_rate": 7.4609125499942995e-06, "loss": 1.7691, "step": 9950 }, { "epoch": 1.0737638911257852, "grad_norm": 6.779469966888428, "learning_rate": 7.435643202491884e-06, "loss": 1.7331, "step": 10000 }, { "epoch": 1.0791324421538626, "grad_norm": 8.191193580627441, "learning_rate": 7.410292024787106e-06, "loss": 1.7757, "step": 10050 }, { "epoch": 1.0845009931819403, "grad_norm": 7.109296798706055, "learning_rate": 7.384859868602411e-06, "loss": 1.5935, "step": 10100 }, { "epoch": 1.0898695442100177, "grad_norm": 6.142228126525879, "learning_rate": 7.359347588380886e-06, "loss": 1.5366, "step": 10150 }, { "epoch": 1.0952380952380953, "grad_norm": 6.949032306671143, "learning_rate": 7.333756041257537e-06, "loss": 1.5345, "step": 10200 }, { "epoch": 1.1006066462661728, "grad_norm": 8.204275131225586, "learning_rate": 7.308086087030498e-06, "loss": 1.6411, "step": 10250 }, { "epoch": 1.1059751972942502, "grad_norm": 9.171077728271484, "learning_rate": 7.282338588132143e-06, "loss": 1.5583, "step": 10300 }, { "epoch": 1.1113437483223279, "grad_norm": 7.4853105545043945, "learning_rate": 7.256514409600108e-06, "loss": 1.6944, "step": 10350 }, { "epoch": 1.1167122993504053, "grad_norm": 5.683228492736816, "learning_rate": 7.23061441904824e-06, "loss": 1.7684, "step": 10400 }, { "epoch": 1.122080850378483, "grad_norm": 7.478188514709473, "learning_rate": 7.2046394866374295e-06, "loss": 1.8792, "step": 10450 }, { "epoch": 1.1274494014065604, "grad_norm": 6.323553085327148, "learning_rate": 7.17859048504639e-06, "loss": 1.6032, "step": 10500 }, { "epoch": 1.132817952434638, "grad_norm": 17.744308471679688, "learning_rate": 7.152468289442334e-06, "loss": 1.3883, "step": 10550 }, { "epoch": 1.1381865034627154, "grad_norm": 18.162912368774414, "learning_rate": 7.126273777451572e-06, "loss": 2.0817, "step": 10600 }, { "epoch": 1.1435550544907929, "grad_norm": 7.005634307861328, "learning_rate": 7.100007829130021e-06, "loss": 2.039, "step": 10650 }, { "epoch": 1.1489236055188705, "grad_norm": 7.600114345550537, "learning_rate": 7.073671326933645e-06, "loss": 1.7712, "step": 10700 }, { "epoch": 1.154292156546948, "grad_norm": 5.979006290435791, "learning_rate": 7.047265155688798e-06, "loss": 1.6261, "step": 10750 }, { "epoch": 1.1596607075750256, "grad_norm": 7.593403339385986, "learning_rate": 7.020790202562513e-06, "loss": 1.6303, "step": 10800 }, { "epoch": 1.165029258603103, "grad_norm": 6.739507675170898, "learning_rate": 6.994247357032672e-06, "loss": 1.7067, "step": 10850 }, { "epoch": 1.1703978096311805, "grad_norm": 7.7881598472595215, "learning_rate": 6.967637510858145e-06, "loss": 1.7556, "step": 10900 }, { "epoch": 1.175766360659258, "grad_norm": 8.353170394897461, "learning_rate": 6.940961558048814e-06, "loss": 1.7988, "step": 10950 }, { "epoch": 1.1811349116873355, "grad_norm": 17.190288543701172, "learning_rate": 6.914220394835547e-06, "loss": 1.8766, "step": 11000 }, { "epoch": 1.1865034627154132, "grad_norm": 7.401528835296631, "learning_rate": 6.88741491964008e-06, "loss": 1.5798, "step": 11050 }, { "epoch": 1.1918720137434906, "grad_norm": 9.883387565612793, "learning_rate": 6.860546033044836e-06, "loss": 1.6789, "step": 11100 }, { "epoch": 1.197240564771568, "grad_norm": 6.592789173126221, "learning_rate": 6.833614637762671e-06, "loss": 1.6847, "step": 11150 }, { "epoch": 1.2026091157996457, "grad_norm": 8.399685859680176, "learning_rate": 6.806621638606541e-06, "loss": 1.9617, "step": 11200 }, { "epoch": 1.2079776668277231, "grad_norm": 9.318310737609863, "learning_rate": 6.779567942459106e-06, "loss": 1.6214, "step": 11250 }, { "epoch": 1.2133462178558008, "grad_norm": 9.2510404586792, "learning_rate": 6.7524544582422556e-06, "loss": 1.7572, "step": 11300 }, { "epoch": 1.2187147688838782, "grad_norm": 7.251592636108398, "learning_rate": 6.725282096886584e-06, "loss": 1.6631, "step": 11350 }, { "epoch": 1.2240833199119558, "grad_norm": 7.429468631744385, "learning_rate": 6.698051771300772e-06, "loss": 1.8303, "step": 11400 }, { "epoch": 1.2294518709400333, "grad_norm": 8.812094688415527, "learning_rate": 6.670764396340924e-06, "loss": 1.7698, "step": 11450 }, { "epoch": 1.2348204219681107, "grad_norm": 6.430805206298828, "learning_rate": 6.643420888779832e-06, "loss": 1.6816, "step": 11500 }, { "epoch": 1.2401889729961884, "grad_norm": 5.964927673339844, "learning_rate": 6.61602216727617e-06, "loss": 1.9627, "step": 11550 }, { "epoch": 1.2455575240242658, "grad_norm": 8.360040664672852, "learning_rate": 6.588569152343636e-06, "loss": 1.6678, "step": 11600 }, { "epoch": 1.2509260750523434, "grad_norm": 8.492232322692871, "learning_rate": 6.561062766320015e-06, "loss": 1.5811, "step": 11650 }, { "epoch": 1.2562946260804209, "grad_norm": 6.507018566131592, "learning_rate": 6.533503933336207e-06, "loss": 1.8282, "step": 11700 }, { "epoch": 1.2616631771084985, "grad_norm": 6.434554100036621, "learning_rate": 6.505893579285164e-06, "loss": 1.6284, "step": 11750 }, { "epoch": 1.267031728136576, "grad_norm": 5.991467475891113, "learning_rate": 6.478232631790792e-06, "loss": 1.6377, "step": 11800 }, { "epoch": 1.2724002791646534, "grad_norm": 17.064252853393555, "learning_rate": 6.45052202017678e-06, "loss": 1.5149, "step": 11850 }, { "epoch": 1.277768830192731, "grad_norm": 18.899658203125, "learning_rate": 6.422762675435387e-06, "loss": 1.9017, "step": 11900 }, { "epoch": 1.2831373812208084, "grad_norm": 6.97517728805542, "learning_rate": 6.3949555301961474e-06, "loss": 1.5649, "step": 11950 }, { "epoch": 1.288505932248886, "grad_norm": 18.426116943359375, "learning_rate": 6.367101518694554e-06, "loss": 1.8782, "step": 12000 }, { "epoch": 1.2938744832769635, "grad_norm": 18.379648208618164, "learning_rate": 6.3392015767406626e-06, "loss": 1.8358, "step": 12050 }, { "epoch": 1.2992430343050412, "grad_norm": 10.581155776977539, "learning_rate": 6.311256641687648e-06, "loss": 1.8926, "step": 12100 }, { "epoch": 1.3046115853331186, "grad_norm": 6.98642635345459, "learning_rate": 6.283267652400323e-06, "loss": 1.6466, "step": 12150 }, { "epoch": 1.309980136361196, "grad_norm": 6.588789939880371, "learning_rate": 6.25523554922358e-06, "loss": 1.825, "step": 12200 }, { "epoch": 1.3153486873892737, "grad_norm": 19.694665908813477, "learning_rate": 6.227161273950818e-06, "loss": 1.7454, "step": 12250 }, { "epoch": 1.3207172384173511, "grad_norm": 8.332229614257812, "learning_rate": 6.199045769792279e-06, "loss": 1.7778, "step": 12300 }, { "epoch": 1.3260857894454285, "grad_norm": 7.123226642608643, "learning_rate": 6.170889981343378e-06, "loss": 1.8883, "step": 12350 }, { "epoch": 1.3314543404735062, "grad_norm": 11.233098030090332, "learning_rate": 6.142694854552957e-06, "loss": 1.7369, "step": 12400 }, { "epoch": 1.3368228915015838, "grad_norm": 6.440243244171143, "learning_rate": 6.114461336691505e-06, "loss": 1.5687, "step": 12450 }, { "epoch": 1.3421914425296613, "grad_norm": 19.100027084350586, "learning_rate": 6.0861903763193374e-06, "loss": 1.8765, "step": 12500 }, { "epoch": 1.3475599935577387, "grad_norm": 17.5304012298584, "learning_rate": 6.05788292325472e-06, "loss": 1.4992, "step": 12550 }, { "epoch": 1.3529285445858164, "grad_norm": 17.435745239257812, "learning_rate": 6.029539928541965e-06, "loss": 1.7109, "step": 12600 }, { "epoch": 1.3582970956138938, "grad_norm": 19.75895118713379, "learning_rate": 6.001162344419477e-06, "loss": 1.7342, "step": 12650 }, { "epoch": 1.3636656466419712, "grad_norm": 6.659576892852783, "learning_rate": 5.9727511242877565e-06, "loss": 1.4889, "step": 12700 }, { "epoch": 1.3690341976700489, "grad_norm": 7.136165142059326, "learning_rate": 5.944307222677372e-06, "loss": 1.6115, "step": 12750 }, { "epoch": 1.3744027486981265, "grad_norm": 7.5163092613220215, "learning_rate": 5.915831595216894e-06, "loss": 1.5552, "step": 12800 }, { "epoch": 1.379771299726204, "grad_norm": 6.865508079528809, "learning_rate": 5.88732519860078e-06, "loss": 1.8203, "step": 12850 }, { "epoch": 1.3851398507542814, "grad_norm": 6.863709449768066, "learning_rate": 5.858788990557239e-06, "loss": 1.7349, "step": 12900 }, { "epoch": 1.390508401782359, "grad_norm": 7.734783172607422, "learning_rate": 5.8302239298160565e-06, "loss": 1.5744, "step": 12950 }, { "epoch": 1.3958769528104364, "grad_norm": 19.227140426635742, "learning_rate": 5.8016309760763755e-06, "loss": 1.7058, "step": 13000 }, { "epoch": 1.4012455038385139, "grad_norm": 6.083110332489014, "learning_rate": 5.773011089974464e-06, "loss": 1.7991, "step": 13050 }, { "epoch": 1.4066140548665915, "grad_norm": 7.670594215393066, "learning_rate": 5.7443652330514335e-06, "loss": 1.6516, "step": 13100 }, { "epoch": 1.411982605894669, "grad_norm": 6.880539894104004, "learning_rate": 5.715694367720932e-06, "loss": 1.7352, "step": 13150 }, { "epoch": 1.4173511569227466, "grad_norm": 7.04241418838501, "learning_rate": 5.686999457236823e-06, "loss": 1.6688, "step": 13200 }, { "epoch": 1.422719707950824, "grad_norm": 6.067574977874756, "learning_rate": 5.658281465660804e-06, "loss": 1.5615, "step": 13250 }, { "epoch": 1.4280882589789017, "grad_norm": 6.4984331130981445, "learning_rate": 5.629541357830035e-06, "loss": 1.7143, "step": 13300 }, { "epoch": 1.433456810006979, "grad_norm": 8.660819053649902, "learning_rate": 5.600780099324711e-06, "loss": 1.7287, "step": 13350 }, { "epoch": 1.4388253610350565, "grad_norm": 7.762180328369141, "learning_rate": 5.571998656435624e-06, "loss": 1.5272, "step": 13400 }, { "epoch": 1.4441939120631342, "grad_norm": 5.679063320159912, "learning_rate": 5.543197996131704e-06, "loss": 1.9106, "step": 13450 }, { "epoch": 1.4495624630912116, "grad_norm": 18.31028175354004, "learning_rate": 5.514379086027525e-06, "loss": 1.5766, "step": 13500 }, { "epoch": 1.4549310141192893, "grad_norm": 7.93739128112793, "learning_rate": 5.485542894350797e-06, "loss": 1.6056, "step": 13550 }, { "epoch": 1.4602995651473667, "grad_norm": 19.02863883972168, "learning_rate": 5.456690389909844e-06, "loss": 1.8463, "step": 13600 }, { "epoch": 1.4656681161754443, "grad_norm": 20.583894729614258, "learning_rate": 5.427822542061043e-06, "loss": 1.8393, "step": 13650 }, { "epoch": 1.4710366672035218, "grad_norm": 7.795589447021484, "learning_rate": 5.398940320676268e-06, "loss": 1.5547, "step": 13700 }, { "epoch": 1.4764052182315992, "grad_norm": 17.66240882873535, "learning_rate": 5.3700446961102945e-06, "loss": 1.8426, "step": 13750 }, { "epoch": 1.4817737692596769, "grad_norm": 7.43621826171875, "learning_rate": 5.3411366391682114e-06, "loss": 1.6478, "step": 13800 }, { "epoch": 1.4871423202877543, "grad_norm": 8.103897094726562, "learning_rate": 5.31221712107279e-06, "loss": 1.5381, "step": 13850 }, { "epoch": 1.492510871315832, "grad_norm": 20.500654220581055, "learning_rate": 5.283287113431867e-06, "loss": 1.69, "step": 13900 }, { "epoch": 1.4978794223439094, "grad_norm": 6.227882385253906, "learning_rate": 5.2543475882056936e-06, "loss": 1.7197, "step": 13950 }, { "epoch": 1.503247973371987, "grad_norm": 6.347196578979492, "learning_rate": 5.225399517674282e-06, "loss": 1.6418, "step": 14000 }, { "epoch": 1.5086165244000644, "grad_norm": 6.504974365234375, "learning_rate": 5.196443874404744e-06, "loss": 1.6719, "step": 14050 }, { "epoch": 1.5139850754281419, "grad_norm": 6.531764507293701, "learning_rate": 5.167481631218608e-06, "loss": 1.7098, "step": 14100 }, { "epoch": 1.5193536264562195, "grad_norm": 10.471376419067383, "learning_rate": 5.138513761159144e-06, "loss": 1.5199, "step": 14150 }, { "epoch": 1.524722177484297, "grad_norm": 9.339461326599121, "learning_rate": 5.109541237458664e-06, "loss": 1.5637, "step": 14200 }, { "epoch": 1.5300907285123744, "grad_norm": 6.214099884033203, "learning_rate": 5.08056503350583e-06, "loss": 1.6646, "step": 14250 }, { "epoch": 1.535459279540452, "grad_norm": 7.6688055992126465, "learning_rate": 5.0515861228129495e-06, "loss": 1.7032, "step": 14300 }, { "epoch": 1.5408278305685297, "grad_norm": 6.576687335968018, "learning_rate": 5.022605478983268e-06, "loss": 1.6774, "step": 14350 }, { "epoch": 1.546196381596607, "grad_norm": 7.902665138244629, "learning_rate": 4.993624075678259e-06, "loss": 1.5937, "step": 14400 }, { "epoch": 1.5515649326246845, "grad_norm": 19.9635009765625, "learning_rate": 4.964642886584911e-06, "loss": 1.6069, "step": 14450 }, { "epoch": 1.5569334836527622, "grad_norm": 6.8427958488464355, "learning_rate": 4.935662885383017e-06, "loss": 1.5762, "step": 14500 }, { "epoch": 1.5623020346808396, "grad_norm": 8.082759857177734, "learning_rate": 4.906685045712461e-06, "loss": 1.5261, "step": 14550 }, { "epoch": 1.567670585708917, "grad_norm": 5.131589412689209, "learning_rate": 4.877710341140504e-06, "loss": 1.4827, "step": 14600 }, { "epoch": 1.5730391367369947, "grad_norm": 17.903608322143555, "learning_rate": 4.84873974512908e-06, "loss": 1.689, "step": 14650 }, { "epoch": 1.5784076877650723, "grad_norm": 5.094648838043213, "learning_rate": 4.819774231002085e-06, "loss": 1.8171, "step": 14700 }, { "epoch": 1.5837762387931498, "grad_norm": 7.117594242095947, "learning_rate": 4.790814771912681e-06, "loss": 1.6111, "step": 14750 }, { "epoch": 1.5891447898212272, "grad_norm": 6.486269950866699, "learning_rate": 4.7618623408105956e-06, "loss": 1.4813, "step": 14800 }, { "epoch": 1.5945133408493048, "grad_norm": 7.995445728302002, "learning_rate": 4.7329179104094456e-06, "loss": 1.6475, "step": 14850 }, { "epoch": 1.5998818918773823, "grad_norm": 7.533879280090332, "learning_rate": 4.703982453154041e-06, "loss": 1.6606, "step": 14900 }, { "epoch": 1.6052504429054597, "grad_norm": 17.664257049560547, "learning_rate": 4.6750569411877244e-06, "loss": 1.6459, "step": 14950 }, { "epoch": 1.6106189939335374, "grad_norm": 20.12204933166504, "learning_rate": 4.646142346319705e-06, "loss": 1.5996, "step": 15000 }, { "epoch": 1.615987544961615, "grad_norm": 6.255960464477539, "learning_rate": 4.617239639992411e-06, "loss": 1.7002, "step": 15050 }, { "epoch": 1.6213560959896924, "grad_norm": 20.118432998657227, "learning_rate": 4.588349793248856e-06, "loss": 1.6454, "step": 15100 }, { "epoch": 1.6267246470177699, "grad_norm": 7.044247627258301, "learning_rate": 4.559473776700007e-06, "loss": 1.7084, "step": 15150 }, { "epoch": 1.6320931980458475, "grad_norm": 7.60048246383667, "learning_rate": 4.530612560492178e-06, "loss": 1.9412, "step": 15200 }, { "epoch": 1.637461749073925, "grad_norm": 18.018789291381836, "learning_rate": 4.501767114274436e-06, "loss": 1.7135, "step": 15250 }, { "epoch": 1.6428303001020024, "grad_norm": 7.586131572723389, "learning_rate": 4.4729384071660295e-06, "loss": 1.5691, "step": 15300 }, { "epoch": 1.64819885113008, "grad_norm": 13.612800598144531, "learning_rate": 4.444127407723819e-06, "loss": 1.9744, "step": 15350 }, { "epoch": 1.6535674021581577, "grad_norm": 7.3399577140808105, "learning_rate": 4.4153350839097415e-06, "loss": 1.4499, "step": 15400 }, { "epoch": 1.6589359531862349, "grad_norm": 6.747891426086426, "learning_rate": 4.386562403058292e-06, "loss": 1.8181, "step": 15450 }, { "epoch": 1.6643045042143125, "grad_norm": 7.361255645751953, "learning_rate": 4.357810331844017e-06, "loss": 1.6833, "step": 15500 }, { "epoch": 1.6696730552423902, "grad_norm": 5.927125453948975, "learning_rate": 4.329079836249051e-06, "loss": 1.7711, "step": 15550 }, { "epoch": 1.6750416062704676, "grad_norm": 7.615528106689453, "learning_rate": 4.300371881530645e-06, "loss": 1.5959, "step": 15600 }, { "epoch": 1.680410157298545, "grad_norm": 20.07931900024414, "learning_rate": 4.271687432188749e-06, "loss": 1.7049, "step": 15650 }, { "epoch": 1.6857787083266227, "grad_norm": 7.868457794189453, "learning_rate": 4.243027451933599e-06, "loss": 1.6376, "step": 15700 }, { "epoch": 1.6911472593547001, "grad_norm": 8.05305290222168, "learning_rate": 4.214392903653351e-06, "loss": 1.6639, "step": 15750 }, { "epoch": 1.6965158103827775, "grad_norm": 9.143363952636719, "learning_rate": 4.185784749381721e-06, "loss": 1.6835, "step": 15800 }, { "epoch": 1.7018843614108552, "grad_norm": 7.359554767608643, "learning_rate": 4.157203950265665e-06, "loss": 1.414, "step": 15850 }, { "epoch": 1.7072529124389328, "grad_norm": 7.3185834884643555, "learning_rate": 4.12865146653309e-06, "loss": 1.8536, "step": 15900 }, { "epoch": 1.7126214634670103, "grad_norm": 7.609386920928955, "learning_rate": 4.100128257460595e-06, "loss": 1.8839, "step": 15950 }, { "epoch": 1.7179900144950877, "grad_norm": 17.042022705078125, "learning_rate": 4.071635281341235e-06, "loss": 1.7974, "step": 16000 }, { "epoch": 1.7233585655231654, "grad_norm": 9.913634300231934, "learning_rate": 4.043173495452332e-06, "loss": 1.6566, "step": 16050 }, { "epoch": 1.7287271165512428, "grad_norm": 6.3825907707214355, "learning_rate": 4.0147438560233134e-06, "loss": 1.9744, "step": 16100 }, { "epoch": 1.7340956675793202, "grad_norm": 9.182840347290039, "learning_rate": 3.986347318203575e-06, "loss": 1.7298, "step": 16150 }, { "epoch": 1.7394642186073979, "grad_norm": 5.4667582511901855, "learning_rate": 3.957984836030413e-06, "loss": 1.6783, "step": 16200 }, { "epoch": 1.7448327696354755, "grad_norm": 7.24221134185791, "learning_rate": 3.929657362396945e-06, "loss": 1.9194, "step": 16250 }, { "epoch": 1.750201320663553, "grad_norm": 8.579157829284668, "learning_rate": 3.9013658490201125e-06, "loss": 1.717, "step": 16300 }, { "epoch": 1.7555698716916304, "grad_norm": 6.769927024841309, "learning_rate": 3.8731112464087025e-06, "loss": 1.7442, "step": 16350 }, { "epoch": 1.760938422719708, "grad_norm": 6.856928825378418, "learning_rate": 3.844894503831414e-06, "loss": 1.8871, "step": 16400 }, { "epoch": 1.7663069737477854, "grad_norm": 6.8800859451293945, "learning_rate": 3.816716569284961e-06, "loss": 1.642, "step": 16450 }, { "epoch": 1.7716755247758629, "grad_norm": 20.131942749023438, "learning_rate": 3.7885783894622275e-06, "loss": 1.6477, "step": 16500 }, { "epoch": 1.7770440758039405, "grad_norm": 6.804838180541992, "learning_rate": 3.7604809097204573e-06, "loss": 1.6398, "step": 16550 }, { "epoch": 1.7824126268320182, "grad_norm": 8.101078033447266, "learning_rate": 3.7324250740494965e-06, "loss": 1.6002, "step": 16600 }, { "epoch": 1.7877811778600956, "grad_norm": 18.09836769104004, "learning_rate": 3.7044118250400817e-06, "loss": 1.788, "step": 16650 }, { "epoch": 1.793149728888173, "grad_norm": 7.4799346923828125, "learning_rate": 3.6764421038521605e-06, "loss": 1.4205, "step": 16700 }, { "epoch": 1.7985182799162507, "grad_norm": 7.4728498458862305, "learning_rate": 3.648516850183281e-06, "loss": 1.7957, "step": 16750 }, { "epoch": 1.8038868309443281, "grad_norm": 6.709610462188721, "learning_rate": 3.6206370022370154e-06, "loss": 1.5291, "step": 16800 }, { "epoch": 1.8092553819724055, "grad_norm": 9.4188814163208, "learning_rate": 3.5928034966914488e-06, "loss": 1.7005, "step": 16850 }, { "epoch": 1.8146239330004832, "grad_norm": 8.755097389221191, "learning_rate": 3.5650172686676955e-06, "loss": 1.7735, "step": 16900 }, { "epoch": 1.8199924840285608, "grad_norm": 7.697582721710205, "learning_rate": 3.5372792516984915e-06, "loss": 1.7826, "step": 16950 }, { "epoch": 1.825361035056638, "grad_norm": 19.516481399536133, "learning_rate": 3.5095903776968277e-06, "loss": 1.536, "step": 17000 }, { "epoch": 1.8307295860847157, "grad_norm": 17.64841651916504, "learning_rate": 3.4819515769246398e-06, "loss": 1.8811, "step": 17050 }, { "epoch": 1.8360981371127933, "grad_norm": 7.726692199707031, "learning_rate": 3.4543637779615574e-06, "loss": 1.3036, "step": 17100 }, { "epoch": 1.8414666881408708, "grad_norm": 7.466884136199951, "learning_rate": 3.4268279076737042e-06, "loss": 1.8029, "step": 17150 }, { "epoch": 1.8468352391689482, "grad_norm": 18.704017639160156, "learning_rate": 3.3993448911825577e-06, "loss": 1.5885, "step": 17200 }, { "epoch": 1.8522037901970259, "grad_norm": 9.161140441894531, "learning_rate": 3.371915651833866e-06, "loss": 1.7349, "step": 17250 }, { "epoch": 1.8575723412251035, "grad_norm": 17.896249771118164, "learning_rate": 3.3445411111666343e-06, "loss": 2.0384, "step": 17300 }, { "epoch": 1.8629408922531807, "grad_norm": 7.49798583984375, "learning_rate": 3.317222188882154e-06, "loss": 1.6774, "step": 17350 }, { "epoch": 1.8683094432812584, "grad_norm": 10.30838680267334, "learning_rate": 3.289959802813111e-06, "loss": 1.7086, "step": 17400 }, { "epoch": 1.873677994309336, "grad_norm": 9.715036392211914, "learning_rate": 3.262754868892742e-06, "loss": 1.9072, "step": 17450 }, { "epoch": 1.8790465453374134, "grad_norm": 8.555960655212402, "learning_rate": 3.235608301124071e-06, "loss": 1.8842, "step": 17500 }, { "epoch": 1.8844150963654909, "grad_norm": 8.199530601501465, "learning_rate": 3.2085210115491966e-06, "loss": 1.5834, "step": 17550 }, { "epoch": 1.8897836473935685, "grad_norm": 10.413174629211426, "learning_rate": 3.1814939102186472e-06, "loss": 1.8812, "step": 17600 }, { "epoch": 1.895152198421646, "grad_norm": 10.915915489196777, "learning_rate": 3.1545279051608113e-06, "loss": 1.7046, "step": 17650 }, { "epoch": 1.9005207494497234, "grad_norm": 6.244101047515869, "learning_rate": 3.1276239023514255e-06, "loss": 1.5147, "step": 17700 }, { "epoch": 1.905889300477801, "grad_norm": 18.936601638793945, "learning_rate": 3.1007828056831467e-06, "loss": 1.6445, "step": 17750 }, { "epoch": 1.9112578515058787, "grad_norm": 17.59870147705078, "learning_rate": 3.07400551693517e-06, "loss": 1.8792, "step": 17800 }, { "epoch": 1.916626402533956, "grad_norm": 8.35571002960205, "learning_rate": 3.0472929357429414e-06, "loss": 1.7538, "step": 17850 }, { "epoch": 1.9219949535620335, "grad_norm": 19.334714889526367, "learning_rate": 3.020645959567926e-06, "loss": 1.7389, "step": 17900 }, { "epoch": 1.9273635045901112, "grad_norm": 8.158848762512207, "learning_rate": 2.994065483667468e-06, "loss": 1.6112, "step": 17950 }, { "epoch": 1.9327320556181886, "grad_norm": 9.61613941192627, "learning_rate": 2.9675524010646974e-06, "loss": 1.9104, "step": 18000 }, { "epoch": 1.938100606646266, "grad_norm": 9.808588027954102, "learning_rate": 2.9411076025185366e-06, "loss": 1.4322, "step": 18050 }, { "epoch": 1.9434691576743437, "grad_norm": 7.1503729820251465, "learning_rate": 2.9147319764937725e-06, "loss": 1.6654, "step": 18100 }, { "epoch": 1.9488377087024213, "grad_norm": 7.38853120803833, "learning_rate": 2.888426409131201e-06, "loss": 1.5595, "step": 18150 }, { "epoch": 1.9542062597304988, "grad_norm": 6.872980117797852, "learning_rate": 2.8621917842178693e-06, "loss": 1.5195, "step": 18200 }, { "epoch": 1.9595748107585762, "grad_norm": 6.323190689086914, "learning_rate": 2.836028983157365e-06, "loss": 1.5121, "step": 18250 }, { "epoch": 1.9649433617866539, "grad_norm": 5.4187469482421875, "learning_rate": 2.809938884940219e-06, "loss": 1.4725, "step": 18300 }, { "epoch": 1.9703119128147313, "grad_norm": 5.585220813751221, "learning_rate": 2.7839223661143606e-06, "loss": 1.7173, "step": 18350 }, { "epoch": 1.9756804638428087, "grad_norm": 5.7172017097473145, "learning_rate": 2.757980300755685e-06, "loss": 1.6042, "step": 18400 }, { "epoch": 1.9810490148708864, "grad_norm": 8.703761100769043, "learning_rate": 2.7321135604386713e-06, "loss": 1.9222, "step": 18450 }, { "epoch": 1.986417565898964, "grad_norm": 6.097997665405273, "learning_rate": 2.706323014207106e-06, "loss": 1.918, "step": 18500 }, { "epoch": 1.9917861169270414, "grad_norm": 6.828339576721191, "learning_rate": 2.6806095285448887e-06, "loss": 1.664, "step": 18550 }, { "epoch": 1.9971546679551189, "grad_norm": 7.003544330596924, "learning_rate": 2.654973967346914e-06, "loss": 1.6381, "step": 18600 }, { "epoch": 2.0025769044934774, "grad_norm": 6.588607311248779, "learning_rate": 2.6294171918900592e-06, "loss": 1.4981, "step": 18650 }, { "epoch": 2.0079454555215546, "grad_norm": 7.204668998718262, "learning_rate": 2.603940060804234e-06, "loss": 1.2035, "step": 18700 }, { "epoch": 2.013314006549632, "grad_norm": 8.035957336425781, "learning_rate": 2.5785434300435406e-06, "loss": 1.0785, "step": 18750 }, { "epoch": 2.01868255757771, "grad_norm": 9.361004829406738, "learning_rate": 2.5532281528575154e-06, "loss": 1.2245, "step": 18800 }, { "epoch": 2.0240511086057875, "grad_norm": 6.5703253746032715, "learning_rate": 2.5279950797624654e-06, "loss": 1.1247, "step": 18850 }, { "epoch": 2.0294196596338647, "grad_norm": 6.012766361236572, "learning_rate": 2.5028450585128854e-06, "loss": 1.1375, "step": 18900 }, { "epoch": 2.0347882106619424, "grad_norm": 5.649380683898926, "learning_rate": 2.4777789340729836e-06, "loss": 1.1421, "step": 18950 }, { "epoch": 2.04015676169002, "grad_norm": 16.015594482421875, "learning_rate": 2.45279754858829e-06, "loss": 1.1345, "step": 19000 }, { "epoch": 2.0455253127180972, "grad_norm": 16.384191513061523, "learning_rate": 2.4279017413573606e-06, "loss": 1.075, "step": 19050 }, { "epoch": 2.050893863746175, "grad_norm": 8.830488204956055, "learning_rate": 2.4030923488035896e-06, "loss": 1.1915, "step": 19100 }, { "epoch": 2.0562624147742525, "grad_norm": 6.353893756866455, "learning_rate": 2.3783702044470948e-06, "loss": 1.1907, "step": 19150 }, { "epoch": 2.0616309658023297, "grad_norm": 8.547567367553711, "learning_rate": 2.3537361388767215e-06, "loss": 1.1703, "step": 19200 }, { "epoch": 2.0669995168304074, "grad_norm": 7.108630180358887, "learning_rate": 2.329190979722134e-06, "loss": 1.1327, "step": 19250 }, { "epoch": 2.072368067858485, "grad_norm": 8.99742603302002, "learning_rate": 2.304735551626017e-06, "loss": 1.1121, "step": 19300 }, { "epoch": 2.0777366188865627, "grad_norm": 6.972029209136963, "learning_rate": 2.2803706762163603e-06, "loss": 1.116, "step": 19350 }, { "epoch": 2.08310516991464, "grad_norm": 6.052910804748535, "learning_rate": 2.2560971720788577e-06, "loss": 1.106, "step": 19400 }, { "epoch": 2.0884737209427175, "grad_norm": 14.27530574798584, "learning_rate": 2.2319158547294096e-06, "loss": 1.2028, "step": 19450 }, { "epoch": 2.093842271970795, "grad_norm": 9.214370727539062, "learning_rate": 2.2078275365867162e-06, "loss": 1.1224, "step": 19500 }, { "epoch": 2.0992108229988724, "grad_norm": 17.302961349487305, "learning_rate": 2.183833026944995e-06, "loss": 1.1746, "step": 19550 }, { "epoch": 2.10457937402695, "grad_norm": 6.132236480712891, "learning_rate": 2.159933131946777e-06, "loss": 1.1734, "step": 19600 }, { "epoch": 2.1099479250550277, "grad_norm": 8.298233032226562, "learning_rate": 2.1361286545558295e-06, "loss": 1.1839, "step": 19650 }, { "epoch": 2.1153164760831054, "grad_norm": 8.142345428466797, "learning_rate": 2.1124203945301786e-06, "loss": 1.1485, "step": 19700 }, { "epoch": 2.1206850271111826, "grad_norm": 7.334796905517578, "learning_rate": 2.0888091483952433e-06, "loss": 1.253, "step": 19750 }, { "epoch": 2.12605357813926, "grad_norm": 7.935271739959717, "learning_rate": 2.065295709417067e-06, "loss": 1.2222, "step": 19800 }, { "epoch": 2.131422129167338, "grad_norm": 7.456075191497803, "learning_rate": 2.041880867575671e-06, "loss": 1.1955, "step": 19850 }, { "epoch": 2.136790680195415, "grad_norm": 6.429117679595947, "learning_rate": 2.0185654095385124e-06, "loss": 1.1424, "step": 19900 }, { "epoch": 2.1421592312234927, "grad_norm": 10.556108474731445, "learning_rate": 1.995350118634058e-06, "loss": 1.1228, "step": 19950 }, { "epoch": 2.1475277822515704, "grad_norm": 8.033760070800781, "learning_rate": 1.9722357748254593e-06, "loss": 1.1683, "step": 20000 }, { "epoch": 2.152896333279648, "grad_norm": 6.466451168060303, "learning_rate": 1.949223154684355e-06, "loss": 1.1262, "step": 20050 }, { "epoch": 2.1582648843077252, "grad_norm": 9.730595588684082, "learning_rate": 1.9263130313647765e-06, "loss": 1.1169, "step": 20100 }, { "epoch": 2.163633435335803, "grad_norm": 8.086485862731934, "learning_rate": 1.9035061745771744e-06, "loss": 1.1748, "step": 20150 }, { "epoch": 2.1690019863638805, "grad_norm": 14.71091365814209, "learning_rate": 1.88080335056256e-06, "loss": 1.1721, "step": 20200 }, { "epoch": 2.1743705373919577, "grad_norm": 18.664920806884766, "learning_rate": 1.8582053220667573e-06, "loss": 1.1807, "step": 20250 }, { "epoch": 2.1797390884200354, "grad_norm": 6.661670684814453, "learning_rate": 1.8357128483147806e-06, "loss": 1.1184, "step": 20300 }, { "epoch": 2.185107639448113, "grad_norm": 16.419658660888672, "learning_rate": 1.8133266849853247e-06, "loss": 1.1751, "step": 20350 }, { "epoch": 2.1904761904761907, "grad_norm": 16.346141815185547, "learning_rate": 1.7910475841853786e-06, "loss": 1.1732, "step": 20400 }, { "epoch": 2.195844741504268, "grad_norm": 6.543334484100342, "learning_rate": 1.7688762944249582e-06, "loss": 1.1495, "step": 20450 }, { "epoch": 2.2012132925323455, "grad_norm": 7.164591312408447, "learning_rate": 1.7468135605919528e-06, "loss": 1.1306, "step": 20500 }, { "epoch": 2.206581843560423, "grad_norm": 9.77757453918457, "learning_rate": 1.7248601239271045e-06, "loss": 1.1395, "step": 20550 }, { "epoch": 2.2119503945885004, "grad_norm": 18.20372200012207, "learning_rate": 1.703016721999103e-06, "loss": 1.1361, "step": 20600 }, { "epoch": 2.217318945616578, "grad_norm": 18.00674819946289, "learning_rate": 1.6812840886798043e-06, "loss": 1.1528, "step": 20650 }, { "epoch": 2.2226874966446557, "grad_norm": 8.286600112915039, "learning_rate": 1.6596629541195787e-06, "loss": 1.111, "step": 20700 }, { "epoch": 2.228056047672733, "grad_norm": 11.050477027893066, "learning_rate": 1.6381540447227728e-06, "loss": 1.095, "step": 20750 }, { "epoch": 2.2334245987008106, "grad_norm": 8.50864315032959, "learning_rate": 1.6167580831233166e-06, "loss": 1.1602, "step": 20800 }, { "epoch": 2.238793149728888, "grad_norm": 7.250463962554932, "learning_rate": 1.595475788160431e-06, "loss": 1.1188, "step": 20850 }, { "epoch": 2.244161700756966, "grad_norm": 9.344785690307617, "learning_rate": 1.5743078748544854e-06, "loss": 1.1872, "step": 20900 }, { "epoch": 2.249530251785043, "grad_norm": 10.801837921142578, "learning_rate": 1.553255054382975e-06, "loss": 1.1003, "step": 20950 }, { "epoch": 2.2548988028131207, "grad_norm": 9.372284889221191, "learning_rate": 1.5323180340566247e-06, "loss": 1.1206, "step": 21000 }, { "epoch": 2.2602673538411984, "grad_norm": 12.740575790405273, "learning_rate": 1.5114975172956247e-06, "loss": 1.1476, "step": 21050 }, { "epoch": 2.265635904869276, "grad_norm": 16.75154685974121, "learning_rate": 1.4907942036060057e-06, "loss": 1.1752, "step": 21100 }, { "epoch": 2.2710044558973532, "grad_norm": 17.161603927612305, "learning_rate": 1.470208788556126e-06, "loss": 1.1481, "step": 21150 }, { "epoch": 2.276373006925431, "grad_norm": 9.21768569946289, "learning_rate": 1.4497419637533116e-06, "loss": 1.1411, "step": 21200 }, { "epoch": 2.2817415579535085, "grad_norm": 10.822429656982422, "learning_rate": 1.429394416820613e-06, "loss": 1.147, "step": 21250 }, { "epoch": 2.2871101089815857, "grad_norm": 9.155590057373047, "learning_rate": 1.4091668313737133e-06, "loss": 1.1169, "step": 21300 }, { "epoch": 2.2924786600096634, "grad_norm": 16.39679527282715, "learning_rate": 1.3890598869979494e-06, "loss": 1.1333, "step": 21350 }, { "epoch": 2.297847211037741, "grad_norm": 7.832981109619141, "learning_rate": 1.3690742592254874e-06, "loss": 1.1509, "step": 21400 }, { "epoch": 2.3032157620658182, "grad_norm": 18.698701858520508, "learning_rate": 1.3492106195126237e-06, "loss": 1.1706, "step": 21450 }, { "epoch": 2.308584313093896, "grad_norm": 9.106189727783203, "learning_rate": 1.3294696352172258e-06, "loss": 1.0814, "step": 21500 }, { "epoch": 2.3139528641219735, "grad_norm": 8.098555564880371, "learning_rate": 1.3098519695763169e-06, "loss": 1.2489, "step": 21550 }, { "epoch": 2.319321415150051, "grad_norm": 8.390243530273438, "learning_rate": 1.2903582816837844e-06, "loss": 1.1502, "step": 21600 }, { "epoch": 2.3246899661781284, "grad_norm": 8.757095336914062, "learning_rate": 1.2709892264682412e-06, "loss": 1.1508, "step": 21650 }, { "epoch": 2.330058517206206, "grad_norm": 7.823190689086914, "learning_rate": 1.25174545467102e-06, "loss": 1.2113, "step": 21700 }, { "epoch": 2.3354270682342837, "grad_norm": 9.674703598022461, "learning_rate": 1.2326276128243175e-06, "loss": 1.1866, "step": 21750 }, { "epoch": 2.340795619262361, "grad_norm": 8.03213882446289, "learning_rate": 1.2136363432294607e-06, "loss": 1.1158, "step": 21800 }, { "epoch": 2.3461641702904386, "grad_norm": 9.820610046386719, "learning_rate": 1.1947722839353375e-06, "loss": 1.1573, "step": 21850 }, { "epoch": 2.351532721318516, "grad_norm": 6.820501804351807, "learning_rate": 1.176036068716953e-06, "loss": 1.1848, "step": 21900 }, { "epoch": 2.356901272346594, "grad_norm": 10.612143516540527, "learning_rate": 1.157428327054147e-06, "loss": 1.1719, "step": 21950 }, { "epoch": 2.362269823374671, "grad_norm": 6.224195957183838, "learning_rate": 1.138949684110432e-06, "loss": 1.1361, "step": 22000 }, { "epoch": 2.3676383744027487, "grad_norm": 8.215645790100098, "learning_rate": 1.1206007607119989e-06, "loss": 1.0933, "step": 22050 }, { "epoch": 2.3730069254308264, "grad_norm": 16.23103141784668, "learning_rate": 1.1023821733268576e-06, "loss": 1.1502, "step": 22100 }, { "epoch": 2.3783754764589036, "grad_norm": 7.224851131439209, "learning_rate": 1.0842945340441207e-06, "loss": 1.1564, "step": 22150 }, { "epoch": 2.383744027486981, "grad_norm": 6.062022686004639, "learning_rate": 1.0663384505534486e-06, "loss": 1.1801, "step": 22200 }, { "epoch": 2.389112578515059, "grad_norm": 6.085788726806641, "learning_rate": 1.0485145261246222e-06, "loss": 1.1397, "step": 22250 }, { "epoch": 2.394481129543136, "grad_norm": 21.13031578063965, "learning_rate": 1.0308233595872823e-06, "loss": 1.1892, "step": 22300 }, { "epoch": 2.3998496805712137, "grad_norm": 8.92026138305664, "learning_rate": 1.013265545310807e-06, "loss": 1.102, "step": 22350 }, { "epoch": 2.4052182315992914, "grad_norm": 32.13631820678711, "learning_rate": 9.958416731843467e-07, "loss": 1.142, "step": 22400 }, { "epoch": 2.410586782627369, "grad_norm": 8.617265701293945, "learning_rate": 9.78552328597001e-07, "loss": 1.0888, "step": 22450 }, { "epoch": 2.4159553336554462, "grad_norm": 6.967668056488037, "learning_rate": 9.613980924181531e-07, "loss": 1.1972, "step": 22500 }, { "epoch": 2.421323884683524, "grad_norm": 14.281668663024902, "learning_rate": 9.44379540977956e-07, "loss": 1.2271, "step": 22550 }, { "epoch": 2.4266924357116015, "grad_norm": 9.496726036071777, "learning_rate": 9.274972460479659e-07, "loss": 1.1356, "step": 22600 }, { "epoch": 2.432060986739679, "grad_norm": 7.084106922149658, "learning_rate": 9.107517748219391e-07, "loss": 1.1693, "step": 22650 }, { "epoch": 2.4374295377677564, "grad_norm": 10.713268280029297, "learning_rate": 8.941436898967676e-07, "loss": 1.1308, "step": 22700 }, { "epoch": 2.442798088795834, "grad_norm": 6.2390031814575195, "learning_rate": 8.776735492535827e-07, "loss": 1.1522, "step": 22750 }, { "epoch": 2.4481666398239117, "grad_norm": 17.40299415588379, "learning_rate": 8.613419062390072e-07, "loss": 1.179, "step": 22800 }, { "epoch": 2.453535190851989, "grad_norm": 7.331134796142578, "learning_rate": 8.451493095465674e-07, "loss": 1.0968, "step": 22850 }, { "epoch": 2.4589037418800666, "grad_norm": 6.012197017669678, "learning_rate": 8.290963031982535e-07, "loss": 1.1, "step": 22900 }, { "epoch": 2.464272292908144, "grad_norm": 18.07862663269043, "learning_rate": 8.131834265262451e-07, "loss": 1.1539, "step": 22950 }, { "epoch": 2.4696408439362214, "grad_norm": 10.868839263916016, "learning_rate": 7.974112141547912e-07, "loss": 1.1659, "step": 23000 }, { "epoch": 2.475009394964299, "grad_norm": 18.099262237548828, "learning_rate": 7.81780195982248e-07, "loss": 1.1514, "step": 23050 }, { "epoch": 2.4803779459923767, "grad_norm": 6.631985187530518, "learning_rate": 7.662908971632777e-07, "loss": 1.1449, "step": 23100 }, { "epoch": 2.4857464970204544, "grad_norm": 10.29295539855957, "learning_rate": 7.509438380912021e-07, "loss": 1.1482, "step": 23150 }, { "epoch": 2.4911150480485316, "grad_norm": 7.769371509552002, "learning_rate": 7.35739534380519e-07, "loss": 1.124, "step": 23200 }, { "epoch": 2.496483599076609, "grad_norm": 8.667695045471191, "learning_rate": 7.206784968495823e-07, "loss": 1.1227, "step": 23250 }, { "epoch": 2.501852150104687, "grad_norm": 16.578887939453125, "learning_rate": 7.057612315034367e-07, "loss": 1.1566, "step": 23300 }, { "epoch": 2.5072207011327645, "grad_norm": 7.591117858886719, "learning_rate": 6.909882395168205e-07, "loss": 1.1766, "step": 23350 }, { "epoch": 2.5125892521608417, "grad_norm": 6.038825035095215, "learning_rate": 6.763600172173229e-07, "loss": 1.1973, "step": 23400 }, { "epoch": 2.5179578031889194, "grad_norm": 9.154902458190918, "learning_rate": 6.61877056068716e-07, "loss": 1.1941, "step": 23450 }, { "epoch": 2.523326354216997, "grad_norm": 16.231149673461914, "learning_rate": 6.475398426544372e-07, "loss": 1.1128, "step": 23500 }, { "epoch": 2.5286949052450742, "grad_norm": 8.24044418334961, "learning_rate": 6.33348858661243e-07, "loss": 1.122, "step": 23550 }, { "epoch": 2.534063456273152, "grad_norm": 6.951202392578125, "learning_rate": 6.193045808630255e-07, "loss": 1.1067, "step": 23600 }, { "epoch": 2.5394320073012295, "grad_norm": 48.8431510925293, "learning_rate": 6.054074811047972e-07, "loss": 1.1233, "step": 23650 }, { "epoch": 2.5448005583293067, "grad_norm": 20.162029266357422, "learning_rate": 5.916580262868338e-07, "loss": 1.2323, "step": 23700 }, { "epoch": 2.5501691093573844, "grad_norm": 7.568118095397949, "learning_rate": 5.780566783489927e-07, "loss": 1.184, "step": 23750 }, { "epoch": 2.555537660385462, "grad_norm": 18.249082565307617, "learning_rate": 5.646038942551885e-07, "loss": 1.1772, "step": 23800 }, { "epoch": 2.5609062114135392, "grad_norm": 8.595870018005371, "learning_rate": 5.513001259780432e-07, "loss": 1.1604, "step": 23850 }, { "epoch": 2.566274762441617, "grad_norm": 6.2978196144104, "learning_rate": 5.381458204836998e-07, "loss": 1.1265, "step": 23900 }, { "epoch": 2.5716433134696945, "grad_norm": 8.599082946777344, "learning_rate": 5.251414197168097e-07, "loss": 1.1018, "step": 23950 }, { "epoch": 2.577011864497772, "grad_norm": 9.089366912841797, "learning_rate": 5.122873605856788e-07, "loss": 1.0831, "step": 24000 }, { "epoch": 2.58238041552585, "grad_norm": 17.699949264526367, "learning_rate": 4.995840749475906e-07, "loss": 1.1397, "step": 24050 }, { "epoch": 2.587748966553927, "grad_norm": 9.30536937713623, "learning_rate": 4.870319895942993e-07, "loss": 1.1431, "step": 24100 }, { "epoch": 2.5931175175820047, "grad_norm": 7.9017229080200195, "learning_rate": 4.746315262376894e-07, "loss": 1.1171, "step": 24150 }, { "epoch": 2.5984860686100824, "grad_norm": 9.600709915161133, "learning_rate": 4.6238310149560815e-07, "loss": 1.1862, "step": 24200 }, { "epoch": 2.6038546196381596, "grad_norm": 15.521000862121582, "learning_rate": 4.5028712687786637e-07, "loss": 1.0834, "step": 24250 }, { "epoch": 2.609223170666237, "grad_norm": 17.337730407714844, "learning_rate": 4.3834400877241557e-07, "loss": 1.1675, "step": 24300 }, { "epoch": 2.614591721694315, "grad_norm": 9.940308570861816, "learning_rate": 4.2655414843169207e-07, "loss": 1.1595, "step": 24350 }, { "epoch": 2.619960272722392, "grad_norm": 11.373148918151855, "learning_rate": 4.1491794195914036e-07, "loss": 1.1601, "step": 24400 }, { "epoch": 2.6253288237504697, "grad_norm": 9.516183853149414, "learning_rate": 4.034357802958999e-07, "loss": 1.0979, "step": 24450 }, { "epoch": 2.6306973747785474, "grad_norm": 7.607344627380371, "learning_rate": 3.921080492076729e-07, "loss": 1.0952, "step": 24500 }, { "epoch": 2.6360659258066246, "grad_norm": 8.217754364013672, "learning_rate": 3.809351292717656e-07, "loss": 1.1319, "step": 24550 }, { "epoch": 2.6414344768347022, "grad_norm": 7.739314079284668, "learning_rate": 3.6991739586429875e-07, "loss": 1.1889, "step": 24600 }, { "epoch": 2.64680302786278, "grad_norm": 7.91011381149292, "learning_rate": 3.590552191476004e-07, "loss": 1.1452, "step": 24650 }, { "epoch": 2.652171578890857, "grad_norm": 18.33525848388672, "learning_rate": 3.483489640577653e-07, "loss": 1.1669, "step": 24700 }, { "epoch": 2.6575401299189347, "grad_norm": 10.001716613769531, "learning_rate": 3.3779899029239504e-07, "loss": 1.1424, "step": 24750 }, { "epoch": 2.6629086809470124, "grad_norm": 16.244098663330078, "learning_rate": 3.2740565229851473e-07, "loss": 1.1258, "step": 24800 }, { "epoch": 2.66827723197509, "grad_norm": 8.882587432861328, "learning_rate": 3.1716929926066563e-07, "loss": 1.0455, "step": 24850 }, { "epoch": 2.6736457830031677, "grad_norm": 18.185453414916992, "learning_rate": 3.070902750891708e-07, "loss": 1.1575, "step": 24900 }, { "epoch": 2.679014334031245, "grad_norm": 16.914825439453125, "learning_rate": 2.971689184085813e-07, "loss": 1.176, "step": 24950 }, { "epoch": 2.6843828850593225, "grad_norm": 7.199150562286377, "learning_rate": 2.8740556254630126e-07, "loss": 1.1321, "step": 25000 }, { "epoch": 2.6897514360874, "grad_norm": 7.281803607940674, "learning_rate": 2.778005355213859e-07, "loss": 1.1242, "step": 25050 }, { "epoch": 2.6951199871154774, "grad_norm": 7.644674301147461, "learning_rate": 2.683541600335271e-07, "loss": 1.1219, "step": 25100 }, { "epoch": 2.700488538143555, "grad_norm": 7.411130428314209, "learning_rate": 2.59066753452204e-07, "loss": 1.1559, "step": 25150 }, { "epoch": 2.7058570891716327, "grad_norm": 7.964588642120361, "learning_rate": 2.499386278060262e-07, "loss": 1.1, "step": 25200 }, { "epoch": 2.71122564019971, "grad_norm": 7.902915000915527, "learning_rate": 2.409700897722456e-07, "loss": 1.1239, "step": 25250 }, { "epoch": 2.7165941912277876, "grad_norm": 6.715937614440918, "learning_rate": 2.3216144066646073e-07, "loss": 1.0599, "step": 25300 }, { "epoch": 2.721962742255865, "grad_norm": 7.483630180358887, "learning_rate": 2.2351297643248337e-07, "loss": 1.1114, "step": 25350 }, { "epoch": 2.7273312932839424, "grad_norm": 7.820849418640137, "learning_rate": 2.1502498763240453e-07, "loss": 1.1012, "step": 25400 }, { "epoch": 2.73269984431202, "grad_norm": 17.91743278503418, "learning_rate": 2.0669775943682634e-07, "loss": 1.1322, "step": 25450 }, { "epoch": 2.7380683953400977, "grad_norm": 8.192693710327148, "learning_rate": 1.9853157161528468e-07, "loss": 1.0981, "step": 25500 }, { "epoch": 2.7434369463681754, "grad_norm": 17.21625328063965, "learning_rate": 1.9052669852684945e-07, "loss": 1.1364, "step": 25550 }, { "epoch": 2.748805497396253, "grad_norm": 18.796228408813477, "learning_rate": 1.8268340911090533e-07, "loss": 1.1526, "step": 25600 }, { "epoch": 2.7541740484243302, "grad_norm": 19.57398796081543, "learning_rate": 1.7500196687811776e-07, "loss": 1.1529, "step": 25650 }, { "epoch": 2.759542599452408, "grad_norm": 7.654101371765137, "learning_rate": 1.674826299015775e-07, "loss": 1.1793, "step": 25700 }, { "epoch": 2.7649111504804855, "grad_norm": 9.904139518737793, "learning_rate": 1.60125650808135e-07, "loss": 1.1499, "step": 25750 }, { "epoch": 2.7702797015085627, "grad_norm": 12.009369850158691, "learning_rate": 1.529312767699065e-07, "loss": 1.0964, "step": 25800 }, { "epoch": 2.7756482525366404, "grad_norm": 10.86778450012207, "learning_rate": 1.4589974949597463e-07, "loss": 1.1427, "step": 25850 }, { "epoch": 2.781016803564718, "grad_norm": 8.0640869140625, "learning_rate": 1.3903130522426589e-07, "loss": 1.1492, "step": 25900 }, { "epoch": 2.7863853545927952, "grad_norm": 9.057573318481445, "learning_rate": 1.3232617471361452e-07, "loss": 1.1511, "step": 25950 }, { "epoch": 2.791753905620873, "grad_norm": 8.094735145568848, "learning_rate": 1.2578458323600774e-07, "loss": 1.1299, "step": 26000 }, { "epoch": 2.7971224566489505, "grad_norm": 6.8636155128479, "learning_rate": 1.194067505690194e-07, "loss": 1.085, "step": 26050 }, { "epoch": 2.8024910076770277, "grad_norm": 14.724271774291992, "learning_rate": 1.1319289098842667e-07, "loss": 1.0734, "step": 26100 }, { "epoch": 2.8078595587051054, "grad_norm": 8.863556861877441, "learning_rate": 1.0714321326100895e-07, "loss": 1.1229, "step": 26150 }, { "epoch": 2.813228109733183, "grad_norm": 18.311233520507812, "learning_rate": 1.0125792063753415e-07, "loss": 1.1797, "step": 26200 }, { "epoch": 2.8185966607612607, "grad_norm": 10.026941299438477, "learning_rate": 9.553721084593182e-08, "loss": 1.1378, "step": 26250 }, { "epoch": 2.823965211789338, "grad_norm": 12.081165313720703, "learning_rate": 8.998127608464801e-08, "loss": 1.1052, "step": 26300 }, { "epoch": 2.8293337628174156, "grad_norm": 8.546296119689941, "learning_rate": 8.459030301618931e-08, "loss": 1.1473, "step": 26350 }, { "epoch": 2.834702313845493, "grad_norm": 8.367006301879883, "learning_rate": 7.936447276085224e-08, "loss": 1.1076, "step": 26400 }, { "epoch": 2.840070864873571, "grad_norm": 8.272223472595215, "learning_rate": 7.430396089063597e-08, "loss": 1.1833, "step": 26450 }, { "epoch": 2.845439415901648, "grad_norm": 16.885454177856445, "learning_rate": 6.940893742334587e-08, "loss": 1.0555, "step": 26500 }, { "epoch": 2.8508079669297257, "grad_norm": 10.456121444702148, "learning_rate": 6.46795668168787e-08, "loss": 1.1435, "step": 26550 }, { "epoch": 2.8561765179578034, "grad_norm": 8.240148544311523, "learning_rate": 6.011600796370032e-08, "loss": 1.0777, "step": 26600 }, { "epoch": 2.8615450689858806, "grad_norm": 15.448931694030762, "learning_rate": 5.5718414185506055e-08, "loss": 1.1292, "step": 26650 }, { "epoch": 2.866913620013958, "grad_norm": 12.672825813293457, "learning_rate": 5.148693322806986e-08, "loss": 1.1192, "step": 26700 }, { "epoch": 2.872282171042036, "grad_norm": 7.322881698608398, "learning_rate": 4.742170725627881e-08, "loss": 1.0856, "step": 26750 }, { "epoch": 2.877650722070113, "grad_norm": 6.888855934143066, "learning_rate": 4.3522872849359744e-08, "loss": 1.138, "step": 26800 }, { "epoch": 2.8830192730981907, "grad_norm": 9.260125160217285, "learning_rate": 3.979056099628842e-08, "loss": 1.2059, "step": 26850 }, { "epoch": 2.8883878241262684, "grad_norm": 10.245002746582031, "learning_rate": 3.622489709138921e-08, "loss": 1.1057, "step": 26900 }, { "epoch": 2.8937563751543456, "grad_norm": 8.351652145385742, "learning_rate": 3.282600093012234e-08, "loss": 1.1575, "step": 26950 }, { "epoch": 2.8991249261824232, "grad_norm": 6.738492012023926, "learning_rate": 2.959398670505986e-08, "loss": 1.1577, "step": 27000 }, { "epoch": 2.904493477210501, "grad_norm": 9.16823959350586, "learning_rate": 2.652896300204766e-08, "loss": 1.1307, "step": 27050 }, { "epoch": 2.9098620282385785, "grad_norm": 17.78186798095703, "learning_rate": 2.363103279655832e-08, "loss": 1.1314, "step": 27100 }, { "epoch": 2.915230579266656, "grad_norm": 10.113680839538574, "learning_rate": 2.0900293450231148e-08, "loss": 1.1145, "step": 27150 }, { "epoch": 2.9205991302947334, "grad_norm": 8.009809494018555, "learning_rate": 1.8336836707601446e-08, "loss": 1.1278, "step": 27200 }, { "epoch": 2.925967681322811, "grad_norm": 9.789654731750488, "learning_rate": 1.5940748693017426e-08, "loss": 1.0919, "step": 27250 }, { "epoch": 2.9313362323508887, "grad_norm": 16.743701934814453, "learning_rate": 1.3712109907748073e-08, "loss": 1.1995, "step": 27300 }, { "epoch": 2.936704783378966, "grad_norm": 6.287803649902344, "learning_rate": 1.1650995227276974e-08, "loss": 1.0879, "step": 27350 }, { "epoch": 2.9420733344070435, "grad_norm": 8.777688980102539, "learning_rate": 9.757473898786562e-09, "loss": 1.235, "step": 27400 }, { "epoch": 2.947441885435121, "grad_norm": 9.769450187683105, "learning_rate": 8.031609538834417e-09, "loss": 1.1746, "step": 27450 }, { "epoch": 2.9528104364631984, "grad_norm": 16.548913955688477, "learning_rate": 6.473460131212194e-09, "loss": 1.0666, "step": 27500 }, { "epoch": 2.958178987491276, "grad_norm": 8.09269905090332, "learning_rate": 5.083078025000521e-09, "loss": 1.0938, "step": 27550 }, { "epoch": 2.9635475385193537, "grad_norm": 10.418055534362793, "learning_rate": 3.860509932808732e-09, "loss": 1.1509, "step": 27600 }, { "epoch": 2.968916089547431, "grad_norm": 8.594873428344727, "learning_rate": 2.805796929205573e-09, "loss": 1.1935, "step": 27650 }, { "epoch": 2.9742846405755086, "grad_norm": 7.283506870269775, "learning_rate": 1.918974449339195e-09, "loss": 1.1609, "step": 27700 }, { "epoch": 2.979653191603586, "grad_norm": 6.869657039642334, "learning_rate": 1.2000722877469894e-09, "loss": 1.12, "step": 27750 }, { "epoch": 2.985021742631664, "grad_norm": 17.424175262451172, "learning_rate": 6.491145973558377e-10, "loss": 1.1747, "step": 27800 }, { "epoch": 2.990390293659741, "grad_norm": 6.791327476501465, "learning_rate": 2.661198886666494e-10, "loss": 1.1605, "step": 27850 }, { "epoch": 2.9957588446878187, "grad_norm": 7.1723785400390625, "learning_rate": 5.11010291376346e-11, "loss": 1.1334, "step": 27900 }, { "epoch": 2.999946314489719, "step": 27939, "total_flos": 6.674170939930627e+19, "train_loss": 1.7448459406648975, "train_runtime": 100944.9227, "train_samples_per_second": 8.857, "train_steps_per_second": 0.277 } ], "logging_steps": 50, "max_steps": 27939, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.674170939930627e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }