{ "best_metric": 2.9280490707606077e-05, "best_model_checkpoint": "./results/models/checkpoint-156240", "epoch": 8.0, "eval_steps": 500, "global_step": 156240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025601638504864313, "grad_norm": 0.0004177093505859375, "learning_rate": 0.0009994879672299028, "loss": 0.023, "step": 500 }, { "epoch": 0.051203277009728626, "grad_norm": 0.000274658203125, "learning_rate": 0.0009989759344598056, "loss": 0.0, "step": 1000 }, { "epoch": 0.07680491551459294, "grad_norm": 0.000270843505859375, "learning_rate": 0.0009984639016897081, "loss": 0.0, "step": 1500 }, { "epoch": 0.10240655401945725, "grad_norm": 0.0002231597900390625, "learning_rate": 0.000997951868919611, "loss": 0.0, "step": 2000 }, { "epoch": 0.12800819252432155, "grad_norm": 0.00021457672119140625, "learning_rate": 0.0009974398361495137, "loss": 0.0, "step": 2500 }, { "epoch": 0.15360983102918588, "grad_norm": 0.00020694732666015625, "learning_rate": 0.0009969278033794163, "loss": 0.0, "step": 3000 }, { "epoch": 0.17921146953405018, "grad_norm": 0.00020313262939453125, "learning_rate": 0.000996415770609319, "loss": 0.0, "step": 3500 }, { "epoch": 0.2048131080389145, "grad_norm": 0.000202178955078125, "learning_rate": 0.0009959037378392218, "loss": 0.0, "step": 4000 }, { "epoch": 0.2304147465437788, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0009953917050691244, "loss": 0.0, "step": 4500 }, { "epoch": 0.2560163850486431, "grad_norm": 0.0024566650390625, "learning_rate": 0.0009948796722990272, "loss": 0.0813, "step": 5000 }, { "epoch": 0.2816180235535074, "grad_norm": 0.0025482177734375, "learning_rate": 0.00099436763952893, "loss": 0.001, "step": 5500 }, { "epoch": 0.30721966205837176, "grad_norm": 0.00048828125, "learning_rate": 0.0009938556067588325, "loss": 0.0005, "step": 6000 }, { "epoch": 0.33282130056323606, "grad_norm": 0.00058746337890625, "learning_rate": 0.0009933435739887353, "loss": 0.0004, "step": 6500 }, { "epoch": 0.35842293906810035, "grad_norm": 0.000614166259765625, "learning_rate": 0.000992831541218638, "loss": 0.0004, "step": 7000 }, { "epoch": 0.38402457757296465, "grad_norm": 0.014404296875, "learning_rate": 0.0009923195084485406, "loss": 0.0002, "step": 7500 }, { "epoch": 0.409626216077829, "grad_norm": 0.000820159912109375, "learning_rate": 0.0009918074756784434, "loss": 0.0002, "step": 8000 }, { "epoch": 0.4352278545826933, "grad_norm": 0.000713348388671875, "learning_rate": 0.0009912954429083462, "loss": 0.0002, "step": 8500 }, { "epoch": 0.4608294930875576, "grad_norm": 0.00057220458984375, "learning_rate": 0.000990783410138249, "loss": 0.0002, "step": 9000 }, { "epoch": 0.4864311315924219, "grad_norm": 0.0026702880859375, "learning_rate": 0.0009902713773681515, "loss": 0.0002, "step": 9500 }, { "epoch": 0.5120327700972862, "grad_norm": 0.0026702880859375, "learning_rate": 0.0009897593445980543, "loss": 0.0001, "step": 10000 }, { "epoch": 0.5376344086021505, "grad_norm": 0.0005035400390625, "learning_rate": 0.000989247311827957, "loss": 0.0001, "step": 10500 }, { "epoch": 0.5632360471070148, "grad_norm": 0.00052642822265625, "learning_rate": 0.0009887352790578599, "loss": 0.0002, "step": 11000 }, { "epoch": 0.5888376856118792, "grad_norm": 0.000667572021484375, "learning_rate": 0.0009882232462877624, "loss": 0.0001, "step": 11500 }, { "epoch": 0.6144393241167435, "grad_norm": 0.0003376007080078125, "learning_rate": 0.0009877112135176652, "loss": 0.0001, "step": 12000 }, { "epoch": 0.6400409626216078, "grad_norm": 0.00029754638671875, "learning_rate": 0.000987199180747568, "loss": 0.0001, "step": 12500 }, { "epoch": 0.6656426011264721, "grad_norm": 0.000308990478515625, "learning_rate": 0.0009866871479774705, "loss": 0.0001, "step": 13000 }, { "epoch": 0.6912442396313364, "grad_norm": 0.0003147125244140625, "learning_rate": 0.0009861751152073733, "loss": 0.0001, "step": 13500 }, { "epoch": 0.7168458781362007, "grad_norm": 0.0004367828369140625, "learning_rate": 0.0009856630824372759, "loss": 0.0001, "step": 14000 }, { "epoch": 0.742447516641065, "grad_norm": 0.0002880096435546875, "learning_rate": 0.0009851510496671787, "loss": 0.0001, "step": 14500 }, { "epoch": 0.7680491551459293, "grad_norm": 0.000274658203125, "learning_rate": 0.0009846390168970814, "loss": 0.0001, "step": 15000 }, { "epoch": 0.7936507936507936, "grad_norm": 0.0003528594970703125, "learning_rate": 0.000984126984126984, "loss": 0.0001, "step": 15500 }, { "epoch": 0.819252432155658, "grad_norm": 0.0002880096435546875, "learning_rate": 0.0009836149513568868, "loss": 0.0001, "step": 16000 }, { "epoch": 0.8448540706605223, "grad_norm": 0.0002536773681640625, "learning_rate": 0.0009831029185867896, "loss": 0.0001, "step": 16500 }, { "epoch": 0.8704557091653866, "grad_norm": 0.0003509521484375, "learning_rate": 0.0009825908858166923, "loss": 0.0001, "step": 17000 }, { "epoch": 0.8960573476702509, "grad_norm": 0.000244140625, "learning_rate": 0.0009820788530465951, "loss": 0.0, "step": 17500 }, { "epoch": 0.9216589861751152, "grad_norm": 0.000568389892578125, "learning_rate": 0.0009815668202764977, "loss": 0.0007, "step": 18000 }, { "epoch": 0.9472606246799795, "grad_norm": 0.0005645751953125, "learning_rate": 0.0009810547875064005, "loss": 0.0001, "step": 18500 }, { "epoch": 0.9728622631848438, "grad_norm": 0.00037384033203125, "learning_rate": 0.0009805427547363032, "loss": 0.0001, "step": 19000 }, { "epoch": 0.9984639016897081, "grad_norm": 0.000507354736328125, "learning_rate": 0.000980030721966206, "loss": 0.0001, "step": 19500 }, { "epoch": 1.0, "eval_loss": 6.09988892392721e-05, "eval_runtime": 0.5651, "eval_samples_per_second": 1769.543, "eval_steps_per_second": 3.539, "step": 19530 }, { "epoch": 1.0240655401945724, "grad_norm": 0.00133514404296875, "learning_rate": 0.0009795186891961086, "loss": 0.0001, "step": 20000 }, { "epoch": 1.0496671786994367, "grad_norm": 0.0004024505615234375, "learning_rate": 0.0009790066564260114, "loss": 0.0001, "step": 20500 }, { "epoch": 1.075268817204301, "grad_norm": 0.00031280517578125, "learning_rate": 0.000978494623655914, "loss": 0.0002, "step": 21000 }, { "epoch": 1.1008704557091653, "grad_norm": 0.0002689361572265625, "learning_rate": 0.0009779825908858167, "loss": 0.0001, "step": 21500 }, { "epoch": 1.1264720942140296, "grad_norm": 0.00023365020751953125, "learning_rate": 0.0009774705581157195, "loss": 0.0001, "step": 22000 }, { "epoch": 1.1520737327188941, "grad_norm": 0.00032806396484375, "learning_rate": 0.000976958525345622, "loss": 0.0001, "step": 22500 }, { "epoch": 1.1776753712237582, "grad_norm": 0.000274658203125, "learning_rate": 0.0009764464925755249, "loss": 0.0001, "step": 23000 }, { "epoch": 1.2032770097286227, "grad_norm": 0.0002651214599609375, "learning_rate": 0.0009759344598054276, "loss": 0.0001, "step": 23500 }, { "epoch": 1.228878648233487, "grad_norm": 0.00029754638671875, "learning_rate": 0.0009754224270353303, "loss": 0.0001, "step": 24000 }, { "epoch": 1.2544802867383513, "grad_norm": 0.000308990478515625, "learning_rate": 0.0009749103942652329, "loss": 0.0001, "step": 24500 }, { "epoch": 1.2800819252432156, "grad_norm": 0.000263214111328125, "learning_rate": 0.0009743983614951357, "loss": 0.0001, "step": 25000 }, { "epoch": 1.30568356374808, "grad_norm": 0.00074005126953125, "learning_rate": 0.0009738863287250385, "loss": 0.0001, "step": 25500 }, { "epoch": 1.3312852022529442, "grad_norm": 0.000354766845703125, "learning_rate": 0.000973374295954941, "loss": 0.0001, "step": 26000 }, { "epoch": 1.3568868407578085, "grad_norm": 0.0003108978271484375, "learning_rate": 0.0009728622631848438, "loss": 0.0001, "step": 26500 }, { "epoch": 1.3824884792626728, "grad_norm": 0.000278472900390625, "learning_rate": 0.0009723502304147466, "loss": 0.0001, "step": 27000 }, { "epoch": 1.4080901177675371, "grad_norm": 0.000293731689453125, "learning_rate": 0.0009718381976446493, "loss": 0.0, "step": 27500 }, { "epoch": 1.4336917562724014, "grad_norm": 0.000263214111328125, "learning_rate": 0.000971326164874552, "loss": 0.0, "step": 28000 }, { "epoch": 1.4592933947772657, "grad_norm": 0.000293731689453125, "learning_rate": 0.0009708141321044547, "loss": 0.0, "step": 28500 }, { "epoch": 1.48489503328213, "grad_norm": 0.000278472900390625, "learning_rate": 0.0009703020993343574, "loss": 0.0001, "step": 29000 }, { "epoch": 1.5104966717869943, "grad_norm": 0.00030517578125, "learning_rate": 0.0009697900665642602, "loss": 0.0001, "step": 29500 }, { "epoch": 1.5360983102918588, "grad_norm": 0.0002899169921875, "learning_rate": 0.0009692780337941628, "loss": 0.0, "step": 30000 }, { "epoch": 1.561699948796723, "grad_norm": 0.000766754150390625, "learning_rate": 0.0009687660010240655, "loss": 0.0, "step": 30500 }, { "epoch": 1.5873015873015874, "grad_norm": 0.00032806396484375, "learning_rate": 0.0009682539682539683, "loss": 0.0001, "step": 31000 }, { "epoch": 1.6129032258064515, "grad_norm": 0.000301361083984375, "learning_rate": 0.000967741935483871, "loss": 0.0001, "step": 31500 }, { "epoch": 1.638504864311316, "grad_norm": 0.003997802734375, "learning_rate": 0.0009672299027137736, "loss": 0.0001, "step": 32000 }, { "epoch": 1.66410650281618, "grad_norm": 0.0003452301025390625, "learning_rate": 0.0009667178699436764, "loss": 0.0001, "step": 32500 }, { "epoch": 1.6897081413210446, "grad_norm": 0.0002918243408203125, "learning_rate": 0.0009662058371735791, "loss": 0.0001, "step": 33000 }, { "epoch": 1.7153097798259087, "grad_norm": 0.0002422332763671875, "learning_rate": 0.0009656938044034819, "loss": 0.0001, "step": 33500 }, { "epoch": 1.7409114183307732, "grad_norm": 0.00124359130859375, "learning_rate": 0.0009651817716333846, "loss": 0.0, "step": 34000 }, { "epoch": 1.7665130568356375, "grad_norm": 0.0009002685546875, "learning_rate": 0.0009646697388632872, "loss": 0.0001, "step": 34500 }, { "epoch": 1.7921146953405018, "grad_norm": 0.00030517578125, "learning_rate": 0.00096415770609319, "loss": 0.0, "step": 35000 }, { "epoch": 1.8177163338453661, "grad_norm": 0.00041961669921875, "learning_rate": 0.0009636456733230928, "loss": 0.0001, "step": 35500 }, { "epoch": 1.8433179723502304, "grad_norm": 0.0003108978271484375, "learning_rate": 0.0009631336405529954, "loss": 0.0, "step": 36000 }, { "epoch": 1.8689196108550947, "grad_norm": 0.0002727508544921875, "learning_rate": 0.0009626216077828981, "loss": 0.0, "step": 36500 }, { "epoch": 1.894521249359959, "grad_norm": 0.0002613067626953125, "learning_rate": 0.0009621095750128009, "loss": 0.0, "step": 37000 }, { "epoch": 1.9201228878648233, "grad_norm": 0.0003204345703125, "learning_rate": 0.0009615975422427036, "loss": 0.0, "step": 37500 }, { "epoch": 1.9457245263696876, "grad_norm": 0.000255584716796875, "learning_rate": 0.0009610855094726063, "loss": 0.0, "step": 38000 }, { "epoch": 1.971326164874552, "grad_norm": 0.000263214111328125, "learning_rate": 0.0009605734767025089, "loss": 0.0, "step": 38500 }, { "epoch": 1.9969278033794162, "grad_norm": 0.00048828125, "learning_rate": 0.0009600614439324117, "loss": 0.0001, "step": 39000 }, { "epoch": 2.0, "eval_loss": 3.46598717442248e-05, "eval_runtime": 0.5684, "eval_samples_per_second": 1759.346, "eval_steps_per_second": 3.519, "step": 39060 }, { "epoch": 2.0225294418842807, "grad_norm": 0.000423431396484375, "learning_rate": 0.0009595494111623145, "loss": 0.0, "step": 39500 }, { "epoch": 2.048131080389145, "grad_norm": 0.0002899169921875, "learning_rate": 0.0009590373783922171, "loss": 0.0, "step": 40000 }, { "epoch": 2.0737327188940093, "grad_norm": 0.0002593994140625, "learning_rate": 0.0009585253456221198, "loss": 0.0, "step": 40500 }, { "epoch": 2.0993343573988734, "grad_norm": 0.00042724609375, "learning_rate": 0.0009580133128520226, "loss": 0.0, "step": 41000 }, { "epoch": 2.124935995903738, "grad_norm": 0.00032806396484375, "learning_rate": 0.0009575012800819252, "loss": 0.0, "step": 41500 }, { "epoch": 2.150537634408602, "grad_norm": 0.0002918243408203125, "learning_rate": 0.000956989247311828, "loss": 0.0, "step": 42000 }, { "epoch": 2.1761392729134665, "grad_norm": 0.00250244140625, "learning_rate": 0.0009564772145417307, "loss": 0.0, "step": 42500 }, { "epoch": 2.2017409114183306, "grad_norm": 0.0002803802490234375, "learning_rate": 0.0009559651817716334, "loss": 0.0, "step": 43000 }, { "epoch": 2.227342549923195, "grad_norm": 0.0003681182861328125, "learning_rate": 0.0009554531490015361, "loss": 0.0, "step": 43500 }, { "epoch": 2.252944188428059, "grad_norm": 0.0002689361572265625, "learning_rate": 0.0009549411162314389, "loss": 0.0, "step": 44000 }, { "epoch": 2.2785458269329237, "grad_norm": 0.000370025634765625, "learning_rate": 0.0009544290834613415, "loss": 0.0001, "step": 44500 }, { "epoch": 2.3041474654377883, "grad_norm": 0.0002536773681640625, "learning_rate": 0.0009539170506912443, "loss": 0.0, "step": 45000 }, { "epoch": 2.3297491039426523, "grad_norm": 0.000591278076171875, "learning_rate": 0.0009534050179211469, "loss": 0.0, "step": 45500 }, { "epoch": 2.3553507424475164, "grad_norm": 0.000514984130859375, "learning_rate": 0.0009528929851510497, "loss": 0.0, "step": 46000 }, { "epoch": 2.380952380952381, "grad_norm": 0.00025177001953125, "learning_rate": 0.0009523809523809524, "loss": 0.0, "step": 46500 }, { "epoch": 2.4065540194572455, "grad_norm": 0.0002384185791015625, "learning_rate": 0.000951868919610855, "loss": 0.0, "step": 47000 }, { "epoch": 2.4321556579621095, "grad_norm": 0.000247955322265625, "learning_rate": 0.0009513568868407578, "loss": 0.0, "step": 47500 }, { "epoch": 2.457757296466974, "grad_norm": 0.000240325927734375, "learning_rate": 0.0009508448540706606, "loss": 0.0, "step": 48000 }, { "epoch": 2.483358934971838, "grad_norm": 0.000274658203125, "learning_rate": 0.0009503328213005633, "loss": 0.0, "step": 48500 }, { "epoch": 2.5089605734767026, "grad_norm": 0.0002269744873046875, "learning_rate": 0.000949820788530466, "loss": 0.0, "step": 49000 }, { "epoch": 2.5345622119815667, "grad_norm": 0.0003204345703125, "learning_rate": 0.0009493087557603687, "loss": 0.0, "step": 49500 }, { "epoch": 2.5601638504864312, "grad_norm": 0.00787353515625, "learning_rate": 0.0009487967229902714, "loss": 0.0, "step": 50000 }, { "epoch": 2.5857654889912953, "grad_norm": 0.00022983551025390625, "learning_rate": 0.0009482846902201742, "loss": 0.0, "step": 50500 }, { "epoch": 2.61136712749616, "grad_norm": 0.0002651214599609375, "learning_rate": 0.0009477726574500767, "loss": 0.0, "step": 51000 }, { "epoch": 2.636968766001024, "grad_norm": 0.0002346038818359375, "learning_rate": 0.0009472606246799795, "loss": 0.0, "step": 51500 }, { "epoch": 2.6625704045058884, "grad_norm": 0.00021839141845703125, "learning_rate": 0.0009467485919098823, "loss": 0.0, "step": 52000 }, { "epoch": 2.688172043010753, "grad_norm": 0.0004177093505859375, "learning_rate": 0.000946236559139785, "loss": 0.0, "step": 52500 }, { "epoch": 2.713773681515617, "grad_norm": 0.000247955322265625, "learning_rate": 0.0009457245263696876, "loss": 0.0, "step": 53000 }, { "epoch": 2.739375320020481, "grad_norm": 0.0002269744873046875, "learning_rate": 0.0009452124935995904, "loss": 0.0, "step": 53500 }, { "epoch": 2.7649769585253456, "grad_norm": 0.00025177001953125, "learning_rate": 0.0009447004608294931, "loss": 0.0, "step": 54000 }, { "epoch": 2.79057859703021, "grad_norm": 0.0002536773681640625, "learning_rate": 0.0009441884280593959, "loss": 0.0, "step": 54500 }, { "epoch": 2.8161802355350742, "grad_norm": 0.0002346038818359375, "learning_rate": 0.0009436763952892985, "loss": 0.0, "step": 55000 }, { "epoch": 2.8417818740399383, "grad_norm": 0.0013427734375, "learning_rate": 0.0009431643625192012, "loss": 0.0, "step": 55500 }, { "epoch": 2.867383512544803, "grad_norm": 0.0003528594970703125, "learning_rate": 0.000942652329749104, "loss": 0.0, "step": 56000 }, { "epoch": 2.8929851510496674, "grad_norm": 0.00023746490478515625, "learning_rate": 0.0009421402969790068, "loss": 0.0, "step": 56500 }, { "epoch": 2.9185867895545314, "grad_norm": 0.00055694580078125, "learning_rate": 0.0009416282642089093, "loss": 0.0, "step": 57000 }, { "epoch": 2.944188428059396, "grad_norm": 0.0002307891845703125, "learning_rate": 0.0009411162314388121, "loss": 0.0, "step": 57500 }, { "epoch": 2.96979006656426, "grad_norm": 0.000286102294921875, "learning_rate": 0.0009406041986687148, "loss": 0.0, "step": 58000 }, { "epoch": 2.9953917050691246, "grad_norm": 0.00019550323486328125, "learning_rate": 0.0009400921658986176, "loss": 0.0, "step": 58500 }, { "epoch": 3.0, "eval_loss": 3.896626367350109e-05, "eval_runtime": 0.5618, "eval_samples_per_second": 1780.014, "eval_steps_per_second": 3.56, "step": 58590 }, { "epoch": 3.0209933435739886, "grad_norm": 0.000255584716796875, "learning_rate": 0.0009395801331285202, "loss": 0.0, "step": 59000 }, { "epoch": 3.046594982078853, "grad_norm": 0.000209808349609375, "learning_rate": 0.0009390681003584229, "loss": 0.0, "step": 59500 }, { "epoch": 3.0721966205837172, "grad_norm": 0.00021457672119140625, "learning_rate": 0.0009385560675883257, "loss": 0.0, "step": 60000 }, { "epoch": 3.0977982590885818, "grad_norm": 0.0013580322265625, "learning_rate": 0.0009380440348182285, "loss": 0.0, "step": 60500 }, { "epoch": 3.123399897593446, "grad_norm": 0.0002002716064453125, "learning_rate": 0.000937532002048131, "loss": 0.0, "step": 61000 }, { "epoch": 3.1490015360983103, "grad_norm": 0.0002727508544921875, "learning_rate": 0.0009370199692780338, "loss": 0.0, "step": 61500 }, { "epoch": 3.1746031746031744, "grad_norm": 0.0003662109375, "learning_rate": 0.0009365079365079366, "loss": 0.0, "step": 62000 }, { "epoch": 3.200204813108039, "grad_norm": 0.0002040863037109375, "learning_rate": 0.0009359959037378392, "loss": 0.0, "step": 62500 }, { "epoch": 3.225806451612903, "grad_norm": 0.00020694732666015625, "learning_rate": 0.0009354838709677419, "loss": 0.0, "step": 63000 }, { "epoch": 3.2514080901177675, "grad_norm": 0.000213623046875, "learning_rate": 0.0009349718381976447, "loss": 0.0, "step": 63500 }, { "epoch": 3.277009728622632, "grad_norm": 0.00020313262939453125, "learning_rate": 0.0009344598054275474, "loss": 0.0, "step": 64000 }, { "epoch": 3.302611367127496, "grad_norm": 0.0001983642578125, "learning_rate": 0.0009339477726574501, "loss": 0.0, "step": 64500 }, { "epoch": 3.32821300563236, "grad_norm": 0.0002117156982421875, "learning_rate": 0.0009334357398873528, "loss": 0.0, "step": 65000 }, { "epoch": 3.3538146441372247, "grad_norm": 0.0004444122314453125, "learning_rate": 0.0009329237071172555, "loss": 0.0, "step": 65500 }, { "epoch": 3.3794162826420893, "grad_norm": 0.00024318695068359375, "learning_rate": 0.0009324116743471583, "loss": 0.0, "step": 66000 }, { "epoch": 3.4050179211469533, "grad_norm": 0.004608154296875, "learning_rate": 0.0009318996415770609, "loss": 0.0, "step": 66500 }, { "epoch": 3.430619559651818, "grad_norm": 0.00022029876708984375, "learning_rate": 0.0009313876088069637, "loss": 0.0, "step": 67000 }, { "epoch": 3.456221198156682, "grad_norm": 0.00021648406982421875, "learning_rate": 0.0009308755760368664, "loss": 0.0, "step": 67500 }, { "epoch": 3.4818228366615465, "grad_norm": 0.0002841949462890625, "learning_rate": 0.000930363543266769, "loss": 0.0, "step": 68000 }, { "epoch": 3.5074244751664105, "grad_norm": 0.0002269744873046875, "learning_rate": 0.0009298515104966718, "loss": 0.0, "step": 68500 }, { "epoch": 3.533026113671275, "grad_norm": 0.00021648406982421875, "learning_rate": 0.0009293394777265746, "loss": 0.0, "step": 69000 }, { "epoch": 3.558627752176139, "grad_norm": 0.0002079010009765625, "learning_rate": 0.0009288274449564772, "loss": 0.0, "step": 69500 }, { "epoch": 3.5842293906810037, "grad_norm": 0.00022125244140625, "learning_rate": 0.00092831541218638, "loss": 0.0, "step": 70000 }, { "epoch": 3.6098310291858677, "grad_norm": 0.00021457672119140625, "learning_rate": 0.0009278033794162827, "loss": 0.0, "step": 70500 }, { "epoch": 3.6354326676907323, "grad_norm": 0.000209808349609375, "learning_rate": 0.0009272913466461854, "loss": 0.0, "step": 71000 }, { "epoch": 3.6610343061955968, "grad_norm": 0.0002727508544921875, "learning_rate": 0.0009267793138760881, "loss": 0.0, "step": 71500 }, { "epoch": 3.686635944700461, "grad_norm": 0.00020313262939453125, "learning_rate": 0.0009262672811059907, "loss": 0.0, "step": 72000 }, { "epoch": 3.712237583205325, "grad_norm": 0.0002288818359375, "learning_rate": 0.0009257552483358935, "loss": 0.0, "step": 72500 }, { "epoch": 3.7378392217101895, "grad_norm": 0.0010528564453125, "learning_rate": 0.0009252432155657963, "loss": 0.0, "step": 73000 }, { "epoch": 3.763440860215054, "grad_norm": 0.00019550323486328125, "learning_rate": 0.0009247311827956989, "loss": 0.0, "step": 73500 }, { "epoch": 3.789042498719918, "grad_norm": 0.00021266937255859375, "learning_rate": 0.0009242191500256016, "loss": 0.0, "step": 74000 }, { "epoch": 3.814644137224782, "grad_norm": 0.0003509521484375, "learning_rate": 0.0009237071172555044, "loss": 0.0, "step": 74500 }, { "epoch": 3.8402457757296466, "grad_norm": 0.00020599365234375, "learning_rate": 0.0009231950844854071, "loss": 0.0, "step": 75000 }, { "epoch": 3.865847414234511, "grad_norm": 0.0002002716064453125, "learning_rate": 0.0009226830517153098, "loss": 0.0, "step": 75500 }, { "epoch": 3.8914490527393752, "grad_norm": 0.00021076202392578125, "learning_rate": 0.0009221710189452125, "loss": 0.0, "step": 76000 }, { "epoch": 3.9170506912442398, "grad_norm": 0.0002918243408203125, "learning_rate": 0.0009216589861751152, "loss": 0.0, "step": 76500 }, { "epoch": 3.942652329749104, "grad_norm": 0.0003223419189453125, "learning_rate": 0.000921146953405018, "loss": 0.0, "step": 77000 }, { "epoch": 3.9682539682539684, "grad_norm": 0.0003032684326171875, "learning_rate": 0.0009206349206349207, "loss": 0.0, "step": 77500 }, { "epoch": 3.9938556067588324, "grad_norm": 0.000698089599609375, "learning_rate": 0.0009201228878648233, "loss": 0.0, "step": 78000 }, { "epoch": 4.0, "eval_loss": 3.7267222069203854e-05, "eval_runtime": 0.546, "eval_samples_per_second": 1831.41, "eval_steps_per_second": 3.663, "step": 78120 }, { "epoch": 4.0194572452636965, "grad_norm": 0.0002651214599609375, "learning_rate": 0.0009196108550947261, "loss": 0.0, "step": 78500 }, { "epoch": 4.0450588837685615, "grad_norm": 0.00020885467529296875, "learning_rate": 0.0009190988223246288, "loss": 0.0, "step": 79000 }, { "epoch": 4.070660522273426, "grad_norm": 0.00019741058349609375, "learning_rate": 0.0009185867895545314, "loss": 0.0, "step": 79500 }, { "epoch": 4.09626216077829, "grad_norm": 0.00021839141845703125, "learning_rate": 0.0009180747567844342, "loss": 0.0, "step": 80000 }, { "epoch": 4.121863799283154, "grad_norm": 0.00020313262939453125, "learning_rate": 0.0009175627240143369, "loss": 0.0, "step": 80500 }, { "epoch": 4.147465437788019, "grad_norm": 0.00023555755615234375, "learning_rate": 0.0009170506912442397, "loss": 0.0, "step": 81000 }, { "epoch": 4.173067076292883, "grad_norm": 0.0003204345703125, "learning_rate": 0.0009165386584741425, "loss": 0.0, "step": 81500 }, { "epoch": 4.198668714797747, "grad_norm": 0.00022411346435546875, "learning_rate": 0.000916026625704045, "loss": 0.0, "step": 82000 }, { "epoch": 4.224270353302611, "grad_norm": 0.0004634857177734375, "learning_rate": 0.0009155145929339478, "loss": 0.0, "step": 82500 }, { "epoch": 4.249871991807476, "grad_norm": 0.00023651123046875, "learning_rate": 0.0009150025601638506, "loss": 0.0, "step": 83000 }, { "epoch": 4.27547363031234, "grad_norm": 0.000202178955078125, "learning_rate": 0.0009144905273937532, "loss": 0.0, "step": 83500 }, { "epoch": 4.301075268817204, "grad_norm": 0.0003528594970703125, "learning_rate": 0.0009139784946236559, "loss": 0.0, "step": 84000 }, { "epoch": 4.326676907322069, "grad_norm": 0.0003204345703125, "learning_rate": 0.0009134664618535587, "loss": 0.0, "step": 84500 }, { "epoch": 4.352278545826933, "grad_norm": 0.000598907470703125, "learning_rate": 0.0009129544290834614, "loss": 0.0, "step": 85000 }, { "epoch": 4.377880184331797, "grad_norm": 0.000286102294921875, "learning_rate": 0.0009124423963133641, "loss": 0.0, "step": 85500 }, { "epoch": 4.403481822836661, "grad_norm": 0.00021839141845703125, "learning_rate": 0.0009119303635432667, "loss": 0.0, "step": 86000 }, { "epoch": 4.429083461341526, "grad_norm": 0.000225067138671875, "learning_rate": 0.0009114183307731695, "loss": 0.0, "step": 86500 }, { "epoch": 4.45468509984639, "grad_norm": 0.00141143798828125, "learning_rate": 0.0009109062980030723, "loss": 0.0, "step": 87000 }, { "epoch": 4.480286738351254, "grad_norm": 0.00022602081298828125, "learning_rate": 0.0009103942652329749, "loss": 0.0, "step": 87500 }, { "epoch": 4.505888376856118, "grad_norm": 0.000335693359375, "learning_rate": 0.0009098822324628776, "loss": 0.0, "step": 88000 }, { "epoch": 4.531490015360983, "grad_norm": 0.00019073486328125, "learning_rate": 0.0009093701996927804, "loss": 0.0, "step": 88500 }, { "epoch": 4.5570916538658475, "grad_norm": 0.00019931793212890625, "learning_rate": 0.000908858166922683, "loss": 0.0, "step": 89000 }, { "epoch": 4.5826932923707115, "grad_norm": 0.00021839141845703125, "learning_rate": 0.0009083461341525858, "loss": 0.0, "step": 89500 }, { "epoch": 4.6082949308755765, "grad_norm": 0.001983642578125, "learning_rate": 0.0009078341013824885, "loss": 0.0, "step": 90000 }, { "epoch": 4.633896569380441, "grad_norm": 0.00020503997802734375, "learning_rate": 0.0009073220686123912, "loss": 0.0, "step": 90500 }, { "epoch": 4.659498207885305, "grad_norm": 0.0002689361572265625, "learning_rate": 0.000906810035842294, "loss": 0.0, "step": 91000 }, { "epoch": 4.685099846390169, "grad_norm": 0.0002307891845703125, "learning_rate": 0.0009062980030721967, "loss": 0.0, "step": 91500 }, { "epoch": 4.710701484895033, "grad_norm": 0.00020885467529296875, "learning_rate": 0.0009057859703020993, "loss": 0.0, "step": 92000 }, { "epoch": 4.736303123399898, "grad_norm": 0.0002536773681640625, "learning_rate": 0.0009052739375320021, "loss": 0.0, "step": 92500 }, { "epoch": 4.761904761904762, "grad_norm": 0.0002040863037109375, "learning_rate": 0.0009047619047619047, "loss": 0.0, "step": 93000 }, { "epoch": 4.787506400409626, "grad_norm": 0.00174713134765625, "learning_rate": 0.0009042498719918075, "loss": 0.0, "step": 93500 }, { "epoch": 4.813108038914491, "grad_norm": 0.00020313262939453125, "learning_rate": 0.0009037378392217102, "loss": 0.0, "step": 94000 }, { "epoch": 4.838709677419355, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0009032258064516129, "loss": 0.0, "step": 94500 }, { "epoch": 4.864311315924219, "grad_norm": 0.000186920166015625, "learning_rate": 0.0009027137736815156, "loss": 0.0, "step": 95000 }, { "epoch": 4.889912954429083, "grad_norm": 0.00019359588623046875, "learning_rate": 0.0009022017409114184, "loss": 0.0, "step": 95500 }, { "epoch": 4.915514592933948, "grad_norm": 0.00018310546875, "learning_rate": 0.000901689708141321, "loss": 0.0, "step": 96000 }, { "epoch": 4.941116231438812, "grad_norm": 0.000579833984375, "learning_rate": 0.0009011776753712238, "loss": 0.0, "step": 96500 }, { "epoch": 4.966717869943676, "grad_norm": 0.00017833709716796875, "learning_rate": 0.0009006656426011265, "loss": 0.0, "step": 97000 }, { "epoch": 4.99231950844854, "grad_norm": 0.000507354736328125, "learning_rate": 0.0009001536098310292, "loss": 0.0, "step": 97500 }, { "epoch": 5.0, "eval_loss": 3.1982614018488675e-05, "eval_runtime": 0.5464, "eval_samples_per_second": 1830.037, "eval_steps_per_second": 3.66, "step": 97650 }, { "epoch": 5.017921146953405, "grad_norm": 0.00020313262939453125, "learning_rate": 0.000899641577060932, "loss": 0.0, "step": 98000 }, { "epoch": 5.043522785458269, "grad_norm": 0.0002651214599609375, "learning_rate": 0.0008991295442908345, "loss": 0.0, "step": 98500 }, { "epoch": 5.0691244239631335, "grad_norm": 0.000335693359375, "learning_rate": 0.0008986175115207373, "loss": 0.0, "step": 99000 }, { "epoch": 5.0947260624679975, "grad_norm": 0.000186920166015625, "learning_rate": 0.0008981054787506401, "loss": 0.0, "step": 99500 }, { "epoch": 5.1203277009728625, "grad_norm": 0.0001850128173828125, "learning_rate": 0.0008975934459805428, "loss": 0.0, "step": 100000 }, { "epoch": 5.145929339477727, "grad_norm": 0.00022411346435546875, "learning_rate": 0.0008970814132104454, "loss": 0.0, "step": 100500 }, { "epoch": 5.171530977982591, "grad_norm": 0.0002002716064453125, "learning_rate": 0.0008965693804403482, "loss": 0.0, "step": 101000 }, { "epoch": 5.197132616487456, "grad_norm": 0.00019550323486328125, "learning_rate": 0.0008960573476702509, "loss": 0.0, "step": 101500 }, { "epoch": 5.22273425499232, "grad_norm": 0.00019931793212890625, "learning_rate": 0.0008955453149001537, "loss": 0.0, "step": 102000 }, { "epoch": 5.248335893497184, "grad_norm": 0.0002918243408203125, "learning_rate": 0.0008950332821300563, "loss": 0.0, "step": 102500 }, { "epoch": 5.273937532002048, "grad_norm": 0.000217437744140625, "learning_rate": 0.000894521249359959, "loss": 0.0, "step": 103000 }, { "epoch": 5.299539170506913, "grad_norm": 0.00018405914306640625, "learning_rate": 0.0008940092165898618, "loss": 0.0, "step": 103500 }, { "epoch": 5.325140809011777, "grad_norm": 0.0009918212890625, "learning_rate": 0.0008934971838197646, "loss": 0.0, "step": 104000 }, { "epoch": 5.350742447516641, "grad_norm": 0.00086212158203125, "learning_rate": 0.0008929851510496671, "loss": 0.0, "step": 104500 }, { "epoch": 5.376344086021505, "grad_norm": 0.00020122528076171875, "learning_rate": 0.0008924731182795699, "loss": 0.0, "step": 105000 }, { "epoch": 5.40194572452637, "grad_norm": 0.00019168853759765625, "learning_rate": 0.0008919610855094726, "loss": 0.0, "step": 105500 }, { "epoch": 5.427547363031234, "grad_norm": 0.0001964569091796875, "learning_rate": 0.0008914490527393754, "loss": 0.0, "step": 106000 }, { "epoch": 5.453149001536098, "grad_norm": 0.0001811981201171875, "learning_rate": 0.000890937019969278, "loss": 0.0, "step": 106500 }, { "epoch": 5.478750640040962, "grad_norm": 0.000179290771484375, "learning_rate": 0.0008904249871991807, "loss": 0.0, "step": 107000 }, { "epoch": 5.504352278545827, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0008899129544290835, "loss": 0.0, "step": 107500 }, { "epoch": 5.529953917050691, "grad_norm": 0.00018310546875, "learning_rate": 0.0008894009216589863, "loss": 0.0, "step": 108000 }, { "epoch": 5.555555555555555, "grad_norm": 0.00018596649169921875, "learning_rate": 0.0008888888888888888, "loss": 0.0, "step": 108500 }, { "epoch": 5.58115719406042, "grad_norm": 0.0002689361572265625, "learning_rate": 0.0008883768561187916, "loss": 0.0, "step": 109000 }, { "epoch": 5.606758832565284, "grad_norm": 0.00018215179443359375, "learning_rate": 0.0008878648233486944, "loss": 0.0, "step": 109500 }, { "epoch": 5.6323604710701485, "grad_norm": 0.0002613067626953125, "learning_rate": 0.000887352790578597, "loss": 0.0, "step": 110000 }, { "epoch": 5.6579621095750126, "grad_norm": 0.0002613067626953125, "learning_rate": 0.0008868407578084997, "loss": 0.0, "step": 110500 }, { "epoch": 5.683563748079877, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0008863287250384025, "loss": 0.0, "step": 111000 }, { "epoch": 5.709165386584742, "grad_norm": 0.00018024444580078125, "learning_rate": 0.0008858166922683052, "loss": 0.0, "step": 111500 }, { "epoch": 5.734767025089606, "grad_norm": 0.000186920166015625, "learning_rate": 0.000885304659498208, "loss": 0.0, "step": 112000 }, { "epoch": 5.76036866359447, "grad_norm": 0.00018024444580078125, "learning_rate": 0.0008847926267281106, "loss": 0.0, "step": 112500 }, { "epoch": 5.785970302099335, "grad_norm": 0.000347137451171875, "learning_rate": 0.0008842805939580133, "loss": 0.0, "step": 113000 }, { "epoch": 5.811571940604199, "grad_norm": 0.00023174285888671875, "learning_rate": 0.0008837685611879161, "loss": 0.0, "step": 113500 }, { "epoch": 5.837173579109063, "grad_norm": 0.000392913818359375, "learning_rate": 0.0008832565284178187, "loss": 0.0, "step": 114000 }, { "epoch": 5.862775217613927, "grad_norm": 0.0003032684326171875, "learning_rate": 0.0008827444956477215, "loss": 0.0, "step": 114500 }, { "epoch": 5.888376856118792, "grad_norm": 0.0001964569091796875, "learning_rate": 0.0008822324628776242, "loss": 0.0, "step": 115000 }, { "epoch": 5.913978494623656, "grad_norm": 0.00020122528076171875, "learning_rate": 0.0008817204301075269, "loss": 0.0, "step": 115500 }, { "epoch": 5.93958013312852, "grad_norm": 0.00019550323486328125, "learning_rate": 0.0008812083973374296, "loss": 0.0, "step": 116000 }, { "epoch": 5.965181771633384, "grad_norm": 0.000179290771484375, "learning_rate": 0.0008806963645673324, "loss": 0.0, "step": 116500 }, { "epoch": 5.990783410138249, "grad_norm": 0.00018024444580078125, "learning_rate": 0.000880184331797235, "loss": 0.0, "step": 117000 }, { "epoch": 6.0, "eval_loss": 3.100566391367465e-05, "eval_runtime": 0.5504, "eval_samples_per_second": 1816.722, "eval_steps_per_second": 3.633, "step": 117180 }, { "epoch": 6.016385048643113, "grad_norm": 0.0001983642578125, "learning_rate": 0.0008796722990271378, "loss": 0.0, "step": 117500 }, { "epoch": 6.041986687147977, "grad_norm": 0.00018978118896484375, "learning_rate": 0.0008791602662570405, "loss": 0.0, "step": 118000 }, { "epoch": 6.067588325652842, "grad_norm": 0.00017833709716796875, "learning_rate": 0.0008786482334869432, "loss": 0.0, "step": 118500 }, { "epoch": 6.093189964157706, "grad_norm": 0.00019168853759765625, "learning_rate": 0.0008781362007168459, "loss": 0.0, "step": 119000 }, { "epoch": 6.11879160266257, "grad_norm": 0.000209808349609375, "learning_rate": 0.0008776241679467485, "loss": 0.0, "step": 119500 }, { "epoch": 6.1443932411674345, "grad_norm": 0.0001964569091796875, "learning_rate": 0.0008771121351766513, "loss": 0.0, "step": 120000 }, { "epoch": 6.169994879672299, "grad_norm": 0.0002002716064453125, "learning_rate": 0.0008766001024065541, "loss": 0.0, "step": 120500 }, { "epoch": 6.1955965181771635, "grad_norm": 0.0001773834228515625, "learning_rate": 0.0008760880696364567, "loss": 0.0, "step": 121000 }, { "epoch": 6.221198156682028, "grad_norm": 0.0001773834228515625, "learning_rate": 0.0008755760368663594, "loss": 0.0, "step": 121500 }, { "epoch": 6.246799795186892, "grad_norm": 0.0002040863037109375, "learning_rate": 0.0008750640040962622, "loss": 0.0, "step": 122000 }, { "epoch": 6.272401433691757, "grad_norm": 0.0002498626708984375, "learning_rate": 0.0008745519713261649, "loss": 0.0, "step": 122500 }, { "epoch": 6.298003072196621, "grad_norm": 0.000179290771484375, "learning_rate": 0.0008740399385560676, "loss": 0.0, "step": 123000 }, { "epoch": 6.323604710701485, "grad_norm": 0.00018405914306640625, "learning_rate": 0.0008735279057859703, "loss": 0.0, "step": 123500 }, { "epoch": 6.349206349206349, "grad_norm": 0.00017452239990234375, "learning_rate": 0.000873015873015873, "loss": 0.0, "step": 124000 }, { "epoch": 6.374807987711214, "grad_norm": 0.00018024444580078125, "learning_rate": 0.0008725038402457758, "loss": 0.0, "step": 124500 }, { "epoch": 6.400409626216078, "grad_norm": 0.00026702880859375, "learning_rate": 0.0008719918074756785, "loss": 0.0, "step": 125000 }, { "epoch": 6.426011264720942, "grad_norm": 0.00018978118896484375, "learning_rate": 0.0008714797747055811, "loss": 0.0, "step": 125500 }, { "epoch": 6.451612903225806, "grad_norm": 0.00018405914306640625, "learning_rate": 0.0008709677419354839, "loss": 0.0, "step": 126000 }, { "epoch": 6.477214541730671, "grad_norm": 0.00018596649169921875, "learning_rate": 0.0008704557091653866, "loss": 0.0, "step": 126500 }, { "epoch": 6.502816180235535, "grad_norm": 0.000568389892578125, "learning_rate": 0.0008699436763952893, "loss": 0.0, "step": 127000 }, { "epoch": 6.528417818740399, "grad_norm": 0.00060272216796875, "learning_rate": 0.000869431643625192, "loss": 0.0, "step": 127500 }, { "epoch": 6.554019457245264, "grad_norm": 0.000179290771484375, "learning_rate": 0.0008689196108550947, "loss": 0.0, "step": 128000 }, { "epoch": 6.579621095750128, "grad_norm": 0.000255584716796875, "learning_rate": 0.0008684075780849975, "loss": 0.0, "step": 128500 }, { "epoch": 6.605222734254992, "grad_norm": 0.00018978118896484375, "learning_rate": 0.0008678955453149003, "loss": 0.0, "step": 129000 }, { "epoch": 6.630824372759856, "grad_norm": 0.00019550323486328125, "learning_rate": 0.0008673835125448028, "loss": 0.0, "step": 129500 }, { "epoch": 6.65642601126472, "grad_norm": 0.0004329681396484375, "learning_rate": 0.0008668714797747056, "loss": 0.0, "step": 130000 }, { "epoch": 6.682027649769585, "grad_norm": 0.00040435791015625, "learning_rate": 0.0008663594470046084, "loss": 0.0, "step": 130500 }, { "epoch": 6.7076292882744495, "grad_norm": 0.000392913818359375, "learning_rate": 0.000865847414234511, "loss": 0.0, "step": 131000 }, { "epoch": 6.733230926779314, "grad_norm": 0.0001811981201171875, "learning_rate": 0.0008653353814644137, "loss": 0.0, "step": 131500 }, { "epoch": 6.7588325652841785, "grad_norm": 0.00018310546875, "learning_rate": 0.0008648233486943165, "loss": 0.0, "step": 132000 }, { "epoch": 6.784434203789043, "grad_norm": 0.00022125244140625, "learning_rate": 0.0008643113159242192, "loss": 0.0, "step": 132500 }, { "epoch": 6.810035842293907, "grad_norm": 0.00018310546875, "learning_rate": 0.000863799283154122, "loss": 0.0, "step": 133000 }, { "epoch": 6.835637480798771, "grad_norm": 0.00022125244140625, "learning_rate": 0.0008632872503840245, "loss": 0.0, "step": 133500 }, { "epoch": 6.861239119303636, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0008627752176139273, "loss": 0.0, "step": 134000 }, { "epoch": 6.8868407578085, "grad_norm": 0.00023365020751953125, "learning_rate": 0.0008622631848438301, "loss": 0.0, "step": 134500 }, { "epoch": 6.912442396313364, "grad_norm": 0.00018787384033203125, "learning_rate": 0.0008617511520737327, "loss": 0.0, "step": 135000 }, { "epoch": 6.938044034818228, "grad_norm": 0.0003833770751953125, "learning_rate": 0.0008612391193036354, "loss": 0.0, "step": 135500 }, { "epoch": 6.963645673323093, "grad_norm": 0.0003833770751953125, "learning_rate": 0.0008607270865335382, "loss": 0.0, "step": 136000 }, { "epoch": 6.989247311827957, "grad_norm": 0.0004425048828125, "learning_rate": 0.0008602150537634409, "loss": 0.0, "step": 136500 }, { "epoch": 7.0, "eval_loss": 3.138924512313679e-05, "eval_runtime": 0.5584, "eval_samples_per_second": 1790.97, "eval_steps_per_second": 3.582, "step": 136710 }, { "epoch": 7.014848950332821, "grad_norm": 0.0001811981201171875, "learning_rate": 0.0008597030209933436, "loss": 0.0, "step": 137000 }, { "epoch": 7.040450588837686, "grad_norm": 0.0002536773681640625, "learning_rate": 0.0008591909882232463, "loss": 0.0, "step": 137500 }, { "epoch": 7.06605222734255, "grad_norm": 0.0004558563232421875, "learning_rate": 0.000858678955453149, "loss": 0.0, "step": 138000 }, { "epoch": 7.091653865847414, "grad_norm": 0.00017833709716796875, "learning_rate": 0.0008581669226830518, "loss": 0.0, "step": 138500 }, { "epoch": 7.117255504352278, "grad_norm": 0.0001773834228515625, "learning_rate": 0.0008576548899129545, "loss": 0.0, "step": 139000 }, { "epoch": 7.142857142857143, "grad_norm": 0.0004482269287109375, "learning_rate": 0.0008571428571428571, "loss": 0.0, "step": 139500 }, { "epoch": 7.168458781362007, "grad_norm": 0.00019359588623046875, "learning_rate": 0.0008566308243727599, "loss": 0.0, "step": 140000 }, { "epoch": 7.194060419866871, "grad_norm": 0.0001811981201171875, "learning_rate": 0.0008561187916026625, "loss": 0.0, "step": 140500 }, { "epoch": 7.2196620583717355, "grad_norm": 0.000576019287109375, "learning_rate": 0.0008556067588325653, "loss": 0.0, "step": 141000 }, { "epoch": 7.2452636968766, "grad_norm": 0.00018787384033203125, "learning_rate": 0.000855094726062468, "loss": 0.0, "step": 141500 }, { "epoch": 7.2708653353814645, "grad_norm": 0.0002155303955078125, "learning_rate": 0.0008545826932923707, "loss": 0.0, "step": 142000 }, { "epoch": 7.296466973886329, "grad_norm": 0.00018215179443359375, "learning_rate": 0.0008540706605222734, "loss": 0.0, "step": 142500 }, { "epoch": 7.322068612391193, "grad_norm": 0.0002346038818359375, "learning_rate": 0.0008535586277521762, "loss": 0.0, "step": 143000 }, { "epoch": 7.347670250896058, "grad_norm": 0.00023937225341796875, "learning_rate": 0.0008530465949820788, "loss": 0.0, "step": 143500 }, { "epoch": 7.373271889400922, "grad_norm": 0.0002460479736328125, "learning_rate": 0.0008525345622119816, "loss": 0.0, "step": 144000 }, { "epoch": 7.398873527905786, "grad_norm": 0.0002613067626953125, "learning_rate": 0.0008520225294418843, "loss": 0.0, "step": 144500 }, { "epoch": 7.42447516641065, "grad_norm": 0.00017642974853515625, "learning_rate": 0.000851510496671787, "loss": 0.0, "step": 145000 }, { "epoch": 7.450076804915515, "grad_norm": 0.000232696533203125, "learning_rate": 0.0008509984639016898, "loss": 0.0, "step": 145500 }, { "epoch": 7.475678443420379, "grad_norm": 0.0001697540283203125, "learning_rate": 0.0008504864311315924, "loss": 0.0, "step": 146000 }, { "epoch": 7.501280081925243, "grad_norm": 0.00017833709716796875, "learning_rate": 0.0008499743983614951, "loss": 0.0, "step": 146500 }, { "epoch": 7.526881720430108, "grad_norm": 0.000377655029296875, "learning_rate": 0.0008494623655913979, "loss": 0.0, "step": 147000 }, { "epoch": 7.552483358934972, "grad_norm": 0.000194549560546875, "learning_rate": 0.0008489503328213006, "loss": 0.0, "step": 147500 }, { "epoch": 7.578084997439836, "grad_norm": 0.0004825592041015625, "learning_rate": 0.0008484383000512033, "loss": 0.0, "step": 148000 }, { "epoch": 7.6036866359447, "grad_norm": 0.0001773834228515625, "learning_rate": 0.000847926267281106, "loss": 0.0, "step": 148500 }, { "epoch": 7.629288274449565, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0008474142345110087, "loss": 0.0, "step": 149000 }, { "epoch": 7.654889912954429, "grad_norm": 0.00018215179443359375, "learning_rate": 0.0008469022017409115, "loss": 0.0, "step": 149500 }, { "epoch": 7.680491551459293, "grad_norm": 0.00019073486328125, "learning_rate": 0.0008463901689708142, "loss": 0.0, "step": 150000 }, { "epoch": 7.706093189964157, "grad_norm": 0.00017833709716796875, "learning_rate": 0.0008458781362007168, "loss": 0.0, "step": 150500 }, { "epoch": 7.731694828469022, "grad_norm": 0.0001926422119140625, "learning_rate": 0.0008453661034306196, "loss": 0.0, "step": 151000 }, { "epoch": 7.757296466973886, "grad_norm": 0.000274658203125, "learning_rate": 0.0008448540706605224, "loss": 0.0, "step": 151500 }, { "epoch": 7.7828981054787505, "grad_norm": 0.00017452239990234375, "learning_rate": 0.0008443420378904249, "loss": 0.0, "step": 152000 }, { "epoch": 7.808499743983615, "grad_norm": 0.00017833709716796875, "learning_rate": 0.0008438300051203277, "loss": 0.0, "step": 152500 }, { "epoch": 7.8341013824884795, "grad_norm": 0.000301361083984375, "learning_rate": 0.0008433179723502304, "loss": 0.0, "step": 153000 }, { "epoch": 7.859703020993344, "grad_norm": 0.0001697540283203125, "learning_rate": 0.0008428059395801332, "loss": 0.0, "step": 153500 }, { "epoch": 7.885304659498208, "grad_norm": 0.00020313262939453125, "learning_rate": 0.0008422939068100358, "loss": 0.0, "step": 154000 }, { "epoch": 7.910906298003072, "grad_norm": 0.0004425048828125, "learning_rate": 0.0008417818740399385, "loss": 0.0, "step": 154500 }, { "epoch": 7.936507936507937, "grad_norm": 0.00017452239990234375, "learning_rate": 0.0008412698412698413, "loss": 0.0, "step": 155000 }, { "epoch": 7.962109575012801, "grad_norm": 0.0002346038818359375, "learning_rate": 0.0008407578084997441, "loss": 0.0, "step": 155500 }, { "epoch": 7.987711213517665, "grad_norm": 0.0001773834228515625, "learning_rate": 0.0008402457757296466, "loss": 0.0, "step": 156000 }, { "epoch": 8.0, "eval_loss": 2.9280490707606077e-05, "eval_runtime": 0.5361, "eval_samples_per_second": 1865.365, "eval_steps_per_second": 3.731, "step": 156240 } ], "logging_steps": 500, "max_steps": 976500, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.132510754955264e+18, "train_batch_size": 512, "trial_name": null, "trial_params": null }