{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 16503, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030298439630359036, "grad_norm": 1.5333168506622314, "learning_rate": 9.970308428770528e-06, "loss": 2.3086, "step": 50 }, { "epoch": 0.006059687926071807, "grad_norm": 1.427223563194275, "learning_rate": 9.9400109071078e-06, "loss": 2.222, "step": 100 }, { "epoch": 0.00908953188910771, "grad_norm": 1.435951590538025, "learning_rate": 9.909713385445072e-06, "loss": 2.1361, "step": 150 }, { "epoch": 0.012119375852143614, "grad_norm": 1.7074558734893799, "learning_rate": 9.879415863782343e-06, "loss": 2.0743, "step": 200 }, { "epoch": 0.015149219815179518, "grad_norm": 1.5592257976531982, "learning_rate": 9.849118342119616e-06, "loss": 2.0526, "step": 250 }, { "epoch": 0.01817906377821542, "grad_norm": 1.5479929447174072, "learning_rate": 9.818820820456887e-06, "loss": 2.0152, "step": 300 }, { "epoch": 0.021208907741251327, "grad_norm": 1.8903971910476685, "learning_rate": 9.78852329879416e-06, "loss": 1.9692, "step": 350 }, { "epoch": 0.02423875170428723, "grad_norm": 1.7212985754013062, "learning_rate": 9.758225777131432e-06, "loss": 1.9869, "step": 400 }, { "epoch": 0.027268595667323134, "grad_norm": 1.5003551244735718, "learning_rate": 9.727928255468703e-06, "loss": 1.9526, "step": 450 }, { "epoch": 0.030298439630359036, "grad_norm": 1.938314437866211, "learning_rate": 9.697630733805974e-06, "loss": 1.9265, "step": 500 }, { "epoch": 0.03332828359339494, "grad_norm": 2.260500431060791, "learning_rate": 9.667333212143247e-06, "loss": 1.9519, "step": 550 }, { "epoch": 0.03635812755643084, "grad_norm": 1.8720899820327759, "learning_rate": 9.63703569048052e-06, "loss": 1.9269, "step": 600 }, { "epoch": 0.03938797151946675, "grad_norm": 1.6958003044128418, "learning_rate": 9.606738168817791e-06, "loss": 1.8689, "step": 650 }, { "epoch": 0.042417815482502653, "grad_norm": 1.8129528760910034, "learning_rate": 9.576440647155064e-06, "loss": 1.9042, "step": 700 }, { "epoch": 0.04544765944553855, "grad_norm": 1.8754149675369263, "learning_rate": 9.546143125492335e-06, "loss": 1.8763, "step": 750 }, { "epoch": 0.04847750340857446, "grad_norm": 1.8043217658996582, "learning_rate": 9.515845603829608e-06, "loss": 1.8885, "step": 800 }, { "epoch": 0.05150734737161036, "grad_norm": 1.7908295392990112, "learning_rate": 9.48554808216688e-06, "loss": 1.894, "step": 850 }, { "epoch": 0.05453719133464627, "grad_norm": 1.8442623615264893, "learning_rate": 9.455250560504152e-06, "loss": 1.8439, "step": 900 }, { "epoch": 0.057567035297682166, "grad_norm": 2.4318511486053467, "learning_rate": 9.424953038841423e-06, "loss": 1.8462, "step": 950 }, { "epoch": 0.06059687926071807, "grad_norm": 1.9251976013183594, "learning_rate": 9.394655517178695e-06, "loss": 1.8359, "step": 1000 }, { "epoch": 0.06362672322375397, "grad_norm": 2.037912368774414, "learning_rate": 9.364357995515968e-06, "loss": 1.8454, "step": 1050 }, { "epoch": 0.06665656718678988, "grad_norm": 1.7860537767410278, "learning_rate": 9.33406047385324e-06, "loss": 1.8139, "step": 1100 }, { "epoch": 0.06968641114982578, "grad_norm": 2.082537889480591, "learning_rate": 9.303762952190512e-06, "loss": 1.8248, "step": 1150 }, { "epoch": 0.07271625511286169, "grad_norm": 1.863715648651123, "learning_rate": 9.273465430527783e-06, "loss": 1.8146, "step": 1200 }, { "epoch": 0.07574609907589759, "grad_norm": 1.9475610256195068, "learning_rate": 9.243167908865056e-06, "loss": 1.8335, "step": 1250 }, { "epoch": 0.0787759430389335, "grad_norm": 1.8209477663040161, "learning_rate": 9.212870387202327e-06, "loss": 1.841, "step": 1300 }, { "epoch": 0.0818057870019694, "grad_norm": 1.7817108631134033, "learning_rate": 9.1825728655396e-06, "loss": 1.8559, "step": 1350 }, { "epoch": 0.08483563096500531, "grad_norm": 2.1402652263641357, "learning_rate": 9.152275343876873e-06, "loss": 1.7597, "step": 1400 }, { "epoch": 0.08786547492804121, "grad_norm": 1.7914282083511353, "learning_rate": 9.121977822214144e-06, "loss": 1.7555, "step": 1450 }, { "epoch": 0.0908953188910771, "grad_norm": 1.8525111675262451, "learning_rate": 9.091680300551415e-06, "loss": 1.7685, "step": 1500 }, { "epoch": 0.09392516285411301, "grad_norm": 1.7323678731918335, "learning_rate": 9.061382778888688e-06, "loss": 1.7729, "step": 1550 }, { "epoch": 0.09695500681714891, "grad_norm": 1.8842493295669556, "learning_rate": 9.03108525722596e-06, "loss": 1.8152, "step": 1600 }, { "epoch": 0.09998485078018482, "grad_norm": 2.3138351440429688, "learning_rate": 9.000787735563232e-06, "loss": 1.7888, "step": 1650 }, { "epoch": 0.10301469474322072, "grad_norm": 1.5812338590621948, "learning_rate": 8.970490213900503e-06, "loss": 1.736, "step": 1700 }, { "epoch": 0.10604453870625663, "grad_norm": 1.7662309408187866, "learning_rate": 8.940192692237775e-06, "loss": 1.7532, "step": 1750 }, { "epoch": 0.10907438266929254, "grad_norm": 1.6153583526611328, "learning_rate": 8.909895170575048e-06, "loss": 1.7995, "step": 1800 }, { "epoch": 0.11210422663232844, "grad_norm": 1.7088361978530884, "learning_rate": 8.87959764891232e-06, "loss": 1.7612, "step": 1850 }, { "epoch": 0.11513407059536433, "grad_norm": 1.7324771881103516, "learning_rate": 8.849300127249592e-06, "loss": 1.7803, "step": 1900 }, { "epoch": 0.11816391455840024, "grad_norm": 2.0461599826812744, "learning_rate": 8.819002605586863e-06, "loss": 1.7642, "step": 1950 }, { "epoch": 0.12119375852143614, "grad_norm": 1.9488065242767334, "learning_rate": 8.788705083924136e-06, "loss": 1.7281, "step": 2000 }, { "epoch": 0.12422360248447205, "grad_norm": 1.6670262813568115, "learning_rate": 8.758407562261407e-06, "loss": 1.7446, "step": 2050 }, { "epoch": 0.12725344644750794, "grad_norm": 2.1428778171539307, "learning_rate": 8.72811004059868e-06, "loss": 1.7129, "step": 2100 }, { "epoch": 0.13028329041054384, "grad_norm": 1.863022804260254, "learning_rate": 8.697812518935953e-06, "loss": 1.7471, "step": 2150 }, { "epoch": 0.13331313437357975, "grad_norm": 1.9406341314315796, "learning_rate": 8.667514997273224e-06, "loss": 1.7381, "step": 2200 }, { "epoch": 0.13634297833661566, "grad_norm": 2.044616222381592, "learning_rate": 8.637217475610495e-06, "loss": 1.6925, "step": 2250 }, { "epoch": 0.13937282229965156, "grad_norm": 2.1815521717071533, "learning_rate": 8.606919953947768e-06, "loss": 1.7528, "step": 2300 }, { "epoch": 0.14240266626268747, "grad_norm": 1.943009853363037, "learning_rate": 8.57662243228504e-06, "loss": 1.7448, "step": 2350 }, { "epoch": 0.14543251022572337, "grad_norm": 1.806156873703003, "learning_rate": 8.546324910622311e-06, "loss": 1.7086, "step": 2400 }, { "epoch": 0.14846235418875928, "grad_norm": 1.6711620092391968, "learning_rate": 8.516027388959584e-06, "loss": 1.7235, "step": 2450 }, { "epoch": 0.15149219815179518, "grad_norm": 1.807391881942749, "learning_rate": 8.48633581773011e-06, "loss": 1.7185, "step": 2500 }, { "epoch": 0.1545220421148311, "grad_norm": 1.809409499168396, "learning_rate": 8.456038296067382e-06, "loss": 1.7292, "step": 2550 }, { "epoch": 0.157551886077867, "grad_norm": 1.953994870185852, "learning_rate": 8.425740774404655e-06, "loss": 1.7225, "step": 2600 }, { "epoch": 0.1605817300409029, "grad_norm": 1.776473045349121, "learning_rate": 8.395443252741926e-06, "loss": 1.707, "step": 2650 }, { "epoch": 0.1636115740039388, "grad_norm": 2.0671963691711426, "learning_rate": 8.365145731079199e-06, "loss": 1.6829, "step": 2700 }, { "epoch": 0.1666414179669747, "grad_norm": 1.7150015830993652, "learning_rate": 8.33484820941647e-06, "loss": 1.6755, "step": 2750 }, { "epoch": 0.16967126193001061, "grad_norm": 1.7115235328674316, "learning_rate": 8.304550687753743e-06, "loss": 1.7345, "step": 2800 }, { "epoch": 0.17270110589304652, "grad_norm": 1.6895759105682373, "learning_rate": 8.274253166091014e-06, "loss": 1.6911, "step": 2850 }, { "epoch": 0.17573094985608242, "grad_norm": 1.8337711095809937, "learning_rate": 8.243955644428287e-06, "loss": 1.717, "step": 2900 }, { "epoch": 0.1787607938191183, "grad_norm": 1.6573454141616821, "learning_rate": 8.21365812276556e-06, "loss": 1.6788, "step": 2950 }, { "epoch": 0.1817906377821542, "grad_norm": 1.6749043464660645, "learning_rate": 8.18336060110283e-06, "loss": 1.6658, "step": 3000 }, { "epoch": 0.1848204817451901, "grad_norm": 1.8717437982559204, "learning_rate": 8.153063079440102e-06, "loss": 1.6504, "step": 3050 }, { "epoch": 0.18785032570822602, "grad_norm": 1.8825408220291138, "learning_rate": 8.122765557777375e-06, "loss": 1.689, "step": 3100 }, { "epoch": 0.19088016967126192, "grad_norm": 1.528969645500183, "learning_rate": 8.092468036114647e-06, "loss": 1.6896, "step": 3150 }, { "epoch": 0.19391001363429783, "grad_norm": 1.6371997594833374, "learning_rate": 8.062170514451918e-06, "loss": 1.6924, "step": 3200 }, { "epoch": 0.19693985759733373, "grad_norm": 1.6460844278335571, "learning_rate": 8.03187299278919e-06, "loss": 1.7174, "step": 3250 }, { "epoch": 0.19996970156036964, "grad_norm": 1.8048419952392578, "learning_rate": 8.001575471126462e-06, "loss": 1.6752, "step": 3300 }, { "epoch": 0.20299954552340554, "grad_norm": 1.5663115978240967, "learning_rate": 7.971277949463735e-06, "loss": 1.67, "step": 3350 }, { "epoch": 0.20602938948644145, "grad_norm": 1.651563048362732, "learning_rate": 7.940980427801006e-06, "loss": 1.676, "step": 3400 }, { "epoch": 0.20905923344947736, "grad_norm": 1.7851747274398804, "learning_rate": 7.910682906138279e-06, "loss": 1.6498, "step": 3450 }, { "epoch": 0.21208907741251326, "grad_norm": 1.839625358581543, "learning_rate": 7.88038538447555e-06, "loss": 1.6772, "step": 3500 }, { "epoch": 0.21511892137554917, "grad_norm": 1.769681453704834, "learning_rate": 7.850087862812823e-06, "loss": 1.6827, "step": 3550 }, { "epoch": 0.21814876533858507, "grad_norm": 1.9082216024398804, "learning_rate": 7.819790341150094e-06, "loss": 1.6918, "step": 3600 }, { "epoch": 0.22117860930162098, "grad_norm": 1.7255308628082275, "learning_rate": 7.789492819487367e-06, "loss": 1.6702, "step": 3650 }, { "epoch": 0.22420845326465688, "grad_norm": 1.5342808961868286, "learning_rate": 7.75919529782464e-06, "loss": 1.6514, "step": 3700 }, { "epoch": 0.2272382972276928, "grad_norm": 1.6255989074707031, "learning_rate": 7.72889777616191e-06, "loss": 1.6619, "step": 3750 }, { "epoch": 0.23026814119072866, "grad_norm": 1.726608157157898, "learning_rate": 7.698600254499182e-06, "loss": 1.6691, "step": 3800 }, { "epoch": 0.23329798515376457, "grad_norm": 1.7505511045455933, "learning_rate": 7.668302732836454e-06, "loss": 1.6603, "step": 3850 }, { "epoch": 0.23632782911680048, "grad_norm": 1.7649147510528564, "learning_rate": 7.638005211173727e-06, "loss": 1.7008, "step": 3900 }, { "epoch": 0.23935767307983638, "grad_norm": 1.7847344875335693, "learning_rate": 7.607707689510999e-06, "loss": 1.6701, "step": 3950 }, { "epoch": 0.24238751704287229, "grad_norm": 1.7049511671066284, "learning_rate": 7.57741016784827e-06, "loss": 1.6446, "step": 4000 }, { "epoch": 0.2454173610059082, "grad_norm": 1.5229992866516113, "learning_rate": 7.547112646185543e-06, "loss": 1.6482, "step": 4050 }, { "epoch": 0.2484472049689441, "grad_norm": 1.7003657817840576, "learning_rate": 7.516815124522815e-06, "loss": 1.6154, "step": 4100 }, { "epoch": 0.25147704893198003, "grad_norm": 2.140820026397705, "learning_rate": 7.486517602860086e-06, "loss": 1.6522, "step": 4150 }, { "epoch": 0.2545068928950159, "grad_norm": 1.580733060836792, "learning_rate": 7.456220081197358e-06, "loss": 1.6252, "step": 4200 }, { "epoch": 0.2575367368580518, "grad_norm": 1.912180781364441, "learning_rate": 7.425922559534631e-06, "loss": 1.6581, "step": 4250 }, { "epoch": 0.2605665808210877, "grad_norm": 1.6341553926467896, "learning_rate": 7.395625037871903e-06, "loss": 1.6569, "step": 4300 }, { "epoch": 0.2635964247841236, "grad_norm": 1.6689584255218506, "learning_rate": 7.365327516209174e-06, "loss": 1.67, "step": 4350 }, { "epoch": 0.2666262687471595, "grad_norm": 1.5784380435943604, "learning_rate": 7.335029994546447e-06, "loss": 1.6469, "step": 4400 }, { "epoch": 0.2696561127101954, "grad_norm": 1.8624850511550903, "learning_rate": 7.3047324728837186e-06, "loss": 1.6654, "step": 4450 }, { "epoch": 0.2726859566732313, "grad_norm": 1.8794349431991577, "learning_rate": 7.274434951220991e-06, "loss": 1.6662, "step": 4500 }, { "epoch": 0.2757158006362672, "grad_norm": 1.8854588270187378, "learning_rate": 7.2441374295582624e-06, "loss": 1.6837, "step": 4550 }, { "epoch": 0.2787456445993031, "grad_norm": 2.1526780128479004, "learning_rate": 7.213839907895534e-06, "loss": 1.6632, "step": 4600 }, { "epoch": 0.281775488562339, "grad_norm": 1.9971776008605957, "learning_rate": 7.183542386232807e-06, "loss": 1.6099, "step": 4650 }, { "epoch": 0.28480533252537493, "grad_norm": 1.598711609840393, "learning_rate": 7.153244864570079e-06, "loss": 1.6607, "step": 4700 }, { "epoch": 0.28783517648841084, "grad_norm": 1.4923897981643677, "learning_rate": 7.12294734290735e-06, "loss": 1.6116, "step": 4750 }, { "epoch": 0.29086502045144674, "grad_norm": 1.8050360679626465, "learning_rate": 7.092649821244623e-06, "loss": 1.6655, "step": 4800 }, { "epoch": 0.29389486441448265, "grad_norm": 1.7371289730072021, "learning_rate": 7.062352299581895e-06, "loss": 1.6276, "step": 4850 }, { "epoch": 0.29692470837751855, "grad_norm": 1.7267305850982666, "learning_rate": 7.032054777919167e-06, "loss": 1.6399, "step": 4900 }, { "epoch": 0.29995455234055446, "grad_norm": 1.5485690832138062, "learning_rate": 7.001757256256439e-06, "loss": 1.6188, "step": 4950 }, { "epoch": 0.30298439630359036, "grad_norm": 1.78826904296875, "learning_rate": 6.971459734593711e-06, "loss": 1.6426, "step": 5000 }, { "epoch": 0.30601424026662627, "grad_norm": 1.8727006912231445, "learning_rate": 6.941768163364238e-06, "loss": 1.6329, "step": 5050 }, { "epoch": 0.3090440842296622, "grad_norm": 1.8767826557159424, "learning_rate": 6.91147064170151e-06, "loss": 1.6724, "step": 5100 }, { "epoch": 0.3120739281926981, "grad_norm": 1.4456415176391602, "learning_rate": 6.881173120038781e-06, "loss": 1.6698, "step": 5150 }, { "epoch": 0.315103772155734, "grad_norm": 1.652795433998108, "learning_rate": 6.8508755983760535e-06, "loss": 1.6425, "step": 5200 }, { "epoch": 0.3181336161187699, "grad_norm": 1.609587550163269, "learning_rate": 6.8205780767133255e-06, "loss": 1.5815, "step": 5250 }, { "epoch": 0.3211634600818058, "grad_norm": 1.6337134838104248, "learning_rate": 6.790280555050598e-06, "loss": 1.6195, "step": 5300 }, { "epoch": 0.3241933040448417, "grad_norm": 1.3837682008743286, "learning_rate": 6.759983033387869e-06, "loss": 1.6397, "step": 5350 }, { "epoch": 0.3272231480078776, "grad_norm": 1.5774863958358765, "learning_rate": 6.729685511725141e-06, "loss": 1.6391, "step": 5400 }, { "epoch": 0.3302529919709135, "grad_norm": 1.6407530307769775, "learning_rate": 6.699387990062414e-06, "loss": 1.6216, "step": 5450 }, { "epoch": 0.3332828359339494, "grad_norm": 1.7346445322036743, "learning_rate": 6.669090468399686e-06, "loss": 1.593, "step": 5500 }, { "epoch": 0.3363126798969853, "grad_norm": 1.907456636428833, "learning_rate": 6.638792946736957e-06, "loss": 1.5871, "step": 5550 }, { "epoch": 0.33934252386002123, "grad_norm": 1.8702911138534546, "learning_rate": 6.60849542507423e-06, "loss": 1.6464, "step": 5600 }, { "epoch": 0.34237236782305713, "grad_norm": 1.750834345817566, "learning_rate": 6.578197903411502e-06, "loss": 1.5719, "step": 5650 }, { "epoch": 0.34540221178609304, "grad_norm": 1.632150650024414, "learning_rate": 6.547900381748773e-06, "loss": 1.6209, "step": 5700 }, { "epoch": 0.34843205574912894, "grad_norm": 1.6225661039352417, "learning_rate": 6.517602860086045e-06, "loss": 1.6307, "step": 5750 }, { "epoch": 0.35146189971216485, "grad_norm": 1.6222798824310303, "learning_rate": 6.487305338423318e-06, "loss": 1.6103, "step": 5800 }, { "epoch": 0.35449174367520075, "grad_norm": 1.9228448867797852, "learning_rate": 6.45700781676059e-06, "loss": 1.6603, "step": 5850 }, { "epoch": 0.3575215876382366, "grad_norm": 1.9268038272857666, "learning_rate": 6.426710295097861e-06, "loss": 1.6057, "step": 5900 }, { "epoch": 0.3605514316012725, "grad_norm": 1.4592260122299194, "learning_rate": 6.3964127734351335e-06, "loss": 1.639, "step": 5950 }, { "epoch": 0.3635812755643084, "grad_norm": 1.8319112062454224, "learning_rate": 6.366115251772405e-06, "loss": 1.6403, "step": 6000 }, { "epoch": 0.3666111195273443, "grad_norm": 1.6500194072723389, "learning_rate": 6.335817730109678e-06, "loss": 1.6216, "step": 6050 }, { "epoch": 0.3696409634903802, "grad_norm": 1.6576268672943115, "learning_rate": 6.305520208446949e-06, "loss": 1.6326, "step": 6100 }, { "epoch": 0.37267080745341613, "grad_norm": 2.2077317237854004, "learning_rate": 6.275222686784221e-06, "loss": 1.609, "step": 6150 }, { "epoch": 0.37570065141645204, "grad_norm": 1.4579232931137085, "learning_rate": 6.244925165121494e-06, "loss": 1.5816, "step": 6200 }, { "epoch": 0.37873049537948794, "grad_norm": 1.6647648811340332, "learning_rate": 6.214627643458766e-06, "loss": 1.6208, "step": 6250 }, { "epoch": 0.38176033934252385, "grad_norm": 1.413583755493164, "learning_rate": 6.184330121796037e-06, "loss": 1.6206, "step": 6300 }, { "epoch": 0.38479018330555975, "grad_norm": 1.743116021156311, "learning_rate": 6.15403260013331e-06, "loss": 1.5973, "step": 6350 }, { "epoch": 0.38782002726859566, "grad_norm": 1.8276968002319336, "learning_rate": 6.123735078470582e-06, "loss": 1.6384, "step": 6400 }, { "epoch": 0.39084987123163156, "grad_norm": 1.3672616481781006, "learning_rate": 6.093437556807854e-06, "loss": 1.5883, "step": 6450 }, { "epoch": 0.39387971519466747, "grad_norm": 1.6729201078414917, "learning_rate": 6.063140035145126e-06, "loss": 1.6074, "step": 6500 }, { "epoch": 0.3969095591577034, "grad_norm": 1.433160662651062, "learning_rate": 6.0328425134823976e-06, "loss": 1.6359, "step": 6550 }, { "epoch": 0.3999394031207393, "grad_norm": 1.6422028541564941, "learning_rate": 6.0025449918196695e-06, "loss": 1.6121, "step": 6600 }, { "epoch": 0.4029692470837752, "grad_norm": 1.6269747018814087, "learning_rate": 5.972247470156942e-06, "loss": 1.6143, "step": 6650 }, { "epoch": 0.4059990910468111, "grad_norm": 1.5140916109085083, "learning_rate": 5.941949948494213e-06, "loss": 1.5386, "step": 6700 }, { "epoch": 0.409028935009847, "grad_norm": 1.6418771743774414, "learning_rate": 5.911652426831485e-06, "loss": 1.5685, "step": 6750 }, { "epoch": 0.4120587789728829, "grad_norm": 2.016139268875122, "learning_rate": 5.881354905168758e-06, "loss": 1.6092, "step": 6800 }, { "epoch": 0.4150886229359188, "grad_norm": 1.8047841787338257, "learning_rate": 5.851057383506029e-06, "loss": 1.6405, "step": 6850 }, { "epoch": 0.4181184668989547, "grad_norm": 1.426552653312683, "learning_rate": 5.820759861843301e-06, "loss": 1.5648, "step": 6900 }, { "epoch": 0.4211483108619906, "grad_norm": 1.8354159593582153, "learning_rate": 5.790462340180574e-06, "loss": 1.6048, "step": 6950 }, { "epoch": 0.4241781548250265, "grad_norm": 1.475692868232727, "learning_rate": 5.760164818517846e-06, "loss": 1.5869, "step": 7000 }, { "epoch": 0.4272079987880624, "grad_norm": 1.6546450853347778, "learning_rate": 5.729867296855117e-06, "loss": 1.5866, "step": 7050 }, { "epoch": 0.43023784275109833, "grad_norm": 1.5041762590408325, "learning_rate": 5.69956977519239e-06, "loss": 1.6071, "step": 7100 }, { "epoch": 0.43326768671413424, "grad_norm": 1.5460017919540405, "learning_rate": 5.669272253529662e-06, "loss": 1.5823, "step": 7150 }, { "epoch": 0.43629753067717014, "grad_norm": 1.6852233409881592, "learning_rate": 5.6389747318669345e-06, "loss": 1.6109, "step": 7200 }, { "epoch": 0.43932737464020605, "grad_norm": 1.5145368576049805, "learning_rate": 5.6086772102042056e-06, "loss": 1.5887, "step": 7250 }, { "epoch": 0.44235721860324195, "grad_norm": 1.6494066715240479, "learning_rate": 5.5783796885414775e-06, "loss": 1.5881, "step": 7300 }, { "epoch": 0.44538706256627786, "grad_norm": 1.7732048034667969, "learning_rate": 5.54808216687875e-06, "loss": 1.5923, "step": 7350 }, { "epoch": 0.44841690652931376, "grad_norm": 2.078328847885132, "learning_rate": 5.517784645216022e-06, "loss": 1.6117, "step": 7400 }, { "epoch": 0.45144675049234967, "grad_norm": 1.5237518548965454, "learning_rate": 5.487487123553293e-06, "loss": 1.6206, "step": 7450 }, { "epoch": 0.4544765944553856, "grad_norm": 1.5857949256896973, "learning_rate": 5.457189601890566e-06, "loss": 1.5938, "step": 7500 }, { "epoch": 0.4575064384184214, "grad_norm": 1.8254096508026123, "learning_rate": 5.426892080227838e-06, "loss": 1.6217, "step": 7550 }, { "epoch": 0.46053628238145733, "grad_norm": 1.7777276039123535, "learning_rate": 5.39659455856511e-06, "loss": 1.5695, "step": 7600 }, { "epoch": 0.46356612634449323, "grad_norm": 1.6662938594818115, "learning_rate": 5.366297036902381e-06, "loss": 1.5875, "step": 7650 }, { "epoch": 0.46659597030752914, "grad_norm": 1.6636159420013428, "learning_rate": 5.335999515239654e-06, "loss": 1.5969, "step": 7700 }, { "epoch": 0.46962581427056505, "grad_norm": 1.4926279783248901, "learning_rate": 5.305701993576926e-06, "loss": 1.5905, "step": 7750 }, { "epoch": 0.47265565823360095, "grad_norm": 1.566446304321289, "learning_rate": 5.275404471914199e-06, "loss": 1.6126, "step": 7800 }, { "epoch": 0.47568550219663686, "grad_norm": 1.8684853315353394, "learning_rate": 5.24510695025147e-06, "loss": 1.5737, "step": 7850 }, { "epoch": 0.47871534615967276, "grad_norm": 1.8654356002807617, "learning_rate": 5.214809428588742e-06, "loss": 1.5964, "step": 7900 }, { "epoch": 0.48174519012270867, "grad_norm": 1.3818305730819702, "learning_rate": 5.184511906926014e-06, "loss": 1.5757, "step": 7950 }, { "epoch": 0.48477503408574457, "grad_norm": 1.6214017868041992, "learning_rate": 5.154214385263286e-06, "loss": 1.559, "step": 8000 }, { "epoch": 0.4878048780487805, "grad_norm": 1.5026991367340088, "learning_rate": 5.1239168636005574e-06, "loss": 1.6033, "step": 8050 }, { "epoch": 0.4908347220118164, "grad_norm": 1.4133880138397217, "learning_rate": 5.09361934193783e-06, "loss": 1.5947, "step": 8100 }, { "epoch": 0.4938645659748523, "grad_norm": 1.5185195207595825, "learning_rate": 5.063321820275102e-06, "loss": 1.5858, "step": 8150 }, { "epoch": 0.4968944099378882, "grad_norm": 1.5284918546676636, "learning_rate": 5.033024298612373e-06, "loss": 1.5998, "step": 8200 }, { "epoch": 0.4999242539009241, "grad_norm": 1.5977320671081543, "learning_rate": 5.002726776949646e-06, "loss": 1.5609, "step": 8250 }, { "epoch": 0.5029540978639601, "grad_norm": 1.6577131748199463, "learning_rate": 4.972429255286918e-06, "loss": 1.5684, "step": 8300 }, { "epoch": 0.5059839418269959, "grad_norm": 1.5592504739761353, "learning_rate": 4.94213173362419e-06, "loss": 1.597, "step": 8350 }, { "epoch": 0.5090137857900318, "grad_norm": 1.6733429431915283, "learning_rate": 4.911834211961462e-06, "loss": 1.5803, "step": 8400 }, { "epoch": 0.5120436297530677, "grad_norm": 1.390070915222168, "learning_rate": 4.881536690298734e-06, "loss": 1.5868, "step": 8450 }, { "epoch": 0.5150734737161036, "grad_norm": 1.6423391103744507, "learning_rate": 4.851239168636006e-06, "loss": 1.6197, "step": 8500 }, { "epoch": 0.5181033176791395, "grad_norm": 1.5211403369903564, "learning_rate": 4.820941646973278e-06, "loss": 1.5985, "step": 8550 }, { "epoch": 0.5211331616421754, "grad_norm": 1.3367249965667725, "learning_rate": 4.7906441253105504e-06, "loss": 1.563, "step": 8600 }, { "epoch": 0.5241630056052113, "grad_norm": 1.4820023775100708, "learning_rate": 4.7603466036478215e-06, "loss": 1.5519, "step": 8650 }, { "epoch": 0.5271928495682472, "grad_norm": 1.5887796878814697, "learning_rate": 4.7306550324183485e-06, "loss": 1.5901, "step": 8700 }, { "epoch": 0.5302226935312832, "grad_norm": 2.0754947662353516, "learning_rate": 4.7003575107556205e-06, "loss": 1.58, "step": 8750 }, { "epoch": 0.533252537494319, "grad_norm": 1.545721173286438, "learning_rate": 4.670059989092892e-06, "loss": 1.5588, "step": 8800 }, { "epoch": 0.536282381457355, "grad_norm": 1.69772469997406, "learning_rate": 4.639762467430164e-06, "loss": 1.5847, "step": 8850 }, { "epoch": 0.5393122254203908, "grad_norm": 1.5981988906860352, "learning_rate": 4.609464945767437e-06, "loss": 1.5508, "step": 8900 }, { "epoch": 0.5423420693834268, "grad_norm": 1.884867787361145, "learning_rate": 4.579167424104708e-06, "loss": 1.5583, "step": 8950 }, { "epoch": 0.5453719133464626, "grad_norm": 1.8997677564620972, "learning_rate": 4.548869902441981e-06, "loss": 1.5681, "step": 9000 }, { "epoch": 0.5484017573094986, "grad_norm": 1.6099635362625122, "learning_rate": 4.518572380779253e-06, "loss": 1.5838, "step": 9050 }, { "epoch": 0.5514316012725344, "grad_norm": 1.659430742263794, "learning_rate": 4.488274859116525e-06, "loss": 1.5876, "step": 9100 }, { "epoch": 0.5544614452355704, "grad_norm": 1.72199547290802, "learning_rate": 4.457977337453797e-06, "loss": 1.5776, "step": 9150 }, { "epoch": 0.5574912891986062, "grad_norm": 1.5597950220108032, "learning_rate": 4.427679815791069e-06, "loss": 1.5383, "step": 9200 }, { "epoch": 0.5605211331616422, "grad_norm": 1.8483946323394775, "learning_rate": 4.397382294128341e-06, "loss": 1.6229, "step": 9250 }, { "epoch": 0.563550977124678, "grad_norm": 1.459078311920166, "learning_rate": 4.367084772465613e-06, "loss": 1.5796, "step": 9300 }, { "epoch": 0.566580821087714, "grad_norm": 1.5624710321426392, "learning_rate": 4.336787250802885e-06, "loss": 1.5648, "step": 9350 }, { "epoch": 0.5696106650507499, "grad_norm": 1.5045188665390015, "learning_rate": 4.306489729140157e-06, "loss": 1.6078, "step": 9400 }, { "epoch": 0.5726405090137858, "grad_norm": 1.5284470319747925, "learning_rate": 4.2761922074774285e-06, "loss": 1.5581, "step": 9450 }, { "epoch": 0.5756703529768217, "grad_norm": 1.7991974353790283, "learning_rate": 4.2458946858147e-06, "loss": 1.5629, "step": 9500 }, { "epoch": 0.5787001969398576, "grad_norm": 1.5898367166519165, "learning_rate": 4.215597164151972e-06, "loss": 1.6102, "step": 9550 }, { "epoch": 0.5817300409028935, "grad_norm": 1.5055047273635864, "learning_rate": 4.185299642489244e-06, "loss": 1.5602, "step": 9600 }, { "epoch": 0.5847598848659294, "grad_norm": 1.4318591356277466, "learning_rate": 4.155002120826517e-06, "loss": 1.5558, "step": 9650 }, { "epoch": 0.5877897288289653, "grad_norm": 1.6692588329315186, "learning_rate": 4.124704599163788e-06, "loss": 1.573, "step": 9700 }, { "epoch": 0.5908195727920013, "grad_norm": 1.4996757507324219, "learning_rate": 4.094407077501061e-06, "loss": 1.5641, "step": 9750 }, { "epoch": 0.5938494167550371, "grad_norm": 1.491071343421936, "learning_rate": 4.064109555838333e-06, "loss": 1.5546, "step": 9800 }, { "epoch": 0.5968792607180731, "grad_norm": 1.5985503196716309, "learning_rate": 4.033812034175605e-06, "loss": 1.5443, "step": 9850 }, { "epoch": 0.5999091046811089, "grad_norm": 1.6403000354766846, "learning_rate": 4.003514512512877e-06, "loss": 1.591, "step": 9900 }, { "epoch": 0.6029389486441449, "grad_norm": 1.3770664930343628, "learning_rate": 3.973216990850149e-06, "loss": 1.5142, "step": 9950 }, { "epoch": 0.6059687926071807, "grad_norm": 1.6218574047088623, "learning_rate": 3.942919469187421e-06, "loss": 1.5854, "step": 10000 }, { "epoch": 0.6089986365702167, "grad_norm": 1.2990355491638184, "learning_rate": 3.9126219475246926e-06, "loss": 1.5595, "step": 10050 }, { "epoch": 0.6120284805332525, "grad_norm": 1.4130761623382568, "learning_rate": 3.8823244258619645e-06, "loss": 1.5471, "step": 10100 }, { "epoch": 0.6150583244962884, "grad_norm": 1.4140211343765259, "learning_rate": 3.852026904199237e-06, "loss": 1.5071, "step": 10150 }, { "epoch": 0.6180881684593243, "grad_norm": 1.4824833869934082, "learning_rate": 3.821729382536508e-06, "loss": 1.5711, "step": 10200 }, { "epoch": 0.6211180124223602, "grad_norm": 1.6242071390151978, "learning_rate": 3.791431860873781e-06, "loss": 1.5181, "step": 10250 }, { "epoch": 0.6241478563853962, "grad_norm": 1.4817885160446167, "learning_rate": 3.7611343392110527e-06, "loss": 1.5468, "step": 10300 }, { "epoch": 0.627177700348432, "grad_norm": 1.3601789474487305, "learning_rate": 3.730836817548325e-06, "loss": 1.5761, "step": 10350 }, { "epoch": 0.630207544311468, "grad_norm": 1.3270106315612793, "learning_rate": 3.700539295885597e-06, "loss": 1.5336, "step": 10400 }, { "epoch": 0.6332373882745038, "grad_norm": 1.4662185907363892, "learning_rate": 3.670241774222869e-06, "loss": 1.5958, "step": 10450 }, { "epoch": 0.6362672322375398, "grad_norm": 1.480850100517273, "learning_rate": 3.639944252560141e-06, "loss": 1.565, "step": 10500 }, { "epoch": 0.6392970762005756, "grad_norm": 1.4275442361831665, "learning_rate": 3.6096467308974132e-06, "loss": 1.5316, "step": 10550 }, { "epoch": 0.6423269201636116, "grad_norm": 1.709631323814392, "learning_rate": 3.5793492092346847e-06, "loss": 1.5568, "step": 10600 }, { "epoch": 0.6453567641266474, "grad_norm": 1.4881377220153809, "learning_rate": 3.549051687571957e-06, "loss": 1.5452, "step": 10650 }, { "epoch": 0.6483866080896834, "grad_norm": 1.3687924146652222, "learning_rate": 3.518754165909229e-06, "loss": 1.567, "step": 10700 }, { "epoch": 0.6514164520527193, "grad_norm": 1.2314640283584595, "learning_rate": 3.4884566442465006e-06, "loss": 1.5678, "step": 10750 }, { "epoch": 0.6544462960157552, "grad_norm": 1.5389395952224731, "learning_rate": 3.458159122583773e-06, "loss": 1.5624, "step": 10800 }, { "epoch": 0.6574761399787911, "grad_norm": 1.401017189025879, "learning_rate": 3.427861600921045e-06, "loss": 1.5287, "step": 10850 }, { "epoch": 0.660505983941827, "grad_norm": 1.9174494743347168, "learning_rate": 3.397564079258317e-06, "loss": 1.547, "step": 10900 }, { "epoch": 0.6635358279048629, "grad_norm": 1.5186442136764526, "learning_rate": 3.3672665575955887e-06, "loss": 1.5489, "step": 10950 }, { "epoch": 0.6665656718678988, "grad_norm": 1.3896212577819824, "learning_rate": 3.336969035932861e-06, "loss": 1.5368, "step": 11000 }, { "epoch": 0.6695955158309347, "grad_norm": 1.7711433172225952, "learning_rate": 3.3072774647033872e-06, "loss": 1.5129, "step": 11050 }, { "epoch": 0.6726253597939706, "grad_norm": 1.5082119703292847, "learning_rate": 3.2769799430406596e-06, "loss": 1.5491, "step": 11100 }, { "epoch": 0.6756552037570065, "grad_norm": 1.3804906606674194, "learning_rate": 3.246682421377931e-06, "loss": 1.5687, "step": 11150 }, { "epoch": 0.6786850477200425, "grad_norm": 1.6013615131378174, "learning_rate": 3.2163848997152035e-06, "loss": 1.5891, "step": 11200 }, { "epoch": 0.6817148916830783, "grad_norm": 1.6114518642425537, "learning_rate": 3.1860873780524754e-06, "loss": 1.5891, "step": 11250 }, { "epoch": 0.6847447356461143, "grad_norm": 1.5464235544204712, "learning_rate": 3.1557898563897478e-06, "loss": 1.5576, "step": 11300 }, { "epoch": 0.6877745796091501, "grad_norm": 1.3998115062713623, "learning_rate": 3.1254923347270193e-06, "loss": 1.543, "step": 11350 }, { "epoch": 0.6908044235721861, "grad_norm": 1.5496994256973267, "learning_rate": 3.0951948130642917e-06, "loss": 1.5751, "step": 11400 }, { "epoch": 0.6938342675352219, "grad_norm": 1.4767708778381348, "learning_rate": 3.0648972914015636e-06, "loss": 1.5892, "step": 11450 }, { "epoch": 0.6968641114982579, "grad_norm": 1.5664912462234497, "learning_rate": 3.0345997697388355e-06, "loss": 1.5674, "step": 11500 }, { "epoch": 0.6998939554612937, "grad_norm": 1.5581713914871216, "learning_rate": 3.0043022480761075e-06, "loss": 1.5157, "step": 11550 }, { "epoch": 0.7029237994243297, "grad_norm": 1.4770848751068115, "learning_rate": 2.97400472641338e-06, "loss": 1.5241, "step": 11600 }, { "epoch": 0.7059536433873655, "grad_norm": 1.5183898210525513, "learning_rate": 2.9437072047506514e-06, "loss": 1.5752, "step": 11650 }, { "epoch": 0.7089834873504015, "grad_norm": 1.3331156969070435, "learning_rate": 2.9134096830879237e-06, "loss": 1.5524, "step": 11700 }, { "epoch": 0.7120133313134374, "grad_norm": 1.3551660776138306, "learning_rate": 2.8831121614251957e-06, "loss": 1.5653, "step": 11750 }, { "epoch": 0.7150431752764732, "grad_norm": 1.4175201654434204, "learning_rate": 2.852814639762468e-06, "loss": 1.5538, "step": 11800 }, { "epoch": 0.7180730192395092, "grad_norm": 1.482858657836914, "learning_rate": 2.8225171180997395e-06, "loss": 1.5411, "step": 11850 }, { "epoch": 0.721102863202545, "grad_norm": 1.3337117433547974, "learning_rate": 2.792219596437012e-06, "loss": 1.5402, "step": 11900 }, { "epoch": 0.724132707165581, "grad_norm": 1.529326319694519, "learning_rate": 2.761922074774284e-06, "loss": 1.5357, "step": 11950 }, { "epoch": 0.7271625511286168, "grad_norm": 1.40908944606781, "learning_rate": 2.7316245531115558e-06, "loss": 1.521, "step": 12000 }, { "epoch": 0.7301923950916528, "grad_norm": 1.434410572052002, "learning_rate": 2.7013270314488277e-06, "loss": 1.5723, "step": 12050 }, { "epoch": 0.7332222390546886, "grad_norm": 1.5077331066131592, "learning_rate": 2.6710295097861e-06, "loss": 1.5979, "step": 12100 }, { "epoch": 0.7362520830177246, "grad_norm": 1.4637471437454224, "learning_rate": 2.6407319881233716e-06, "loss": 1.5465, "step": 12150 }, { "epoch": 0.7392819269807605, "grad_norm": 1.7819764614105225, "learning_rate": 2.6104344664606435e-06, "loss": 1.5425, "step": 12200 }, { "epoch": 0.7423117709437964, "grad_norm": 1.3142591714859009, "learning_rate": 2.580136944797916e-06, "loss": 1.5659, "step": 12250 }, { "epoch": 0.7453416149068323, "grad_norm": 1.4326512813568115, "learning_rate": 2.5498394231351874e-06, "loss": 1.5506, "step": 12300 }, { "epoch": 0.7483714588698682, "grad_norm": 1.3926914930343628, "learning_rate": 2.5195419014724598e-06, "loss": 1.519, "step": 12350 }, { "epoch": 0.7514013028329041, "grad_norm": 1.5886470079421997, "learning_rate": 2.4892443798097317e-06, "loss": 1.5323, "step": 12400 }, { "epoch": 0.75443114679594, "grad_norm": 1.6596341133117676, "learning_rate": 2.4589468581470036e-06, "loss": 1.5572, "step": 12450 }, { "epoch": 0.7574609907589759, "grad_norm": 1.5624167919158936, "learning_rate": 2.428649336484276e-06, "loss": 1.517, "step": 12500 }, { "epoch": 0.7604908347220118, "grad_norm": 1.5052602291107178, "learning_rate": 2.398351814821548e-06, "loss": 1.5634, "step": 12550 }, { "epoch": 0.7635206786850477, "grad_norm": 1.4568336009979248, "learning_rate": 2.36805429315882e-06, "loss": 1.5581, "step": 12600 }, { "epoch": 0.7665505226480837, "grad_norm": 1.4686074256896973, "learning_rate": 2.337756771496092e-06, "loss": 1.5383, "step": 12650 }, { "epoch": 0.7695803666111195, "grad_norm": 1.53707754611969, "learning_rate": 2.3074592498333638e-06, "loss": 1.5654, "step": 12700 }, { "epoch": 0.7726102105741555, "grad_norm": 1.5452001094818115, "learning_rate": 2.277161728170636e-06, "loss": 1.5772, "step": 12750 }, { "epoch": 0.7756400545371913, "grad_norm": 1.553062915802002, "learning_rate": 2.2468642065079076e-06, "loss": 1.5178, "step": 12800 }, { "epoch": 0.7786698985002273, "grad_norm": 1.328934669494629, "learning_rate": 2.2165666848451796e-06, "loss": 1.5718, "step": 12850 }, { "epoch": 0.7816997424632631, "grad_norm": 1.7127735614776611, "learning_rate": 2.186269163182452e-06, "loss": 1.5341, "step": 12900 }, { "epoch": 0.7847295864262991, "grad_norm": 1.728936791419983, "learning_rate": 2.155971641519724e-06, "loss": 1.5546, "step": 12950 }, { "epoch": 0.7877594303893349, "grad_norm": 1.6442391872406006, "learning_rate": 2.125674119856996e-06, "loss": 1.56, "step": 13000 }, { "epoch": 0.7907892743523709, "grad_norm": 1.4701004028320312, "learning_rate": 2.0953765981942678e-06, "loss": 1.5182, "step": 13050 }, { "epoch": 0.7938191183154067, "grad_norm": 1.2920253276824951, "learning_rate": 2.0650790765315397e-06, "loss": 1.5291, "step": 13100 }, { "epoch": 0.7968489622784427, "grad_norm": 1.3979644775390625, "learning_rate": 2.034781554868812e-06, "loss": 1.5246, "step": 13150 }, { "epoch": 0.7998788062414786, "grad_norm": 1.7545385360717773, "learning_rate": 2.004484033206084e-06, "loss": 1.5752, "step": 13200 }, { "epoch": 0.8029086502045145, "grad_norm": 1.347290277481079, "learning_rate": 1.974186511543356e-06, "loss": 1.5264, "step": 13250 }, { "epoch": 0.8059384941675504, "grad_norm": 1.4548012018203735, "learning_rate": 1.943888989880628e-06, "loss": 1.5995, "step": 13300 }, { "epoch": 0.8089683381305863, "grad_norm": 1.6475197076797485, "learning_rate": 1.9135914682179e-06, "loss": 1.5506, "step": 13350 }, { "epoch": 0.8119981820936222, "grad_norm": 1.497528314590454, "learning_rate": 1.883293946555172e-06, "loss": 1.5702, "step": 13400 }, { "epoch": 0.815028026056658, "grad_norm": 1.791481852531433, "learning_rate": 1.852996424892444e-06, "loss": 1.5474, "step": 13450 }, { "epoch": 0.818057870019694, "grad_norm": 1.5969312191009521, "learning_rate": 1.822698903229716e-06, "loss": 1.5416, "step": 13500 }, { "epoch": 0.8210877139827298, "grad_norm": 1.4239813089370728, "learning_rate": 1.792401381566988e-06, "loss": 1.5252, "step": 13550 }, { "epoch": 0.8241175579457658, "grad_norm": 1.579437017440796, "learning_rate": 1.76210385990426e-06, "loss": 1.5764, "step": 13600 }, { "epoch": 0.8271474019088016, "grad_norm": 1.6514719724655151, "learning_rate": 1.731806338241532e-06, "loss": 1.545, "step": 13650 }, { "epoch": 0.8301772458718376, "grad_norm": 1.5468313694000244, "learning_rate": 1.701508816578804e-06, "loss": 1.5377, "step": 13700 }, { "epoch": 0.8332070898348735, "grad_norm": 1.8032771348953247, "learning_rate": 1.6712112949160762e-06, "loss": 1.5183, "step": 13750 }, { "epoch": 0.8362369337979094, "grad_norm": 2.1406657695770264, "learning_rate": 1.6409137732533481e-06, "loss": 1.5268, "step": 13800 }, { "epoch": 0.8392667777609453, "grad_norm": 1.7589893341064453, "learning_rate": 1.61061625159062e-06, "loss": 1.5054, "step": 13850 }, { "epoch": 0.8422966217239812, "grad_norm": 1.619626522064209, "learning_rate": 1.5803187299278922e-06, "loss": 1.5747, "step": 13900 }, { "epoch": 0.8453264656870171, "grad_norm": 1.3544018268585205, "learning_rate": 1.5500212082651641e-06, "loss": 1.5264, "step": 13950 }, { "epoch": 0.848356309650053, "grad_norm": 1.3174818754196167, "learning_rate": 1.5197236866024359e-06, "loss": 1.4991, "step": 14000 }, { "epoch": 0.8513861536130889, "grad_norm": 1.3408424854278564, "learning_rate": 1.489426164939708e-06, "loss": 1.5574, "step": 14050 }, { "epoch": 0.8544159975761249, "grad_norm": 1.5097547769546509, "learning_rate": 1.45912864327698e-06, "loss": 1.5423, "step": 14100 }, { "epoch": 0.8574458415391607, "grad_norm": 1.5766481161117554, "learning_rate": 1.4288311216142519e-06, "loss": 1.5253, "step": 14150 }, { "epoch": 0.8604756855021967, "grad_norm": 1.2671705484390259, "learning_rate": 1.398533599951524e-06, "loss": 1.5421, "step": 14200 }, { "epoch": 0.8635055294652325, "grad_norm": 1.622070074081421, "learning_rate": 1.368236078288796e-06, "loss": 1.5283, "step": 14250 }, { "epoch": 0.8665353734282685, "grad_norm": 1.5320727825164795, "learning_rate": 1.3379385566260681e-06, "loss": 1.5386, "step": 14300 }, { "epoch": 0.8695652173913043, "grad_norm": 1.5714218616485596, "learning_rate": 1.30764103496334e-06, "loss": 1.558, "step": 14350 }, { "epoch": 0.8725950613543403, "grad_norm": 1.5528987646102905, "learning_rate": 1.277343513300612e-06, "loss": 1.5058, "step": 14400 }, { "epoch": 0.8756249053173761, "grad_norm": 1.4519600868225098, "learning_rate": 1.2470459916378842e-06, "loss": 1.535, "step": 14450 }, { "epoch": 0.8786547492804121, "grad_norm": 1.9346317052841187, "learning_rate": 1.2173544204084107e-06, "loss": 1.5321, "step": 14500 }, { "epoch": 0.881684593243448, "grad_norm": 1.4452751874923706, "learning_rate": 1.1870568987456827e-06, "loss": 1.5147, "step": 14550 }, { "epoch": 0.8847144372064839, "grad_norm": 1.4155831336975098, "learning_rate": 1.1567593770829546e-06, "loss": 1.5439, "step": 14600 }, { "epoch": 0.8877442811695198, "grad_norm": 1.3461506366729736, "learning_rate": 1.1264618554202268e-06, "loss": 1.517, "step": 14650 }, { "epoch": 0.8907741251325557, "grad_norm": 1.4958879947662354, "learning_rate": 1.0961643337574987e-06, "loss": 1.5288, "step": 14700 }, { "epoch": 0.8938039690955916, "grad_norm": 1.610373616218567, "learning_rate": 1.0658668120947706e-06, "loss": 1.5272, "step": 14750 }, { "epoch": 0.8968338130586275, "grad_norm": 1.5633084774017334, "learning_rate": 1.0355692904320428e-06, "loss": 1.524, "step": 14800 }, { "epoch": 0.8998636570216634, "grad_norm": 1.5080559253692627, "learning_rate": 1.0052717687693147e-06, "loss": 1.5302, "step": 14850 }, { "epoch": 0.9028935009846993, "grad_norm": 1.5691548585891724, "learning_rate": 9.749742471065869e-07, "loss": 1.5573, "step": 14900 }, { "epoch": 0.9059233449477352, "grad_norm": 1.44107985496521, "learning_rate": 9.446767254438588e-07, "loss": 1.5266, "step": 14950 }, { "epoch": 0.9089531889107711, "grad_norm": 1.5320124626159668, "learning_rate": 9.143792037811309e-07, "loss": 1.5842, "step": 15000 }, { "epoch": 0.911983032873807, "grad_norm": 1.8644306659698486, "learning_rate": 8.840816821184028e-07, "loss": 1.5547, "step": 15050 }, { "epoch": 0.9150128768368428, "grad_norm": 1.5941940546035767, "learning_rate": 8.537841604556748e-07, "loss": 1.5481, "step": 15100 }, { "epoch": 0.9180427207998788, "grad_norm": 1.3055516481399536, "learning_rate": 8.234866387929468e-07, "loss": 1.5227, "step": 15150 }, { "epoch": 0.9210725647629147, "grad_norm": 1.5031893253326416, "learning_rate": 7.931891171302187e-07, "loss": 1.5489, "step": 15200 }, { "epoch": 0.9241024087259506, "grad_norm": 1.575303554534912, "learning_rate": 7.628915954674908e-07, "loss": 1.5486, "step": 15250 }, { "epoch": 0.9271322526889865, "grad_norm": 1.5559568405151367, "learning_rate": 7.325940738047628e-07, "loss": 1.4914, "step": 15300 }, { "epoch": 0.9301620966520224, "grad_norm": 1.3393162488937378, "learning_rate": 7.022965521420348e-07, "loss": 1.5474, "step": 15350 }, { "epoch": 0.9331919406150583, "grad_norm": 1.404003381729126, "learning_rate": 6.719990304793069e-07, "loss": 1.5088, "step": 15400 }, { "epoch": 0.9362217845780942, "grad_norm": 1.5367882251739502, "learning_rate": 6.417015088165788e-07, "loss": 1.5545, "step": 15450 }, { "epoch": 0.9392516285411301, "grad_norm": 1.4406241178512573, "learning_rate": 6.114039871538509e-07, "loss": 1.527, "step": 15500 }, { "epoch": 0.942281472504166, "grad_norm": 1.3530008792877197, "learning_rate": 5.811064654911229e-07, "loss": 1.5504, "step": 15550 }, { "epoch": 0.9453113164672019, "grad_norm": 1.4533566236495972, "learning_rate": 5.508089438283949e-07, "loss": 1.518, "step": 15600 }, { "epoch": 0.9483411604302379, "grad_norm": 1.5848952531814575, "learning_rate": 5.205114221656669e-07, "loss": 1.524, "step": 15650 }, { "epoch": 0.9513710043932737, "grad_norm": 1.5302923917770386, "learning_rate": 4.902139005029388e-07, "loss": 1.5194, "step": 15700 }, { "epoch": 0.9544008483563097, "grad_norm": 1.381064772605896, "learning_rate": 4.599163788402109e-07, "loss": 1.5345, "step": 15750 }, { "epoch": 0.9574306923193455, "grad_norm": 1.5697840452194214, "learning_rate": 4.2961885717748293e-07, "loss": 1.5332, "step": 15800 }, { "epoch": 0.9604605362823815, "grad_norm": 1.5636427402496338, "learning_rate": 3.99321335514755e-07, "loss": 1.5104, "step": 15850 }, { "epoch": 0.9634903802454173, "grad_norm": 1.45536208152771, "learning_rate": 3.690238138520269e-07, "loss": 1.5505, "step": 15900 }, { "epoch": 0.9665202242084533, "grad_norm": 1.49254310131073, "learning_rate": 3.387262921892989e-07, "loss": 1.512, "step": 15950 }, { "epoch": 0.9695500681714891, "grad_norm": 1.603808045387268, "learning_rate": 3.0842877052657095e-07, "loss": 1.5309, "step": 16000 }, { "epoch": 0.9725799121345251, "grad_norm": 1.5627918243408203, "learning_rate": 2.7813124886384294e-07, "loss": 1.5498, "step": 16050 }, { "epoch": 0.975609756097561, "grad_norm": 1.4155369997024536, "learning_rate": 2.47833727201115e-07, "loss": 1.567, "step": 16100 }, { "epoch": 0.9786396000605969, "grad_norm": 1.3787842988967896, "learning_rate": 2.1753620553838698e-07, "loss": 1.5511, "step": 16150 }, { "epoch": 0.9816694440236328, "grad_norm": 1.3937129974365234, "learning_rate": 1.87238683875659e-07, "loss": 1.4971, "step": 16200 }, { "epoch": 0.9846992879866687, "grad_norm": 1.588771104812622, "learning_rate": 1.5694116221293101e-07, "loss": 1.5384, "step": 16250 }, { "epoch": 0.9877291319497046, "grad_norm": 1.402185320854187, "learning_rate": 1.26643640550203e-07, "loss": 1.517, "step": 16300 }, { "epoch": 0.9907589759127405, "grad_norm": 1.557084321975708, "learning_rate": 9.634611888747501e-08, "loss": 1.5527, "step": 16350 }, { "epoch": 0.9937888198757764, "grad_norm": 1.4594271183013916, "learning_rate": 6.604859722474703e-08, "loss": 1.5216, "step": 16400 }, { "epoch": 0.9968186638388123, "grad_norm": 1.467354655265808, "learning_rate": 3.5751075562019034e-08, "loss": 1.5684, "step": 16450 }, { "epoch": 0.9998485078018482, "grad_norm": 1.326578140258789, "learning_rate": 5.453553899291039e-09, "loss": 1.5009, "step": 16500 }, { "epoch": 1.0, "eval_loss": 1.4647282361984253, "eval_runtime": 301.5458, "eval_samples_per_second": 48.646, "eval_steps_per_second": 6.082, "step": 16503 } ], "logging_steps": 50, "max_steps": 16503, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.4495512772608e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }