{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 1000, "global_step": 9753, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03076804738279297, "grad_norm": 2.239741325378418, "learning_rate": 4.85e-05, "loss": 2.6548, "step": 100 }, { "epoch": 0.06153609476558594, "grad_norm": 1.6382789611816406, "learning_rate": 4.94975655236714e-05, "loss": 2.3258, "step": 200 }, { "epoch": 0.09230414214837891, "grad_norm": 1.4356640577316284, "learning_rate": 4.89795918367347e-05, "loss": 2.2618, "step": 300 }, { "epoch": 0.12307218953117188, "grad_norm": 1.5954153537750244, "learning_rate": 4.846161814979799e-05, "loss": 2.2213, "step": 400 }, { "epoch": 0.15384023691396484, "grad_norm": 1.5558085441589355, "learning_rate": 4.794364446286129e-05, "loss": 2.2169, "step": 500 }, { "epoch": 0.18460828429675782, "grad_norm": 1.3300689458847046, "learning_rate": 4.7425670775924586e-05, "loss": 2.1835, "step": 600 }, { "epoch": 0.21537633167955078, "grad_norm": 1.5879923105239868, "learning_rate": 4.690769708898788e-05, "loss": 2.1786, "step": 700 }, { "epoch": 0.24614437906234377, "grad_norm": 1.4449992179870605, "learning_rate": 4.638972340205118e-05, "loss": 2.17, "step": 800 }, { "epoch": 0.27691242644513675, "grad_norm": 1.660819411277771, "learning_rate": 4.5871749715114475e-05, "loss": 2.1397, "step": 900 }, { "epoch": 0.3076804738279297, "grad_norm": 1.3502899408340454, "learning_rate": 4.535377602817777e-05, "loss": 2.1476, "step": 1000 }, { "epoch": 0.3076804738279297, "eval_loss": 2.0527570247650146, "eval_runtime": 4.7056, "eval_samples_per_second": 42.502, "eval_steps_per_second": 21.251, "step": 1000 }, { "epoch": 0.33844852121072266, "grad_norm": 1.5296121835708618, "learning_rate": 4.483580234124107e-05, "loss": 2.1347, "step": 1100 }, { "epoch": 0.36921656859351565, "grad_norm": 1.4386141300201416, "learning_rate": 4.4317828654304364e-05, "loss": 2.1128, "step": 1200 }, { "epoch": 0.3999846159763086, "grad_norm": 1.7359758615493774, "learning_rate": 4.379985496736766e-05, "loss": 2.1256, "step": 1300 }, { "epoch": 0.43075266335910156, "grad_norm": 1.2721539735794067, "learning_rate": 4.3281881280430956e-05, "loss": 2.0986, "step": 1400 }, { "epoch": 0.46152071074189455, "grad_norm": 1.507233738899231, "learning_rate": 4.276390759349425e-05, "loss": 2.1203, "step": 1500 }, { "epoch": 0.49228875812468753, "grad_norm": 1.5739669799804688, "learning_rate": 4.224593390655755e-05, "loss": 2.0817, "step": 1600 }, { "epoch": 0.5230568055074805, "grad_norm": 1.7889518737792969, "learning_rate": 4.1727960219620845e-05, "loss": 2.0951, "step": 1700 }, { "epoch": 0.5538248528902735, "grad_norm": 1.3344364166259766, "learning_rate": 4.120998653268414e-05, "loss": 2.0856, "step": 1800 }, { "epoch": 0.5845929002730664, "grad_norm": 1.4601390361785889, "learning_rate": 4.069201284574744e-05, "loss": 2.0758, "step": 1900 }, { "epoch": 0.6153609476558594, "grad_norm": 1.1662238836288452, "learning_rate": 4.0174039158810734e-05, "loss": 2.0747, "step": 2000 }, { "epoch": 0.6153609476558594, "eval_loss": 1.9568997621536255, "eval_runtime": 4.6999, "eval_samples_per_second": 42.554, "eval_steps_per_second": 21.277, "step": 2000 }, { "epoch": 0.6461289950386524, "grad_norm": 1.5872595310211182, "learning_rate": 3.965606547187403e-05, "loss": 2.0822, "step": 2100 }, { "epoch": 0.6768970424214453, "grad_norm": 1.1704156398773193, "learning_rate": 3.913809178493733e-05, "loss": 2.0962, "step": 2200 }, { "epoch": 0.7076650898042383, "grad_norm": 1.6314693689346313, "learning_rate": 3.8620118098000623e-05, "loss": 2.0639, "step": 2300 }, { "epoch": 0.7384331371870313, "grad_norm": 1.4110997915267944, "learning_rate": 3.810214441106392e-05, "loss": 2.0581, "step": 2400 }, { "epoch": 0.7692011845698242, "grad_norm": 1.2487901449203491, "learning_rate": 3.7584170724127216e-05, "loss": 2.0588, "step": 2500 }, { "epoch": 0.7999692319526172, "grad_norm": 1.5325045585632324, "learning_rate": 3.706619703719051e-05, "loss": 2.0731, "step": 2600 }, { "epoch": 0.8307372793354102, "grad_norm": 1.2173811197280884, "learning_rate": 3.654822335025381e-05, "loss": 2.0459, "step": 2700 }, { "epoch": 0.8615053267182031, "grad_norm": 1.6108065843582153, "learning_rate": 3.6030249663317105e-05, "loss": 2.0776, "step": 2800 }, { "epoch": 0.8922733741009962, "grad_norm": 1.7993882894515991, "learning_rate": 3.55122759763804e-05, "loss": 2.0391, "step": 2900 }, { "epoch": 0.9230414214837891, "grad_norm": 1.4659514427185059, "learning_rate": 3.49943022894437e-05, "loss": 2.0292, "step": 3000 }, { "epoch": 0.9230414214837891, "eval_loss": 1.8916079998016357, "eval_runtime": 4.8218, "eval_samples_per_second": 41.478, "eval_steps_per_second": 20.739, "step": 3000 }, { "epoch": 0.953809468866582, "grad_norm": 1.5378167629241943, "learning_rate": 3.4476328602506994e-05, "loss": 2.0525, "step": 3100 }, { "epoch": 0.9845775162493751, "grad_norm": 1.4396274089813232, "learning_rate": 3.395835491557029e-05, "loss": 2.0479, "step": 3200 }, { "epoch": 1.0150763432175685, "grad_norm": 1.1797999143600464, "learning_rate": 3.344038122863359e-05, "loss": 1.9838, "step": 3300 }, { "epoch": 1.0458443906003616, "grad_norm": 1.2820570468902588, "learning_rate": 3.292240754169688e-05, "loss": 1.9482, "step": 3400 }, { "epoch": 1.0766124379831545, "grad_norm": 1.4676374197006226, "learning_rate": 3.240443385476018e-05, "loss": 1.9538, "step": 3500 }, { "epoch": 1.1073804853659475, "grad_norm": 1.5519450902938843, "learning_rate": 3.1886460167823476e-05, "loss": 1.9727, "step": 3600 }, { "epoch": 1.1381485327487404, "grad_norm": 1.3620473146438599, "learning_rate": 3.136848648088677e-05, "loss": 1.96, "step": 3700 }, { "epoch": 1.1689165801315333, "grad_norm": 1.3301316499710083, "learning_rate": 3.085051279395007e-05, "loss": 1.9463, "step": 3800 }, { "epoch": 1.1996846275143263, "grad_norm": 1.3053226470947266, "learning_rate": 3.0332539107013365e-05, "loss": 1.9709, "step": 3900 }, { "epoch": 1.2304526748971194, "grad_norm": 1.2289018630981445, "learning_rate": 2.981456542007666e-05, "loss": 1.9847, "step": 4000 }, { "epoch": 1.2304526748971194, "eval_loss": 1.8471519947052002, "eval_runtime": 4.6843, "eval_samples_per_second": 42.696, "eval_steps_per_second": 21.348, "step": 4000 }, { "epoch": 1.2612207222799123, "grad_norm": 1.319795846939087, "learning_rate": 2.9296591733139954e-05, "loss": 1.9524, "step": 4100 }, { "epoch": 1.2919887696627053, "grad_norm": 1.1395128965377808, "learning_rate": 2.8778618046203254e-05, "loss": 1.9531, "step": 4200 }, { "epoch": 1.3227568170454982, "grad_norm": 1.2818642854690552, "learning_rate": 2.8260644359266553e-05, "loss": 1.946, "step": 4300 }, { "epoch": 1.3535248644282913, "grad_norm": 1.404370903968811, "learning_rate": 2.7742670672329846e-05, "loss": 1.9611, "step": 4400 }, { "epoch": 1.3842929118110843, "grad_norm": 1.3051358461380005, "learning_rate": 2.7224696985393146e-05, "loss": 1.9345, "step": 4500 }, { "epoch": 1.4150609591938772, "grad_norm": 1.2720344066619873, "learning_rate": 2.670672329845644e-05, "loss": 1.9596, "step": 4600 }, { "epoch": 1.4458290065766701, "grad_norm": 1.1316062211990356, "learning_rate": 2.6188749611519735e-05, "loss": 1.9655, "step": 4700 }, { "epoch": 1.476597053959463, "grad_norm": 1.1589127779006958, "learning_rate": 2.56759556614524e-05, "loss": 1.9336, "step": 4800 }, { "epoch": 1.507365101342256, "grad_norm": 1.418484091758728, "learning_rate": 2.5157981974515692e-05, "loss": 1.9538, "step": 4900 }, { "epoch": 1.538133148725049, "grad_norm": 1.1148117780685425, "learning_rate": 2.464000828757899e-05, "loss": 1.9373, "step": 5000 }, { "epoch": 1.538133148725049, "eval_loss": 1.8176252841949463, "eval_runtime": 4.6868, "eval_samples_per_second": 42.673, "eval_steps_per_second": 21.337, "step": 5000 }, { "epoch": 1.5689011961078418, "grad_norm": 1.409507155418396, "learning_rate": 2.4122034600642288e-05, "loss": 1.9487, "step": 5100 }, { "epoch": 1.599669243490635, "grad_norm": 1.4098600149154663, "learning_rate": 2.3604060913705588e-05, "loss": 1.9158, "step": 5200 }, { "epoch": 1.630437290873428, "grad_norm": 1.2400621175765991, "learning_rate": 2.308608722676888e-05, "loss": 1.9257, "step": 5300 }, { "epoch": 1.661205338256221, "grad_norm": 1.3149056434631348, "learning_rate": 2.2568113539832177e-05, "loss": 1.9254, "step": 5400 }, { "epoch": 1.691973385639014, "grad_norm": 1.306784749031067, "learning_rate": 2.2050139852895473e-05, "loss": 1.9446, "step": 5500 }, { "epoch": 1.722741433021807, "grad_norm": 1.3898966312408447, "learning_rate": 2.153216616595877e-05, "loss": 1.9151, "step": 5600 }, { "epoch": 1.7535094804045999, "grad_norm": 1.2994170188903809, "learning_rate": 2.1014192479022066e-05, "loss": 1.9464, "step": 5700 }, { "epoch": 1.7842775277873928, "grad_norm": 1.3611286878585815, "learning_rate": 2.0496218792085366e-05, "loss": 1.9206, "step": 5800 }, { "epoch": 1.8150455751701857, "grad_norm": 1.406435489654541, "learning_rate": 1.997824510514866e-05, "loss": 1.9303, "step": 5900 }, { "epoch": 1.8458136225529787, "grad_norm": 1.4985857009887695, "learning_rate": 1.9460271418211955e-05, "loss": 1.9306, "step": 6000 }, { "epoch": 1.8458136225529787, "eval_loss": 1.7914341688156128, "eval_runtime": 4.8041, "eval_samples_per_second": 41.631, "eval_steps_per_second": 20.816, "step": 6000 }, { "epoch": 1.8765816699357716, "grad_norm": 1.3169234991073608, "learning_rate": 1.894229773127525e-05, "loss": 1.9435, "step": 6100 }, { "epoch": 1.9073497173185645, "grad_norm": 1.4807860851287842, "learning_rate": 1.8424324044338548e-05, "loss": 1.9108, "step": 6200 }, { "epoch": 1.9381177647013577, "grad_norm": 1.5189799070358276, "learning_rate": 1.7906350357401844e-05, "loss": 1.9274, "step": 6300 }, { "epoch": 1.9688858120841506, "grad_norm": 1.37026846408844, "learning_rate": 1.7388376670465144e-05, "loss": 1.9307, "step": 6400 }, { "epoch": 1.9996538594669437, "grad_norm": 1.2851979732513428, "learning_rate": 1.687040298352844e-05, "loss": 1.9324, "step": 6500 }, { "epoch": 2.030152686435137, "grad_norm": 1.5312321186065674, "learning_rate": 1.6352429296591733e-05, "loss": 1.8818, "step": 6600 }, { "epoch": 2.0609207338179303, "grad_norm": 1.2317149639129639, "learning_rate": 1.583445560965503e-05, "loss": 1.8666, "step": 6700 }, { "epoch": 2.091688781200723, "grad_norm": 1.4043223857879639, "learning_rate": 1.5316481922718326e-05, "loss": 1.8588, "step": 6800 }, { "epoch": 2.122456828583516, "grad_norm": 1.3751603364944458, "learning_rate": 1.4798508235781622e-05, "loss": 1.873, "step": 6900 }, { "epoch": 2.153224875966309, "grad_norm": 1.605711579322815, "learning_rate": 1.428053454884492e-05, "loss": 1.8861, "step": 7000 }, { "epoch": 2.153224875966309, "eval_loss": 1.770365595817566, "eval_runtime": 4.8144, "eval_samples_per_second": 41.542, "eval_steps_per_second": 20.771, "step": 7000 }, { "epoch": 2.183992923349102, "grad_norm": 1.642946720123291, "learning_rate": 1.3762560861908216e-05, "loss": 1.8608, "step": 7100 }, { "epoch": 2.214760970731895, "grad_norm": 1.3048804998397827, "learning_rate": 1.3244587174971513e-05, "loss": 1.8653, "step": 7200 }, { "epoch": 2.245529018114688, "grad_norm": 1.3654470443725586, "learning_rate": 1.2726613488034809e-05, "loss": 1.8803, "step": 7300 }, { "epoch": 2.276297065497481, "grad_norm": 1.377312421798706, "learning_rate": 1.2208639801098105e-05, "loss": 1.8783, "step": 7400 }, { "epoch": 2.3070651128802737, "grad_norm": 1.2883929014205933, "learning_rate": 1.1690666114161402e-05, "loss": 1.8612, "step": 7500 }, { "epoch": 2.3378331602630666, "grad_norm": 1.3439315557479858, "learning_rate": 1.1172692427224698e-05, "loss": 1.8627, "step": 7600 }, { "epoch": 2.3686012076458596, "grad_norm": 1.1767522096633911, "learning_rate": 1.0654718740287994e-05, "loss": 1.8713, "step": 7700 }, { "epoch": 2.3993692550286525, "grad_norm": 1.2367137670516968, "learning_rate": 1.0141924790220656e-05, "loss": 1.8753, "step": 7800 }, { "epoch": 2.430137302411446, "grad_norm": 1.218923568725586, "learning_rate": 9.623951103283954e-06, "loss": 1.8924, "step": 7900 }, { "epoch": 2.460905349794239, "grad_norm": 1.481924057006836, "learning_rate": 9.10597741634725e-06, "loss": 1.875, "step": 8000 }, { "epoch": 2.460905349794239, "eval_loss": 1.7587065696716309, "eval_runtime": 4.6695, "eval_samples_per_second": 42.831, "eval_steps_per_second": 21.416, "step": 8000 }, { "epoch": 2.4916733971770317, "grad_norm": 1.5611909627914429, "learning_rate": 8.588003729410545e-06, "loss": 1.8847, "step": 8100 }, { "epoch": 2.5224414445598247, "grad_norm": 1.573014736175537, "learning_rate": 8.070030042473843e-06, "loss": 1.8645, "step": 8200 }, { "epoch": 2.5532094919426176, "grad_norm": 1.4010423421859741, "learning_rate": 7.5520563555371394e-06, "loss": 1.8716, "step": 8300 }, { "epoch": 2.5839775393254105, "grad_norm": 1.4731730222702026, "learning_rate": 7.034082668600435e-06, "loss": 1.8545, "step": 8400 }, { "epoch": 2.6147455867082035, "grad_norm": 1.5401034355163574, "learning_rate": 6.516108981663732e-06, "loss": 1.8701, "step": 8500 }, { "epoch": 2.6455136340909964, "grad_norm": 1.4150763750076294, "learning_rate": 5.998135294727028e-06, "loss": 1.8781, "step": 8600 }, { "epoch": 2.6762816814737893, "grad_norm": 1.2187877893447876, "learning_rate": 5.480161607790325e-06, "loss": 1.8554, "step": 8700 }, { "epoch": 2.7070497288565827, "grad_norm": 1.2359226942062378, "learning_rate": 4.962187920853621e-06, "loss": 1.8655, "step": 8800 }, { "epoch": 2.7378177762393756, "grad_norm": 1.5639677047729492, "learning_rate": 4.444214233916917e-06, "loss": 1.8702, "step": 8900 }, { "epoch": 2.7685858236221685, "grad_norm": 1.3015304803848267, "learning_rate": 3.926240546980214e-06, "loss": 1.8724, "step": 9000 }, { "epoch": 2.7685858236221685, "eval_loss": 1.7501640319824219, "eval_runtime": 4.6364, "eval_samples_per_second": 43.137, "eval_steps_per_second": 21.568, "step": 9000 }, { "epoch": 2.7993538710049615, "grad_norm": 1.2435898780822754, "learning_rate": 3.40826686004351e-06, "loss": 1.8699, "step": 9100 }, { "epoch": 2.8301219183877544, "grad_norm": 1.4046730995178223, "learning_rate": 2.890293173106806e-06, "loss": 1.874, "step": 9200 }, { "epoch": 2.8608899657705473, "grad_norm": 1.4343743324279785, "learning_rate": 2.3723194861701027e-06, "loss": 1.8844, "step": 9300 }, { "epoch": 2.8916580131533403, "grad_norm": 1.506251573562622, "learning_rate": 1.8543457992333989e-06, "loss": 1.8738, "step": 9400 }, { "epoch": 2.922426060536133, "grad_norm": 1.1848888397216797, "learning_rate": 1.3363721122966954e-06, "loss": 1.8549, "step": 9500 }, { "epoch": 2.953194107918926, "grad_norm": 1.301543951034546, "learning_rate": 8.183984253599917e-07, "loss": 1.8691, "step": 9600 }, { "epoch": 2.983962155301719, "grad_norm": 1.4360523223876953, "learning_rate": 3.004247384232881e-07, "loss": 1.8902, "step": 9700 } ], "logging_steps": 100, "max_steps": 9753, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.0763124744192e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }