| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 1000, | |
| "global_step": 9753, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03076804738279297, | |
| "grad_norm": 2.239741325378418, | |
| "learning_rate": 4.85e-05, | |
| "loss": 2.6548, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06153609476558594, | |
| "grad_norm": 1.6382789611816406, | |
| "learning_rate": 4.94975655236714e-05, | |
| "loss": 2.3258, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09230414214837891, | |
| "grad_norm": 1.4356640577316284, | |
| "learning_rate": 4.89795918367347e-05, | |
| "loss": 2.2618, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.12307218953117188, | |
| "grad_norm": 1.5954153537750244, | |
| "learning_rate": 4.846161814979799e-05, | |
| "loss": 2.2213, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.15384023691396484, | |
| "grad_norm": 1.5558085441589355, | |
| "learning_rate": 4.794364446286129e-05, | |
| "loss": 2.2169, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.18460828429675782, | |
| "grad_norm": 1.3300689458847046, | |
| "learning_rate": 4.7425670775924586e-05, | |
| "loss": 2.1835, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.21537633167955078, | |
| "grad_norm": 1.5879923105239868, | |
| "learning_rate": 4.690769708898788e-05, | |
| "loss": 2.1786, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.24614437906234377, | |
| "grad_norm": 1.4449992179870605, | |
| "learning_rate": 4.638972340205118e-05, | |
| "loss": 2.17, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.27691242644513675, | |
| "grad_norm": 1.660819411277771, | |
| "learning_rate": 4.5871749715114475e-05, | |
| "loss": 2.1397, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.3076804738279297, | |
| "grad_norm": 1.3502899408340454, | |
| "learning_rate": 4.535377602817777e-05, | |
| "loss": 2.1476, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.3076804738279297, | |
| "eval_loss": 2.0527570247650146, | |
| "eval_runtime": 4.7056, | |
| "eval_samples_per_second": 42.502, | |
| "eval_steps_per_second": 21.251, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.33844852121072266, | |
| "grad_norm": 1.5296121835708618, | |
| "learning_rate": 4.483580234124107e-05, | |
| "loss": 2.1347, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.36921656859351565, | |
| "grad_norm": 1.4386141300201416, | |
| "learning_rate": 4.4317828654304364e-05, | |
| "loss": 2.1128, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.3999846159763086, | |
| "grad_norm": 1.7359758615493774, | |
| "learning_rate": 4.379985496736766e-05, | |
| "loss": 2.1256, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.43075266335910156, | |
| "grad_norm": 1.2721539735794067, | |
| "learning_rate": 4.3281881280430956e-05, | |
| "loss": 2.0986, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.46152071074189455, | |
| "grad_norm": 1.507233738899231, | |
| "learning_rate": 4.276390759349425e-05, | |
| "loss": 2.1203, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.49228875812468753, | |
| "grad_norm": 1.5739669799804688, | |
| "learning_rate": 4.224593390655755e-05, | |
| "loss": 2.0817, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.5230568055074805, | |
| "grad_norm": 1.7889518737792969, | |
| "learning_rate": 4.1727960219620845e-05, | |
| "loss": 2.0951, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.5538248528902735, | |
| "grad_norm": 1.3344364166259766, | |
| "learning_rate": 4.120998653268414e-05, | |
| "loss": 2.0856, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.5845929002730664, | |
| "grad_norm": 1.4601390361785889, | |
| "learning_rate": 4.069201284574744e-05, | |
| "loss": 2.0758, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.6153609476558594, | |
| "grad_norm": 1.1662238836288452, | |
| "learning_rate": 4.0174039158810734e-05, | |
| "loss": 2.0747, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6153609476558594, | |
| "eval_loss": 1.9568997621536255, | |
| "eval_runtime": 4.6999, | |
| "eval_samples_per_second": 42.554, | |
| "eval_steps_per_second": 21.277, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.6461289950386524, | |
| "grad_norm": 1.5872595310211182, | |
| "learning_rate": 3.965606547187403e-05, | |
| "loss": 2.0822, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.6768970424214453, | |
| "grad_norm": 1.1704156398773193, | |
| "learning_rate": 3.913809178493733e-05, | |
| "loss": 2.0962, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.7076650898042383, | |
| "grad_norm": 1.6314693689346313, | |
| "learning_rate": 3.8620118098000623e-05, | |
| "loss": 2.0639, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.7384331371870313, | |
| "grad_norm": 1.4110997915267944, | |
| "learning_rate": 3.810214441106392e-05, | |
| "loss": 2.0581, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.7692011845698242, | |
| "grad_norm": 1.2487901449203491, | |
| "learning_rate": 3.7584170724127216e-05, | |
| "loss": 2.0588, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.7999692319526172, | |
| "grad_norm": 1.5325045585632324, | |
| "learning_rate": 3.706619703719051e-05, | |
| "loss": 2.0731, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.8307372793354102, | |
| "grad_norm": 1.2173811197280884, | |
| "learning_rate": 3.654822335025381e-05, | |
| "loss": 2.0459, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.8615053267182031, | |
| "grad_norm": 1.6108065843582153, | |
| "learning_rate": 3.6030249663317105e-05, | |
| "loss": 2.0776, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.8922733741009962, | |
| "grad_norm": 1.7993882894515991, | |
| "learning_rate": 3.55122759763804e-05, | |
| "loss": 2.0391, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.9230414214837891, | |
| "grad_norm": 1.4659514427185059, | |
| "learning_rate": 3.49943022894437e-05, | |
| "loss": 2.0292, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.9230414214837891, | |
| "eval_loss": 1.8916079998016357, | |
| "eval_runtime": 4.8218, | |
| "eval_samples_per_second": 41.478, | |
| "eval_steps_per_second": 20.739, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.953809468866582, | |
| "grad_norm": 1.5378167629241943, | |
| "learning_rate": 3.4476328602506994e-05, | |
| "loss": 2.0525, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.9845775162493751, | |
| "grad_norm": 1.4396274089813232, | |
| "learning_rate": 3.395835491557029e-05, | |
| "loss": 2.0479, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.0150763432175685, | |
| "grad_norm": 1.1797999143600464, | |
| "learning_rate": 3.344038122863359e-05, | |
| "loss": 1.9838, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 1.0458443906003616, | |
| "grad_norm": 1.2820570468902588, | |
| "learning_rate": 3.292240754169688e-05, | |
| "loss": 1.9482, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 1.0766124379831545, | |
| "grad_norm": 1.4676374197006226, | |
| "learning_rate": 3.240443385476018e-05, | |
| "loss": 1.9538, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 1.1073804853659475, | |
| "grad_norm": 1.5519450902938843, | |
| "learning_rate": 3.1886460167823476e-05, | |
| "loss": 1.9727, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 1.1381485327487404, | |
| "grad_norm": 1.3620473146438599, | |
| "learning_rate": 3.136848648088677e-05, | |
| "loss": 1.96, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 1.1689165801315333, | |
| "grad_norm": 1.3301316499710083, | |
| "learning_rate": 3.085051279395007e-05, | |
| "loss": 1.9463, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 1.1996846275143263, | |
| "grad_norm": 1.3053226470947266, | |
| "learning_rate": 3.0332539107013365e-05, | |
| "loss": 1.9709, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 1.2304526748971194, | |
| "grad_norm": 1.2289018630981445, | |
| "learning_rate": 2.981456542007666e-05, | |
| "loss": 1.9847, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2304526748971194, | |
| "eval_loss": 1.8471519947052002, | |
| "eval_runtime": 4.6843, | |
| "eval_samples_per_second": 42.696, | |
| "eval_steps_per_second": 21.348, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 1.2612207222799123, | |
| "grad_norm": 1.319795846939087, | |
| "learning_rate": 2.9296591733139954e-05, | |
| "loss": 1.9524, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.2919887696627053, | |
| "grad_norm": 1.1395128965377808, | |
| "learning_rate": 2.8778618046203254e-05, | |
| "loss": 1.9531, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.3227568170454982, | |
| "grad_norm": 1.2818642854690552, | |
| "learning_rate": 2.8260644359266553e-05, | |
| "loss": 1.946, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.3535248644282913, | |
| "grad_norm": 1.404370903968811, | |
| "learning_rate": 2.7742670672329846e-05, | |
| "loss": 1.9611, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.3842929118110843, | |
| "grad_norm": 1.3051358461380005, | |
| "learning_rate": 2.7224696985393146e-05, | |
| "loss": 1.9345, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.4150609591938772, | |
| "grad_norm": 1.2720344066619873, | |
| "learning_rate": 2.670672329845644e-05, | |
| "loss": 1.9596, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.4458290065766701, | |
| "grad_norm": 1.1316062211990356, | |
| "learning_rate": 2.6188749611519735e-05, | |
| "loss": 1.9655, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.476597053959463, | |
| "grad_norm": 1.1589127779006958, | |
| "learning_rate": 2.56759556614524e-05, | |
| "loss": 1.9336, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.507365101342256, | |
| "grad_norm": 1.418484091758728, | |
| "learning_rate": 2.5157981974515692e-05, | |
| "loss": 1.9538, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.538133148725049, | |
| "grad_norm": 1.1148117780685425, | |
| "learning_rate": 2.464000828757899e-05, | |
| "loss": 1.9373, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.538133148725049, | |
| "eval_loss": 1.8176252841949463, | |
| "eval_runtime": 4.6868, | |
| "eval_samples_per_second": 42.673, | |
| "eval_steps_per_second": 21.337, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.5689011961078418, | |
| "grad_norm": 1.409507155418396, | |
| "learning_rate": 2.4122034600642288e-05, | |
| "loss": 1.9487, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.599669243490635, | |
| "grad_norm": 1.4098600149154663, | |
| "learning_rate": 2.3604060913705588e-05, | |
| "loss": 1.9158, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.630437290873428, | |
| "grad_norm": 1.2400621175765991, | |
| "learning_rate": 2.308608722676888e-05, | |
| "loss": 1.9257, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.661205338256221, | |
| "grad_norm": 1.3149056434631348, | |
| "learning_rate": 2.2568113539832177e-05, | |
| "loss": 1.9254, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.691973385639014, | |
| "grad_norm": 1.306784749031067, | |
| "learning_rate": 2.2050139852895473e-05, | |
| "loss": 1.9446, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.722741433021807, | |
| "grad_norm": 1.3898966312408447, | |
| "learning_rate": 2.153216616595877e-05, | |
| "loss": 1.9151, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.7535094804045999, | |
| "grad_norm": 1.2994170188903809, | |
| "learning_rate": 2.1014192479022066e-05, | |
| "loss": 1.9464, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.7842775277873928, | |
| "grad_norm": 1.3611286878585815, | |
| "learning_rate": 2.0496218792085366e-05, | |
| "loss": 1.9206, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.8150455751701857, | |
| "grad_norm": 1.406435489654541, | |
| "learning_rate": 1.997824510514866e-05, | |
| "loss": 1.9303, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.8458136225529787, | |
| "grad_norm": 1.4985857009887695, | |
| "learning_rate": 1.9460271418211955e-05, | |
| "loss": 1.9306, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8458136225529787, | |
| "eval_loss": 1.7914341688156128, | |
| "eval_runtime": 4.8041, | |
| "eval_samples_per_second": 41.631, | |
| "eval_steps_per_second": 20.816, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.8765816699357716, | |
| "grad_norm": 1.3169234991073608, | |
| "learning_rate": 1.894229773127525e-05, | |
| "loss": 1.9435, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.9073497173185645, | |
| "grad_norm": 1.4807860851287842, | |
| "learning_rate": 1.8424324044338548e-05, | |
| "loss": 1.9108, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.9381177647013577, | |
| "grad_norm": 1.5189799070358276, | |
| "learning_rate": 1.7906350357401844e-05, | |
| "loss": 1.9274, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.9688858120841506, | |
| "grad_norm": 1.37026846408844, | |
| "learning_rate": 1.7388376670465144e-05, | |
| "loss": 1.9307, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.9996538594669437, | |
| "grad_norm": 1.2851979732513428, | |
| "learning_rate": 1.687040298352844e-05, | |
| "loss": 1.9324, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 2.030152686435137, | |
| "grad_norm": 1.5312321186065674, | |
| "learning_rate": 1.6352429296591733e-05, | |
| "loss": 1.8818, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 2.0609207338179303, | |
| "grad_norm": 1.2317149639129639, | |
| "learning_rate": 1.583445560965503e-05, | |
| "loss": 1.8666, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 2.091688781200723, | |
| "grad_norm": 1.4043223857879639, | |
| "learning_rate": 1.5316481922718326e-05, | |
| "loss": 1.8588, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 2.122456828583516, | |
| "grad_norm": 1.3751603364944458, | |
| "learning_rate": 1.4798508235781622e-05, | |
| "loss": 1.873, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 2.153224875966309, | |
| "grad_norm": 1.605711579322815, | |
| "learning_rate": 1.428053454884492e-05, | |
| "loss": 1.8861, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.153224875966309, | |
| "eval_loss": 1.770365595817566, | |
| "eval_runtime": 4.8144, | |
| "eval_samples_per_second": 41.542, | |
| "eval_steps_per_second": 20.771, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 2.183992923349102, | |
| "grad_norm": 1.642946720123291, | |
| "learning_rate": 1.3762560861908216e-05, | |
| "loss": 1.8608, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 2.214760970731895, | |
| "grad_norm": 1.3048804998397827, | |
| "learning_rate": 1.3244587174971513e-05, | |
| "loss": 1.8653, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 2.245529018114688, | |
| "grad_norm": 1.3654470443725586, | |
| "learning_rate": 1.2726613488034809e-05, | |
| "loss": 1.8803, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 2.276297065497481, | |
| "grad_norm": 1.377312421798706, | |
| "learning_rate": 1.2208639801098105e-05, | |
| "loss": 1.8783, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 2.3070651128802737, | |
| "grad_norm": 1.2883929014205933, | |
| "learning_rate": 1.1690666114161402e-05, | |
| "loss": 1.8612, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 2.3378331602630666, | |
| "grad_norm": 1.3439315557479858, | |
| "learning_rate": 1.1172692427224698e-05, | |
| "loss": 1.8627, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 2.3686012076458596, | |
| "grad_norm": 1.1767522096633911, | |
| "learning_rate": 1.0654718740287994e-05, | |
| "loss": 1.8713, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 2.3993692550286525, | |
| "grad_norm": 1.2367137670516968, | |
| "learning_rate": 1.0141924790220656e-05, | |
| "loss": 1.8753, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 2.430137302411446, | |
| "grad_norm": 1.218923568725586, | |
| "learning_rate": 9.623951103283954e-06, | |
| "loss": 1.8924, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 2.460905349794239, | |
| "grad_norm": 1.481924057006836, | |
| "learning_rate": 9.10597741634725e-06, | |
| "loss": 1.875, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.460905349794239, | |
| "eval_loss": 1.7587065696716309, | |
| "eval_runtime": 4.6695, | |
| "eval_samples_per_second": 42.831, | |
| "eval_steps_per_second": 21.416, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 2.4916733971770317, | |
| "grad_norm": 1.5611909627914429, | |
| "learning_rate": 8.588003729410545e-06, | |
| "loss": 1.8847, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 2.5224414445598247, | |
| "grad_norm": 1.573014736175537, | |
| "learning_rate": 8.070030042473843e-06, | |
| "loss": 1.8645, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 2.5532094919426176, | |
| "grad_norm": 1.4010423421859741, | |
| "learning_rate": 7.5520563555371394e-06, | |
| "loss": 1.8716, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 2.5839775393254105, | |
| "grad_norm": 1.4731730222702026, | |
| "learning_rate": 7.034082668600435e-06, | |
| "loss": 1.8545, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 2.6147455867082035, | |
| "grad_norm": 1.5401034355163574, | |
| "learning_rate": 6.516108981663732e-06, | |
| "loss": 1.8701, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 2.6455136340909964, | |
| "grad_norm": 1.4150763750076294, | |
| "learning_rate": 5.998135294727028e-06, | |
| "loss": 1.8781, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 2.6762816814737893, | |
| "grad_norm": 1.2187877893447876, | |
| "learning_rate": 5.480161607790325e-06, | |
| "loss": 1.8554, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 2.7070497288565827, | |
| "grad_norm": 1.2359226942062378, | |
| "learning_rate": 4.962187920853621e-06, | |
| "loss": 1.8655, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 2.7378177762393756, | |
| "grad_norm": 1.5639677047729492, | |
| "learning_rate": 4.444214233916917e-06, | |
| "loss": 1.8702, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 2.7685858236221685, | |
| "grad_norm": 1.3015304803848267, | |
| "learning_rate": 3.926240546980214e-06, | |
| "loss": 1.8724, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.7685858236221685, | |
| "eval_loss": 1.7501640319824219, | |
| "eval_runtime": 4.6364, | |
| "eval_samples_per_second": 43.137, | |
| "eval_steps_per_second": 21.568, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 2.7993538710049615, | |
| "grad_norm": 1.2435898780822754, | |
| "learning_rate": 3.40826686004351e-06, | |
| "loss": 1.8699, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 2.8301219183877544, | |
| "grad_norm": 1.4046730995178223, | |
| "learning_rate": 2.890293173106806e-06, | |
| "loss": 1.874, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 2.8608899657705473, | |
| "grad_norm": 1.4343743324279785, | |
| "learning_rate": 2.3723194861701027e-06, | |
| "loss": 1.8844, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 2.8916580131533403, | |
| "grad_norm": 1.506251573562622, | |
| "learning_rate": 1.8543457992333989e-06, | |
| "loss": 1.8738, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 2.922426060536133, | |
| "grad_norm": 1.1848888397216797, | |
| "learning_rate": 1.3363721122966954e-06, | |
| "loss": 1.8549, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 2.953194107918926, | |
| "grad_norm": 1.301543951034546, | |
| "learning_rate": 8.183984253599917e-07, | |
| "loss": 1.8691, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 2.983962155301719, | |
| "grad_norm": 1.4360523223876953, | |
| "learning_rate": 3.004247384232881e-07, | |
| "loss": 1.8902, | |
| "step": 9700 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 9753, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.0763124744192e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |