{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 708, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02824858757062147, "grad_norm": 5.842401568474987, "learning_rate": 5e-06, "loss": 2.0396, "step": 10 }, { "epoch": 0.05649717514124294, "grad_norm": 3.6388768595262957, "learning_rate": 4.997468222143782e-06, "loss": 1.7131, "step": 20 }, { "epoch": 0.0847457627118644, "grad_norm": 1.8710689080902667, "learning_rate": 4.989878016494418e-06, "loss": 1.5907, "step": 30 }, { "epoch": 0.11299435028248588, "grad_norm": 1.880666572553662, "learning_rate": 4.977244756423578e-06, "loss": 1.502, "step": 40 }, { "epoch": 0.14124293785310735, "grad_norm": 1.6520564729299156, "learning_rate": 4.959594029617741e-06, "loss": 1.4357, "step": 50 }, { "epoch": 0.1694915254237288, "grad_norm": 1.7284626155394651, "learning_rate": 4.9369615862523266e-06, "loss": 1.3794, "step": 60 }, { "epoch": 0.1977401129943503, "grad_norm": 1.7131842535751252, "learning_rate": 4.90939326658249e-06, "loss": 1.3396, "step": 70 }, { "epoch": 0.22598870056497175, "grad_norm": 1.6872621752920653, "learning_rate": 4.876944908097249e-06, "loss": 1.3119, "step": 80 }, { "epoch": 0.2542372881355932, "grad_norm": 1.6781047658941155, "learning_rate": 4.8396822324249915e-06, "loss": 1.2932, "step": 90 }, { "epoch": 0.2824858757062147, "grad_norm": 1.8225514648316858, "learning_rate": 4.797680712219421e-06, "loss": 1.2533, "step": 100 }, { "epoch": 0.3107344632768362, "grad_norm": 1.4304345855914073, "learning_rate": 4.751025418295565e-06, "loss": 1.2581, "step": 110 }, { "epoch": 0.3389830508474576, "grad_norm": 2.489693154849688, "learning_rate": 4.699810847325449e-06, "loss": 1.2615, "step": 120 }, { "epoch": 0.3672316384180791, "grad_norm": 1.497470889635536, "learning_rate": 4.644140730442432e-06, "loss": 1.2385, "step": 130 }, { "epoch": 0.3954802259887006, "grad_norm": 1.4852922561551125, "learning_rate": 4.584127823141855e-06, "loss": 1.2228, "step": 140 }, { "epoch": 0.423728813559322, "grad_norm": 1.4374567982916069, "learning_rate": 4.5198936769035504e-06, "loss": 1.2254, "step": 150 }, { "epoch": 0.4519774011299435, "grad_norm": 1.4496375733325404, "learning_rate": 4.451568392998767e-06, "loss": 1.2265, "step": 160 }, { "epoch": 0.480225988700565, "grad_norm": 1.2754388779607286, "learning_rate": 4.3792903589801515e-06, "loss": 1.1846, "step": 170 }, { "epoch": 0.5084745762711864, "grad_norm": 1.3975405752824914, "learning_rate": 4.30320596838852e-06, "loss": 1.178, "step": 180 }, { "epoch": 0.536723163841808, "grad_norm": 1.3749489429241677, "learning_rate": 4.223469324244115e-06, "loss": 1.1717, "step": 190 }, { "epoch": 0.5649717514124294, "grad_norm": 1.393315372114993, "learning_rate": 4.140241926922916e-06, "loss": 1.181, "step": 200 }, { "epoch": 0.5932203389830508, "grad_norm": 1.505577319035518, "learning_rate": 4.0536923470501775e-06, "loss": 1.1744, "step": 210 }, { "epoch": 0.6214689265536724, "grad_norm": 1.457055249771263, "learning_rate": 3.96399588407373e-06, "loss": 1.151, "step": 220 }, { "epoch": 0.6497175141242938, "grad_norm": 1.3812712992322103, "learning_rate": 3.8713342112085685e-06, "loss": 1.1787, "step": 230 }, { "epoch": 0.6779661016949152, "grad_norm": 1.528224921984006, "learning_rate": 3.775895007471876e-06, "loss": 1.158, "step": 240 }, { "epoch": 0.7062146892655368, "grad_norm": 1.4254969486921814, "learning_rate": 3.677871577553763e-06, "loss": 1.1662, "step": 250 }, { "epoch": 0.7344632768361582, "grad_norm": 1.4029062285962588, "learning_rate": 3.5774624602936344e-06, "loss": 1.1716, "step": 260 }, { "epoch": 0.7627118644067796, "grad_norm": 1.4621797650744357, "learning_rate": 3.474871026555204e-06, "loss": 1.1523, "step": 270 }, { "epoch": 0.7909604519774012, "grad_norm": 1.576846783408568, "learning_rate": 3.370305067314612e-06, "loss": 1.169, "step": 280 }, { "epoch": 0.8192090395480226, "grad_norm": 1.4553855180258597, "learning_rate": 3.2639763727959554e-06, "loss": 1.1374, "step": 290 }, { "epoch": 0.847457627118644, "grad_norm": 1.3037242619402352, "learning_rate": 3.1561003035066435e-06, "loss": 1.144, "step": 300 }, { "epoch": 0.8757062146892656, "grad_norm": 1.4064392305723843, "learning_rate": 3.0468953540414304e-06, "loss": 1.1396, "step": 310 }, { "epoch": 0.903954802259887, "grad_norm": 1.2072866070703256, "learning_rate": 2.936582710538593e-06, "loss": 1.1388, "step": 320 }, { "epoch": 0.9322033898305084, "grad_norm": 1.234511949962314, "learning_rate": 2.8253858026845958e-06, "loss": 1.149, "step": 330 }, { "epoch": 0.96045197740113, "grad_norm": 1.3682697011912646, "learning_rate": 2.7135298511746276e-06, "loss": 1.1463, "step": 340 }, { "epoch": 0.9887005649717514, "grad_norm": 1.4595117952046124, "learning_rate": 2.6012414115455826e-06, "loss": 1.1339, "step": 350 }, { "epoch": 1.0169491525423728, "grad_norm": 1.365355679432521, "learning_rate": 2.488747915305431e-06, "loss": 1.0592, "step": 360 }, { "epoch": 1.0451977401129944, "grad_norm": 1.4547925198477407, "learning_rate": 2.376277209288372e-06, "loss": 1.0288, "step": 370 }, { "epoch": 1.073446327683616, "grad_norm": 1.2787445662747539, "learning_rate": 2.2640570941687794e-06, "loss": 1.022, "step": 380 }, { "epoch": 1.1016949152542372, "grad_norm": 1.3427132830957944, "learning_rate": 2.1523148630686397e-06, "loss": 1.0311, "step": 390 }, { "epoch": 1.1299435028248588, "grad_norm": 1.446710010618482, "learning_rate": 2.0412768411929948e-06, "loss": 1.017, "step": 400 }, { "epoch": 1.1581920903954803, "grad_norm": 1.34481917501583, "learning_rate": 1.931167927425832e-06, "loss": 1.0266, "step": 410 }, { "epoch": 1.1864406779661016, "grad_norm": 1.3341116732215694, "learning_rate": 1.8222111388148678e-06, "loss": 1.0236, "step": 420 }, { "epoch": 1.2146892655367232, "grad_norm": 1.345613554920642, "learning_rate": 1.714627158867857e-06, "loss": 1.0109, "step": 430 }, { "epoch": 1.2429378531073447, "grad_norm": 1.2833581931894298, "learning_rate": 1.6086338905752883e-06, "loss": 0.9996, "step": 440 }, { "epoch": 1.271186440677966, "grad_norm": 1.3699670070856527, "learning_rate": 1.50444601506482e-06, "loss": 1.0128, "step": 450 }, { "epoch": 1.2994350282485876, "grad_norm": 1.2610914382885658, "learning_rate": 1.4022745567813334e-06, "loss": 1.0094, "step": 460 }, { "epoch": 1.327683615819209, "grad_norm": 1.2627951088836191, "learning_rate": 1.3023264560733268e-06, "loss": 1.0206, "step": 470 }, { "epoch": 1.3559322033898304, "grad_norm": 1.268968791060816, "learning_rate": 1.2048041500513136e-06, "loss": 1.0136, "step": 480 }, { "epoch": 1.384180790960452, "grad_norm": 1.2875833787086686, "learning_rate": 1.1099051625671928e-06, "loss": 1.0029, "step": 490 }, { "epoch": 1.4124293785310735, "grad_norm": 1.2562663969235288, "learning_rate": 1.0178217041450355e-06, "loss": 1.012, "step": 500 }, { "epoch": 1.4406779661016949, "grad_norm": 1.2156705609077063, "learning_rate": 9.287402826736089e-07, "loss": 1.0057, "step": 510 }, { "epoch": 1.4689265536723164, "grad_norm": 2.005409061506591, "learning_rate": 8.428413256491386e-07, "loss": 1.0042, "step": 520 }, { "epoch": 1.497175141242938, "grad_norm": 1.2904127978163176, "learning_rate": 7.602988147334372e-07, "loss": 1.0029, "step": 530 }, { "epoch": 1.5254237288135593, "grad_norm": 1.2327199709726766, "learning_rate": 6.81279933367571e-07, "loss": 0.999, "step": 540 }, { "epoch": 1.5536723163841808, "grad_norm": 1.3251476089228396, "learning_rate": 6.059447281547929e-07, "loss": 1.0063, "step": 550 }, { "epoch": 1.5819209039548023, "grad_norm": 1.2684033556573027, "learning_rate": 5.344457846985837e-07, "loss": 1.0041, "step": 560 }, { "epoch": 1.6101694915254239, "grad_norm": 1.274213451781422, "learning_rate": 4.6692791855237144e-07, "loss": 1.0102, "step": 570 }, { "epoch": 1.6384180790960452, "grad_norm": 1.2004747141578425, "learning_rate": 4.0352788190688245e-07, "loss": 0.9986, "step": 580 }, { "epoch": 1.6666666666666665, "grad_norm": 1.2138867752979392, "learning_rate": 3.443740866092074e-07, "loss": 0.9936, "step": 590 }, { "epoch": 1.694915254237288, "grad_norm": 1.3657230357705068, "learning_rate": 2.895863440745822e-07, "loss": 1.0085, "step": 600 }, { "epoch": 1.7231638418079096, "grad_norm": 1.2178452975739655, "learning_rate": 2.3927562261768095e-07, "loss": 1.0114, "step": 610 }, { "epoch": 1.7514124293785311, "grad_norm": 1.201511284318748, "learning_rate": 1.935438226949146e-07, "loss": 1.011, "step": 620 }, { "epoch": 1.7796610169491527, "grad_norm": 1.2103707783122326, "learning_rate": 1.5248357051297957e-07, "loss": 1.0053, "step": 630 }, { "epoch": 1.807909604519774, "grad_norm": 1.206125451520313, "learning_rate": 1.1617803042167142e-07, "loss": 0.9949, "step": 640 }, { "epoch": 1.8361581920903953, "grad_norm": 1.327611608679995, "learning_rate": 8.4700736470959e-08, "loss": 1.0107, "step": 650 }, { "epoch": 1.8644067796610169, "grad_norm": 1.2479460675046632, "learning_rate": 5.811544347348097e-08, "loss": 0.9932, "step": 660 }, { "epoch": 1.8926553672316384, "grad_norm": 1.2150911321344626, "learning_rate": 3.647599787412692e-08, "loss": 1.0028, "step": 670 }, { "epoch": 1.92090395480226, "grad_norm": 1.1712546508255446, "learning_rate": 1.9826228688248073e-08, "loss": 0.9951, "step": 680 }, { "epoch": 1.9491525423728815, "grad_norm": 1.2304982193614624, "learning_rate": 8.19985872939355e-09, "loss": 0.9928, "step": 690 }, { "epoch": 1.9774011299435028, "grad_norm": 1.2367698151490922, "learning_rate": 1.6204363063712647e-09, "loss": 0.9979, "step": 700 } ], "logging_steps": 10, "max_steps": 708, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 230913547960320.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }