| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 708, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02824858757062147, | |
| "grad_norm": 5.842401568474987, | |
| "learning_rate": 5e-06, | |
| "loss": 2.0396, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05649717514124294, | |
| "grad_norm": 3.6388768595262957, | |
| "learning_rate": 4.997468222143782e-06, | |
| "loss": 1.7131, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0847457627118644, | |
| "grad_norm": 1.8710689080902667, | |
| "learning_rate": 4.989878016494418e-06, | |
| "loss": 1.5907, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.11299435028248588, | |
| "grad_norm": 1.880666572553662, | |
| "learning_rate": 4.977244756423578e-06, | |
| "loss": 1.502, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.14124293785310735, | |
| "grad_norm": 1.6520564729299156, | |
| "learning_rate": 4.959594029617741e-06, | |
| "loss": 1.4357, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1694915254237288, | |
| "grad_norm": 1.7284626155394651, | |
| "learning_rate": 4.9369615862523266e-06, | |
| "loss": 1.3794, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1977401129943503, | |
| "grad_norm": 1.7131842535751252, | |
| "learning_rate": 4.90939326658249e-06, | |
| "loss": 1.3396, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.22598870056497175, | |
| "grad_norm": 1.6872621752920653, | |
| "learning_rate": 4.876944908097249e-06, | |
| "loss": 1.3119, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2542372881355932, | |
| "grad_norm": 1.6781047658941155, | |
| "learning_rate": 4.8396822324249915e-06, | |
| "loss": 1.2932, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2824858757062147, | |
| "grad_norm": 1.8225514648316858, | |
| "learning_rate": 4.797680712219421e-06, | |
| "loss": 1.2533, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3107344632768362, | |
| "grad_norm": 1.4304345855914073, | |
| "learning_rate": 4.751025418295565e-06, | |
| "loss": 1.2581, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3389830508474576, | |
| "grad_norm": 2.489693154849688, | |
| "learning_rate": 4.699810847325449e-06, | |
| "loss": 1.2615, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3672316384180791, | |
| "grad_norm": 1.497470889635536, | |
| "learning_rate": 4.644140730442432e-06, | |
| "loss": 1.2385, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3954802259887006, | |
| "grad_norm": 1.4852922561551125, | |
| "learning_rate": 4.584127823141855e-06, | |
| "loss": 1.2228, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.423728813559322, | |
| "grad_norm": 1.4374567982916069, | |
| "learning_rate": 4.5198936769035504e-06, | |
| "loss": 1.2254, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4519774011299435, | |
| "grad_norm": 1.4496375733325404, | |
| "learning_rate": 4.451568392998767e-06, | |
| "loss": 1.2265, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.480225988700565, | |
| "grad_norm": 1.2754388779607286, | |
| "learning_rate": 4.3792903589801515e-06, | |
| "loss": 1.1846, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5084745762711864, | |
| "grad_norm": 1.3975405752824914, | |
| "learning_rate": 4.30320596838852e-06, | |
| "loss": 1.178, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.536723163841808, | |
| "grad_norm": 1.3749489429241677, | |
| "learning_rate": 4.223469324244115e-06, | |
| "loss": 1.1717, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5649717514124294, | |
| "grad_norm": 1.393315372114993, | |
| "learning_rate": 4.140241926922916e-06, | |
| "loss": 1.181, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5932203389830508, | |
| "grad_norm": 1.505577319035518, | |
| "learning_rate": 4.0536923470501775e-06, | |
| "loss": 1.1744, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6214689265536724, | |
| "grad_norm": 1.457055249771263, | |
| "learning_rate": 3.96399588407373e-06, | |
| "loss": 1.151, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6497175141242938, | |
| "grad_norm": 1.3812712992322103, | |
| "learning_rate": 3.8713342112085685e-06, | |
| "loss": 1.1787, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6779661016949152, | |
| "grad_norm": 1.528224921984006, | |
| "learning_rate": 3.775895007471876e-06, | |
| "loss": 1.158, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7062146892655368, | |
| "grad_norm": 1.4254969486921814, | |
| "learning_rate": 3.677871577553763e-06, | |
| "loss": 1.1662, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7344632768361582, | |
| "grad_norm": 1.4029062285962588, | |
| "learning_rate": 3.5774624602936344e-06, | |
| "loss": 1.1716, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7627118644067796, | |
| "grad_norm": 1.4621797650744357, | |
| "learning_rate": 3.474871026555204e-06, | |
| "loss": 1.1523, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7909604519774012, | |
| "grad_norm": 1.576846783408568, | |
| "learning_rate": 3.370305067314612e-06, | |
| "loss": 1.169, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8192090395480226, | |
| "grad_norm": 1.4553855180258597, | |
| "learning_rate": 3.2639763727959554e-06, | |
| "loss": 1.1374, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.847457627118644, | |
| "grad_norm": 1.3037242619402352, | |
| "learning_rate": 3.1561003035066435e-06, | |
| "loss": 1.144, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8757062146892656, | |
| "grad_norm": 1.4064392305723843, | |
| "learning_rate": 3.0468953540414304e-06, | |
| "loss": 1.1396, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.903954802259887, | |
| "grad_norm": 1.2072866070703256, | |
| "learning_rate": 2.936582710538593e-06, | |
| "loss": 1.1388, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9322033898305084, | |
| "grad_norm": 1.234511949962314, | |
| "learning_rate": 2.8253858026845958e-06, | |
| "loss": 1.149, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.96045197740113, | |
| "grad_norm": 1.3682697011912646, | |
| "learning_rate": 2.7135298511746276e-06, | |
| "loss": 1.1463, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9887005649717514, | |
| "grad_norm": 1.4595117952046124, | |
| "learning_rate": 2.6012414115455826e-06, | |
| "loss": 1.1339, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.0169491525423728, | |
| "grad_norm": 1.365355679432521, | |
| "learning_rate": 2.488747915305431e-06, | |
| "loss": 1.0592, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.0451977401129944, | |
| "grad_norm": 1.4547925198477407, | |
| "learning_rate": 2.376277209288372e-06, | |
| "loss": 1.0288, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.073446327683616, | |
| "grad_norm": 1.2787445662747539, | |
| "learning_rate": 2.2640570941687794e-06, | |
| "loss": 1.022, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.1016949152542372, | |
| "grad_norm": 1.3427132830957944, | |
| "learning_rate": 2.1523148630686397e-06, | |
| "loss": 1.0311, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.1299435028248588, | |
| "grad_norm": 1.446710010618482, | |
| "learning_rate": 2.0412768411929948e-06, | |
| "loss": 1.017, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.1581920903954803, | |
| "grad_norm": 1.34481917501583, | |
| "learning_rate": 1.931167927425832e-06, | |
| "loss": 1.0266, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.1864406779661016, | |
| "grad_norm": 1.3341116732215694, | |
| "learning_rate": 1.8222111388148678e-06, | |
| "loss": 1.0236, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.2146892655367232, | |
| "grad_norm": 1.345613554920642, | |
| "learning_rate": 1.714627158867857e-06, | |
| "loss": 1.0109, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.2429378531073447, | |
| "grad_norm": 1.2833581931894298, | |
| "learning_rate": 1.6086338905752883e-06, | |
| "loss": 0.9996, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.271186440677966, | |
| "grad_norm": 1.3699670070856527, | |
| "learning_rate": 1.50444601506482e-06, | |
| "loss": 1.0128, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.2994350282485876, | |
| "grad_norm": 1.2610914382885658, | |
| "learning_rate": 1.4022745567813334e-06, | |
| "loss": 1.0094, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.327683615819209, | |
| "grad_norm": 1.2627951088836191, | |
| "learning_rate": 1.3023264560733268e-06, | |
| "loss": 1.0206, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.3559322033898304, | |
| "grad_norm": 1.268968791060816, | |
| "learning_rate": 1.2048041500513136e-06, | |
| "loss": 1.0136, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.384180790960452, | |
| "grad_norm": 1.2875833787086686, | |
| "learning_rate": 1.1099051625671928e-06, | |
| "loss": 1.0029, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.4124293785310735, | |
| "grad_norm": 1.2562663969235288, | |
| "learning_rate": 1.0178217041450355e-06, | |
| "loss": 1.012, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.4406779661016949, | |
| "grad_norm": 1.2156705609077063, | |
| "learning_rate": 9.287402826736089e-07, | |
| "loss": 1.0057, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.4689265536723164, | |
| "grad_norm": 2.005409061506591, | |
| "learning_rate": 8.428413256491386e-07, | |
| "loss": 1.0042, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.497175141242938, | |
| "grad_norm": 1.2904127978163176, | |
| "learning_rate": 7.602988147334372e-07, | |
| "loss": 1.0029, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.5254237288135593, | |
| "grad_norm": 1.2327199709726766, | |
| "learning_rate": 6.81279933367571e-07, | |
| "loss": 0.999, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.5536723163841808, | |
| "grad_norm": 1.3251476089228396, | |
| "learning_rate": 6.059447281547929e-07, | |
| "loss": 1.0063, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.5819209039548023, | |
| "grad_norm": 1.2684033556573027, | |
| "learning_rate": 5.344457846985837e-07, | |
| "loss": 1.0041, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6101694915254239, | |
| "grad_norm": 1.274213451781422, | |
| "learning_rate": 4.6692791855237144e-07, | |
| "loss": 1.0102, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.6384180790960452, | |
| "grad_norm": 1.2004747141578425, | |
| "learning_rate": 4.0352788190688245e-07, | |
| "loss": 0.9986, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 1.2138867752979392, | |
| "learning_rate": 3.443740866092074e-07, | |
| "loss": 0.9936, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.694915254237288, | |
| "grad_norm": 1.3657230357705068, | |
| "learning_rate": 2.895863440745822e-07, | |
| "loss": 1.0085, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.7231638418079096, | |
| "grad_norm": 1.2178452975739655, | |
| "learning_rate": 2.3927562261768095e-07, | |
| "loss": 1.0114, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.7514124293785311, | |
| "grad_norm": 1.201511284318748, | |
| "learning_rate": 1.935438226949146e-07, | |
| "loss": 1.011, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.7796610169491527, | |
| "grad_norm": 1.2103707783122326, | |
| "learning_rate": 1.5248357051297957e-07, | |
| "loss": 1.0053, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.807909604519774, | |
| "grad_norm": 1.206125451520313, | |
| "learning_rate": 1.1617803042167142e-07, | |
| "loss": 0.9949, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.8361581920903953, | |
| "grad_norm": 1.327611608679995, | |
| "learning_rate": 8.4700736470959e-08, | |
| "loss": 1.0107, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.8644067796610169, | |
| "grad_norm": 1.2479460675046632, | |
| "learning_rate": 5.811544347348097e-08, | |
| "loss": 0.9932, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.8926553672316384, | |
| "grad_norm": 1.2150911321344626, | |
| "learning_rate": 3.647599787412692e-08, | |
| "loss": 1.0028, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.92090395480226, | |
| "grad_norm": 1.1712546508255446, | |
| "learning_rate": 1.9826228688248073e-08, | |
| "loss": 0.9951, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.9491525423728815, | |
| "grad_norm": 1.2304982193614624, | |
| "learning_rate": 8.19985872939355e-09, | |
| "loss": 0.9928, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.9774011299435028, | |
| "grad_norm": 1.2367698151490922, | |
| "learning_rate": 1.6204363063712647e-09, | |
| "loss": 0.9979, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 708, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 230913547960320.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |