{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.4454976303317535, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08616975441619991, "grad_norm": 1.8284112215042114, "learning_rate": 1.98e-05, "loss": 5.544659423828125, "step": 100 }, { "epoch": 0.17233950883239982, "grad_norm": 0.5319015383720398, "learning_rate": 3.979999999999999e-05, "loss": 3.90488037109375, "step": 200 }, { "epoch": 0.25850926324859974, "grad_norm": 1.3977950811386108, "learning_rate": 5.98e-05, "loss": 3.39756103515625, "step": 300 }, { "epoch": 0.34467901766479964, "grad_norm": 1.9291573762893677, "learning_rate": 7.98e-05, "loss": 3.019991149902344, "step": 400 }, { "epoch": 0.4308487720809996, "grad_norm": 1.4095340967178345, "learning_rate": 9.979999999999999e-05, "loss": 2.815445861816406, "step": 500 }, { "epoch": 0.5170185264971995, "grad_norm": 3.2716641426086426, "learning_rate": 0.00011979999999999998, "loss": 2.6590045166015623, "step": 600 }, { "epoch": 0.6031882809133994, "grad_norm": 1.3838716745376587, "learning_rate": 0.00013979999999999998, "loss": 2.543310089111328, "step": 700 }, { "epoch": 0.6893580353295993, "grad_norm": 1.069161057472229, "learning_rate": 0.00015979999999999998, "loss": 2.396273651123047, "step": 800 }, { "epoch": 0.7755277897457993, "grad_norm": 0.8585665822029114, "learning_rate": 0.0001798, "loss": 2.242165985107422, "step": 900 }, { "epoch": 0.8616975441619992, "grad_norm": 0.7467069625854492, "learning_rate": 0.0001998, "loss": 2.1027012634277344, "step": 1000 }, { "epoch": 0.9478672985781991, "grad_norm": 0.5805935859680176, "learning_rate": 0.00021979999999999998, "loss": 2.037454376220703, "step": 1100 }, { "epoch": 1.033606204222318, "grad_norm": 0.5948718786239624, "learning_rate": 0.00023979999999999997, "loss": 1.9681085205078126, "step": 1200 }, { "epoch": 1.1197759586385179, "grad_norm": 0.5413378477096558, "learning_rate": 0.00025979999999999997, "loss": 1.9135774230957032, "step": 1300 }, { "epoch": 1.2059457130547178, "grad_norm": 0.5196030139923096, "learning_rate": 0.00027979999999999997, "loss": 1.8392716979980468, "step": 1400 }, { "epoch": 1.2921154674709177, "grad_norm": 0.49619364738464355, "learning_rate": 0.00029979999999999997, "loss": 1.8049734497070313, "step": 1500 }, { "epoch": 1.3782852218871176, "grad_norm": 0.44414839148521423, "learning_rate": 0.000299991068233357, "loss": 1.7638165283203124, "step": 1600 }, { "epoch": 1.4644549763033177, "grad_norm": 0.46444711089134216, "learning_rate": 0.0002999639122316208, "loss": 1.7137832641601562, "step": 1700 }, { "epoch": 1.5506247307195173, "grad_norm": 0.5176238417625427, "learning_rate": 0.0002999185343831476, "loss": 1.675589599609375, "step": 1800 }, { "epoch": 1.6367944851357175, "grad_norm": 0.4177858829498291, "learning_rate": 0.0002998549402017187, "loss": 1.6349491882324219, "step": 1900 }, { "epoch": 1.7229642395519171, "grad_norm": 0.42198434472084045, "learning_rate": 0.0002997731374145493, "loss": 1.596505126953125, "step": 2000 }, { "epoch": 1.8091339939681172, "grad_norm": 0.4523915946483612, "learning_rate": 0.0002996731359613498, "loss": 1.5908058166503907, "step": 2100 }, { "epoch": 1.8953037483843171, "grad_norm": 0.3901713788509369, "learning_rate": 0.0002995549479931178, "loss": 1.5610142517089844, "step": 2200 }, { "epoch": 1.981473502800517, "grad_norm": 0.41816478967666626, "learning_rate": 0.00029941858787066206, "loss": 1.5319706726074218, "step": 2300 }, { "epoch": 2.067212408444636, "grad_norm": 0.3872755765914917, "learning_rate": 0.00029926407216285706, "loss": 1.5055549621582032, "step": 2400 }, { "epoch": 2.1533821628608356, "grad_norm": 0.4193103611469269, "learning_rate": 0.0002990914196446301, "loss": 1.4792218017578125, "step": 2500 }, { "epoch": 2.2395519172770357, "grad_norm": 0.4024358093738556, "learning_rate": 0.00029890065129467986, "loss": 1.4786280822753906, "step": 2600 }, { "epoch": 2.325721671693236, "grad_norm": 0.37588468194007874, "learning_rate": 0.0002986917902929273, "loss": 1.4545697021484374, "step": 2700 }, { "epoch": 2.4118914261094355, "grad_norm": 0.39736974239349365, "learning_rate": 0.0002984648620176991, "loss": 1.4498170471191407, "step": 2800 }, { "epoch": 2.4980611805256356, "grad_norm": 0.42380592226982117, "learning_rate": 0.00029821989404264424, "loss": 1.4262150573730468, "step": 2900 }, { "epoch": 2.5842309349418353, "grad_norm": 0.411803662776947, "learning_rate": 0.00029795691613338307, "loss": 1.417086181640625, "step": 3000 }, { "epoch": 2.6704006893580354, "grad_norm": 0.3662901818752289, "learning_rate": 0.000297675960243891, "loss": 1.3942941284179688, "step": 3100 }, { "epoch": 2.756570443774235, "grad_norm": 0.3642771244049072, "learning_rate": 0.00029737706051261557, "loss": 1.38471923828125, "step": 3200 }, { "epoch": 2.842740198190435, "grad_norm": 0.4138600826263428, "learning_rate": 0.00029706025325832857, "loss": 1.3765927124023438, "step": 3300 }, { "epoch": 2.9289099526066353, "grad_norm": 0.3687536418437958, "learning_rate": 0.0002967255769757127, "loss": 1.3617820739746094, "step": 3400 }, { "epoch": 3.014648858250754, "grad_norm": 0.3252148926258087, "learning_rate": 0.0002963730723306845, "loss": 1.3490205383300782, "step": 3500 }, { "epoch": 3.100818612666954, "grad_norm": 0.3874260187149048, "learning_rate": 0.0002960027821554529, "loss": 1.3380169677734375, "step": 3600 }, { "epoch": 3.1869883670831536, "grad_norm": 0.37778887152671814, "learning_rate": 0.00029561475144331467, "loss": 1.3190237426757812, "step": 3700 }, { "epoch": 3.2731581214993537, "grad_norm": 0.37266016006469727, "learning_rate": 0.00029520902734318766, "loss": 1.313209991455078, "step": 3800 }, { "epoch": 3.359327875915554, "grad_norm": 0.3792646527290344, "learning_rate": 0.00029478565915388153, "loss": 1.3055996704101562, "step": 3900 }, { "epoch": 3.4454976303317535, "grad_norm": 0.3583495318889618, "learning_rate": 0.00029434469831810764, "loss": 1.301021728515625, "step": 4000 } ], "logging_steps": 100, "max_steps": 30000, "num_input_tokens_seen": 0, "num_train_epochs": 26, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.9825523114901504e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }