{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5286343612334802, "eval_steps": 500, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00881057268722467, "grad_norm": 19.571447343122014, "learning_rate": 6.666666666666666e-07, "loss": 2.2003, "step": 10 }, { "epoch": 0.01762114537444934, "grad_norm": 6.597664324174228, "learning_rate": 1.3333333333333332e-06, "loss": 2.0343, "step": 20 }, { "epoch": 0.02643171806167401, "grad_norm": 3.014996781518845, "learning_rate": 2e-06, "loss": 1.8168, "step": 30 }, { "epoch": 0.03524229074889868, "grad_norm": 2.496924527736719, "learning_rate": 1.9995958747931082e-06, "loss": 1.7178, "step": 40 }, { "epoch": 0.04405286343612335, "grad_norm": 2.8973420754515464, "learning_rate": 1.9983838258067983e-06, "loss": 1.6557, "step": 50 }, { "epoch": 0.05286343612334802, "grad_norm": 2.3479520994877605, "learning_rate": 1.9963648326801653e-06, "loss": 1.595, "step": 60 }, { "epoch": 0.06167400881057269, "grad_norm": 2.2003866197934974, "learning_rate": 1.993540527265239e-06, "loss": 1.5428, "step": 70 }, { "epoch": 0.07048458149779736, "grad_norm": 2.222349491130299, "learning_rate": 1.98991319230804e-06, "loss": 1.5157, "step": 80 }, { "epoch": 0.07929515418502203, "grad_norm": 2.146284797118347, "learning_rate": 1.9854857596035474e-06, "loss": 1.4995, "step": 90 }, { "epoch": 0.0881057268722467, "grad_norm": 2.305597555128469, "learning_rate": 1.980261807626078e-06, "loss": 1.4886, "step": 100 }, { "epoch": 0.09691629955947137, "grad_norm": 2.199507218792448, "learning_rate": 1.974245558636978e-06, "loss": 1.4643, "step": 110 }, { "epoch": 0.10572687224669604, "grad_norm": 2.6946448901304674, "learning_rate": 1.967441875271983e-06, "loss": 1.4522, "step": 120 }, { "epoch": 0.1145374449339207, "grad_norm": 1.9626073275857734, "learning_rate": 1.959856256610988e-06, "loss": 1.4149, "step": 130 }, { "epoch": 0.12334801762114538, "grad_norm": 2.0441408462679065, "learning_rate": 1.951494833733414e-06, "loss": 1.385, "step": 140 }, { "epoch": 0.13215859030837004, "grad_norm": 2.0166534518054493, "learning_rate": 1.942364364762762e-06, "loss": 1.3718, "step": 150 }, { "epoch": 0.14096916299559473, "grad_norm": 1.9565509974119288, "learning_rate": 1.932472229404356e-06, "loss": 1.3775, "step": 160 }, { "epoch": 0.14977973568281938, "grad_norm": 2.1012455112006743, "learning_rate": 1.9218264229806917e-06, "loss": 1.3579, "step": 170 }, { "epoch": 0.15859030837004406, "grad_norm": 1.90439374473129, "learning_rate": 1.9104355499692162e-06, "loss": 1.3333, "step": 180 }, { "epoch": 0.16740088105726872, "grad_norm": 2.2252827256286922, "learning_rate": 1.8983088170477553e-06, "loss": 1.3458, "step": 190 }, { "epoch": 0.1762114537444934, "grad_norm": 1.958512432909627, "learning_rate": 1.8854560256532098e-06, "loss": 1.332, "step": 200 }, { "epoch": 0.18502202643171806, "grad_norm": 1.9065312694563097, "learning_rate": 1.871887564059543e-06, "loss": 1.3138, "step": 210 }, { "epoch": 0.19383259911894274, "grad_norm": 2.6199771240261436, "learning_rate": 1.8576143989814521e-06, "loss": 1.3229, "step": 220 }, { "epoch": 0.2026431718061674, "grad_norm": 2.217202276336257, "learning_rate": 1.8426480667105175e-06, "loss": 1.3205, "step": 230 }, { "epoch": 0.21145374449339208, "grad_norm": 1.8491248577424624, "learning_rate": 1.8270006637909905e-06, "loss": 1.3085, "step": 240 }, { "epoch": 0.22026431718061673, "grad_norm": 1.9823551473714545, "learning_rate": 1.8106848372427548e-06, "loss": 1.2956, "step": 250 }, { "epoch": 0.2290748898678414, "grad_norm": 2.1060962042553726, "learning_rate": 1.7937137743393693e-06, "loss": 1.2804, "step": 260 }, { "epoch": 0.23788546255506607, "grad_norm": 1.8695678284143167, "learning_rate": 1.7761011919494486e-06, "loss": 1.273, "step": 270 }, { "epoch": 0.24669603524229075, "grad_norm": 2.56804202001478, "learning_rate": 1.7578613254499968e-06, "loss": 1.2984, "step": 280 }, { "epoch": 0.2555066079295154, "grad_norm": 1.9594579835819614, "learning_rate": 1.739008917220659e-06, "loss": 1.2554, "step": 290 }, { "epoch": 0.2643171806167401, "grad_norm": 1.948264525063767, "learning_rate": 1.719559204728188e-06, "loss": 1.2733, "step": 300 }, { "epoch": 0.27312775330396477, "grad_norm": 1.9555361537181306, "learning_rate": 1.6995279082107537e-06, "loss": 1.2794, "step": 310 }, { "epoch": 0.28193832599118945, "grad_norm": 2.072778303156424, "learning_rate": 1.6789312179720546e-06, "loss": 1.2594, "step": 320 }, { "epoch": 0.2907488986784141, "grad_norm": 1.8797335867525704, "learning_rate": 1.6577857812954991e-06, "loss": 1.2485, "step": 330 }, { "epoch": 0.29955947136563876, "grad_norm": 2.039515984739043, "learning_rate": 1.6361086889890305e-06, "loss": 1.2645, "step": 340 }, { "epoch": 0.30837004405286345, "grad_norm": 2.0610642486240125, "learning_rate": 1.613917461571475e-06, "loss": 1.2531, "step": 350 }, { "epoch": 0.31718061674008813, "grad_norm": 2.06709238158364, "learning_rate": 1.5912300351115757e-06, "loss": 1.2491, "step": 360 }, { "epoch": 0.32599118942731276, "grad_norm": 1.8437333020014937, "learning_rate": 1.5680647467311555e-06, "loss": 1.2334, "step": 370 }, { "epoch": 0.33480176211453744, "grad_norm": 1.8965957390199724, "learning_rate": 1.5444403197841344e-06, "loss": 1.2172, "step": 380 }, { "epoch": 0.3436123348017621, "grad_norm": 10.745414670704703, "learning_rate": 1.5203758487233676e-06, "loss": 1.2306, "step": 390 }, { "epoch": 0.3524229074889868, "grad_norm": 1.978659425380664, "learning_rate": 1.4958907836675465e-06, "loss": 1.2425, "step": 400 }, { "epoch": 0.36123348017621143, "grad_norm": 1.8232345405717043, "learning_rate": 1.4710049146806346e-06, "loss": 1.2401, "step": 410 }, { "epoch": 0.3700440528634361, "grad_norm": 1.9694165773247603, "learning_rate": 1.4457383557765383e-06, "loss": 1.2137, "step": 420 }, { "epoch": 0.3788546255506608, "grad_norm": 1.857798327139117, "learning_rate": 1.4201115286619464e-06, "loss": 1.2328, "step": 430 }, { "epoch": 0.3876651982378855, "grad_norm": 1.9640307234634835, "learning_rate": 1.3941451462304777e-06, "loss": 1.2244, "step": 440 }, { "epoch": 0.3964757709251101, "grad_norm": 1.9269638055698532, "learning_rate": 1.3678601958214777e-06, "loss": 1.2399, "step": 450 }, { "epoch": 0.4052863436123348, "grad_norm": 1.8699071909555112, "learning_rate": 1.3412779222569906e-06, "loss": 1.2175, "step": 460 }, { "epoch": 0.41409691629955947, "grad_norm": 1.8481447316610542, "learning_rate": 1.3144198106706239e-06, "loss": 1.2238, "step": 470 }, { "epoch": 0.42290748898678415, "grad_norm": 1.8316730682769253, "learning_rate": 1.2873075691421806e-06, "loss": 1.2304, "step": 480 }, { "epoch": 0.43171806167400884, "grad_norm": 1.8960767894113884, "learning_rate": 1.2599631111520955e-06, "loss": 1.2036, "step": 490 }, { "epoch": 0.44052863436123346, "grad_norm": 1.9109891577097189, "learning_rate": 1.2324085378698529e-06, "loss": 1.1917, "step": 500 }, { "epoch": 0.44933920704845814, "grad_norm": 2.010796003658019, "learning_rate": 1.20466612029071e-06, "loss": 1.2279, "step": 510 }, { "epoch": 0.4581497797356828, "grad_norm": 2.6998433293606454, "learning_rate": 1.1767582812351548e-06, "loss": 1.1925, "step": 520 }, { "epoch": 0.4669603524229075, "grad_norm": 2.275146777332648, "learning_rate": 1.1487075772256517e-06, "loss": 1.1742, "step": 530 }, { "epoch": 0.47577092511013214, "grad_norm": 1.7756028193514886, "learning_rate": 1.1205366802553228e-06, "loss": 1.1833, "step": 540 }, { "epoch": 0.4845814977973568, "grad_norm": 2.042541280215003, "learning_rate": 1.092268359463302e-06, "loss": 1.1826, "step": 550 }, { "epoch": 0.4933920704845815, "grad_norm": 1.84844630790175, "learning_rate": 1.0639254627315658e-06, "loss": 1.2045, "step": 560 }, { "epoch": 0.5022026431718062, "grad_norm": 1.9498951674092562, "learning_rate": 1.0355308982181253e-06, "loss": 1.1895, "step": 570 }, { "epoch": 0.5110132158590308, "grad_norm": 1.961163999725279, "learning_rate": 1.0071076158414974e-06, "loss": 1.1898, "step": 580 }, { "epoch": 0.5198237885462555, "grad_norm": 1.91915126758986, "learning_rate": 9.786785887314253e-07, "loss": 1.2014, "step": 590 }, { "epoch": 0.5286343612334802, "grad_norm": 1.845963386436035, "learning_rate": 9.502667946608332e-07, "loss": 1.207, "step": 600 } ], "logging_steps": 10, "max_steps": 1135, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 97844723712000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }