{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3022, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01654533421575116, "grad_norm": 8.903882026672363, "learning_rate": 3.2343234323432342e-06, "loss": 0.411, "step": 50 }, { "epoch": 0.03309066843150232, "grad_norm": 14.360084533691406, "learning_rate": 6.534653465346535e-06, "loss": 0.1448, "step": 100 }, { "epoch": 0.04963600264725347, "grad_norm": 15.013335227966309, "learning_rate": 9.834983498349836e-06, "loss": 0.1364, "step": 150 }, { "epoch": 0.06618133686300463, "grad_norm": 8.873584747314453, "learning_rate": 1.3135313531353136e-05, "loss": 0.1064, "step": 200 }, { "epoch": 0.0827266710787558, "grad_norm": 5.249364376068115, "learning_rate": 1.6435643564356436e-05, "loss": 0.119, "step": 250 }, { "epoch": 0.09927200529450694, "grad_norm": 10.57626724243164, "learning_rate": 1.9735973597359735e-05, "loss": 0.1209, "step": 300 }, { "epoch": 0.1158173395102581, "grad_norm": 9.12836742401123, "learning_rate": 1.966164030893711e-05, "loss": 0.1335, "step": 350 }, { "epoch": 0.13236267372600927, "grad_norm": 7.3058061599731445, "learning_rate": 1.9293858036042665e-05, "loss": 0.122, "step": 400 }, { "epoch": 0.14890800794176043, "grad_norm": 9.762269020080566, "learning_rate": 1.8926075763148218e-05, "loss": 0.1231, "step": 450 }, { "epoch": 0.1654533421575116, "grad_norm": 9.085260391235352, "learning_rate": 1.855829349025377e-05, "loss": 0.1147, "step": 500 }, { "epoch": 0.18199867637326275, "grad_norm": 3.281120538711548, "learning_rate": 1.8190511217359325e-05, "loss": 0.1224, "step": 550 }, { "epoch": 0.1985440105890139, "grad_norm": 10.944239616394043, "learning_rate": 1.782272894446488e-05, "loss": 0.1113, "step": 600 }, { "epoch": 0.21508934480476505, "grad_norm": 6.012777328491211, "learning_rate": 1.7454946671570432e-05, "loss": 0.1054, "step": 650 }, { "epoch": 0.2316346790205162, "grad_norm": 7.861270904541016, "learning_rate": 1.7087164398675985e-05, "loss": 0.1013, "step": 700 }, { "epoch": 0.24818001323626737, "grad_norm": 6.786500930786133, "learning_rate": 1.671938212578154e-05, "loss": 0.1019, "step": 750 }, { "epoch": 0.26472534745201853, "grad_norm": 12.52272891998291, "learning_rate": 1.6351599852887092e-05, "loss": 0.0964, "step": 800 }, { "epoch": 0.2812706816677697, "grad_norm": 6.790607929229736, "learning_rate": 1.5983817579992645e-05, "loss": 0.0961, "step": 850 }, { "epoch": 0.29781601588352086, "grad_norm": 6.981550693511963, "learning_rate": 1.56160353070982e-05, "loss": 0.0927, "step": 900 }, { "epoch": 0.314361350099272, "grad_norm": 4.593362808227539, "learning_rate": 1.5248253034203752e-05, "loss": 0.0946, "step": 950 }, { "epoch": 0.3309066843150232, "grad_norm": 4.783601760864258, "learning_rate": 1.4880470761309307e-05, "loss": 0.0988, "step": 1000 }, { "epoch": 0.34745201853077434, "grad_norm": 5.365964889526367, "learning_rate": 1.4512688488414859e-05, "loss": 0.091, "step": 1050 }, { "epoch": 0.3639973527465255, "grad_norm": 7.908538341522217, "learning_rate": 1.4144906215520414e-05, "loss": 0.091, "step": 1100 }, { "epoch": 0.3805426869622766, "grad_norm": 6.80029821395874, "learning_rate": 1.3777123942625967e-05, "loss": 0.0802, "step": 1150 }, { "epoch": 0.3970880211780278, "grad_norm": 4.817410945892334, "learning_rate": 1.3409341669731519e-05, "loss": 0.0824, "step": 1200 }, { "epoch": 0.41363335539377893, "grad_norm": 2.9681406021118164, "learning_rate": 1.3041559396837074e-05, "loss": 0.0845, "step": 1250 }, { "epoch": 0.4301786896095301, "grad_norm": 6.426875591278076, "learning_rate": 1.2673777123942627e-05, "loss": 0.09, "step": 1300 }, { "epoch": 0.44672402382528126, "grad_norm": 8.284296035766602, "learning_rate": 1.2305994851048179e-05, "loss": 0.0866, "step": 1350 }, { "epoch": 0.4632693580410324, "grad_norm": 5.250367164611816, "learning_rate": 1.1938212578153734e-05, "loss": 0.0826, "step": 1400 }, { "epoch": 0.4798146922567836, "grad_norm": 4.941124439239502, "learning_rate": 1.1570430305259287e-05, "loss": 0.0735, "step": 1450 }, { "epoch": 0.49636002647253474, "grad_norm": 4.166671276092529, "learning_rate": 1.120264803236484e-05, "loss": 0.0801, "step": 1500 }, { "epoch": 0.5129053606882858, "grad_norm": 4.212963581085205, "learning_rate": 1.0834865759470394e-05, "loss": 0.0694, "step": 1550 }, { "epoch": 0.5294506949040371, "grad_norm": 4.659167766571045, "learning_rate": 1.0467083486575949e-05, "loss": 0.0681, "step": 1600 }, { "epoch": 0.5459960291197882, "grad_norm": 1.2746665477752686, "learning_rate": 1.00993012136815e-05, "loss": 0.0741, "step": 1650 }, { "epoch": 0.5625413633355394, "grad_norm": 5.447134017944336, "learning_rate": 9.731518940787054e-06, "loss": 0.0855, "step": 1700 }, { "epoch": 0.5790866975512905, "grad_norm": 5.253468036651611, "learning_rate": 9.363736667892607e-06, "loss": 0.0789, "step": 1750 }, { "epoch": 0.5956320317670417, "grad_norm": 9.73345947265625, "learning_rate": 8.995954394998163e-06, "loss": 0.061, "step": 1800 }, { "epoch": 0.6121773659827928, "grad_norm": 6.357212066650391, "learning_rate": 8.628172122103716e-06, "loss": 0.077, "step": 1850 }, { "epoch": 0.628722700198544, "grad_norm": 4.212811470031738, "learning_rate": 8.26038984920927e-06, "loss": 0.085, "step": 1900 }, { "epoch": 0.6452680344142951, "grad_norm": 5.546814441680908, "learning_rate": 7.892607576314823e-06, "loss": 0.0661, "step": 1950 }, { "epoch": 0.6618133686300464, "grad_norm": 5.276381492614746, "learning_rate": 7.524825303420375e-06, "loss": 0.0676, "step": 2000 }, { "epoch": 0.6783587028457975, "grad_norm": 4.1648077964782715, "learning_rate": 7.157043030525929e-06, "loss": 0.0598, "step": 2050 }, { "epoch": 0.6949040370615487, "grad_norm": 7.234411716461182, "learning_rate": 6.789260757631483e-06, "loss": 0.0734, "step": 2100 }, { "epoch": 0.7114493712772998, "grad_norm": 6.458765029907227, "learning_rate": 6.421478484737036e-06, "loss": 0.0653, "step": 2150 }, { "epoch": 0.727994705493051, "grad_norm": 4.6173930168151855, "learning_rate": 6.05369621184259e-06, "loss": 0.0751, "step": 2200 }, { "epoch": 0.7445400397088021, "grad_norm": 5.65716028213501, "learning_rate": 5.685913938948143e-06, "loss": 0.064, "step": 2250 }, { "epoch": 0.7610853739245532, "grad_norm": 4.843678951263428, "learning_rate": 5.318131666053696e-06, "loss": 0.0624, "step": 2300 }, { "epoch": 0.7776307081403044, "grad_norm": 4.933941841125488, "learning_rate": 4.95034939315925e-06, "loss": 0.064, "step": 2350 }, { "epoch": 0.7941760423560555, "grad_norm": 4.032420635223389, "learning_rate": 4.582567120264804e-06, "loss": 0.0584, "step": 2400 }, { "epoch": 0.8107213765718068, "grad_norm": 4.290740013122559, "learning_rate": 4.214784847370357e-06, "loss": 0.0577, "step": 2450 }, { "epoch": 0.8272667107875579, "grad_norm": 3.434802293777466, "learning_rate": 3.8470025744759105e-06, "loss": 0.0536, "step": 2500 }, { "epoch": 0.8438120450033091, "grad_norm": 4.708302021026611, "learning_rate": 3.479220301581464e-06, "loss": 0.0562, "step": 2550 }, { "epoch": 0.8603573792190602, "grad_norm": 3.8048012256622314, "learning_rate": 3.1114380286870177e-06, "loss": 0.0546, "step": 2600 }, { "epoch": 0.8769027134348114, "grad_norm": 5.102752208709717, "learning_rate": 2.7436557557925706e-06, "loss": 0.0551, "step": 2650 }, { "epoch": 0.8934480476505625, "grad_norm": 4.97892951965332, "learning_rate": 2.3758734828981244e-06, "loss": 0.0619, "step": 2700 }, { "epoch": 0.9099933818663137, "grad_norm": 8.729324340820312, "learning_rate": 2.008091210003678e-06, "loss": 0.0501, "step": 2750 }, { "epoch": 0.9265387160820648, "grad_norm": 6.92756462097168, "learning_rate": 1.6403089371092316e-06, "loss": 0.0488, "step": 2800 }, { "epoch": 0.9430840502978161, "grad_norm": 8.403130531311035, "learning_rate": 1.272526664214785e-06, "loss": 0.0596, "step": 2850 }, { "epoch": 0.9596293845135672, "grad_norm": 2.115785837173462, "learning_rate": 9.047443913203384e-07, "loss": 0.0514, "step": 2900 }, { "epoch": 0.9761747187293184, "grad_norm": 4.952437400817871, "learning_rate": 5.369621184258919e-07, "loss": 0.0556, "step": 2950 }, { "epoch": 0.9927200529450695, "grad_norm": 3.030841588973999, "learning_rate": 1.691798455314454e-07, "loss": 0.0395, "step": 3000 } ], "logging_steps": 50, "max_steps": 3022, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }