{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0384, "eval_steps": 1000, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00064, "grad_norm": 1.6144306659698486, "learning_rate": 1.1520000000000002e-08, "loss": 0.729, "step": 10 }, { "epoch": 0.00128, "grad_norm": 2.0952296257019043, "learning_rate": 2.4320000000000002e-08, "loss": 0.7295, "step": 20 }, { "epoch": 0.00192, "grad_norm": 1.3587689399719238, "learning_rate": 3.7120000000000004e-08, "loss": 0.73, "step": 30 }, { "epoch": 0.00256, "grad_norm": 1.2531732320785522, "learning_rate": 4.9920000000000006e-08, "loss": 0.7221, "step": 40 }, { "epoch": 0.0032, "grad_norm": 1.437932014465332, "learning_rate": 6.272000000000001e-08, "loss": 0.7209, "step": 50 }, { "epoch": 0.00384, "grad_norm": 1.418426752090454, "learning_rate": 7.552e-08, "loss": 0.729, "step": 60 }, { "epoch": 0.00448, "grad_norm": 1.9476298093795776, "learning_rate": 8.832e-08, "loss": 0.7242, "step": 70 }, { "epoch": 0.00512, "grad_norm": 1.7948051691055298, "learning_rate": 1.0112000000000001e-07, "loss": 0.7227, "step": 80 }, { "epoch": 0.00576, "grad_norm": 1.6534360647201538, "learning_rate": 1.1392e-07, "loss": 0.7234, "step": 90 }, { "epoch": 0.0064, "grad_norm": 1.0920158624649048, "learning_rate": 1.2672e-07, "loss": 0.7328, "step": 100 }, { "epoch": 0.00704, "grad_norm": 1.977837085723877, "learning_rate": 1.3952000000000002e-07, "loss": 0.7263, "step": 110 }, { "epoch": 0.00768, "grad_norm": 1.388983130455017, "learning_rate": 1.5232000000000003e-07, "loss": 0.7286, "step": 120 }, { "epoch": 0.00832, "grad_norm": 1.2956682443618774, "learning_rate": 1.6512e-07, "loss": 0.7251, "step": 130 }, { "epoch": 0.00896, "grad_norm": 1.8125052452087402, "learning_rate": 1.7792e-07, "loss": 0.7251, "step": 140 }, { "epoch": 0.0096, "grad_norm": 1.626846194267273, "learning_rate": 1.9072e-07, "loss": 0.727, "step": 150 }, { "epoch": 0.01024, "grad_norm": 2.3243086338043213, "learning_rate": 2.0352e-07, "loss": 0.726, "step": 160 }, { "epoch": 0.01088, "grad_norm": 1.4734737873077393, "learning_rate": 2.1632e-07, "loss": 0.7252, "step": 170 }, { "epoch": 0.01152, "grad_norm": 2.090498685836792, "learning_rate": 2.2912e-07, "loss": 0.7273, "step": 180 }, { "epoch": 0.01216, "grad_norm": 1.7563093900680542, "learning_rate": 2.4192000000000004e-07, "loss": 0.719, "step": 190 }, { "epoch": 0.0128, "grad_norm": 1.449843168258667, "learning_rate": 2.5472000000000005e-07, "loss": 0.7237, "step": 200 }, { "epoch": 0.01344, "grad_norm": 2.1326472759246826, "learning_rate": 2.6752000000000006e-07, "loss": 0.7305, "step": 210 }, { "epoch": 0.01408, "grad_norm": 2.21703839302063, "learning_rate": 2.8032e-07, "loss": 0.7167, "step": 220 }, { "epoch": 0.01472, "grad_norm": 1.6385700702667236, "learning_rate": 2.9312e-07, "loss": 0.7209, "step": 230 }, { "epoch": 0.01536, "grad_norm": 1.4293471574783325, "learning_rate": 3.0592000000000003e-07, "loss": 0.722, "step": 240 }, { "epoch": 0.016, "grad_norm": 2.1437904834747314, "learning_rate": 3.1872e-07, "loss": 0.717, "step": 250 }, { "epoch": 0.01664, "grad_norm": 2.014806032180786, "learning_rate": 3.3152000000000005e-07, "loss": 0.7182, "step": 260 }, { "epoch": 0.01728, "grad_norm": 1.7216386795043945, "learning_rate": 3.4432e-07, "loss": 0.7253, "step": 270 }, { "epoch": 0.01792, "grad_norm": 1.4267009496688843, "learning_rate": 3.5712e-07, "loss": 0.7189, "step": 280 }, { "epoch": 0.01856, "grad_norm": 2.222503185272217, "learning_rate": 3.6992e-07, "loss": 0.7198, "step": 290 }, { "epoch": 0.0192, "grad_norm": 1.578922986984253, "learning_rate": 3.8272000000000003e-07, "loss": 0.717, "step": 300 }, { "epoch": 0.01984, "grad_norm": 1.719905972480774, "learning_rate": 3.9552e-07, "loss": 0.709, "step": 310 }, { "epoch": 0.02048, "grad_norm": 1.4473963975906372, "learning_rate": 4.0832000000000005e-07, "loss": 0.7215, "step": 320 }, { "epoch": 0.02112, "grad_norm": 2.1639790534973145, "learning_rate": 4.2112e-07, "loss": 0.7175, "step": 330 }, { "epoch": 0.02176, "grad_norm": 1.2387958765029907, "learning_rate": 4.3392e-07, "loss": 0.7129, "step": 340 }, { "epoch": 0.0224, "grad_norm": 2.2797842025756836, "learning_rate": 4.4672000000000007e-07, "loss": 0.7159, "step": 350 }, { "epoch": 0.02304, "grad_norm": 1.5692473649978638, "learning_rate": 4.5952000000000003e-07, "loss": 0.7161, "step": 360 }, { "epoch": 0.02368, "grad_norm": 1.4270817041397095, "learning_rate": 4.723200000000001e-07, "loss": 0.7114, "step": 370 }, { "epoch": 0.02432, "grad_norm": 1.4091335535049438, "learning_rate": 4.8512e-07, "loss": 0.7127, "step": 380 }, { "epoch": 0.02496, "grad_norm": 1.8862844705581665, "learning_rate": 4.979200000000001e-07, "loss": 0.7153, "step": 390 }, { "epoch": 0.0256, "grad_norm": 1.9264376163482666, "learning_rate": 5.107200000000001e-07, "loss": 0.7109, "step": 400 }, { "epoch": 0.02624, "grad_norm": 1.4058727025985718, "learning_rate": 5.235200000000001e-07, "loss": 0.705, "step": 410 }, { "epoch": 0.02688, "grad_norm": 1.519445776939392, "learning_rate": 5.363200000000001e-07, "loss": 0.7131, "step": 420 }, { "epoch": 0.02752, "grad_norm": 1.6636698246002197, "learning_rate": 5.491200000000001e-07, "loss": 0.6916, "step": 430 }, { "epoch": 0.02816, "grad_norm": 1.5472590923309326, "learning_rate": 5.6192e-07, "loss": 0.705, "step": 440 }, { "epoch": 0.0288, "grad_norm": 1.4896206855773926, "learning_rate": 5.747200000000001e-07, "loss": 0.7046, "step": 450 }, { "epoch": 0.02944, "grad_norm": 2.2565503120422363, "learning_rate": 5.8752e-07, "loss": 0.7009, "step": 460 }, { "epoch": 0.03008, "grad_norm": 2.017638683319092, "learning_rate": 6.0032e-07, "loss": 0.7058, "step": 470 }, { "epoch": 0.03072, "grad_norm": 1.3399696350097656, "learning_rate": 6.1312e-07, "loss": 0.7003, "step": 480 }, { "epoch": 0.03136, "grad_norm": 1.3090866804122925, "learning_rate": 6.2592e-07, "loss": 0.7067, "step": 490 }, { "epoch": 0.032, "grad_norm": 1.4199142456054688, "learning_rate": 6.3872e-07, "loss": 0.7008, "step": 500 }, { "epoch": 0.03264, "grad_norm": 1.7174904346466064, "learning_rate": 6.515200000000001e-07, "loss": 0.7003, "step": 510 }, { "epoch": 0.03328, "grad_norm": 1.2983943223953247, "learning_rate": 6.643200000000001e-07, "loss": 0.698, "step": 520 }, { "epoch": 0.03392, "grad_norm": 1.8224154710769653, "learning_rate": 6.7712e-07, "loss": 0.7047, "step": 530 }, { "epoch": 0.03456, "grad_norm": 1.3605278730392456, "learning_rate": 6.899200000000001e-07, "loss": 0.6974, "step": 540 }, { "epoch": 0.0352, "grad_norm": 1.4932376146316528, "learning_rate": 7.027200000000001e-07, "loss": 0.6918, "step": 550 }, { "epoch": 0.03584, "grad_norm": 1.2169368267059326, "learning_rate": 7.155200000000001e-07, "loss": 0.6996, "step": 560 }, { "epoch": 0.03648, "grad_norm": 1.5690464973449707, "learning_rate": 7.2832e-07, "loss": 0.6942, "step": 570 }, { "epoch": 0.03712, "grad_norm": 1.541991949081421, "learning_rate": 7.4112e-07, "loss": 0.6973, "step": 580 }, { "epoch": 0.03776, "grad_norm": 1.7749661207199097, "learning_rate": 7.5392e-07, "loss": 0.6865, "step": 590 }, { "epoch": 0.0384, "grad_norm": 1.2169281244277954, "learning_rate": 7.667200000000001e-07, "loss": 0.6876, "step": 600 } ], "logging_steps": 10, "max_steps": 156250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5051732262912000.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }