{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 250.0, "eval_steps": 50, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.266666666666667, "grad_norm": 0.26255419850349426, "learning_rate": 0.00029265, "loss": 0.5343, "step": 50 }, { "epoch": 12.533333333333333, "grad_norm": 0.23443113267421722, "learning_rate": 0.00028514999999999997, "loss": 0.1262, "step": 100 }, { "epoch": 18.8, "grad_norm": 0.17284274101257324, "learning_rate": 0.00027764999999999995, "loss": 0.0524, "step": 150 }, { "epoch": 25.0, "grad_norm": 0.18162274360656738, "learning_rate": 0.00027015, "loss": 0.0297, "step": 200 }, { "epoch": 31.266666666666666, "grad_norm": 0.06401970237493515, "learning_rate": 0.00026264999999999996, "loss": 0.0201, "step": 250 }, { "epoch": 37.53333333333333, "grad_norm": 0.0907013937830925, "learning_rate": 0.00025515, "loss": 0.015, "step": 300 }, { "epoch": 43.8, "grad_norm": 0.03851782903075218, "learning_rate": 0.00024765, "loss": 0.0138, "step": 350 }, { "epoch": 50.0, "grad_norm": 0.11328539997339249, "learning_rate": 0.00024014999999999998, "loss": 0.0127, "step": 400 }, { "epoch": 56.266666666666666, "grad_norm": 0.05093704164028168, "learning_rate": 0.00023264999999999996, "loss": 0.0136, "step": 450 }, { "epoch": 62.53333333333333, "grad_norm": 0.09107606112957001, "learning_rate": 0.00022514999999999997, "loss": 0.0141, "step": 500 }, { "epoch": 68.8, "grad_norm": 0.03748961165547371, "learning_rate": 0.00021764999999999998, "loss": 0.0129, "step": 550 }, { "epoch": 75.0, "grad_norm": 0.02932876907289028, "learning_rate": 0.00021014999999999999, "loss": 0.0112, "step": 600 }, { "epoch": 81.26666666666667, "grad_norm": 0.019922535866498947, "learning_rate": 0.00020264999999999997, "loss": 0.011, "step": 650 }, { "epoch": 87.53333333333333, "grad_norm": 0.01521008089184761, "learning_rate": 0.00019514999999999997, "loss": 0.0101, "step": 700 }, { "epoch": 93.8, "grad_norm": 0.015646636486053467, "learning_rate": 0.00018764999999999998, "loss": 0.0098, "step": 750 }, { "epoch": 100.0, "grad_norm": 0.024294869974255562, "learning_rate": 0.00018015, "loss": 0.0098, "step": 800 }, { "epoch": 106.26666666666667, "grad_norm": 0.012117642909288406, "learning_rate": 0.00017265, "loss": 0.0097, "step": 850 }, { "epoch": 112.53333333333333, "grad_norm": 0.01629127934575081, "learning_rate": 0.00016514999999999998, "loss": 0.0097, "step": 900 }, { "epoch": 118.8, "grad_norm": 0.017040058970451355, "learning_rate": 0.00015764999999999998, "loss": 0.0097, "step": 950 }, { "epoch": 125.0, "grad_norm": 0.018876733258366585, "learning_rate": 0.00015014999999999996, "loss": 0.0097, "step": 1000 }, { "epoch": 131.26666666666668, "grad_norm": 0.012507513165473938, "learning_rate": 0.00014264999999999997, "loss": 0.0096, "step": 1050 }, { "epoch": 137.53333333333333, "grad_norm": 0.012686866335570812, "learning_rate": 0.00013514999999999998, "loss": 0.0096, "step": 1100 }, { "epoch": 143.8, "grad_norm": 0.01112140528857708, "learning_rate": 0.00012764999999999999, "loss": 0.0096, "step": 1150 }, { "epoch": 150.0, "grad_norm": 0.023951932787895203, "learning_rate": 0.00012014999999999999, "loss": 0.0096, "step": 1200 }, { "epoch": 156.26666666666668, "grad_norm": 0.010721893981099129, "learning_rate": 0.00011264999999999999, "loss": 0.0096, "step": 1250 }, { "epoch": 162.53333333333333, "grad_norm": 0.012511960230767727, "learning_rate": 0.00010514999999999998, "loss": 0.0096, "step": 1300 }, { "epoch": 168.8, "grad_norm": 0.010806918144226074, "learning_rate": 9.764999999999999e-05, "loss": 0.0096, "step": 1350 }, { "epoch": 175.0, "grad_norm": 0.018283583223819733, "learning_rate": 9.014999999999998e-05, "loss": 0.0096, "step": 1400 }, { "epoch": 181.26666666666668, "grad_norm": 0.010316784493625164, "learning_rate": 8.265e-05, "loss": 0.0095, "step": 1450 }, { "epoch": 187.53333333333333, "grad_norm": 0.011216912418603897, "learning_rate": 7.515e-05, "loss": 0.0095, "step": 1500 }, { "epoch": 193.8, "grad_norm": 0.01198404561728239, "learning_rate": 6.764999999999999e-05, "loss": 0.0095, "step": 1550 }, { "epoch": 200.0, "grad_norm": 0.014636659994721413, "learning_rate": 6.015e-05, "loss": 0.0095, "step": 1600 }, { "epoch": 206.26666666666668, "grad_norm": 0.010694563388824463, "learning_rate": 5.264999999999999e-05, "loss": 0.0095, "step": 1650 }, { "epoch": 212.53333333333333, "grad_norm": 0.011114147491753101, "learning_rate": 4.514999999999999e-05, "loss": 0.0095, "step": 1700 }, { "epoch": 218.8, "grad_norm": 0.010317948646843433, "learning_rate": 3.7649999999999994e-05, "loss": 0.0095, "step": 1750 }, { "epoch": 225.0, "grad_norm": 0.014378506690263748, "learning_rate": 3.0149999999999998e-05, "loss": 0.0094, "step": 1800 }, { "epoch": 231.26666666666668, "grad_norm": 0.011094390414655209, "learning_rate": 2.2649999999999998e-05, "loss": 0.0094, "step": 1850 }, { "epoch": 237.53333333333333, "grad_norm": 0.011555945500731468, "learning_rate": 1.5149999999999999e-05, "loss": 0.0094, "step": 1900 }, { "epoch": 243.8, "grad_norm": 0.009383700788021088, "learning_rate": 7.65e-06, "loss": 0.0094, "step": 1950 }, { "epoch": 250.0, "grad_norm": 0.014644854702055454, "learning_rate": 1.5e-07, "loss": 0.0094, "step": 2000 } ], "logging_steps": 50, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 286, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.0511261556736e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }