{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4708236471176765, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005885295588970956, "grad_norm": 29.875, "learning_rate": 3.9215686274509805e-05, "loss": 1.0686, "step": 100 }, { "epoch": 0.011770591177941912, "grad_norm": 11.375, "learning_rate": 7.843137254901961e-05, "loss": 0.6153, "step": 200 }, { "epoch": 0.01765588676691287, "grad_norm": 20.25, "learning_rate": 0.00011764705882352942, "loss": 0.5969, "step": 300 }, { "epoch": 0.023541182355883823, "grad_norm": 4.5, "learning_rate": 0.00015686274509803922, "loss": 0.5478, "step": 400 }, { "epoch": 0.02942647794485478, "grad_norm": 7.40625, "learning_rate": 0.000196078431372549, "loss": 0.5599, "step": 500 }, { "epoch": 0.03531177353382574, "grad_norm": 2.71875, "learning_rate": 0.00019998528443307886, "loss": 0.565, "step": 600 }, { "epoch": 0.04119706912279669, "grad_norm": 30.875, "learning_rate": 0.00019993442136695625, "loss": 0.5501, "step": 700 }, { "epoch": 0.047082364711767646, "grad_norm": 3.328125, "learning_rate": 0.00019984724760441856, "loss": 0.5355, "step": 800 }, { "epoch": 0.05296766030073861, "grad_norm": 6.03125, "learning_rate": 0.00019972379481963764, "loss": 0.5344, "step": 900 }, { "epoch": 0.05885295588970956, "grad_norm": 29.75, "learning_rate": 0.00019956410786859524, "loss": 0.5016, "step": 1000 }, { "epoch": 0.06473825147868052, "grad_norm": 14.0625, "learning_rate": 0.00019936824477278514, "loss": 0.5091, "step": 1100 }, { "epoch": 0.07062354706765148, "grad_norm": 24.75, "learning_rate": 0.00019913627669813103, "loss": 0.5005, "step": 1200 }, { "epoch": 0.07650884265662243, "grad_norm": 3.203125, "learning_rate": 0.00019886828792912894, "loss": 0.4961, "step": 1300 }, { "epoch": 0.08239413824559338, "grad_norm": 1.875, "learning_rate": 0.0001985643758382227, "loss": 0.4755, "step": 1400 }, { "epoch": 0.08827943383456434, "grad_norm": 7.46875, "learning_rate": 0.00019822465085042422, "loss": 0.4889, "step": 1500 }, { "epoch": 0.09416472942353529, "grad_norm": 3.859375, "learning_rate": 0.0001978492364031911, "loss": 0.5024, "step": 1600 }, { "epoch": 0.10005002501250625, "grad_norm": 14.4375, "learning_rate": 0.00019743826890157614, "loss": 0.4681, "step": 1700 }, { "epoch": 0.10593532060147721, "grad_norm": 10.375, "learning_rate": 0.0001969918976686652, "loss": 0.488, "step": 1800 }, { "epoch": 0.11182061619044817, "grad_norm": 9.5625, "learning_rate": 0.00019651028489132147, "loss": 0.4859, "step": 1900 }, { "epoch": 0.11770591177941912, "grad_norm": 15.125, "learning_rate": 0.0001959936055612557, "loss": 0.5028, "step": 2000 }, { "epoch": 0.12359120736839008, "grad_norm": 12.5625, "learning_rate": 0.0001954420474114435, "loss": 0.4937, "step": 2100 }, { "epoch": 0.12947650295736104, "grad_norm": 3.890625, "learning_rate": 0.00019485581084791376, "loss": 0.4801, "step": 2200 }, { "epoch": 0.13536179854633199, "grad_norm": 19.125, "learning_rate": 0.0001942351088769319, "loss": 0.4853, "step": 2300 }, { "epoch": 0.14124709413530295, "grad_norm": 11.8125, "learning_rate": 0.0001935801670276052, "loss": 0.4739, "step": 2400 }, { "epoch": 0.1471323897242739, "grad_norm": 35.5, "learning_rate": 0.00019289122326993777, "loss": 0.4868, "step": 2500 }, { "epoch": 0.15301768531324486, "grad_norm": 20.875, "learning_rate": 0.00019216852792836516, "loss": 0.4925, "step": 2600 }, { "epoch": 0.1589029809022158, "grad_norm": 12.5625, "learning_rate": 0.00019141234359080055, "loss": 0.4808, "step": 2700 }, { "epoch": 0.16478827649118677, "grad_norm": 8.6875, "learning_rate": 0.00019062294501322416, "loss": 0.4757, "step": 2800 }, { "epoch": 0.17067357208015774, "grad_norm": 20.625, "learning_rate": 0.0001898006190198525, "loss": 0.4805, "step": 2900 }, { "epoch": 0.17655886766912868, "grad_norm": 10.25, "learning_rate": 0.0001889456643989218, "loss": 0.4832, "step": 3000 }, { "epoch": 0.18244416325809965, "grad_norm": 20.25, "learning_rate": 0.00018805839179412485, "loss": 0.4559, "step": 3100 }, { "epoch": 0.18832945884707059, "grad_norm": 8.5625, "learning_rate": 0.00018713912359174, "loss": 0.497, "step": 3200 }, { "epoch": 0.19421475443604155, "grad_norm": 6.40625, "learning_rate": 0.00018618819380349382, "loss": 0.4776, "step": 3300 }, { "epoch": 0.2001000500250125, "grad_norm": 12.8125, "learning_rate": 0.00018520594794519941, "loss": 0.4915, "step": 3400 }, { "epoch": 0.20598534561398346, "grad_norm": 1.84375, "learning_rate": 0.00018419274291121485, "loss": 0.4498, "step": 3500 }, { "epoch": 0.21187064120295443, "grad_norm": 3.8125, "learning_rate": 0.00018314894684476736, "loss": 0.4625, "step": 3600 }, { "epoch": 0.21775593679192537, "grad_norm": 19.125, "learning_rate": 0.00018207493900419027, "loss": 0.4625, "step": 3700 }, { "epoch": 0.22364123238089634, "grad_norm": 11.5, "learning_rate": 0.00018097110962512128, "loss": 0.4655, "step": 3800 }, { "epoch": 0.22952652796986728, "grad_norm": 6.3125, "learning_rate": 0.00017983785977871209, "loss": 0.4488, "step": 3900 }, { "epoch": 0.23541182355883825, "grad_norm": 9.875, "learning_rate": 0.00017867560122590125, "loss": 0.4441, "step": 4000 }, { "epoch": 0.24129711914780919, "grad_norm": 12.875, "learning_rate": 0.00017748475626780277, "loss": 0.4732, "step": 4100 }, { "epoch": 0.24718241473678015, "grad_norm": 4.21875, "learning_rate": 0.0001762657575922649, "loss": 0.4544, "step": 4200 }, { "epoch": 0.2530677103257511, "grad_norm": 3.125, "learning_rate": 0.0001750190481166552, "loss": 0.4779, "step": 4300 }, { "epoch": 0.2589530059147221, "grad_norm": 2.1875, "learning_rate": 0.00017374508082692848, "loss": 0.4661, "step": 4400 }, { "epoch": 0.26483830150369303, "grad_norm": 26.25, "learning_rate": 0.0001724443186130367, "loss": 0.4916, "step": 4500 }, { "epoch": 0.27072359709266397, "grad_norm": 8.125, "learning_rate": 0.00017111723410073991, "loss": 0.449, "step": 4600 }, { "epoch": 0.2766088926816349, "grad_norm": 8.625, "learning_rate": 0.00016976430947988007, "loss": 0.45, "step": 4700 }, { "epoch": 0.2824941882706059, "grad_norm": 3.59375, "learning_rate": 0.00016838603632917954, "loss": 0.4593, "step": 4800 }, { "epoch": 0.28837948385957685, "grad_norm": 6.40625, "learning_rate": 0.0001669829154376285, "loss": 0.4847, "step": 4900 }, { "epoch": 0.2942647794485478, "grad_norm": 13.125, "learning_rate": 0.00016555545662252536, "loss": 0.4576, "step": 5000 }, { "epoch": 0.3001500750375188, "grad_norm": 14.3125, "learning_rate": 0.00016410417854423735, "loss": 0.4457, "step": 5100 }, { "epoch": 0.3060353706264897, "grad_norm": 29.0, "learning_rate": 0.00016262960851774752, "loss": 0.4972, "step": 5200 }, { "epoch": 0.31192066621546066, "grad_norm": 20.75, "learning_rate": 0.00016113228232105757, "loss": 0.4715, "step": 5300 }, { "epoch": 0.3178059618044316, "grad_norm": 22.5, "learning_rate": 0.0001596127440005152, "loss": 0.4696, "step": 5400 }, { "epoch": 0.3236912573934026, "grad_norm": 8.1875, "learning_rate": 0.00015807154567313775, "loss": 0.4629, "step": 5500 }, { "epoch": 0.32957655298237354, "grad_norm": 4.375, "learning_rate": 0.0001565092473260029, "loss": 0.475, "step": 5600 }, { "epoch": 0.3354618485713445, "grad_norm": 13.5, "learning_rate": 0.00015492641661278005, "loss": 0.4511, "step": 5700 }, { "epoch": 0.3413471441603155, "grad_norm": 3.5625, "learning_rate": 0.0001533236286474762, "loss": 0.4743, "step": 5800 }, { "epoch": 0.3472324397492864, "grad_norm": 11.8125, "learning_rate": 0.0001517014657954708, "loss": 0.4418, "step": 5900 }, { "epoch": 0.35311773533825735, "grad_norm": 26.125, "learning_rate": 0.00015006051746191626, "loss": 0.45, "step": 6000 }, { "epoch": 0.3590030309272283, "grad_norm": 15.375, "learning_rate": 0.00014840137987758028, "loss": 0.4463, "step": 6100 }, { "epoch": 0.3648883265161993, "grad_norm": 5.90625, "learning_rate": 0.00014672465588220837, "loss": 0.4559, "step": 6200 }, { "epoch": 0.37077362210517023, "grad_norm": 12.9375, "learning_rate": 0.0001450309547054846, "loss": 0.4398, "step": 6300 }, { "epoch": 0.37665891769414117, "grad_norm": 21.875, "learning_rate": 0.00014332089174567126, "loss": 0.4454, "step": 6400 }, { "epoch": 0.38254421328311217, "grad_norm": 16.875, "learning_rate": 0.00014159508834600657, "loss": 0.4443, "step": 6500 }, { "epoch": 0.3884295088720831, "grad_norm": 34.25, "learning_rate": 0.00013985417156894267, "loss": 0.4762, "step": 6600 }, { "epoch": 0.39431480446105405, "grad_norm": 4.5625, "learning_rate": 0.0001380987739683055, "loss": 0.4795, "step": 6700 }, { "epoch": 0.400200100050025, "grad_norm": 15.75, "learning_rate": 0.00013632953335945927, "loss": 0.4603, "step": 6800 }, { "epoch": 0.406085395638996, "grad_norm": 5.40625, "learning_rate": 0.00013454709258755942, "loss": 0.4674, "step": 6900 }, { "epoch": 0.4119706912279669, "grad_norm": 30.125, "learning_rate": 0.00013275209929397775, "loss": 0.4595, "step": 7000 }, { "epoch": 0.41785598681693786, "grad_norm": 16.875, "learning_rate": 0.0001309452056809851, "loss": 0.4398, "step": 7100 }, { "epoch": 0.42374128240590886, "grad_norm": 5.6875, "learning_rate": 0.00012912706827477671, "loss": 0.4693, "step": 7200 }, { "epoch": 0.4296265779948798, "grad_norm": 17.125, "learning_rate": 0.00012729834768692667, "loss": 0.4564, "step": 7300 }, { "epoch": 0.43551187358385074, "grad_norm": 9.75, "learning_rate": 0.00012545970837435756, "loss": 0.4732, "step": 7400 }, { "epoch": 0.4413971691728217, "grad_norm": 6.3125, "learning_rate": 0.00012361181839791357, "loss": 0.4647, "step": 7500 }, { "epoch": 0.4472824647617927, "grad_norm": 19.0, "learning_rate": 0.00012175534917962352, "loss": 0.4697, "step": 7600 }, { "epoch": 0.4531677603507636, "grad_norm": 19.375, "learning_rate": 0.00011989097525874294, "loss": 0.4814, "step": 7700 }, { "epoch": 0.45905305593973456, "grad_norm": 2.015625, "learning_rate": 0.00011801937404666336, "loss": 0.4688, "step": 7800 }, { "epoch": 0.46493835152870555, "grad_norm": 9.625, "learning_rate": 0.00011614122558077828, "loss": 0.4665, "step": 7900 }, { "epoch": 0.4708236471176765, "grad_norm": 21.875, "learning_rate": 0.00011425721227739465, "loss": 0.472, "step": 8000 } ], "logging_steps": 100, "max_steps": 16991, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.253476198349144e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }