| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4708236471176765, | |
| "eval_steps": 500, | |
| "global_step": 8000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005885295588970956, | |
| "grad_norm": 29.875, | |
| "learning_rate": 3.9215686274509805e-05, | |
| "loss": 1.0686, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.011770591177941912, | |
| "grad_norm": 11.375, | |
| "learning_rate": 7.843137254901961e-05, | |
| "loss": 0.6153, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01765588676691287, | |
| "grad_norm": 20.25, | |
| "learning_rate": 0.00011764705882352942, | |
| "loss": 0.5969, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.023541182355883823, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.00015686274509803922, | |
| "loss": 0.5478, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.02942647794485478, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 0.000196078431372549, | |
| "loss": 0.5599, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03531177353382574, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 0.00019998528443307886, | |
| "loss": 0.565, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04119706912279669, | |
| "grad_norm": 30.875, | |
| "learning_rate": 0.00019993442136695625, | |
| "loss": 0.5501, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.047082364711767646, | |
| "grad_norm": 3.328125, | |
| "learning_rate": 0.00019984724760441856, | |
| "loss": 0.5355, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.05296766030073861, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 0.00019972379481963764, | |
| "loss": 0.5344, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.05885295588970956, | |
| "grad_norm": 29.75, | |
| "learning_rate": 0.00019956410786859524, | |
| "loss": 0.5016, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06473825147868052, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 0.00019936824477278514, | |
| "loss": 0.5091, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.07062354706765148, | |
| "grad_norm": 24.75, | |
| "learning_rate": 0.00019913627669813103, | |
| "loss": 0.5005, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.07650884265662243, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 0.00019886828792912894, | |
| "loss": 0.4961, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.08239413824559338, | |
| "grad_norm": 1.875, | |
| "learning_rate": 0.0001985643758382227, | |
| "loss": 0.4755, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.08827943383456434, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 0.00019822465085042422, | |
| "loss": 0.4889, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.09416472942353529, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 0.0001978492364031911, | |
| "loss": 0.5024, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.10005002501250625, | |
| "grad_norm": 14.4375, | |
| "learning_rate": 0.00019743826890157614, | |
| "loss": 0.4681, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.10593532060147721, | |
| "grad_norm": 10.375, | |
| "learning_rate": 0.0001969918976686652, | |
| "loss": 0.488, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.11182061619044817, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 0.00019651028489132147, | |
| "loss": 0.4859, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.11770591177941912, | |
| "grad_norm": 15.125, | |
| "learning_rate": 0.0001959936055612557, | |
| "loss": 0.5028, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.12359120736839008, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 0.0001954420474114435, | |
| "loss": 0.4937, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.12947650295736104, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 0.00019485581084791376, | |
| "loss": 0.4801, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.13536179854633199, | |
| "grad_norm": 19.125, | |
| "learning_rate": 0.0001942351088769319, | |
| "loss": 0.4853, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.14124709413530295, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.0001935801670276052, | |
| "loss": 0.4739, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1471323897242739, | |
| "grad_norm": 35.5, | |
| "learning_rate": 0.00019289122326993777, | |
| "loss": 0.4868, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.15301768531324486, | |
| "grad_norm": 20.875, | |
| "learning_rate": 0.00019216852792836516, | |
| "loss": 0.4925, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.1589029809022158, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 0.00019141234359080055, | |
| "loss": 0.4808, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.16478827649118677, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 0.00019062294501322416, | |
| "loss": 0.4757, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.17067357208015774, | |
| "grad_norm": 20.625, | |
| "learning_rate": 0.0001898006190198525, | |
| "loss": 0.4805, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.17655886766912868, | |
| "grad_norm": 10.25, | |
| "learning_rate": 0.0001889456643989218, | |
| "loss": 0.4832, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.18244416325809965, | |
| "grad_norm": 20.25, | |
| "learning_rate": 0.00018805839179412485, | |
| "loss": 0.4559, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.18832945884707059, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 0.00018713912359174, | |
| "loss": 0.497, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.19421475443604155, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 0.00018618819380349382, | |
| "loss": 0.4776, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2001000500250125, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 0.00018520594794519941, | |
| "loss": 0.4915, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.20598534561398346, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.00018419274291121485, | |
| "loss": 0.4498, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.21187064120295443, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 0.00018314894684476736, | |
| "loss": 0.4625, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.21775593679192537, | |
| "grad_norm": 19.125, | |
| "learning_rate": 0.00018207493900419027, | |
| "loss": 0.4625, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.22364123238089634, | |
| "grad_norm": 11.5, | |
| "learning_rate": 0.00018097110962512128, | |
| "loss": 0.4655, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.22952652796986728, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 0.00017983785977871209, | |
| "loss": 0.4488, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.23541182355883825, | |
| "grad_norm": 9.875, | |
| "learning_rate": 0.00017867560122590125, | |
| "loss": 0.4441, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.24129711914780919, | |
| "grad_norm": 12.875, | |
| "learning_rate": 0.00017748475626780277, | |
| "loss": 0.4732, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.24718241473678015, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 0.0001762657575922649, | |
| "loss": 0.4544, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.2530677103257511, | |
| "grad_norm": 3.125, | |
| "learning_rate": 0.0001750190481166552, | |
| "loss": 0.4779, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.2589530059147221, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 0.00017374508082692848, | |
| "loss": 0.4661, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.26483830150369303, | |
| "grad_norm": 26.25, | |
| "learning_rate": 0.0001724443186130367, | |
| "loss": 0.4916, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.27072359709266397, | |
| "grad_norm": 8.125, | |
| "learning_rate": 0.00017111723410073991, | |
| "loss": 0.449, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.2766088926816349, | |
| "grad_norm": 8.625, | |
| "learning_rate": 0.00016976430947988007, | |
| "loss": 0.45, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.2824941882706059, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 0.00016838603632917954, | |
| "loss": 0.4593, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.28837948385957685, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 0.0001669829154376285, | |
| "loss": 0.4847, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.2942647794485478, | |
| "grad_norm": 13.125, | |
| "learning_rate": 0.00016555545662252536, | |
| "loss": 0.4576, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3001500750375188, | |
| "grad_norm": 14.3125, | |
| "learning_rate": 0.00016410417854423735, | |
| "loss": 0.4457, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3060353706264897, | |
| "grad_norm": 29.0, | |
| "learning_rate": 0.00016262960851774752, | |
| "loss": 0.4972, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.31192066621546066, | |
| "grad_norm": 20.75, | |
| "learning_rate": 0.00016113228232105757, | |
| "loss": 0.4715, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.3178059618044316, | |
| "grad_norm": 22.5, | |
| "learning_rate": 0.0001596127440005152, | |
| "loss": 0.4696, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.3236912573934026, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 0.00015807154567313775, | |
| "loss": 0.4629, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.32957655298237354, | |
| "grad_norm": 4.375, | |
| "learning_rate": 0.0001565092473260029, | |
| "loss": 0.475, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.3354618485713445, | |
| "grad_norm": 13.5, | |
| "learning_rate": 0.00015492641661278005, | |
| "loss": 0.4511, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.3413471441603155, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 0.0001533236286474762, | |
| "loss": 0.4743, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.3472324397492864, | |
| "grad_norm": 11.8125, | |
| "learning_rate": 0.0001517014657954708, | |
| "loss": 0.4418, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.35311773533825735, | |
| "grad_norm": 26.125, | |
| "learning_rate": 0.00015006051746191626, | |
| "loss": 0.45, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3590030309272283, | |
| "grad_norm": 15.375, | |
| "learning_rate": 0.00014840137987758028, | |
| "loss": 0.4463, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.3648883265161993, | |
| "grad_norm": 5.90625, | |
| "learning_rate": 0.00014672465588220837, | |
| "loss": 0.4559, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.37077362210517023, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 0.0001450309547054846, | |
| "loss": 0.4398, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.37665891769414117, | |
| "grad_norm": 21.875, | |
| "learning_rate": 0.00014332089174567126, | |
| "loss": 0.4454, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.38254421328311217, | |
| "grad_norm": 16.875, | |
| "learning_rate": 0.00014159508834600657, | |
| "loss": 0.4443, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.3884295088720831, | |
| "grad_norm": 34.25, | |
| "learning_rate": 0.00013985417156894267, | |
| "loss": 0.4762, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.39431480446105405, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 0.0001380987739683055, | |
| "loss": 0.4795, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.400200100050025, | |
| "grad_norm": 15.75, | |
| "learning_rate": 0.00013632953335945927, | |
| "loss": 0.4603, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.406085395638996, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 0.00013454709258755942, | |
| "loss": 0.4674, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.4119706912279669, | |
| "grad_norm": 30.125, | |
| "learning_rate": 0.00013275209929397775, | |
| "loss": 0.4595, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.41785598681693786, | |
| "grad_norm": 16.875, | |
| "learning_rate": 0.0001309452056809851, | |
| "loss": 0.4398, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.42374128240590886, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 0.00012912706827477671, | |
| "loss": 0.4693, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.4296265779948798, | |
| "grad_norm": 17.125, | |
| "learning_rate": 0.00012729834768692667, | |
| "loss": 0.4564, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.43551187358385074, | |
| "grad_norm": 9.75, | |
| "learning_rate": 0.00012545970837435756, | |
| "loss": 0.4732, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.4413971691728217, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 0.00012361181839791357, | |
| "loss": 0.4647, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.4472824647617927, | |
| "grad_norm": 19.0, | |
| "learning_rate": 0.00012175534917962352, | |
| "loss": 0.4697, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.4531677603507636, | |
| "grad_norm": 19.375, | |
| "learning_rate": 0.00011989097525874294, | |
| "loss": 0.4814, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.45905305593973456, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 0.00011801937404666336, | |
| "loss": 0.4688, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.46493835152870555, | |
| "grad_norm": 9.625, | |
| "learning_rate": 0.00011614122558077828, | |
| "loss": 0.4665, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.4708236471176765, | |
| "grad_norm": 21.875, | |
| "learning_rate": 0.00011425721227739465, | |
| "loss": 0.472, | |
| "step": 8000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 16991, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 4000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.253476198349144e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |