| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9971509971509973, | |
| "eval_steps": 500, | |
| "global_step": 876, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022792022792022793, | |
| "grad_norm": 3.7423104091436783, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7531, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.045584045584045586, | |
| "grad_norm": 0.804241817019037, | |
| "learning_rate": 5e-06, | |
| "loss": 0.699, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06837606837606838, | |
| "grad_norm": 0.9152853396823092, | |
| "learning_rate": 5e-06, | |
| "loss": 0.679, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09116809116809117, | |
| "grad_norm": 0.8622513053123825, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6792, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11396011396011396, | |
| "grad_norm": 0.8036004831872618, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6753, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13675213675213677, | |
| "grad_norm": 0.6816477255973206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6551, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15954415954415954, | |
| "grad_norm": 0.44513213813326225, | |
| "learning_rate": 5e-06, | |
| "loss": 0.652, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.18233618233618235, | |
| "grad_norm": 0.3742970630530666, | |
| "learning_rate": 5e-06, | |
| "loss": 0.655, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.20512820512820512, | |
| "grad_norm": 0.3165130129136532, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6398, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.22792022792022792, | |
| "grad_norm": 0.3042067201330251, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6411, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.25071225071225073, | |
| "grad_norm": 0.35376934688819983, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6324, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.27350427350427353, | |
| "grad_norm": 0.30824084149805553, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6407, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2962962962962963, | |
| "grad_norm": 0.29372321657922484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6416, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3190883190883191, | |
| "grad_norm": 0.30117031377143266, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6438, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 0.3067841663492602, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3646723646723647, | |
| "grad_norm": 0.3116337699832478, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6414, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.38746438746438744, | |
| "grad_norm": 0.3594095078967235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.41025641025641024, | |
| "grad_norm": 0.30116165319677957, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6388, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.43304843304843305, | |
| "grad_norm": 0.3022545956449521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6314, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.45584045584045585, | |
| "grad_norm": 0.3069466305582857, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6394, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.47863247863247865, | |
| "grad_norm": 0.30814435636491544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6357, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5014245014245015, | |
| "grad_norm": 0.3071303249181051, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6395, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5242165242165242, | |
| "grad_norm": 0.32490357531744296, | |
| "learning_rate": 5e-06, | |
| "loss": 0.635, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5470085470085471, | |
| "grad_norm": 0.29591883031381105, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6309, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5698005698005698, | |
| "grad_norm": 0.33357471359245844, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5925925925925926, | |
| "grad_norm": 0.2981284989408182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.634, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.3102186918626153, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6312, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6381766381766382, | |
| "grad_norm": 0.32329286600519713, | |
| "learning_rate": 5e-06, | |
| "loss": 0.642, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6609686609686609, | |
| "grad_norm": 0.3242832742998967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6307, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 0.29295198198920075, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6287, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7065527065527065, | |
| "grad_norm": 0.29895032078931544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6286, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7293447293447294, | |
| "grad_norm": 0.3447337206328429, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6292, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7521367521367521, | |
| "grad_norm": 0.30639369318166265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6258, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7749287749287749, | |
| "grad_norm": 0.32323869936115435, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6316, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7977207977207977, | |
| "grad_norm": 0.33615968390765805, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6387, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8205128205128205, | |
| "grad_norm": 0.3134642477669063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6305, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8433048433048433, | |
| "grad_norm": 0.32819600455604525, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6242, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8660968660968661, | |
| "grad_norm": 0.30983155185254496, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6365, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 0.3264724671732252, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9116809116809117, | |
| "grad_norm": 0.2944418898860399, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6383, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9344729344729344, | |
| "grad_norm": 0.2885387150113508, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9572649572649573, | |
| "grad_norm": 0.29703808786645514, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6214, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.98005698005698, | |
| "grad_norm": 0.3420563577044849, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6345, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9982905982905983, | |
| "eval_loss": 0.6251205205917358, | |
| "eval_runtime": 442.0682, | |
| "eval_samples_per_second": 26.745, | |
| "eval_steps_per_second": 0.418, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 1.0034188034188034, | |
| "grad_norm": 0.34625889399732096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6472, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0262108262108263, | |
| "grad_norm": 0.3281982864617852, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6017, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.049002849002849, | |
| "grad_norm": 0.3141455005951138, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5954, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.0717948717948718, | |
| "grad_norm": 0.28996166460793643, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5962, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.0945868945868946, | |
| "grad_norm": 0.3076841135262722, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6039, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.1173789173789175, | |
| "grad_norm": 0.3297318480619313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6089, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1401709401709401, | |
| "grad_norm": 0.29043048157109264, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5988, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.162962962962963, | |
| "grad_norm": 0.31369187178492997, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5966, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.1857549857549858, | |
| "grad_norm": 0.295573169618973, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6023, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2085470085470085, | |
| "grad_norm": 0.32173567699812744, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5953, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.2313390313390313, | |
| "grad_norm": 0.28814724323263435, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6055, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.2541310541310542, | |
| "grad_norm": 0.33914723903856536, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6022, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2769230769230768, | |
| "grad_norm": 0.2903868392469319, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6037, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.2997150997150997, | |
| "grad_norm": 0.30247551693702623, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5977, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3225071225071225, | |
| "grad_norm": 0.269850218574931, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5962, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.3452991452991454, | |
| "grad_norm": 0.297302467684091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5958, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.368091168091168, | |
| "grad_norm": 0.3510894377240081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6015, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.390883190883191, | |
| "grad_norm": 0.3251341988125297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6015, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4136752136752135, | |
| "grad_norm": 0.2909407195521434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6041, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.4364672364672364, | |
| "grad_norm": 0.31809945732936756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6021, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.4592592592592593, | |
| "grad_norm": 0.308053494784738, | |
| "learning_rate": 5e-06, | |
| "loss": 0.597, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.4820512820512821, | |
| "grad_norm": 0.29613688364727164, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5949, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.504843304843305, | |
| "grad_norm": 0.2899379408613246, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5946, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5276353276353276, | |
| "grad_norm": 0.3137945273300998, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6022, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.5504273504273505, | |
| "grad_norm": 0.2724494831447769, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5908, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.573219373219373, | |
| "grad_norm": 0.32140838815196376, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5947, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.596011396011396, | |
| "grad_norm": 0.2848716261938471, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6102, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6188034188034188, | |
| "grad_norm": 0.3068892037035822, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5973, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.6415954415954417, | |
| "grad_norm": 0.2913053429545111, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6019, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.6643874643874645, | |
| "grad_norm": 0.32543222896562873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.595, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.6871794871794872, | |
| "grad_norm": 0.326218112009388, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5961, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7099715099715098, | |
| "grad_norm": 0.3003624805192447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5935, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7327635327635327, | |
| "grad_norm": 0.31631458744002094, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5974, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.7555555555555555, | |
| "grad_norm": 0.2957884880110632, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5938, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.7783475783475784, | |
| "grad_norm": 0.30827577948493284, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6009, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8011396011396013, | |
| "grad_norm": 0.29302997826431615, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5985, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.823931623931624, | |
| "grad_norm": 0.31044034277936555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5853, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8467236467236468, | |
| "grad_norm": 0.3167005866324159, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5932, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.8695156695156694, | |
| "grad_norm": 0.30381283586000696, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5909, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8923076923076922, | |
| "grad_norm": 0.3231388546230539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5947, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.915099715099715, | |
| "grad_norm": 0.31619140661356115, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5992, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.937891737891738, | |
| "grad_norm": 0.3008849085639024, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5973, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.9606837606837608, | |
| "grad_norm": 0.3110240663396626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5986, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9834757834757835, | |
| "grad_norm": 0.3538843685145682, | |
| "learning_rate": 5e-06, | |
| "loss": 0.5961, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.9971509971509973, | |
| "eval_loss": 0.6187613010406494, | |
| "eval_runtime": 442.0084, | |
| "eval_samples_per_second": 26.748, | |
| "eval_steps_per_second": 0.419, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 1.9971509971509973, | |
| "step": 876, | |
| "total_flos": 1836725450047488.0, | |
| "train_loss": 0.6210468141999963, | |
| "train_runtime": 47044.2331, | |
| "train_samples_per_second": 9.549, | |
| "train_steps_per_second": 0.019 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 876, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1836725450047488.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |