{ "best_global_step": 1, "best_metric": 0.0719488188624382, "best_model_checkpoint": "outputs_3/checkpoint-1125", "epoch": 2.2681564245810057, "eval_steps": 20, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032310177705977385, "grad_norm": 0.0250613521784544, "learning_rate": 0.0, "loss": 0.1103, "step": 1 }, { "epoch": 0.0446927374301676, "grad_norm": 0.1923094540834427, "learning_rate": 1.5828635851183766e-05, "loss": 0.0672, "step": 2 }, { "epoch": 0.0670391061452514, "grad_norm": 0.16962496936321259, "learning_rate": 0.0, "loss": 0.063, "step": 3 }, { "epoch": 0.0893854748603352, "grad_norm": 0.19785423576831818, "learning_rate": 0.0, "loss": 0.0712, "step": 4 }, { "epoch": 0.11173184357541899, "grad_norm": 0.21228645741939545, "learning_rate": 0.0, "loss": 0.0725, "step": 5 }, { "epoch": 0.1340782122905028, "grad_norm": 0.18974730372428894, "learning_rate": 0.0, "loss": 0.0627, "step": 6 }, { "epoch": 0.1564245810055866, "grad_norm": 0.19599883258342743, "learning_rate": 0.0, "loss": 0.0712, "step": 7 }, { "epoch": 0.1787709497206704, "grad_norm": 0.17850686609745026, "learning_rate": 0.0, "loss": 0.0579, "step": 8 }, { "epoch": 0.2011173184357542, "grad_norm": 0.24011921882629395, "learning_rate": 0.0, "loss": 0.0828, "step": 9 }, { "epoch": 0.22346368715083798, "grad_norm": 0.20377422869205475, "learning_rate": 0.0, "loss": 0.0627, "step": 10 }, { "epoch": 0.24581005586592178, "grad_norm": 0.21203763782978058, "learning_rate": 0.0, "loss": 0.0745, "step": 11 }, { "epoch": 0.2681564245810056, "grad_norm": 0.21539321541786194, "learning_rate": 0.0, "loss": 0.0727, "step": 12 }, { "epoch": 0.2905027932960894, "grad_norm": 0.16341185569763184, "learning_rate": 0.0, "loss": 0.0521, "step": 13 }, { "epoch": 0.3128491620111732, "grad_norm": 0.24054378271102905, "learning_rate": 0.0, "loss": 0.0816, "step": 14 }, { "epoch": 0.33519553072625696, "grad_norm": 0.25560539960861206, "learning_rate": 0.0, "loss": 0.0833, "step": 15 }, { "epoch": 0.3575418994413408, "grad_norm": 0.20083709061145782, "learning_rate": 0.0, "loss": 0.0655, "step": 16 }, { "epoch": 0.37988826815642457, "grad_norm": 0.14186497032642365, "learning_rate": 0.0, "loss": 0.0487, "step": 17 }, { "epoch": 0.4022346368715084, "grad_norm": 0.21731677651405334, "learning_rate": 0.0, "loss": 0.0723, "step": 18 }, { "epoch": 0.4245810055865922, "grad_norm": 0.1948932707309723, "learning_rate": 0.0, "loss": 0.0648, "step": 19 }, { "epoch": 0.44692737430167595, "grad_norm": 0.24341759085655212, "learning_rate": 0.0, "loss": 0.0748, "step": 20 }, { "epoch": 0.4692737430167598, "grad_norm": 0.24712610244750977, "learning_rate": 0.0, "loss": 0.0802, "step": 21 }, { "epoch": 0.49162011173184356, "grad_norm": 0.1936245709657669, "learning_rate": 0.0, "loss": 0.0657, "step": 22 }, { "epoch": 0.5139664804469274, "grad_norm": 0.2752823829650879, "learning_rate": 0.0, "loss": 0.0965, "step": 23 }, { "epoch": 0.5363128491620112, "grad_norm": 0.2546307444572449, "learning_rate": 0.0, "loss": 0.08, "step": 24 }, { "epoch": 0.5586592178770949, "grad_norm": 0.17411033809185028, "learning_rate": 0.0, "loss": 0.0625, "step": 25 }, { "epoch": 0.5810055865921788, "grad_norm": 0.22320827841758728, "learning_rate": 0.0, "loss": 0.075, "step": 26 }, { "epoch": 0.6033519553072626, "grad_norm": 0.19832342863082886, "learning_rate": 0.0, "loss": 0.0672, "step": 27 }, { "epoch": 0.6256983240223464, "grad_norm": 0.2498759627342224, "learning_rate": 0.0, "loss": 0.0825, "step": 28 }, { "epoch": 0.6480446927374302, "grad_norm": 0.20985010266304016, "learning_rate": 0.0, "loss": 0.072, "step": 29 }, { "epoch": 0.6703910614525139, "grad_norm": 0.1565495878458023, "learning_rate": 0.0, "loss": 0.0579, "step": 30 }, { "epoch": 0.6927374301675978, "grad_norm": 0.27120932936668396, "learning_rate": 0.0, "loss": 0.1052, "step": 31 }, { "epoch": 0.7150837988826816, "grad_norm": 0.2077290564775467, "learning_rate": 0.0, "loss": 0.071, "step": 32 }, { "epoch": 0.7374301675977654, "grad_norm": 0.2480878382921219, "learning_rate": 0.0, "loss": 0.0856, "step": 33 }, { "epoch": 0.7597765363128491, "grad_norm": 0.17309145629405975, "learning_rate": 0.0, "loss": 0.0508, "step": 34 }, { "epoch": 0.7821229050279329, "grad_norm": 0.26946860551834106, "learning_rate": 0.0, "loss": 0.0797, "step": 35 }, { "epoch": 0.8044692737430168, "grad_norm": 0.24403966963291168, "learning_rate": 0.0, "loss": 0.0824, "step": 36 }, { "epoch": 0.8268156424581006, "grad_norm": 0.2867369055747986, "learning_rate": 0.0, "loss": 0.0982, "step": 37 }, { "epoch": 0.8491620111731844, "grad_norm": 0.19101133942604065, "learning_rate": 0.0, "loss": 0.0633, "step": 38 }, { "epoch": 0.8715083798882681, "grad_norm": 0.15404891967773438, "learning_rate": 0.0, "loss": 0.0558, "step": 39 }, { "epoch": 0.8938547486033519, "grad_norm": 0.174382746219635, "learning_rate": 0.0, "loss": 0.0553, "step": 40 }, { "epoch": 0.9162011173184358, "grad_norm": 0.18793272972106934, "learning_rate": 0.0, "loss": 0.0655, "step": 41 }, { "epoch": 0.9385474860335196, "grad_norm": 0.1648206263780594, "learning_rate": 0.0, "loss": 0.053, "step": 42 }, { "epoch": 0.9608938547486033, "grad_norm": 0.21228265762329102, "learning_rate": 0.0, "loss": 0.0656, "step": 43 }, { "epoch": 0.9832402234636871, "grad_norm": 0.26687151193618774, "learning_rate": 0.0, "loss": 0.0834, "step": 44 }, { "epoch": 1.0223463687150838, "grad_norm": 0.3744015395641327, "learning_rate": 0.0, "loss": 0.1444, "step": 45 }, { "epoch": 1.0446927374301676, "grad_norm": 0.23247657716274261, "learning_rate": 0.0, "loss": 0.0745, "step": 46 }, { "epoch": 1.0670391061452513, "grad_norm": 0.1918107271194458, "learning_rate": 0.0, "loss": 0.0609, "step": 47 }, { "epoch": 1.089385474860335, "grad_norm": 0.18841643631458282, "learning_rate": 0.0, "loss": 0.0557, "step": 48 }, { "epoch": 1.111731843575419, "grad_norm": 0.23093074560165405, "learning_rate": 0.0, "loss": 0.0779, "step": 49 }, { "epoch": 1.1340782122905029, "grad_norm": 0.1891765594482422, "learning_rate": 0.0, "loss": 0.0539, "step": 50 }, { "epoch": 1.1564245810055866, "grad_norm": 0.17240329086780548, "learning_rate": 0.0, "loss": 0.0626, "step": 51 }, { "epoch": 1.1787709497206704, "grad_norm": 0.1980515420436859, "learning_rate": 0.0, "loss": 0.0635, "step": 52 }, { "epoch": 1.2011173184357542, "grad_norm": 0.14951692521572113, "learning_rate": 0.0, "loss": 0.0528, "step": 53 }, { "epoch": 1.223463687150838, "grad_norm": 0.21029718220233917, "learning_rate": 0.0, "loss": 0.0685, "step": 54 }, { "epoch": 1.2458100558659218, "grad_norm": 0.19979310035705566, "learning_rate": 0.0, "loss": 0.0688, "step": 55 }, { "epoch": 1.2681564245810055, "grad_norm": 0.20193366706371307, "learning_rate": 0.0, "loss": 0.0558, "step": 56 }, { "epoch": 1.2905027932960893, "grad_norm": 0.1936744749546051, "learning_rate": 0.0, "loss": 0.0708, "step": 57 }, { "epoch": 1.3128491620111733, "grad_norm": 0.2125776708126068, "learning_rate": 0.0, "loss": 0.0755, "step": 58 }, { "epoch": 1.3351955307262569, "grad_norm": 0.23341749608516693, "learning_rate": 0.0, "loss": 0.0739, "step": 59 }, { "epoch": 1.3575418994413408, "grad_norm": 0.18041910231113434, "learning_rate": 0.0, "loss": 0.0599, "step": 60 }, { "epoch": 1.3798882681564246, "grad_norm": 0.1614307016134262, "learning_rate": 0.0, "loss": 0.0515, "step": 61 }, { "epoch": 1.4022346368715084, "grad_norm": 0.20286191999912262, "learning_rate": 0.0, "loss": 0.071, "step": 62 }, { "epoch": 1.4245810055865922, "grad_norm": 0.22490696609020233, "learning_rate": 0.0, "loss": 0.0753, "step": 63 }, { "epoch": 1.446927374301676, "grad_norm": 0.19001837074756622, "learning_rate": 0.0, "loss": 0.0621, "step": 64 }, { "epoch": 1.4692737430167597, "grad_norm": 0.23111850023269653, "learning_rate": 0.0, "loss": 0.0719, "step": 65 }, { "epoch": 1.4916201117318435, "grad_norm": 0.1808815598487854, "learning_rate": 0.0, "loss": 0.0605, "step": 66 }, { "epoch": 1.5139664804469275, "grad_norm": 36.47340393066406, "learning_rate": 0.0, "loss": 0.0895, "step": 67 }, { "epoch": 1.536312849162011, "grad_norm": 0.24750609695911407, "learning_rate": 0.0, "loss": 0.0873, "step": 68 }, { "epoch": 1.558659217877095, "grad_norm": 0.26650676131248474, "learning_rate": 0.0, "loss": 0.0875, "step": 69 }, { "epoch": 1.5810055865921788, "grad_norm": 0.24256528913974762, "learning_rate": 0.0, "loss": 0.0858, "step": 70 }, { "epoch": 1.6033519553072626, "grad_norm": 0.1995740830898285, "learning_rate": 0.0, "loss": 0.0699, "step": 71 }, { "epoch": 1.6256983240223464, "grad_norm": 0.22049686312675476, "learning_rate": 0.0, "loss": 0.0783, "step": 72 }, { "epoch": 1.6480446927374302, "grad_norm": 0.3053763508796692, "learning_rate": 0.0, "loss": 0.0975, "step": 73 }, { "epoch": 1.670391061452514, "grad_norm": 0.2576776444911957, "learning_rate": 0.0, "loss": 0.0829, "step": 74 }, { "epoch": 1.6927374301675977, "grad_norm": 0.2177535593509674, "learning_rate": 0.0, "loss": 0.0821, "step": 75 }, { "epoch": 1.7150837988826817, "grad_norm": 0.1927560567855835, "learning_rate": 0.0, "loss": 0.0748, "step": 76 }, { "epoch": 1.7374301675977653, "grad_norm": 0.19672603905200958, "learning_rate": 0.0, "loss": 0.0691, "step": 77 }, { "epoch": 1.7597765363128492, "grad_norm": 0.2172834575176239, "learning_rate": 0.0, "loss": 0.0777, "step": 78 }, { "epoch": 1.7821229050279328, "grad_norm": 548396.8125, "learning_rate": 0.0, "loss": 0.0554, "step": 79 }, { "epoch": 1.8044692737430168, "grad_norm": 0.2138211727142334, "learning_rate": 0.0, "loss": 0.0735, "step": 80 }, { "epoch": 1.8268156424581006, "grad_norm": 0.21841295063495636, "learning_rate": 0.0, "loss": 0.0787, "step": 81 }, { "epoch": 1.8491620111731844, "grad_norm": 0.20514176785945892, "learning_rate": 0.0, "loss": 0.0695, "step": 82 }, { "epoch": 1.8715083798882681, "grad_norm": 0.20208212733268738, "learning_rate": 0.0, "loss": 0.0715, "step": 83 }, { "epoch": 1.893854748603352, "grad_norm": 0.16259855031967163, "learning_rate": 0.0, "loss": 0.0549, "step": 84 }, { "epoch": 1.916201117318436, "grad_norm": 0.18260662257671356, "learning_rate": 0.0, "loss": 0.063, "step": 85 }, { "epoch": 1.9385474860335195, "grad_norm": 0.2165863811969757, "learning_rate": 0.0, "loss": 0.0729, "step": 86 }, { "epoch": 1.9608938547486034, "grad_norm": 0.22433914244174957, "learning_rate": 0.0, "loss": 0.0744, "step": 87 }, { "epoch": 1.983240223463687, "grad_norm": 0.20538468658924103, "learning_rate": 0.0, "loss": 0.0627, "step": 88 }, { "epoch": 2.022346368715084, "grad_norm": 0.5192253589630127, "learning_rate": 0.0, "loss": 0.156, "step": 89 }, { "epoch": 2.0446927374301676, "grad_norm": 0.1697319746017456, "learning_rate": 0.0, "loss": 0.0622, "step": 90 }, { "epoch": 2.0670391061452515, "grad_norm": 0.24133789539337158, "learning_rate": 0.0, "loss": 0.0754, "step": 91 }, { "epoch": 2.089385474860335, "grad_norm": 0.15512314438819885, "learning_rate": 0.0, "loss": 0.0488, "step": 92 }, { "epoch": 2.111731843575419, "grad_norm": 0.20826704800128937, "learning_rate": 0.0, "loss": 0.073, "step": 93 }, { "epoch": 2.1340782122905027, "grad_norm": 0.238108292222023, "learning_rate": 0.0, "loss": 0.0764, "step": 94 }, { "epoch": 2.1564245810055866, "grad_norm": 0.13203485310077667, "learning_rate": 0.0, "loss": 0.0464, "step": 95 }, { "epoch": 2.17877094972067, "grad_norm": 0.20562757551670074, "learning_rate": 0.0, "loss": 0.0694, "step": 96 }, { "epoch": 2.201117318435754, "grad_norm": 0.170828178524971, "learning_rate": 0.0, "loss": 0.0602, "step": 97 }, { "epoch": 2.223463687150838, "grad_norm": 0.28195905685424805, "learning_rate": 0.0, "loss": 0.0744, "step": 98 }, { "epoch": 2.2458100558659218, "grad_norm": 0.18713518977165222, "learning_rate": 0.0, "loss": 0.0654, "step": 99 }, { "epoch": 2.2681564245810057, "grad_norm": 0.1810666173696518, "learning_rate": 0.0, "loss": 0.0624, "step": 100 } ], "logging_steps": 1, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.441939466572827e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }