| { | |
| "best_global_step": 1, | |
| "best_metric": 0.0719488188624382, | |
| "best_model_checkpoint": "outputs_3/checkpoint-1125", | |
| "epoch": 2.2681564245810057, | |
| "eval_steps": 20, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0032310177705977385, | |
| "grad_norm": 0.0250613521784544, | |
| "learning_rate": 0.0, | |
| "loss": 0.1103, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.0446927374301676, | |
| "grad_norm": 0.1923094540834427, | |
| "learning_rate": 1.5828635851183766e-05, | |
| "loss": 0.0672, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0670391061452514, | |
| "grad_norm": 0.16962496936321259, | |
| "learning_rate": 0.0, | |
| "loss": 0.063, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0893854748603352, | |
| "grad_norm": 0.19785423576831818, | |
| "learning_rate": 0.0, | |
| "loss": 0.0712, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.11173184357541899, | |
| "grad_norm": 0.21228645741939545, | |
| "learning_rate": 0.0, | |
| "loss": 0.0725, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.1340782122905028, | |
| "grad_norm": 0.18974730372428894, | |
| "learning_rate": 0.0, | |
| "loss": 0.0627, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.1564245810055866, | |
| "grad_norm": 0.19599883258342743, | |
| "learning_rate": 0.0, | |
| "loss": 0.0712, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1787709497206704, | |
| "grad_norm": 0.17850686609745026, | |
| "learning_rate": 0.0, | |
| "loss": 0.0579, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.2011173184357542, | |
| "grad_norm": 0.24011921882629395, | |
| "learning_rate": 0.0, | |
| "loss": 0.0828, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.22346368715083798, | |
| "grad_norm": 0.20377422869205475, | |
| "learning_rate": 0.0, | |
| "loss": 0.0627, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24581005586592178, | |
| "grad_norm": 0.21203763782978058, | |
| "learning_rate": 0.0, | |
| "loss": 0.0745, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.2681564245810056, | |
| "grad_norm": 0.21539321541786194, | |
| "learning_rate": 0.0, | |
| "loss": 0.0727, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.2905027932960894, | |
| "grad_norm": 0.16341185569763184, | |
| "learning_rate": 0.0, | |
| "loss": 0.0521, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.3128491620111732, | |
| "grad_norm": 0.24054378271102905, | |
| "learning_rate": 0.0, | |
| "loss": 0.0816, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.33519553072625696, | |
| "grad_norm": 0.25560539960861206, | |
| "learning_rate": 0.0, | |
| "loss": 0.0833, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.3575418994413408, | |
| "grad_norm": 0.20083709061145782, | |
| "learning_rate": 0.0, | |
| "loss": 0.0655, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.37988826815642457, | |
| "grad_norm": 0.14186497032642365, | |
| "learning_rate": 0.0, | |
| "loss": 0.0487, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.4022346368715084, | |
| "grad_norm": 0.21731677651405334, | |
| "learning_rate": 0.0, | |
| "loss": 0.0723, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.4245810055865922, | |
| "grad_norm": 0.1948932707309723, | |
| "learning_rate": 0.0, | |
| "loss": 0.0648, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.44692737430167595, | |
| "grad_norm": 0.24341759085655212, | |
| "learning_rate": 0.0, | |
| "loss": 0.0748, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4692737430167598, | |
| "grad_norm": 0.24712610244750977, | |
| "learning_rate": 0.0, | |
| "loss": 0.0802, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.49162011173184356, | |
| "grad_norm": 0.1936245709657669, | |
| "learning_rate": 0.0, | |
| "loss": 0.0657, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.5139664804469274, | |
| "grad_norm": 0.2752823829650879, | |
| "learning_rate": 0.0, | |
| "loss": 0.0965, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.5363128491620112, | |
| "grad_norm": 0.2546307444572449, | |
| "learning_rate": 0.0, | |
| "loss": 0.08, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.5586592178770949, | |
| "grad_norm": 0.17411033809185028, | |
| "learning_rate": 0.0, | |
| "loss": 0.0625, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.5810055865921788, | |
| "grad_norm": 0.22320827841758728, | |
| "learning_rate": 0.0, | |
| "loss": 0.075, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.6033519553072626, | |
| "grad_norm": 0.19832342863082886, | |
| "learning_rate": 0.0, | |
| "loss": 0.0672, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.6256983240223464, | |
| "grad_norm": 0.2498759627342224, | |
| "learning_rate": 0.0, | |
| "loss": 0.0825, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.6480446927374302, | |
| "grad_norm": 0.20985010266304016, | |
| "learning_rate": 0.0, | |
| "loss": 0.072, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.6703910614525139, | |
| "grad_norm": 0.1565495878458023, | |
| "learning_rate": 0.0, | |
| "loss": 0.0579, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.6927374301675978, | |
| "grad_norm": 0.27120932936668396, | |
| "learning_rate": 0.0, | |
| "loss": 0.1052, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.7150837988826816, | |
| "grad_norm": 0.2077290564775467, | |
| "learning_rate": 0.0, | |
| "loss": 0.071, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.7374301675977654, | |
| "grad_norm": 0.2480878382921219, | |
| "learning_rate": 0.0, | |
| "loss": 0.0856, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.7597765363128491, | |
| "grad_norm": 0.17309145629405975, | |
| "learning_rate": 0.0, | |
| "loss": 0.0508, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.7821229050279329, | |
| "grad_norm": 0.26946860551834106, | |
| "learning_rate": 0.0, | |
| "loss": 0.0797, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.8044692737430168, | |
| "grad_norm": 0.24403966963291168, | |
| "learning_rate": 0.0, | |
| "loss": 0.0824, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.8268156424581006, | |
| "grad_norm": 0.2867369055747986, | |
| "learning_rate": 0.0, | |
| "loss": 0.0982, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.8491620111731844, | |
| "grad_norm": 0.19101133942604065, | |
| "learning_rate": 0.0, | |
| "loss": 0.0633, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.8715083798882681, | |
| "grad_norm": 0.15404891967773438, | |
| "learning_rate": 0.0, | |
| "loss": 0.0558, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.8938547486033519, | |
| "grad_norm": 0.174382746219635, | |
| "learning_rate": 0.0, | |
| "loss": 0.0553, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.9162011173184358, | |
| "grad_norm": 0.18793272972106934, | |
| "learning_rate": 0.0, | |
| "loss": 0.0655, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.9385474860335196, | |
| "grad_norm": 0.1648206263780594, | |
| "learning_rate": 0.0, | |
| "loss": 0.053, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.9608938547486033, | |
| "grad_norm": 0.21228265762329102, | |
| "learning_rate": 0.0, | |
| "loss": 0.0656, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.9832402234636871, | |
| "grad_norm": 0.26687151193618774, | |
| "learning_rate": 0.0, | |
| "loss": 0.0834, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 1.0223463687150838, | |
| "grad_norm": 0.3744015395641327, | |
| "learning_rate": 0.0, | |
| "loss": 0.1444, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 1.0446927374301676, | |
| "grad_norm": 0.23247657716274261, | |
| "learning_rate": 0.0, | |
| "loss": 0.0745, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 1.0670391061452513, | |
| "grad_norm": 0.1918107271194458, | |
| "learning_rate": 0.0, | |
| "loss": 0.0609, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 1.089385474860335, | |
| "grad_norm": 0.18841643631458282, | |
| "learning_rate": 0.0, | |
| "loss": 0.0557, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 1.111731843575419, | |
| "grad_norm": 0.23093074560165405, | |
| "learning_rate": 0.0, | |
| "loss": 0.0779, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 1.1340782122905029, | |
| "grad_norm": 0.1891765594482422, | |
| "learning_rate": 0.0, | |
| "loss": 0.0539, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 1.1564245810055866, | |
| "grad_norm": 0.17240329086780548, | |
| "learning_rate": 0.0, | |
| "loss": 0.0626, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 1.1787709497206704, | |
| "grad_norm": 0.1980515420436859, | |
| "learning_rate": 0.0, | |
| "loss": 0.0635, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 1.2011173184357542, | |
| "grad_norm": 0.14951692521572113, | |
| "learning_rate": 0.0, | |
| "loss": 0.0528, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 1.223463687150838, | |
| "grad_norm": 0.21029718220233917, | |
| "learning_rate": 0.0, | |
| "loss": 0.0685, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 1.2458100558659218, | |
| "grad_norm": 0.19979310035705566, | |
| "learning_rate": 0.0, | |
| "loss": 0.0688, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 1.2681564245810055, | |
| "grad_norm": 0.20193366706371307, | |
| "learning_rate": 0.0, | |
| "loss": 0.0558, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 1.2905027932960893, | |
| "grad_norm": 0.1936744749546051, | |
| "learning_rate": 0.0, | |
| "loss": 0.0708, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.3128491620111733, | |
| "grad_norm": 0.2125776708126068, | |
| "learning_rate": 0.0, | |
| "loss": 0.0755, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 1.3351955307262569, | |
| "grad_norm": 0.23341749608516693, | |
| "learning_rate": 0.0, | |
| "loss": 0.0739, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 1.3575418994413408, | |
| "grad_norm": 0.18041910231113434, | |
| "learning_rate": 0.0, | |
| "loss": 0.0599, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.3798882681564246, | |
| "grad_norm": 0.1614307016134262, | |
| "learning_rate": 0.0, | |
| "loss": 0.0515, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 1.4022346368715084, | |
| "grad_norm": 0.20286191999912262, | |
| "learning_rate": 0.0, | |
| "loss": 0.071, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 1.4245810055865922, | |
| "grad_norm": 0.22490696609020233, | |
| "learning_rate": 0.0, | |
| "loss": 0.0753, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 1.446927374301676, | |
| "grad_norm": 0.19001837074756622, | |
| "learning_rate": 0.0, | |
| "loss": 0.0621, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 1.4692737430167597, | |
| "grad_norm": 0.23111850023269653, | |
| "learning_rate": 0.0, | |
| "loss": 0.0719, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.4916201117318435, | |
| "grad_norm": 0.1808815598487854, | |
| "learning_rate": 0.0, | |
| "loss": 0.0605, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 1.5139664804469275, | |
| "grad_norm": 36.47340393066406, | |
| "learning_rate": 0.0, | |
| "loss": 0.0895, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.536312849162011, | |
| "grad_norm": 0.24750609695911407, | |
| "learning_rate": 0.0, | |
| "loss": 0.0873, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.558659217877095, | |
| "grad_norm": 0.26650676131248474, | |
| "learning_rate": 0.0, | |
| "loss": 0.0875, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.5810055865921788, | |
| "grad_norm": 0.24256528913974762, | |
| "learning_rate": 0.0, | |
| "loss": 0.0858, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.6033519553072626, | |
| "grad_norm": 0.1995740830898285, | |
| "learning_rate": 0.0, | |
| "loss": 0.0699, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.6256983240223464, | |
| "grad_norm": 0.22049686312675476, | |
| "learning_rate": 0.0, | |
| "loss": 0.0783, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.6480446927374302, | |
| "grad_norm": 0.3053763508796692, | |
| "learning_rate": 0.0, | |
| "loss": 0.0975, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.670391061452514, | |
| "grad_norm": 0.2576776444911957, | |
| "learning_rate": 0.0, | |
| "loss": 0.0829, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.6927374301675977, | |
| "grad_norm": 0.2177535593509674, | |
| "learning_rate": 0.0, | |
| "loss": 0.0821, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.7150837988826817, | |
| "grad_norm": 0.1927560567855835, | |
| "learning_rate": 0.0, | |
| "loss": 0.0748, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.7374301675977653, | |
| "grad_norm": 0.19672603905200958, | |
| "learning_rate": 0.0, | |
| "loss": 0.0691, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.7597765363128492, | |
| "grad_norm": 0.2172834575176239, | |
| "learning_rate": 0.0, | |
| "loss": 0.0777, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.7821229050279328, | |
| "grad_norm": 548396.8125, | |
| "learning_rate": 0.0, | |
| "loss": 0.0554, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.8044692737430168, | |
| "grad_norm": 0.2138211727142334, | |
| "learning_rate": 0.0, | |
| "loss": 0.0735, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.8268156424581006, | |
| "grad_norm": 0.21841295063495636, | |
| "learning_rate": 0.0, | |
| "loss": 0.0787, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.8491620111731844, | |
| "grad_norm": 0.20514176785945892, | |
| "learning_rate": 0.0, | |
| "loss": 0.0695, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.8715083798882681, | |
| "grad_norm": 0.20208212733268738, | |
| "learning_rate": 0.0, | |
| "loss": 0.0715, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.893854748603352, | |
| "grad_norm": 0.16259855031967163, | |
| "learning_rate": 0.0, | |
| "loss": 0.0549, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.916201117318436, | |
| "grad_norm": 0.18260662257671356, | |
| "learning_rate": 0.0, | |
| "loss": 0.063, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.9385474860335195, | |
| "grad_norm": 0.2165863811969757, | |
| "learning_rate": 0.0, | |
| "loss": 0.0729, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.9608938547486034, | |
| "grad_norm": 0.22433914244174957, | |
| "learning_rate": 0.0, | |
| "loss": 0.0744, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.983240223463687, | |
| "grad_norm": 0.20538468658924103, | |
| "learning_rate": 0.0, | |
| "loss": 0.0627, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 2.022346368715084, | |
| "grad_norm": 0.5192253589630127, | |
| "learning_rate": 0.0, | |
| "loss": 0.156, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 2.0446927374301676, | |
| "grad_norm": 0.1697319746017456, | |
| "learning_rate": 0.0, | |
| "loss": 0.0622, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 2.0670391061452515, | |
| "grad_norm": 0.24133789539337158, | |
| "learning_rate": 0.0, | |
| "loss": 0.0754, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 2.089385474860335, | |
| "grad_norm": 0.15512314438819885, | |
| "learning_rate": 0.0, | |
| "loss": 0.0488, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 2.111731843575419, | |
| "grad_norm": 0.20826704800128937, | |
| "learning_rate": 0.0, | |
| "loss": 0.073, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 2.1340782122905027, | |
| "grad_norm": 0.238108292222023, | |
| "learning_rate": 0.0, | |
| "loss": 0.0764, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 2.1564245810055866, | |
| "grad_norm": 0.13203485310077667, | |
| "learning_rate": 0.0, | |
| "loss": 0.0464, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 2.17877094972067, | |
| "grad_norm": 0.20562757551670074, | |
| "learning_rate": 0.0, | |
| "loss": 0.0694, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 2.201117318435754, | |
| "grad_norm": 0.170828178524971, | |
| "learning_rate": 0.0, | |
| "loss": 0.0602, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 2.223463687150838, | |
| "grad_norm": 0.28195905685424805, | |
| "learning_rate": 0.0, | |
| "loss": 0.0744, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 2.2458100558659218, | |
| "grad_norm": 0.18713518977165222, | |
| "learning_rate": 0.0, | |
| "loss": 0.0654, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 2.2681564245810057, | |
| "grad_norm": 0.1810666173696518, | |
| "learning_rate": 0.0, | |
| "loss": 0.0624, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 132, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.441939466572827e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |