| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 905, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0055248618784530384, | |
| "grad_norm": 2.2495654788733823, | |
| "learning_rate": 5.4347826086956525e-06, | |
| "loss": 0.9695, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.011049723756906077, | |
| "grad_norm": 1.9461642490164472, | |
| "learning_rate": 1.0869565217391305e-05, | |
| "loss": 0.9861, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016574585635359115, | |
| "grad_norm": 1.417204705816258, | |
| "learning_rate": 1.630434782608696e-05, | |
| "loss": 0.9362, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.022099447513812154, | |
| "grad_norm": 1.1341587011781606, | |
| "learning_rate": 2.173913043478261e-05, | |
| "loss": 0.8601, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.027624309392265192, | |
| "grad_norm": 1.0602684449366846, | |
| "learning_rate": 2.7173913043478262e-05, | |
| "loss": 0.7882, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03314917127071823, | |
| "grad_norm": 0.9548854170467792, | |
| "learning_rate": 3.260869565217392e-05, | |
| "loss": 0.7866, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03867403314917127, | |
| "grad_norm": 0.8472222008228629, | |
| "learning_rate": 3.804347826086957e-05, | |
| "loss": 0.7845, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04419889502762431, | |
| "grad_norm": 0.9685960177896414, | |
| "learning_rate": 4.347826086956522e-05, | |
| "loss": 0.8259, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.049723756906077346, | |
| "grad_norm": 1.0119153562267356, | |
| "learning_rate": 4.891304347826087e-05, | |
| "loss": 0.783, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.055248618784530384, | |
| "grad_norm": 1.0082850863037842, | |
| "learning_rate": 4.9997592434132936e-05, | |
| "loss": 0.8047, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06077348066298342, | |
| "grad_norm": 1.078699321374065, | |
| "learning_rate": 4.9987812580832524e-05, | |
| "loss": 0.7593, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06629834254143646, | |
| "grad_norm": 0.8486834251840161, | |
| "learning_rate": 4.9970513234911335e-05, | |
| "loss": 0.7781, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0718232044198895, | |
| "grad_norm": 0.8151943193010388, | |
| "learning_rate": 4.99457001809384e-05, | |
| "loss": 0.7678, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07734806629834254, | |
| "grad_norm": 0.999095249982963, | |
| "learning_rate": 4.991338171592238e-05, | |
| "loss": 0.7203, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08287292817679558, | |
| "grad_norm": 0.9570443791252086, | |
| "learning_rate": 4.9873568646537276e-05, | |
| "loss": 0.8225, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08839779005524862, | |
| "grad_norm": 1.0377587505108075, | |
| "learning_rate": 4.9826274285508836e-05, | |
| "loss": 0.7606, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.09392265193370165, | |
| "grad_norm": 0.8916728595351986, | |
| "learning_rate": 4.977151444716302e-05, | |
| "loss": 0.7752, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09944751381215469, | |
| "grad_norm": 0.7828895660955932, | |
| "learning_rate": 4.970930744213807e-05, | |
| "loss": 0.7716, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10497237569060773, | |
| "grad_norm": 0.9255391724115476, | |
| "learning_rate": 4.963967407126167e-05, | |
| "loss": 0.7904, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.11049723756906077, | |
| "grad_norm": 0.8282644001387807, | |
| "learning_rate": 4.956263761859566e-05, | |
| "loss": 0.7607, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11602209944751381, | |
| "grad_norm": 0.8855019219145566, | |
| "learning_rate": 4.947822384365024e-05, | |
| "loss": 0.7336, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.12154696132596685, | |
| "grad_norm": 1.3326630713526908, | |
| "learning_rate": 4.9386460972770485e-05, | |
| "loss": 0.756, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1270718232044199, | |
| "grad_norm": 0.7623686644469245, | |
| "learning_rate": 4.9287379689697974e-05, | |
| "loss": 0.7787, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.13259668508287292, | |
| "grad_norm": 0.6744035121841259, | |
| "learning_rate": 4.9181013125310804e-05, | |
| "loss": 0.7607, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13812154696132597, | |
| "grad_norm": 0.8801426064681885, | |
| "learning_rate": 4.9067396846545205e-05, | |
| "loss": 0.7879, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.143646408839779, | |
| "grad_norm": 0.8104653989188151, | |
| "learning_rate": 4.894656884450265e-05, | |
| "loss": 0.7172, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14917127071823205, | |
| "grad_norm": 0.8442454603941927, | |
| "learning_rate": 4.881856952174638e-05, | |
| "loss": 0.8042, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15469613259668508, | |
| "grad_norm": 0.8046048783159924, | |
| "learning_rate": 4.868344167879152e-05, | |
| "loss": 0.7545, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.16022099447513813, | |
| "grad_norm": 0.7342440517610522, | |
| "learning_rate": 4.8541230499793475e-05, | |
| "loss": 0.7239, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.16574585635359115, | |
| "grad_norm": 0.8351674082098574, | |
| "learning_rate": 4.839198353743915e-05, | |
| "loss": 0.7782, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1712707182320442, | |
| "grad_norm": 0.8568746827943232, | |
| "learning_rate": 4.8235750697046296e-05, | |
| "loss": 0.7305, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17679558011049723, | |
| "grad_norm": 0.799075952862911, | |
| "learning_rate": 4.8072584219876086e-05, | |
| "loss": 0.736, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18232044198895028, | |
| "grad_norm": 0.7632219335279382, | |
| "learning_rate": 4.790253866566467e-05, | |
| "loss": 0.7413, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1878453038674033, | |
| "grad_norm": 0.7143875780259061, | |
| "learning_rate": 4.772567089437945e-05, | |
| "loss": 0.7112, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19337016574585636, | |
| "grad_norm": 0.757498467869332, | |
| "learning_rate": 4.7542040047206165e-05, | |
| "loss": 0.79, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.19889502762430938, | |
| "grad_norm": 0.7359012444695264, | |
| "learning_rate": 4.735170752677327e-05, | |
| "loss": 0.7153, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20441988950276244, | |
| "grad_norm": 0.7088663879251825, | |
| "learning_rate": 4.715473697662001e-05, | |
| "loss": 0.7293, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.20994475138121546, | |
| "grad_norm": 0.695608085898617, | |
| "learning_rate": 4.69511942599153e-05, | |
| "loss": 0.7251, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.2154696132596685, | |
| "grad_norm": 0.632627534005913, | |
| "learning_rate": 4.6741147437434323e-05, | |
| "loss": 0.6947, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.22099447513812154, | |
| "grad_norm": 0.704743849198389, | |
| "learning_rate": 4.6524666744800305e-05, | |
| "loss": 0.7147, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2265193370165746, | |
| "grad_norm": 0.7017435419792435, | |
| "learning_rate": 4.630182456899907e-05, | |
| "loss": 0.7302, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.23204419889502761, | |
| "grad_norm": 0.7021833045223487, | |
| "learning_rate": 4.607269542417427e-05, | |
| "loss": 0.7304, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23756906077348067, | |
| "grad_norm": 0.7070770033781527, | |
| "learning_rate": 4.58373559267112e-05, | |
| "loss": 0.7373, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2430939226519337, | |
| "grad_norm": 0.6730958248948499, | |
| "learning_rate": 4.559588476961784e-05, | |
| "loss": 0.7126, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24861878453038674, | |
| "grad_norm": 0.6997491859738907, | |
| "learning_rate": 4.534836269621137e-05, | |
| "loss": 0.7402, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.2541436464088398, | |
| "grad_norm": 0.5977862788008176, | |
| "learning_rate": 4.509487247311917e-05, | |
| "loss": 0.7245, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2596685082872928, | |
| "grad_norm": 0.6993869464122184, | |
| "learning_rate": 4.483549886260324e-05, | |
| "loss": 0.7241, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.26519337016574585, | |
| "grad_norm": 0.7030894956399786, | |
| "learning_rate": 4.4570328594217356e-05, | |
| "loss": 0.7005, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.27071823204419887, | |
| "grad_norm": 0.7057927516603153, | |
| "learning_rate": 4.429945033580633e-05, | |
| "loss": 0.7097, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.27624309392265195, | |
| "grad_norm": 0.6391012673945178, | |
| "learning_rate": 4.4022954663857244e-05, | |
| "loss": 0.7134, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.281767955801105, | |
| "grad_norm": 0.6744043202485268, | |
| "learning_rate": 4.374093403321233e-05, | |
| "loss": 0.7275, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.287292817679558, | |
| "grad_norm": 0.7400160690377109, | |
| "learning_rate": 4.345348274615395e-05, | |
| "loss": 0.7233, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.292817679558011, | |
| "grad_norm": 0.739209940661344, | |
| "learning_rate": 4.3160696920871605e-05, | |
| "loss": 0.7444, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.2983425414364641, | |
| "grad_norm": 0.6742845794877194, | |
| "learning_rate": 4.2862674459321994e-05, | |
| "loss": 0.7316, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.30386740331491713, | |
| "grad_norm": 0.6651549735204162, | |
| "learning_rate": 4.255951501449234e-05, | |
| "loss": 0.7301, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.30939226519337015, | |
| "grad_norm": 0.6819675840699815, | |
| "learning_rate": 4.225131995707845e-05, | |
| "loss": 0.7179, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3149171270718232, | |
| "grad_norm": 0.6416784248554884, | |
| "learning_rate": 4.1938192341588265e-05, | |
| "loss": 0.729, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.32044198895027626, | |
| "grad_norm": 0.727007416919034, | |
| "learning_rate": 4.162023687188238e-05, | |
| "loss": 0.7368, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3259668508287293, | |
| "grad_norm": 0.6219219552088717, | |
| "learning_rate": 4.129755986616315e-05, | |
| "loss": 0.7058, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3314917127071823, | |
| "grad_norm": 0.6741150513014229, | |
| "learning_rate": 4.097026922142389e-05, | |
| "loss": 0.7406, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3370165745856354, | |
| "grad_norm": 0.7472100039399484, | |
| "learning_rate": 4.0638474377370155e-05, | |
| "loss": 0.7021, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.3425414364640884, | |
| "grad_norm": 0.627954564572662, | |
| "learning_rate": 4.0302286279825286e-05, | |
| "loss": 0.7014, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.34806629834254144, | |
| "grad_norm": 0.734789301440309, | |
| "learning_rate": 3.996181734363218e-05, | |
| "loss": 0.7012, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.35359116022099446, | |
| "grad_norm": 0.7242805946396662, | |
| "learning_rate": 3.961718141506398e-05, | |
| "loss": 0.7191, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.35911602209944754, | |
| "grad_norm": 0.6457534652885096, | |
| "learning_rate": 3.926849373375604e-05, | |
| "loss": 0.7239, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.36464088397790057, | |
| "grad_norm": 0.6599551130534848, | |
| "learning_rate": 3.891587089417201e-05, | |
| "loss": 0.7311, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3701657458563536, | |
| "grad_norm": 0.6375311137388923, | |
| "learning_rate": 3.855943080661688e-05, | |
| "loss": 0.6725, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3756906077348066, | |
| "grad_norm": 0.694826014055163, | |
| "learning_rate": 3.819929265781007e-05, | |
| "loss": 0.7053, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3812154696132597, | |
| "grad_norm": 0.7063030149524828, | |
| "learning_rate": 3.783557687103164e-05, | |
| "loss": 0.688, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3867403314917127, | |
| "grad_norm": 0.7229962452752301, | |
| "learning_rate": 3.7468405065855066e-05, | |
| "loss": 0.7466, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.39226519337016574, | |
| "grad_norm": 0.6282377813832024, | |
| "learning_rate": 3.7097900017480025e-05, | |
| "loss": 0.6712, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.39779005524861877, | |
| "grad_norm": 0.6936933940060245, | |
| "learning_rate": 3.672418561567867e-05, | |
| "loss": 0.705, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.40331491712707185, | |
| "grad_norm": 0.6543353136837871, | |
| "learning_rate": 3.634738682336934e-05, | |
| "loss": 0.7115, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.4088397790055249, | |
| "grad_norm": 0.5872861304195733, | |
| "learning_rate": 3.596762963483127e-05, | |
| "loss": 0.7152, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.4143646408839779, | |
| "grad_norm": 0.6287395366045305, | |
| "learning_rate": 3.5585041033574615e-05, | |
| "loss": 0.7396, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.4198895027624309, | |
| "grad_norm": 0.6173655969252787, | |
| "learning_rate": 3.5199748949879544e-05, | |
| "loss": 0.6718, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.425414364640884, | |
| "grad_norm": 0.6115484539768465, | |
| "learning_rate": 3.4811882218018836e-05, | |
| "loss": 0.7041, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.430939226519337, | |
| "grad_norm": 0.6592054769519977, | |
| "learning_rate": 3.442157053317817e-05, | |
| "loss": 0.6905, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.43646408839779005, | |
| "grad_norm": 0.64695169140918, | |
| "learning_rate": 3.402894440808852e-05, | |
| "loss": 0.7249, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4419889502762431, | |
| "grad_norm": 0.6660351092999496, | |
| "learning_rate": 3.363413512938527e-05, | |
| "loss": 0.692, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.44751381215469616, | |
| "grad_norm": 0.6625512350903074, | |
| "learning_rate": 3.32372747137084e-05, | |
| "loss": 0.7009, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.4530386740331492, | |
| "grad_norm": 0.7130204785542628, | |
| "learning_rate": 3.2838495863558716e-05, | |
| "loss": 0.7196, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4585635359116022, | |
| "grad_norm": 0.7209339805921852, | |
| "learning_rate": 3.243793192292468e-05, | |
| "loss": 0.6964, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.46408839779005523, | |
| "grad_norm": 0.6556622599922761, | |
| "learning_rate": 3.2035716832694705e-05, | |
| "loss": 0.6912, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.4696132596685083, | |
| "grad_norm": 0.6372838951795533, | |
| "learning_rate": 3.163198508586993e-05, | |
| "loss": 0.7146, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.47513812154696133, | |
| "grad_norm": 0.6188245445254615, | |
| "learning_rate": 3.122687168259233e-05, | |
| "loss": 0.6981, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.48066298342541436, | |
| "grad_norm": 0.6253825885926813, | |
| "learning_rate": 3.082051208500324e-05, | |
| "loss": 0.7083, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.4861878453038674, | |
| "grad_norm": 0.6385317812674182, | |
| "learning_rate": 3.0413042171947475e-05, | |
| "loss": 0.7183, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.49171270718232046, | |
| "grad_norm": 0.5545979834442434, | |
| "learning_rate": 3.000459819353798e-05, | |
| "loss": 0.6757, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.4972375690607735, | |
| "grad_norm": 0.6457896775039127, | |
| "learning_rate": 2.9595316725596485e-05, | |
| "loss": 0.7126, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5027624309392266, | |
| "grad_norm": 0.6026206696104952, | |
| "learning_rate": 2.918533462398509e-05, | |
| "loss": 0.6932, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5082872928176796, | |
| "grad_norm": 0.705985656607051, | |
| "learning_rate": 2.8774788978844374e-05, | |
| "loss": 0.7165, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5138121546961326, | |
| "grad_norm": 0.6822033180016115, | |
| "learning_rate": 2.8363817068753025e-05, | |
| "loss": 0.7232, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5193370165745856, | |
| "grad_norm": 0.6026435106196351, | |
| "learning_rate": 2.795255631482457e-05, | |
| "loss": 0.6923, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5248618784530387, | |
| "grad_norm": 0.6136085628375205, | |
| "learning_rate": 2.7541144234756327e-05, | |
| "loss": 0.6988, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5303867403314917, | |
| "grad_norm": 0.595949445957797, | |
| "learning_rate": 2.7129718396846216e-05, | |
| "loss": 0.6793, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5359116022099447, | |
| "grad_norm": 0.6198967125875174, | |
| "learning_rate": 2.671841637399249e-05, | |
| "loss": 0.6711, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5414364640883977, | |
| "grad_norm": 0.6448908365718997, | |
| "learning_rate": 2.6307375697692016e-05, | |
| "loss": 0.6963, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5469613259668509, | |
| "grad_norm": 0.5952230142125985, | |
| "learning_rate": 2.5896733812052403e-05, | |
| "loss": 0.6595, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5524861878453039, | |
| "grad_norm": 0.5692444802287681, | |
| "learning_rate": 2.5486628027833337e-05, | |
| "loss": 0.705, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5580110497237569, | |
| "grad_norm": 0.6583799728791444, | |
| "learning_rate": 2.5077195476532456e-05, | |
| "loss": 0.7108, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.56353591160221, | |
| "grad_norm": 0.6118674538428357, | |
| "learning_rate": 2.4668573064531275e-05, | |
| "loss": 0.6779, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.569060773480663, | |
| "grad_norm": 0.6865305146078687, | |
| "learning_rate": 2.4260897427316255e-05, | |
| "loss": 0.7115, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.574585635359116, | |
| "grad_norm": 0.5634331597251886, | |
| "learning_rate": 2.3854304883790573e-05, | |
| "loss": 0.6763, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.580110497237569, | |
| "grad_norm": 0.603347124007975, | |
| "learning_rate": 2.344893139069166e-05, | |
| "loss": 0.6728, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.585635359116022, | |
| "grad_norm": 0.5780548338990102, | |
| "learning_rate": 2.304491249712992e-05, | |
| "loss": 0.6857, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5911602209944752, | |
| "grad_norm": 0.5838799744020187, | |
| "learning_rate": 2.2642383299263674e-05, | |
| "loss": 0.6961, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5966850828729282, | |
| "grad_norm": 0.610184172054768, | |
| "learning_rate": 2.224147839512562e-05, | |
| "loss": 0.7036, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6022099447513812, | |
| "grad_norm": 0.6249192777191065, | |
| "learning_rate": 2.184233183961582e-05, | |
| "loss": 0.6486, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.6077348066298343, | |
| "grad_norm": 0.637226405901165, | |
| "learning_rate": 2.1445077099676346e-05, | |
| "loss": 0.6819, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6132596685082873, | |
| "grad_norm": 0.6326954384056243, | |
| "learning_rate": 2.1049847009662455e-05, | |
| "loss": 0.6957, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6187845303867403, | |
| "grad_norm": 0.621926214738313, | |
| "learning_rate": 2.065677372692536e-05, | |
| "loss": 0.6761, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6243093922651933, | |
| "grad_norm": 0.5897101308180229, | |
| "learning_rate": 2.0265988687621363e-05, | |
| "loss": 0.6872, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6298342541436464, | |
| "grad_norm": 0.5718564263286896, | |
| "learning_rate": 1.9877622562762088e-05, | |
| "loss": 0.6663, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6353591160220995, | |
| "grad_norm": 0.5822438329318962, | |
| "learning_rate": 1.949180521452064e-05, | |
| "loss": 0.6952, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6408839779005525, | |
| "grad_norm": 0.55538735160568, | |
| "learning_rate": 1.9108665652808177e-05, | |
| "loss": 0.6389, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6464088397790055, | |
| "grad_norm": 0.624144335344885, | |
| "learning_rate": 1.8728331992135457e-05, | |
| "loss": 0.6916, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6519337016574586, | |
| "grad_norm": 0.5931319057188856, | |
| "learning_rate": 1.835093140877383e-05, | |
| "loss": 0.6737, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6574585635359116, | |
| "grad_norm": 0.6049587715850621, | |
| "learning_rate": 1.7976590098229932e-05, | |
| "loss": 0.6985, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6629834254143646, | |
| "grad_norm": 0.5880589654245556, | |
| "learning_rate": 1.7605433233048325e-05, | |
| "loss": 0.6615, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6685082872928176, | |
| "grad_norm": 0.5512950866249651, | |
| "learning_rate": 1.7237584920956195e-05, | |
| "loss": 0.6676, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6740331491712708, | |
| "grad_norm": 0.6279974406839042, | |
| "learning_rate": 1.6873168163364126e-05, | |
| "loss": 0.6792, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6795580110497238, | |
| "grad_norm": 0.6298057725245987, | |
| "learning_rate": 1.651230481423677e-05, | |
| "loss": 0.6611, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6850828729281768, | |
| "grad_norm": 0.5801623465755368, | |
| "learning_rate": 1.615511553934726e-05, | |
| "loss": 0.6862, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6906077348066298, | |
| "grad_norm": 0.5791150492145345, | |
| "learning_rate": 1.5801719775928858e-05, | |
| "loss": 0.6777, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6961325966850829, | |
| "grad_norm": 0.5365861610338757, | |
| "learning_rate": 1.545223569273744e-05, | |
| "loss": 0.6619, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.7016574585635359, | |
| "grad_norm": 0.5592674957340275, | |
| "learning_rate": 1.5106780150538164e-05, | |
| "loss": 0.6725, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.7071823204419889, | |
| "grad_norm": 0.5874282755671061, | |
| "learning_rate": 1.4765468663029427e-05, | |
| "loss": 0.705, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.712707182320442, | |
| "grad_norm": 0.5736802681033443, | |
| "learning_rate": 1.4428415358217348e-05, | |
| "loss": 0.6721, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.7182320441988951, | |
| "grad_norm": 0.5911352248795241, | |
| "learning_rate": 1.409573294025354e-05, | |
| "loss": 0.7002, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7237569060773481, | |
| "grad_norm": 0.5731565558773776, | |
| "learning_rate": 1.3767532651748973e-05, | |
| "loss": 0.6584, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7292817679558011, | |
| "grad_norm": 0.5567247088749864, | |
| "learning_rate": 1.3443924236576643e-05, | |
| "loss": 0.7039, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7348066298342542, | |
| "grad_norm": 0.5760525309492333, | |
| "learning_rate": 1.3125015903175292e-05, | |
| "loss": 0.6675, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7403314917127072, | |
| "grad_norm": 0.5707270289177868, | |
| "learning_rate": 1.2810914288366571e-05, | |
| "loss": 0.6866, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7458563535911602, | |
| "grad_norm": 0.6426256664859457, | |
| "learning_rate": 1.2501724421697753e-05, | |
| "loss": 0.6701, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7513812154696132, | |
| "grad_norm": 0.5777718022675424, | |
| "learning_rate": 1.2197549690321886e-05, | |
| "loss": 0.7014, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7569060773480663, | |
| "grad_norm": 0.5477827409060126, | |
| "learning_rate": 1.1898491804427097e-05, | |
| "loss": 0.6319, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7624309392265194, | |
| "grad_norm": 0.674023091743246, | |
| "learning_rate": 1.1604650763226643e-05, | |
| "loss": 0.6525, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7679558011049724, | |
| "grad_norm": 0.607590999103845, | |
| "learning_rate": 1.131612482152113e-05, | |
| "loss": 0.667, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7734806629834254, | |
| "grad_norm": 0.6018921128449826, | |
| "learning_rate": 1.1033010456843956e-05, | |
| "loss": 0.6689, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7790055248618785, | |
| "grad_norm": 0.6519694089962578, | |
| "learning_rate": 1.075540233720112e-05, | |
| "loss": 0.7203, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7845303867403315, | |
| "grad_norm": 0.5735806870840707, | |
| "learning_rate": 1.048339328941601e-05, | |
| "loss": 0.6472, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7900552486187845, | |
| "grad_norm": 0.6397132166838564, | |
| "learning_rate": 1.0217074268089937e-05, | |
| "loss": 0.6769, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7955801104972375, | |
| "grad_norm": 0.5494095260135633, | |
| "learning_rate": 9.956534325188664e-06, | |
| "loss": 0.7047, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8011049723756906, | |
| "grad_norm": 0.6304028016016777, | |
| "learning_rate": 9.701860580265087e-06, | |
| "loss": 0.6768, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.8066298342541437, | |
| "grad_norm": 0.5704657105301698, | |
| "learning_rate": 9.453138191328185e-06, | |
| "loss": 0.6683, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8121546961325967, | |
| "grad_norm": 0.5890416243667709, | |
| "learning_rate": 9.210450326367803e-06, | |
| "loss": 0.6728, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.8176795580110497, | |
| "grad_norm": 0.5607066583139462, | |
| "learning_rate": 8.973878135544859e-06, | |
| "loss": 0.683, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8232044198895028, | |
| "grad_norm": 0.6271466709020599, | |
| "learning_rate": 8.743500724056313e-06, | |
| "loss": 0.6754, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8287292817679558, | |
| "grad_norm": 0.6131988976912456, | |
| "learning_rate": 8.519395125683873e-06, | |
| "loss": 0.6387, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8342541436464088, | |
| "grad_norm": 0.5615837426545646, | |
| "learning_rate": 8.30163627703541e-06, | |
| "loss": 0.6717, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8397790055248618, | |
| "grad_norm": 0.5158077036018013, | |
| "learning_rate": 8.090296992487588e-06, | |
| "loss": 0.6001, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8453038674033149, | |
| "grad_norm": 0.5367673789723251, | |
| "learning_rate": 7.885447939838128e-06, | |
| "loss": 0.675, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.850828729281768, | |
| "grad_norm": 0.53629105700699, | |
| "learning_rate": 7.687157616675851e-06, | |
| "loss": 0.6333, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.856353591160221, | |
| "grad_norm": 0.5672788900263016, | |
| "learning_rate": 7.495492327476418e-06, | |
| "loss": 0.6704, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.861878453038674, | |
| "grad_norm": 0.5428010023883922, | |
| "learning_rate": 7.310516161431368e-06, | |
| "loss": 0.6473, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8674033149171271, | |
| "grad_norm": 0.5921147890300914, | |
| "learning_rate": 7.132290971017927e-06, | |
| "loss": 0.6815, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8729281767955801, | |
| "grad_norm": 0.5344053242706458, | |
| "learning_rate": 6.9608763513167336e-06, | |
| "loss": 0.6564, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8784530386740331, | |
| "grad_norm": 0.628627445655646, | |
| "learning_rate": 6.796329620084385e-06, | |
| "loss": 0.6537, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8839779005524862, | |
| "grad_norm": 0.5410896967839478, | |
| "learning_rate": 6.63870579858749e-06, | |
| "loss": 0.6215, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8895027624309392, | |
| "grad_norm": 0.553908457484728, | |
| "learning_rate": 6.488057593204589e-06, | |
| "loss": 0.6904, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8950276243093923, | |
| "grad_norm": 0.5653620781263345, | |
| "learning_rate": 6.344435377802178e-06, | |
| "loss": 0.6563, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.9005524861878453, | |
| "grad_norm": 0.5298053711596861, | |
| "learning_rate": 6.207887176890645e-06, | |
| "loss": 0.6585, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.9060773480662984, | |
| "grad_norm": 0.5934777330589273, | |
| "learning_rate": 6.0784586495658025e-06, | |
| "loss": 0.6779, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9116022099447514, | |
| "grad_norm": 0.5863688277507009, | |
| "learning_rate": 5.956193074241348e-06, | |
| "loss": 0.6686, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.9171270718232044, | |
| "grad_norm": 0.49942899738158636, | |
| "learning_rate": 5.841131334177408e-06, | |
| "loss": 0.6206, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9226519337016574, | |
| "grad_norm": 0.542957527300465, | |
| "learning_rate": 5.733311903809964e-06, | |
| "loss": 0.6407, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.9281767955801105, | |
| "grad_norm": 0.5772756241841824, | |
| "learning_rate": 5.6327708358857185e-06, | |
| "loss": 0.6838, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9337016574585635, | |
| "grad_norm": 0.6803622516332938, | |
| "learning_rate": 5.5395417494067696e-06, | |
| "loss": 0.6923, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.9392265193370166, | |
| "grad_norm": 0.5292098020073351, | |
| "learning_rate": 5.453655818389058e-06, | |
| "loss": 0.6497, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9447513812154696, | |
| "grad_norm": 0.4934073985759796, | |
| "learning_rate": 5.37514176143837e-06, | |
| "loss": 0.6088, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9502762430939227, | |
| "grad_norm": 0.593336038543385, | |
| "learning_rate": 5.304025832147392e-06, | |
| "loss": 0.6514, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9558011049723757, | |
| "grad_norm": 0.5473762015068131, | |
| "learning_rate": 5.240331810317012e-06, | |
| "loss": 0.6576, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.9613259668508287, | |
| "grad_norm": 0.6258926493703553, | |
| "learning_rate": 5.184080994004797e-06, | |
| "loss": 0.6733, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9668508287292817, | |
| "grad_norm": 0.5461645379290371, | |
| "learning_rate": 5.135292192403366e-06, | |
| "loss": 0.6529, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9723756906077348, | |
| "grad_norm": 0.6064073284247843, | |
| "learning_rate": 5.093981719550922e-06, | |
| "loss": 0.6559, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9779005524861878, | |
| "grad_norm": 0.5243053103650271, | |
| "learning_rate": 5.060163388876165e-06, | |
| "loss": 0.6214, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.9834254143646409, | |
| "grad_norm": 0.5755014538442916, | |
| "learning_rate": 5.033848508579353e-06, | |
| "loss": 0.6603, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.988950276243094, | |
| "grad_norm": 0.5488561027700054, | |
| "learning_rate": 5.015045877851049e-06, | |
| "loss": 0.6213, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.994475138121547, | |
| "grad_norm": 0.5042073727528616, | |
| "learning_rate": 5.003761783929837e-06, | |
| "loss": 0.6301, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5266929188152082, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6544, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 905, | |
| "total_flos": 77344140165120.0, | |
| "train_loss": 0.7063807840505357, | |
| "train_runtime": 17902.8015, | |
| "train_samples_per_second": 0.303, | |
| "train_steps_per_second": 0.051 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 905, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 77344140165120.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |