| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.012497188132670149, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 2.4994376265340298e-05, | |
| "grad_norm": 22.25, | |
| "learning_rate": 0.0008, | |
| "loss": 12.4316, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00012497188132670149, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 0.0007999840035991902, | |
| "loss": 13.3525, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.00024994376265340297, | |
| "grad_norm": 4.5, | |
| "learning_rate": 0.000799964008098178, | |
| "loss": 14.8491, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.00037491564398010446, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 0.0007999440125971656, | |
| "loss": 8.8491, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0004998875253068059, | |
| "grad_norm": 3.75, | |
| "learning_rate": 0.0007999240170961534, | |
| "loss": 7.9633, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0006248594066335074, | |
| "grad_norm": 4.75, | |
| "learning_rate": 0.0007999040215951412, | |
| "loss": 7.787, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0007498312879602089, | |
| "grad_norm": 15.3125, | |
| "learning_rate": 0.0007998840260941289, | |
| "loss": 7.6196, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0008748031692869104, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 0.0007998640305931166, | |
| "loss": 7.453, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0009997750506136119, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 0.0007998440350921043, | |
| "loss": 7.3384, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0011247469319403134, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 0.000799824039591092, | |
| "loss": 7.2074, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.0012497188132670149, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 0.0007998040440900798, | |
| "loss": 7.1685, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0013746906945937163, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 0.0007997840485890674, | |
| "loss": 7.2311, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0014996625759204178, | |
| "grad_norm": 7.28125, | |
| "learning_rate": 0.0007997640530880552, | |
| "loss": 7.1694, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0016246344572471193, | |
| "grad_norm": 3.0, | |
| "learning_rate": 0.0007997440575870429, | |
| "loss": 7.1179, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.0017496063385738208, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 0.0007997240620860307, | |
| "loss": 7.1143, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0018745782199005223, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 0.0007997040665850185, | |
| "loss": 7.0387, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0019995501012272238, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 0.0007996840710840061, | |
| "loss": 6.825, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0021245219825539252, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 0.0007996640755829939, | |
| "loss": 6.7811, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.0022494938638806267, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 0.0007996440800819816, | |
| "loss": 6.821, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.0023744657452073282, | |
| "grad_norm": 3.0, | |
| "learning_rate": 0.0007996240845809693, | |
| "loss": 6.7552, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.0024994376265340297, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 0.000799604089079957, | |
| "loss": 6.7139, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.002624409507860731, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 0.0007995840935789447, | |
| "loss": 6.7687, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0027493813891874327, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 0.0007995640980779325, | |
| "loss": 6.6156, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.002874353270514134, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 0.0007995441025769203, | |
| "loss": 6.5183, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.0029993251518408356, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 0.000799524107075908, | |
| "loss": 6.5023, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.003124297033167537, | |
| "grad_norm": 3.0, | |
| "learning_rate": 0.0007995041115748957, | |
| "loss": 6.3342, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0032492689144942386, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 0.0007994841160738834, | |
| "loss": 6.416, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.00337424079582094, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 0.0007994641205728712, | |
| "loss": 6.3348, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.0034992126771476416, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 0.0007994441250718589, | |
| "loss": 6.3827, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.003624184558474343, | |
| "grad_norm": 2.5, | |
| "learning_rate": 0.0007994241295708465, | |
| "loss": 6.4047, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.0037491564398010446, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 0.0007994041340698343, | |
| "loss": 6.1781, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.003874128321127746, | |
| "grad_norm": 3.375, | |
| "learning_rate": 0.000799384138568822, | |
| "loss": 6.2532, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.0039991002024544475, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 0.0007993641430678099, | |
| "loss": 6.2372, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.004124072083781149, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 0.0007993441475667975, | |
| "loss": 6.0455, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.0042490439651078505, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 0.0007993241520657852, | |
| "loss": 6.1694, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.004374015846434552, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 0.000799304156564773, | |
| "loss": 6.045, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.0044989877277612535, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 0.0007992841610637607, | |
| "loss": 6.0104, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.004623959609087955, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 0.0007992641655627485, | |
| "loss": 5.9327, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.0047489314904146564, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 0.0007992441700617361, | |
| "loss": 5.9164, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.004873903371741358, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 0.0007992241745607238, | |
| "loss": 5.9775, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.004998875253068059, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 0.0007992041790597116, | |
| "loss": 5.6958, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.005123847134394761, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 0.0007991841835586994, | |
| "loss": 5.8286, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.005248819015721462, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 0.000799164188057687, | |
| "loss": 5.7053, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.005373790897048164, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 0.0007991441925566748, | |
| "loss": 5.725, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.005498762778374865, | |
| "grad_norm": 1.875, | |
| "learning_rate": 0.0007991241970556625, | |
| "loss": 5.6576, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.005623734659701567, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0007991042015546503, | |
| "loss": 5.7389, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.005748706541028268, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 0.000799084206053638, | |
| "loss": 5.4624, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.00587367842235497, | |
| "grad_norm": 2.625, | |
| "learning_rate": 0.0007990642105526257, | |
| "loss": 5.6441, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.005998650303681671, | |
| "grad_norm": 2.5, | |
| "learning_rate": 0.0007990442150516134, | |
| "loss": 5.552, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.006123622185008373, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.0007990242195506011, | |
| "loss": 5.3983, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.006248594066335074, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 0.000799004224049589, | |
| "loss": 5.3757, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.006373565947661776, | |
| "grad_norm": 2.25, | |
| "learning_rate": 0.0007989842285485766, | |
| "loss": 5.4723, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.006498537828988477, | |
| "grad_norm": 2.25, | |
| "learning_rate": 0.0007989642330475643, | |
| "loss": 5.301, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.006623509710315179, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 0.0007989442375465521, | |
| "loss": 5.4499, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.00674848159164188, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 0.0007989242420455398, | |
| "loss": 5.5597, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.006873453472968582, | |
| "grad_norm": 2.25, | |
| "learning_rate": 0.0007989042465445275, | |
| "loss": 5.2123, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.006998425354295283, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 0.0007988842510435152, | |
| "loss": 5.3393, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.007123397235621985, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 0.000798864255542503, | |
| "loss": 5.2593, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.007248369116948686, | |
| "grad_norm": 2.125, | |
| "learning_rate": 0.0007988442600414907, | |
| "loss": 5.2776, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.007373340998275388, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.0007988242645404785, | |
| "loss": 5.2529, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.007498312879602089, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 0.0007988042690394662, | |
| "loss": 5.0664, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.007623284760928791, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 0.0007987842735384539, | |
| "loss": 5.3028, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.007748256642255492, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 0.0007987642780374416, | |
| "loss": 5.2152, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.007873228523582194, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 0.0007987442825364294, | |
| "loss": 4.97, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.007998200404908895, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 0.000798724287035417, | |
| "loss": 5.1328, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.008123172286235597, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 0.0007987042915344048, | |
| "loss": 5.0475, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.008248144167562298, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 0.0007986842960333925, | |
| "loss": 5.009, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.008373116048889, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 0.0007986643005323803, | |
| "loss": 5.0827, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.008498087930215701, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 0.000798644305031368, | |
| "loss": 5.0896, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.008623059811542403, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 0.0007986243095303557, | |
| "loss": 4.9185, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.008748031692869104, | |
| "grad_norm": 2.25, | |
| "learning_rate": 0.0007986043140293435, | |
| "loss": 5.0279, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.008873003574195806, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.0007985843185283312, | |
| "loss": 4.8811, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.008997975455522507, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 0.0007985643230273188, | |
| "loss": 5.0377, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.00912294733684921, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 0.0007985443275263066, | |
| "loss": 4.9903, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.00924791921817591, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 0.0007985243320252943, | |
| "loss": 4.8109, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.009372891099502612, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 0.0007985043365242821, | |
| "loss": 4.8445, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.009497862980829313, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 0.0007984843410232698, | |
| "loss": 4.887, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.009622834862156015, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 0.0007984643455222575, | |
| "loss": 4.8373, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.009747806743482716, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 0.0007984443500212453, | |
| "loss": 4.8948, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.009872778624809418, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 0.000798424354520233, | |
| "loss": 4.9098, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.009997750506136119, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 0.0007984043590192208, | |
| "loss": 4.6948, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.010122722387462821, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 0.0007983843635182084, | |
| "loss": 4.7561, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.010247694268789522, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 0.0007983643680171961, | |
| "loss": 4.6634, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.010372666150116224, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 0.0007983443725161839, | |
| "loss": 4.6339, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.010497638031442925, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 0.0007983243770151716, | |
| "loss": 4.7292, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.010622609912769627, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 0.0007983043815141594, | |
| "loss": 4.6211, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.010747581794096328, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 0.0007982843860131471, | |
| "loss": 4.6857, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.01087255367542303, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 0.0007982643905121348, | |
| "loss": 4.6335, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.01099752555674973, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 0.0007982443950111226, | |
| "loss": 4.7285, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.011122497438076433, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 0.0007982243995101103, | |
| "loss": 4.7707, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.011247469319403134, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 0.000798204404009098, | |
| "loss": 4.6769, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.011372441200729836, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 0.0007981844085080857, | |
| "loss": 4.7311, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.011497413082056537, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 0.0007981644130070734, | |
| "loss": 4.6677, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.011622384963383239, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 0.0007981444175060612, | |
| "loss": 4.6862, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.01174735684470994, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 0.0007981244220050488, | |
| "loss": 4.5674, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.011872328726036642, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 0.0007981044265040366, | |
| "loss": 4.5283, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.011997300607363343, | |
| "grad_norm": 1.625, | |
| "learning_rate": 0.0007980844310030244, | |
| "loss": 4.5966, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.012122272488690045, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 0.0007980644355020121, | |
| "loss": 4.4785, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.012247244370016746, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 0.0007980444400009999, | |
| "loss": 4.5966, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.012372216251343448, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 0.0007980244444999875, | |
| "loss": 4.6428, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.012497188132670149, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 0.0007980044489989753, | |
| "loss": 4.5225, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 200045, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.2266079346688e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |