| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 640, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0078125, | |
| "grad_norm": 4487.662174517055, | |
| "learning_rate": 1.0000000000000002e-06, | |
| "loss": 14.0537, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015625, | |
| "grad_norm": 3265.096963107133, | |
| "learning_rate": 2.25e-06, | |
| "loss": 13.5061, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0234375, | |
| "grad_norm": 4019.1529344819937, | |
| "learning_rate": 3.5e-06, | |
| "loss": 10.2522, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03125, | |
| "grad_norm": 338.18628553133266, | |
| "learning_rate": 4.75e-06, | |
| "loss": 7.6669, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0390625, | |
| "grad_norm": 543.0311725735058, | |
| "learning_rate": 4.999486510586282e-06, | |
| "loss": 5.5188, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.046875, | |
| "grad_norm": 246.23088738753083, | |
| "learning_rate": 4.9974008213559725e-06, | |
| "loss": 3.8606, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0546875, | |
| "grad_norm": 119.09382078616744, | |
| "learning_rate": 4.993712176889086e-06, | |
| "loss": 3.9223, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 369.14487711524765, | |
| "learning_rate": 4.988422944739889e-06, | |
| "loss": 3.7582, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.0703125, | |
| "grad_norm": 73.68641210467196, | |
| "learning_rate": 4.981536519798899e-06, | |
| "loss": 4.3581, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.078125, | |
| "grad_norm": 143.98864514567893, | |
| "learning_rate": 4.973057322113883e-06, | |
| "loss": 4.1023, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0859375, | |
| "grad_norm": 129.86219911163266, | |
| "learning_rate": 4.962990794052847e-06, | |
| "loss": 3.3248, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.09375, | |
| "grad_norm": 213.93081161843782, | |
| "learning_rate": 4.95134339681086e-06, | |
| "loss": 3.5744, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1015625, | |
| "grad_norm": 43.57647693525295, | |
| "learning_rate": 4.938122606262935e-06, | |
| "loss": 3.0606, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.109375, | |
| "grad_norm": 24.136487456562335, | |
| "learning_rate": 4.923336908165649e-06, | |
| "loss": 3.2941, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1171875, | |
| "grad_norm": 79.3928477087145, | |
| "learning_rate": 4.906995792710559e-06, | |
| "loss": 3.466, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 20.979020201181374, | |
| "learning_rate": 4.889109748432932e-06, | |
| "loss": 3.0447, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1328125, | |
| "grad_norm": 33.10678605914481, | |
| "learning_rate": 4.8696902554796824e-06, | |
| "loss": 2.9834, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.140625, | |
| "grad_norm": 37.842992430043786, | |
| "learning_rate": 4.84874977824085e-06, | |
| "loss": 3.3414, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1484375, | |
| "grad_norm": 15.354073559162247, | |
| "learning_rate": 4.826301757349337e-06, | |
| "loss": 3.4018, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.15625, | |
| "grad_norm": 41.07210772695517, | |
| "learning_rate": 4.802360601054042e-06, | |
| "loss": 3.3653, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1640625, | |
| "grad_norm": 34.46675403978081, | |
| "learning_rate": 4.776941675971941e-06, | |
| "loss": 2.8566, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.171875, | |
| "grad_norm": 30.438715655532075, | |
| "learning_rate": 4.750061297225028e-06, | |
| "loss": 2.9337, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1796875, | |
| "grad_norm": 47.52097975604328, | |
| "learning_rate": 4.721736717968465e-06, | |
| "loss": 3.0085, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 32.91467336279922, | |
| "learning_rate": 4.691986118316654e-06, | |
| "loss": 2.7027, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 31.817561483162603, | |
| "learning_rate": 4.660828593674344e-06, | |
| "loss": 3.0477, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.203125, | |
| "grad_norm": 68.38396573292292, | |
| "learning_rate": 4.628284142480256e-06, | |
| "loss": 3.1287, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2109375, | |
| "grad_norm": 48.99060766773029, | |
| "learning_rate": 4.594373653371095e-06, | |
| "loss": 3.0499, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.21875, | |
| "grad_norm": 29.811323317057937, | |
| "learning_rate": 4.559118891774188e-06, | |
| "loss": 2.6658, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2265625, | |
| "grad_norm": 16.757192674214547, | |
| "learning_rate": 4.522542485937369e-06, | |
| "loss": 2.7111, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.234375, | |
| "grad_norm": 854.2596861814334, | |
| "learning_rate": 4.484667912405038e-06, | |
| "loss": 2.7731, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2421875, | |
| "grad_norm": 41.03420716766998, | |
| "learning_rate": 4.445519480949761e-06, | |
| "loss": 3.0335, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 37.20268617216472, | |
| "learning_rate": 4.4051223189690585e-06, | |
| "loss": 2.6551, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.2578125, | |
| "grad_norm": 35.47333065284907, | |
| "learning_rate": 4.3635023553574e-06, | |
| "loss": 2.5314, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.265625, | |
| "grad_norm": 34.11952544419604, | |
| "learning_rate": 4.320686303863752e-06, | |
| "loss": 2.4063, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2734375, | |
| "grad_norm": 31.337300080344246, | |
| "learning_rate": 4.276701645945384e-06, | |
| "loss": 2.8953, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.28125, | |
| "grad_norm": 71.0076530183938, | |
| "learning_rate": 4.231576613128902e-06, | |
| "loss": 2.7765, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2890625, | |
| "grad_norm": 35.89471866430099, | |
| "learning_rate": 4.185340168889869e-06, | |
| "loss": 2.6225, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.296875, | |
| "grad_norm": 34.718088237273705, | |
| "learning_rate": 4.138021990062606e-06, | |
| "loss": 2.6321, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3046875, | |
| "grad_norm": 21.190370232623685, | |
| "learning_rate": 4.089652447792141e-06, | |
| "loss": 2.3654, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 24.28207185282827, | |
| "learning_rate": 4.040262588040503e-06, | |
| "loss": 2.439, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3203125, | |
| "grad_norm": 30.270563671656443, | |
| "learning_rate": 3.989884111659893e-06, | |
| "loss": 2.6155, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.328125, | |
| "grad_norm": 30.374998546903175, | |
| "learning_rate": 3.938549354045508e-06, | |
| "loss": 2.5646, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3359375, | |
| "grad_norm": 28.01955269110465, | |
| "learning_rate": 3.8862912643810895e-06, | |
| "loss": 2.1882, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.34375, | |
| "grad_norm": 43.01312536445347, | |
| "learning_rate": 3.833143384490506e-06, | |
| "loss": 2.6895, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.3515625, | |
| "grad_norm": 34.41155110572982, | |
| "learning_rate": 3.7791398273089562e-06, | |
| "loss": 2.5118, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.359375, | |
| "grad_norm": 112.57991856629606, | |
| "learning_rate": 3.7243152549875995e-06, | |
| "loss": 2.223, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3671875, | |
| "grad_norm": 76.22091458973803, | |
| "learning_rate": 3.6687048566456783e-06, | |
| "loss": 2.5385, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.375, | |
| "grad_norm": 47.102411590390844, | |
| "learning_rate": 3.6123443257843985e-06, | |
| "loss": 2.0943, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.3828125, | |
| "grad_norm": 73.15900084042869, | |
| "learning_rate": 3.55526983737708e-06, | |
| "loss": 2.4384, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 54.222406085796, | |
| "learning_rate": 3.4975180246502694e-06, | |
| "loss": 2.5384, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3984375, | |
| "grad_norm": 81.39267333279535, | |
| "learning_rate": 3.4391259555707258e-06, | |
| "loss": 2.4972, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.40625, | |
| "grad_norm": 67.5323257615502, | |
| "learning_rate": 3.3801311090533713e-06, | |
| "loss": 1.8014, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.4140625, | |
| "grad_norm": 26.26948979836602, | |
| "learning_rate": 3.320571350905466e-06, | |
| "loss": 1.773, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.421875, | |
| "grad_norm": 44.03138506176169, | |
| "learning_rate": 3.2604849095224666e-06, | |
| "loss": 2.0221, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4296875, | |
| "grad_norm": 43.260012330071866, | |
| "learning_rate": 3.1999103513511528e-06, | |
| "loss": 2.2129, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.4375, | |
| "grad_norm": 26.951615470416954, | |
| "learning_rate": 3.1388865561357727e-06, | |
| "loss": 2.2301, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4453125, | |
| "grad_norm": 46.519629142166174, | |
| "learning_rate": 3.077452691963109e-06, | |
| "loss": 2.3289, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.453125, | |
| "grad_norm": 48.39622358551033, | |
| "learning_rate": 3.0156481901224573e-06, | |
| "loss": 1.795, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4609375, | |
| "grad_norm": 52.06374998250507, | |
| "learning_rate": 2.953512719796683e-06, | |
| "loss": 2.2433, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.46875, | |
| "grad_norm": 474.92706894375743, | |
| "learning_rate": 2.8910861626005774e-06, | |
| "loss": 1.7952, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4765625, | |
| "grad_norm": 74.9957849628404, | |
| "learning_rate": 2.8284085869828664e-06, | |
| "loss": 2.3712, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.484375, | |
| "grad_norm": 51.21339523637916, | |
| "learning_rate": 2.765520222508302e-06, | |
| "loss": 1.9892, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4921875, | |
| "grad_norm": 131.64456956757485, | |
| "learning_rate": 2.7024614340363365e-06, | |
| "loss": 1.9972, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 31.52894208660832, | |
| "learning_rate": 2.6392726958129653e-06, | |
| "loss": 1.5076, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5078125, | |
| "grad_norm": 106.27601023204515, | |
| "learning_rate": 2.5759945654923575e-06, | |
| "loss": 2.0369, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.515625, | |
| "grad_norm": 47.324062545245894, | |
| "learning_rate": 2.5126676581049413e-06, | |
| "loss": 1.8094, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5234375, | |
| "grad_norm": 76.05234123339977, | |
| "learning_rate": 2.4493326199886813e-06, | |
| "loss": 1.9059, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.53125, | |
| "grad_norm": 115.507138279873, | |
| "learning_rate": 2.3860301027002432e-06, | |
| "loss": 1.9663, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5390625, | |
| "grad_norm": 81.81607613481519, | |
| "learning_rate": 2.322800736922818e-06, | |
| "loss": 2.1141, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.546875, | |
| "grad_norm": 97.43878259958254, | |
| "learning_rate": 2.259685106387345e-06, | |
| "loss": 2.0336, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5546875, | |
| "grad_norm": 42.91394725301739, | |
| "learning_rate": 2.196723721823863e-06, | |
| "loss": 2.038, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5625, | |
| "grad_norm": 37.32603199657443, | |
| "learning_rate": 2.1339569949597284e-06, | |
| "loss": 1.7698, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5703125, | |
| "grad_norm": 200.45931632492386, | |
| "learning_rate": 2.0714252125813667e-06, | |
| "loss": 1.9531, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.578125, | |
| "grad_norm": 31.963910575975373, | |
| "learning_rate": 2.0091685106762233e-06, | |
| "loss": 1.8749, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 89.03310789583136, | |
| "learning_rate": 1.947226848671508e-06, | |
| "loss": 2.0674, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.59375, | |
| "grad_norm": 31.937502854280478, | |
| "learning_rate": 1.8856399837862552e-06, | |
| "loss": 1.6382, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6015625, | |
| "grad_norm": 39.761140346439326, | |
| "learning_rate": 1.824447445513179e-06, | |
| "loss": 1.6946, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.609375, | |
| "grad_norm": 46.64848349398602, | |
| "learning_rate": 1.7636885102466907e-06, | |
| "loss": 1.5179, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6171875, | |
| "grad_norm": 20.613416865713134, | |
| "learning_rate": 1.7034021760733712e-06, | |
| "loss": 1.437, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.625, | |
| "grad_norm": 70.94522606308973, | |
| "learning_rate": 1.6436271377410667e-06, | |
| "loss": 2.2482, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6328125, | |
| "grad_norm": 43.35603987077311, | |
| "learning_rate": 1.5844017618226934e-06, | |
| "loss": 1.6283, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.640625, | |
| "grad_norm": 117.64561093792364, | |
| "learning_rate": 1.525764062090671e-06, | |
| "loss": 1.5861, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6484375, | |
| "grad_norm": 51.13049033408944, | |
| "learning_rate": 1.46775167511781e-06, | |
| "loss": 1.7587, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.65625, | |
| "grad_norm": 23.67179139062178, | |
| "learning_rate": 1.4104018361202947e-06, | |
| "loss": 1.6897, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6640625, | |
| "grad_norm": 29.890157693144154, | |
| "learning_rate": 1.3537513550582853e-06, | |
| "loss": 1.8858, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.671875, | |
| "grad_norm": 120.12084561754483, | |
| "learning_rate": 1.2978365930094645e-06, | |
| "loss": 1.6889, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.6796875, | |
| "grad_norm": 29.265397717275942, | |
| "learning_rate": 1.2426934388307059e-06, | |
| "loss": 1.6589, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6875, | |
| "grad_norm": 147.8296220293303, | |
| "learning_rate": 1.1883572861228255e-06, | |
| "loss": 2.3553, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6953125, | |
| "grad_norm": 75.42228430231394, | |
| "learning_rate": 1.1348630105132253e-06, | |
| "loss": 1.7002, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.703125, | |
| "grad_norm": 50.39104505021309, | |
| "learning_rate": 1.0822449472709907e-06, | |
| "loss": 1.9122, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7109375, | |
| "grad_norm": 61.50703160486361, | |
| "learning_rate": 1.0305368692688175e-06, | |
| "loss": 1.536, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.71875, | |
| "grad_norm": 73.8503299397507, | |
| "learning_rate": 9.797719653059176e-07, | |
| "loss": 1.7232, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7265625, | |
| "grad_norm": 67.88315535620379, | |
| "learning_rate": 9.299828188058013e-07, | |
| "loss": 1.7666, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.734375, | |
| "grad_norm": 83.76797348985389, | |
| "learning_rate": 8.812013869026334e-07, | |
| "loss": 1.6567, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7421875, | |
| "grad_norm": 136.16919333944256, | |
| "learning_rate": 8.334589799295592e-07, | |
| "loss": 1.6399, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 50.54545692566125, | |
| "learning_rate": 7.867862413221894e-07, | |
| "loss": 1.5422, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.7578125, | |
| "grad_norm": 243.07205145310937, | |
| "learning_rate": 7.412131279501297e-07, | |
| "loss": 1.4785, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.765625, | |
| "grad_norm": 45.010451945401705, | |
| "learning_rate": 6.967688908891793e-07, | |
| "loss": 1.884, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.7734375, | |
| "grad_norm": 48.23699940972905, | |
| "learning_rate": 6.534820566465464e-07, | |
| "loss": 1.7596, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 90.74839489958582, | |
| "learning_rate": 6.113804088511261e-07, | |
| "loss": 1.6985, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7890625, | |
| "grad_norm": 26.31053939307125, | |
| "learning_rate": 5.704909704205949e-07, | |
| "loss": 1.7385, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.796875, | |
| "grad_norm": 58.47989097161682, | |
| "learning_rate": 5.308399862167693e-07, | |
| "loss": 1.9481, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8046875, | |
| "grad_norm": 796.9462628456254, | |
| "learning_rate": 4.924529062003522e-07, | |
| "loss": 1.762, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8125, | |
| "grad_norm": 39.897170110050865, | |
| "learning_rate": 4.553543690958939e-07, | |
| "loss": 1.2992, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8203125, | |
| "grad_norm": 53.31172241007692, | |
| "learning_rate": 4.1956818657744065e-07, | |
| "loss": 1.8682, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.828125, | |
| "grad_norm": 35.918165546768364, | |
| "learning_rate": 3.851173279850251e-07, | |
| "loss": 1.6464, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8359375, | |
| "grad_norm": 47.70548947871064, | |
| "learning_rate": 3.5202390558181145e-07, | |
| "loss": 2.157, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.84375, | |
| "grad_norm": 81.91605468065885, | |
| "learning_rate": 3.2030916036134866e-07, | |
| "loss": 1.6421, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.8515625, | |
| "grad_norm": 53.06973167494431, | |
| "learning_rate": 2.8999344841405377e-07, | |
| "loss": 1.6688, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.859375, | |
| "grad_norm": 88.49158154638214, | |
| "learning_rate": 2.61096227861668e-07, | |
| "loss": 1.702, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8671875, | |
| "grad_norm": 120.59801382869998, | |
| "learning_rate": 2.3363604636807065e-07, | |
| "loss": 1.3829, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.875, | |
| "grad_norm": 139.01600571943374, | |
| "learning_rate": 2.0763052923447214e-07, | |
| "loss": 1.6871, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8828125, | |
| "grad_norm": 45.50934842650282, | |
| "learning_rate": 1.830963680866285e-07, | |
| "loss": 1.4887, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.890625, | |
| "grad_norm": 35.76953954334381, | |
| "learning_rate": 1.600493101613268e-07, | |
| "loss": 1.6375, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8984375, | |
| "grad_norm": 37.27719885211302, | |
| "learning_rate": 1.3850414819903235e-07, | |
| "loss": 1.7941, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.90625, | |
| "grad_norm": 39.534848671404404, | |
| "learning_rate": 1.1847471094917711e-07, | |
| "loss": 1.7665, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9140625, | |
| "grad_norm": 50.50325344773253, | |
| "learning_rate": 9.997385429418555e-08, | |
| "loss": 1.8857, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.921875, | |
| "grad_norm": 31.729549388411133, | |
| "learning_rate": 8.301345299793374e-08, | |
| "loss": 1.5837, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.9296875, | |
| "grad_norm": 29.796478369791654, | |
| "learning_rate": 6.760439308393763e-08, | |
| "loss": 1.963, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.9375, | |
| "grad_norm": 37.3072954627531, | |
| "learning_rate": 5.3756564848168325e-08, | |
| "loss": 1.507, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9453125, | |
| "grad_norm": 32.75001709716989, | |
| "learning_rate": 4.147885651096861e-08, | |
| "loss": 1.6399, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.953125, | |
| "grad_norm": 53.42892228692126, | |
| "learning_rate": 3.077914851215585e-08, | |
| "loss": 1.7654, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9609375, | |
| "grad_norm": 91.78268048604703, | |
| "learning_rate": 2.1664308452965798e-08, | |
| "loss": 1.4882, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.96875, | |
| "grad_norm": 52.95012166005466, | |
| "learning_rate": 1.4140186688086365e-08, | |
| "loss": 1.5379, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 114.57655882499603, | |
| "learning_rate": 8.211612570611927e-09, | |
| "loss": 1.5018, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.984375, | |
| "grad_norm": 96.85859485649613, | |
| "learning_rate": 3.882391352324766e-09, | |
| "loss": 1.4683, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9921875, | |
| "grad_norm": 37.14188481057337, | |
| "learning_rate": 1.1553017412971323e-09, | |
| "loss": 1.8316, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 34.77706091063119, | |
| "learning_rate": 3.2094118379288885e-11, | |
| "loss": 1.6444, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 640, | |
| "total_flos": 61131755814912.0, | |
| "train_loss": 2.5290436543524266, | |
| "train_runtime": 1576.5021, | |
| "train_samples_per_second": 3.245, | |
| "train_steps_per_second": 0.406 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 640, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 640.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 61131755814912.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |