{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 640, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0078125, "grad_norm": 4487.662174517055, "learning_rate": 1.0000000000000002e-06, "loss": 14.0537, "step": 5 }, { "epoch": 0.015625, "grad_norm": 3265.096963107133, "learning_rate": 2.25e-06, "loss": 13.5061, "step": 10 }, { "epoch": 0.0234375, "grad_norm": 4019.1529344819937, "learning_rate": 3.5e-06, "loss": 10.2522, "step": 15 }, { "epoch": 0.03125, "grad_norm": 338.18628553133266, "learning_rate": 4.75e-06, "loss": 7.6669, "step": 20 }, { "epoch": 0.0390625, "grad_norm": 543.0311725735058, "learning_rate": 4.999486510586282e-06, "loss": 5.5188, "step": 25 }, { "epoch": 0.046875, "grad_norm": 246.23088738753083, "learning_rate": 4.9974008213559725e-06, "loss": 3.8606, "step": 30 }, { "epoch": 0.0546875, "grad_norm": 119.09382078616744, "learning_rate": 4.993712176889086e-06, "loss": 3.9223, "step": 35 }, { "epoch": 0.0625, "grad_norm": 369.14487711524765, "learning_rate": 4.988422944739889e-06, "loss": 3.7582, "step": 40 }, { "epoch": 0.0703125, "grad_norm": 73.68641210467196, "learning_rate": 4.981536519798899e-06, "loss": 4.3581, "step": 45 }, { "epoch": 0.078125, "grad_norm": 143.98864514567893, "learning_rate": 4.973057322113883e-06, "loss": 4.1023, "step": 50 }, { "epoch": 0.0859375, "grad_norm": 129.86219911163266, "learning_rate": 4.962990794052847e-06, "loss": 3.3248, "step": 55 }, { "epoch": 0.09375, "grad_norm": 213.93081161843782, "learning_rate": 4.95134339681086e-06, "loss": 3.5744, "step": 60 }, { "epoch": 0.1015625, "grad_norm": 43.57647693525295, "learning_rate": 4.938122606262935e-06, "loss": 3.0606, "step": 65 }, { "epoch": 0.109375, "grad_norm": 24.136487456562335, "learning_rate": 4.923336908165649e-06, "loss": 3.2941, "step": 70 }, { "epoch": 0.1171875, "grad_norm": 79.3928477087145, "learning_rate": 4.906995792710559e-06, "loss": 3.466, "step": 75 }, { "epoch": 0.125, "grad_norm": 20.979020201181374, "learning_rate": 4.889109748432932e-06, "loss": 3.0447, "step": 80 }, { "epoch": 0.1328125, "grad_norm": 33.10678605914481, "learning_rate": 4.8696902554796824e-06, "loss": 2.9834, "step": 85 }, { "epoch": 0.140625, "grad_norm": 37.842992430043786, "learning_rate": 4.84874977824085e-06, "loss": 3.3414, "step": 90 }, { "epoch": 0.1484375, "grad_norm": 15.354073559162247, "learning_rate": 4.826301757349337e-06, "loss": 3.4018, "step": 95 }, { "epoch": 0.15625, "grad_norm": 41.07210772695517, "learning_rate": 4.802360601054042e-06, "loss": 3.3653, "step": 100 }, { "epoch": 0.1640625, "grad_norm": 34.46675403978081, "learning_rate": 4.776941675971941e-06, "loss": 2.8566, "step": 105 }, { "epoch": 0.171875, "grad_norm": 30.438715655532075, "learning_rate": 4.750061297225028e-06, "loss": 2.9337, "step": 110 }, { "epoch": 0.1796875, "grad_norm": 47.52097975604328, "learning_rate": 4.721736717968465e-06, "loss": 3.0085, "step": 115 }, { "epoch": 0.1875, "grad_norm": 32.91467336279922, "learning_rate": 4.691986118316654e-06, "loss": 2.7027, "step": 120 }, { "epoch": 0.1953125, "grad_norm": 31.817561483162603, "learning_rate": 4.660828593674344e-06, "loss": 3.0477, "step": 125 }, { "epoch": 0.203125, "grad_norm": 68.38396573292292, "learning_rate": 4.628284142480256e-06, "loss": 3.1287, "step": 130 }, { "epoch": 0.2109375, "grad_norm": 48.99060766773029, "learning_rate": 4.594373653371095e-06, "loss": 3.0499, "step": 135 }, { "epoch": 0.21875, "grad_norm": 29.811323317057937, "learning_rate": 4.559118891774188e-06, "loss": 2.6658, "step": 140 }, { "epoch": 0.2265625, "grad_norm": 16.757192674214547, "learning_rate": 4.522542485937369e-06, "loss": 2.7111, "step": 145 }, { "epoch": 0.234375, "grad_norm": 854.2596861814334, "learning_rate": 4.484667912405038e-06, "loss": 2.7731, "step": 150 }, { "epoch": 0.2421875, "grad_norm": 41.03420716766998, "learning_rate": 4.445519480949761e-06, "loss": 3.0335, "step": 155 }, { "epoch": 0.25, "grad_norm": 37.20268617216472, "learning_rate": 4.4051223189690585e-06, "loss": 2.6551, "step": 160 }, { "epoch": 0.2578125, "grad_norm": 35.47333065284907, "learning_rate": 4.3635023553574e-06, "loss": 2.5314, "step": 165 }, { "epoch": 0.265625, "grad_norm": 34.11952544419604, "learning_rate": 4.320686303863752e-06, "loss": 2.4063, "step": 170 }, { "epoch": 0.2734375, "grad_norm": 31.337300080344246, "learning_rate": 4.276701645945384e-06, "loss": 2.8953, "step": 175 }, { "epoch": 0.28125, "grad_norm": 71.0076530183938, "learning_rate": 4.231576613128902e-06, "loss": 2.7765, "step": 180 }, { "epoch": 0.2890625, "grad_norm": 35.89471866430099, "learning_rate": 4.185340168889869e-06, "loss": 2.6225, "step": 185 }, { "epoch": 0.296875, "grad_norm": 34.718088237273705, "learning_rate": 4.138021990062606e-06, "loss": 2.6321, "step": 190 }, { "epoch": 0.3046875, "grad_norm": 21.190370232623685, "learning_rate": 4.089652447792141e-06, "loss": 2.3654, "step": 195 }, { "epoch": 0.3125, "grad_norm": 24.28207185282827, "learning_rate": 4.040262588040503e-06, "loss": 2.439, "step": 200 }, { "epoch": 0.3203125, "grad_norm": 30.270563671656443, "learning_rate": 3.989884111659893e-06, "loss": 2.6155, "step": 205 }, { "epoch": 0.328125, "grad_norm": 30.374998546903175, "learning_rate": 3.938549354045508e-06, "loss": 2.5646, "step": 210 }, { "epoch": 0.3359375, "grad_norm": 28.01955269110465, "learning_rate": 3.8862912643810895e-06, "loss": 2.1882, "step": 215 }, { "epoch": 0.34375, "grad_norm": 43.01312536445347, "learning_rate": 3.833143384490506e-06, "loss": 2.6895, "step": 220 }, { "epoch": 0.3515625, "grad_norm": 34.41155110572982, "learning_rate": 3.7791398273089562e-06, "loss": 2.5118, "step": 225 }, { "epoch": 0.359375, "grad_norm": 112.57991856629606, "learning_rate": 3.7243152549875995e-06, "loss": 2.223, "step": 230 }, { "epoch": 0.3671875, "grad_norm": 76.22091458973803, "learning_rate": 3.6687048566456783e-06, "loss": 2.5385, "step": 235 }, { "epoch": 0.375, "grad_norm": 47.102411590390844, "learning_rate": 3.6123443257843985e-06, "loss": 2.0943, "step": 240 }, { "epoch": 0.3828125, "grad_norm": 73.15900084042869, "learning_rate": 3.55526983737708e-06, "loss": 2.4384, "step": 245 }, { "epoch": 0.390625, "grad_norm": 54.222406085796, "learning_rate": 3.4975180246502694e-06, "loss": 2.5384, "step": 250 }, { "epoch": 0.3984375, "grad_norm": 81.39267333279535, "learning_rate": 3.4391259555707258e-06, "loss": 2.4972, "step": 255 }, { "epoch": 0.40625, "grad_norm": 67.5323257615502, "learning_rate": 3.3801311090533713e-06, "loss": 1.8014, "step": 260 }, { "epoch": 0.4140625, "grad_norm": 26.26948979836602, "learning_rate": 3.320571350905466e-06, "loss": 1.773, "step": 265 }, { "epoch": 0.421875, "grad_norm": 44.03138506176169, "learning_rate": 3.2604849095224666e-06, "loss": 2.0221, "step": 270 }, { "epoch": 0.4296875, "grad_norm": 43.260012330071866, "learning_rate": 3.1999103513511528e-06, "loss": 2.2129, "step": 275 }, { "epoch": 0.4375, "grad_norm": 26.951615470416954, "learning_rate": 3.1388865561357727e-06, "loss": 2.2301, "step": 280 }, { "epoch": 0.4453125, "grad_norm": 46.519629142166174, "learning_rate": 3.077452691963109e-06, "loss": 2.3289, "step": 285 }, { "epoch": 0.453125, "grad_norm": 48.39622358551033, "learning_rate": 3.0156481901224573e-06, "loss": 1.795, "step": 290 }, { "epoch": 0.4609375, "grad_norm": 52.06374998250507, "learning_rate": 2.953512719796683e-06, "loss": 2.2433, "step": 295 }, { "epoch": 0.46875, "grad_norm": 474.92706894375743, "learning_rate": 2.8910861626005774e-06, "loss": 1.7952, "step": 300 }, { "epoch": 0.4765625, "grad_norm": 74.9957849628404, "learning_rate": 2.8284085869828664e-06, "loss": 2.3712, "step": 305 }, { "epoch": 0.484375, "grad_norm": 51.21339523637916, "learning_rate": 2.765520222508302e-06, "loss": 1.9892, "step": 310 }, { "epoch": 0.4921875, "grad_norm": 131.64456956757485, "learning_rate": 2.7024614340363365e-06, "loss": 1.9972, "step": 315 }, { "epoch": 0.5, "grad_norm": 31.52894208660832, "learning_rate": 2.6392726958129653e-06, "loss": 1.5076, "step": 320 }, { "epoch": 0.5078125, "grad_norm": 106.27601023204515, "learning_rate": 2.5759945654923575e-06, "loss": 2.0369, "step": 325 }, { "epoch": 0.515625, "grad_norm": 47.324062545245894, "learning_rate": 2.5126676581049413e-06, "loss": 1.8094, "step": 330 }, { "epoch": 0.5234375, "grad_norm": 76.05234123339977, "learning_rate": 2.4493326199886813e-06, "loss": 1.9059, "step": 335 }, { "epoch": 0.53125, "grad_norm": 115.507138279873, "learning_rate": 2.3860301027002432e-06, "loss": 1.9663, "step": 340 }, { "epoch": 0.5390625, "grad_norm": 81.81607613481519, "learning_rate": 2.322800736922818e-06, "loss": 2.1141, "step": 345 }, { "epoch": 0.546875, "grad_norm": 97.43878259958254, "learning_rate": 2.259685106387345e-06, "loss": 2.0336, "step": 350 }, { "epoch": 0.5546875, "grad_norm": 42.91394725301739, "learning_rate": 2.196723721823863e-06, "loss": 2.038, "step": 355 }, { "epoch": 0.5625, "grad_norm": 37.32603199657443, "learning_rate": 2.1339569949597284e-06, "loss": 1.7698, "step": 360 }, { "epoch": 0.5703125, "grad_norm": 200.45931632492386, "learning_rate": 2.0714252125813667e-06, "loss": 1.9531, "step": 365 }, { "epoch": 0.578125, "grad_norm": 31.963910575975373, "learning_rate": 2.0091685106762233e-06, "loss": 1.8749, "step": 370 }, { "epoch": 0.5859375, "grad_norm": 89.03310789583136, "learning_rate": 1.947226848671508e-06, "loss": 2.0674, "step": 375 }, { "epoch": 0.59375, "grad_norm": 31.937502854280478, "learning_rate": 1.8856399837862552e-06, "loss": 1.6382, "step": 380 }, { "epoch": 0.6015625, "grad_norm": 39.761140346439326, "learning_rate": 1.824447445513179e-06, "loss": 1.6946, "step": 385 }, { "epoch": 0.609375, "grad_norm": 46.64848349398602, "learning_rate": 1.7636885102466907e-06, "loss": 1.5179, "step": 390 }, { "epoch": 0.6171875, "grad_norm": 20.613416865713134, "learning_rate": 1.7034021760733712e-06, "loss": 1.437, "step": 395 }, { "epoch": 0.625, "grad_norm": 70.94522606308973, "learning_rate": 1.6436271377410667e-06, "loss": 2.2482, "step": 400 }, { "epoch": 0.6328125, "grad_norm": 43.35603987077311, "learning_rate": 1.5844017618226934e-06, "loss": 1.6283, "step": 405 }, { "epoch": 0.640625, "grad_norm": 117.64561093792364, "learning_rate": 1.525764062090671e-06, "loss": 1.5861, "step": 410 }, { "epoch": 0.6484375, "grad_norm": 51.13049033408944, "learning_rate": 1.46775167511781e-06, "loss": 1.7587, "step": 415 }, { "epoch": 0.65625, "grad_norm": 23.67179139062178, "learning_rate": 1.4104018361202947e-06, "loss": 1.6897, "step": 420 }, { "epoch": 0.6640625, "grad_norm": 29.890157693144154, "learning_rate": 1.3537513550582853e-06, "loss": 1.8858, "step": 425 }, { "epoch": 0.671875, "grad_norm": 120.12084561754483, "learning_rate": 1.2978365930094645e-06, "loss": 1.6889, "step": 430 }, { "epoch": 0.6796875, "grad_norm": 29.265397717275942, "learning_rate": 1.2426934388307059e-06, "loss": 1.6589, "step": 435 }, { "epoch": 0.6875, "grad_norm": 147.8296220293303, "learning_rate": 1.1883572861228255e-06, "loss": 2.3553, "step": 440 }, { "epoch": 0.6953125, "grad_norm": 75.42228430231394, "learning_rate": 1.1348630105132253e-06, "loss": 1.7002, "step": 445 }, { "epoch": 0.703125, "grad_norm": 50.39104505021309, "learning_rate": 1.0822449472709907e-06, "loss": 1.9122, "step": 450 }, { "epoch": 0.7109375, "grad_norm": 61.50703160486361, "learning_rate": 1.0305368692688175e-06, "loss": 1.536, "step": 455 }, { "epoch": 0.71875, "grad_norm": 73.8503299397507, "learning_rate": 9.797719653059176e-07, "loss": 1.7232, "step": 460 }, { "epoch": 0.7265625, "grad_norm": 67.88315535620379, "learning_rate": 9.299828188058013e-07, "loss": 1.7666, "step": 465 }, { "epoch": 0.734375, "grad_norm": 83.76797348985389, "learning_rate": 8.812013869026334e-07, "loss": 1.6567, "step": 470 }, { "epoch": 0.7421875, "grad_norm": 136.16919333944256, "learning_rate": 8.334589799295592e-07, "loss": 1.6399, "step": 475 }, { "epoch": 0.75, "grad_norm": 50.54545692566125, "learning_rate": 7.867862413221894e-07, "loss": 1.5422, "step": 480 }, { "epoch": 0.7578125, "grad_norm": 243.07205145310937, "learning_rate": 7.412131279501297e-07, "loss": 1.4785, "step": 485 }, { "epoch": 0.765625, "grad_norm": 45.010451945401705, "learning_rate": 6.967688908891793e-07, "loss": 1.884, "step": 490 }, { "epoch": 0.7734375, "grad_norm": 48.23699940972905, "learning_rate": 6.534820566465464e-07, "loss": 1.7596, "step": 495 }, { "epoch": 0.78125, "grad_norm": 90.74839489958582, "learning_rate": 6.113804088511261e-07, "loss": 1.6985, "step": 500 }, { "epoch": 0.7890625, "grad_norm": 26.31053939307125, "learning_rate": 5.704909704205949e-07, "loss": 1.7385, "step": 505 }, { "epoch": 0.796875, "grad_norm": 58.47989097161682, "learning_rate": 5.308399862167693e-07, "loss": 1.9481, "step": 510 }, { "epoch": 0.8046875, "grad_norm": 796.9462628456254, "learning_rate": 4.924529062003522e-07, "loss": 1.762, "step": 515 }, { "epoch": 0.8125, "grad_norm": 39.897170110050865, "learning_rate": 4.553543690958939e-07, "loss": 1.2992, "step": 520 }, { "epoch": 0.8203125, "grad_norm": 53.31172241007692, "learning_rate": 4.1956818657744065e-07, "loss": 1.8682, "step": 525 }, { "epoch": 0.828125, "grad_norm": 35.918165546768364, "learning_rate": 3.851173279850251e-07, "loss": 1.6464, "step": 530 }, { "epoch": 0.8359375, "grad_norm": 47.70548947871064, "learning_rate": 3.5202390558181145e-07, "loss": 2.157, "step": 535 }, { "epoch": 0.84375, "grad_norm": 81.91605468065885, "learning_rate": 3.2030916036134866e-07, "loss": 1.6421, "step": 540 }, { "epoch": 0.8515625, "grad_norm": 53.06973167494431, "learning_rate": 2.8999344841405377e-07, "loss": 1.6688, "step": 545 }, { "epoch": 0.859375, "grad_norm": 88.49158154638214, "learning_rate": 2.61096227861668e-07, "loss": 1.702, "step": 550 }, { "epoch": 0.8671875, "grad_norm": 120.59801382869998, "learning_rate": 2.3363604636807065e-07, "loss": 1.3829, "step": 555 }, { "epoch": 0.875, "grad_norm": 139.01600571943374, "learning_rate": 2.0763052923447214e-07, "loss": 1.6871, "step": 560 }, { "epoch": 0.8828125, "grad_norm": 45.50934842650282, "learning_rate": 1.830963680866285e-07, "loss": 1.4887, "step": 565 }, { "epoch": 0.890625, "grad_norm": 35.76953954334381, "learning_rate": 1.600493101613268e-07, "loss": 1.6375, "step": 570 }, { "epoch": 0.8984375, "grad_norm": 37.27719885211302, "learning_rate": 1.3850414819903235e-07, "loss": 1.7941, "step": 575 }, { "epoch": 0.90625, "grad_norm": 39.534848671404404, "learning_rate": 1.1847471094917711e-07, "loss": 1.7665, "step": 580 }, { "epoch": 0.9140625, "grad_norm": 50.50325344773253, "learning_rate": 9.997385429418555e-08, "loss": 1.8857, "step": 585 }, { "epoch": 0.921875, "grad_norm": 31.729549388411133, "learning_rate": 8.301345299793374e-08, "loss": 1.5837, "step": 590 }, { "epoch": 0.9296875, "grad_norm": 29.796478369791654, "learning_rate": 6.760439308393763e-08, "loss": 1.963, "step": 595 }, { "epoch": 0.9375, "grad_norm": 37.3072954627531, "learning_rate": 5.3756564848168325e-08, "loss": 1.507, "step": 600 }, { "epoch": 0.9453125, "grad_norm": 32.75001709716989, "learning_rate": 4.147885651096861e-08, "loss": 1.6399, "step": 605 }, { "epoch": 0.953125, "grad_norm": 53.42892228692126, "learning_rate": 3.077914851215585e-08, "loss": 1.7654, "step": 610 }, { "epoch": 0.9609375, "grad_norm": 91.78268048604703, "learning_rate": 2.1664308452965798e-08, "loss": 1.4882, "step": 615 }, { "epoch": 0.96875, "grad_norm": 52.95012166005466, "learning_rate": 1.4140186688086365e-08, "loss": 1.5379, "step": 620 }, { "epoch": 0.9765625, "grad_norm": 114.57655882499603, "learning_rate": 8.211612570611927e-09, "loss": 1.5018, "step": 625 }, { "epoch": 0.984375, "grad_norm": 96.85859485649613, "learning_rate": 3.882391352324766e-09, "loss": 1.4683, "step": 630 }, { "epoch": 0.9921875, "grad_norm": 37.14188481057337, "learning_rate": 1.1553017412971323e-09, "loss": 1.8316, "step": 635 }, { "epoch": 1.0, "grad_norm": 34.77706091063119, "learning_rate": 3.2094118379288885e-11, "loss": 1.6444, "step": 640 }, { "epoch": 1.0, "step": 640, "total_flos": 61131755814912.0, "train_loss": 2.5290436543524266, "train_runtime": 1576.5021, "train_samples_per_second": 3.245, "train_steps_per_second": 0.406 } ], "logging_steps": 5, "max_steps": 640, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 640.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 61131755814912.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }