{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 594, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003367003367003367, "grad_norm": 4.149549460211877, "learning_rate": 1.6666666666666668e-07, "loss": 2.1262, "step": 1 }, { "epoch": 0.016835016835016835, "grad_norm": 4.010411470158707, "learning_rate": 8.333333333333333e-07, "loss": 2.0953, "step": 5 }, { "epoch": 0.03367003367003367, "grad_norm": 2.095167650626825, "learning_rate": 1.6666666666666667e-06, "loss": 1.8934, "step": 10 }, { "epoch": 0.050505050505050504, "grad_norm": 3.1062734406420245, "learning_rate": 2.5e-06, "loss": 1.4715, "step": 15 }, { "epoch": 0.06734006734006734, "grad_norm": 0.5160710231319299, "learning_rate": 3.3333333333333333e-06, "loss": 0.9113, "step": 20 }, { "epoch": 0.08417508417508418, "grad_norm": 0.42528861790572803, "learning_rate": 4.166666666666667e-06, "loss": 0.7253, "step": 25 }, { "epoch": 0.10101010101010101, "grad_norm": 0.27763208880764356, "learning_rate": 5e-06, "loss": 0.664, "step": 30 }, { "epoch": 0.11784511784511785, "grad_norm": 0.22702639326519608, "learning_rate": 5.833333333333334e-06, "loss": 0.5673, "step": 35 }, { "epoch": 0.13468013468013468, "grad_norm": 0.20027531882369276, "learning_rate": 6.666666666666667e-06, "loss": 0.5462, "step": 40 }, { "epoch": 0.15151515151515152, "grad_norm": 0.22331242368395698, "learning_rate": 7.500000000000001e-06, "loss": 0.5208, "step": 45 }, { "epoch": 0.16835016835016836, "grad_norm": 0.2495290013519025, "learning_rate": 8.333333333333334e-06, "loss": 0.5063, "step": 50 }, { "epoch": 0.18518518518518517, "grad_norm": 0.22997291660332314, "learning_rate": 9.166666666666666e-06, "loss": 0.4763, "step": 55 }, { "epoch": 0.20202020202020202, "grad_norm": 0.1854483270058641, "learning_rate": 1e-05, "loss": 0.4797, "step": 60 }, { "epoch": 0.21885521885521886, "grad_norm": 0.19584468158412918, "learning_rate": 9.997836953115927e-06, "loss": 0.4705, "step": 65 }, { "epoch": 0.2356902356902357, "grad_norm": 0.20152991740174855, "learning_rate": 9.991349683972435e-06, "loss": 0.4539, "step": 70 }, { "epoch": 0.25252525252525254, "grad_norm": 0.16770527024092638, "learning_rate": 9.980543805476447e-06, "loss": 0.4611, "step": 75 }, { "epoch": 0.26936026936026936, "grad_norm": 0.1345303934796462, "learning_rate": 9.965428667076687e-06, "loss": 0.4208, "step": 80 }, { "epoch": 0.28619528619528617, "grad_norm": 0.14467263254757398, "learning_rate": 9.946017346674362e-06, "loss": 0.4291, "step": 85 }, { "epoch": 0.30303030303030304, "grad_norm": 0.11675624115717066, "learning_rate": 9.922326639307918e-06, "loss": 0.4118, "step": 90 }, { "epoch": 0.31986531986531985, "grad_norm": 0.12419897003228939, "learning_rate": 9.894377042621654e-06, "loss": 0.4169, "step": 95 }, { "epoch": 0.3367003367003367, "grad_norm": 0.1411171642205521, "learning_rate": 9.86219273913078e-06, "loss": 0.4336, "step": 100 }, { "epoch": 0.35353535353535354, "grad_norm": 0.10846956134773478, "learning_rate": 9.825801575298248e-06, "loss": 0.3873, "step": 105 }, { "epoch": 0.37037037037037035, "grad_norm": 0.12002423610117473, "learning_rate": 9.785235037441473e-06, "loss": 0.3619, "step": 110 }, { "epoch": 0.3872053872053872, "grad_norm": 0.11343783522484456, "learning_rate": 9.74052822448978e-06, "loss": 0.3993, "step": 115 }, { "epoch": 0.40404040404040403, "grad_norm": 0.11192347591145078, "learning_rate": 9.691719817616148e-06, "loss": 0.4268, "step": 120 }, { "epoch": 0.4208754208754209, "grad_norm": 0.1139058653749672, "learning_rate": 9.63885204676954e-06, "loss": 0.3725, "step": 125 }, { "epoch": 0.4377104377104377, "grad_norm": 0.1254583279635675, "learning_rate": 9.581970654136752e-06, "loss": 0.3645, "step": 130 }, { "epoch": 0.45454545454545453, "grad_norm": 0.11137112662931582, "learning_rate": 9.521124854565425e-06, "loss": 0.3796, "step": 135 }, { "epoch": 0.4713804713804714, "grad_norm": 0.12134068097374133, "learning_rate": 9.45636729298243e-06, "loss": 0.4135, "step": 140 }, { "epoch": 0.4882154882154882, "grad_norm": 0.10390787212244738, "learning_rate": 9.387753998844482e-06, "loss": 0.4136, "step": 145 }, { "epoch": 0.5050505050505051, "grad_norm": 0.10067346916725926, "learning_rate": 9.315344337660422e-06, "loss": 0.3814, "step": 150 }, { "epoch": 0.5218855218855218, "grad_norm": 0.12175671218216305, "learning_rate": 9.239200959627048e-06, "loss": 0.4118, "step": 155 }, { "epoch": 0.5387205387205387, "grad_norm": 0.10919894991390798, "learning_rate": 9.159389745423003e-06, "loss": 0.4231, "step": 160 }, { "epoch": 0.5555555555555556, "grad_norm": 0.10398902410046555, "learning_rate": 9.07597974920756e-06, "loss": 0.4062, "step": 165 }, { "epoch": 0.5723905723905723, "grad_norm": 0.100265725065611, "learning_rate": 8.98904313887369e-06, "loss": 0.3677, "step": 170 }, { "epoch": 0.5892255892255892, "grad_norm": 0.099307367621639, "learning_rate": 8.89865513360703e-06, "loss": 0.3758, "step": 175 }, { "epoch": 0.6060606060606061, "grad_norm": 0.09507905089725636, "learning_rate": 8.804893938804839e-06, "loss": 0.3703, "step": 180 }, { "epoch": 0.622895622895623, "grad_norm": 0.11024852756485287, "learning_rate": 8.707840678411223e-06, "loss": 0.4052, "step": 185 }, { "epoch": 0.6397306397306397, "grad_norm": 0.10365136825762392, "learning_rate": 8.607579324727175e-06, "loss": 0.3955, "step": 190 }, { "epoch": 0.6565656565656566, "grad_norm": 0.11091339739436984, "learning_rate": 8.504196625756166e-06, "loss": 0.3884, "step": 195 }, { "epoch": 0.6734006734006734, "grad_norm": 0.10685048356906915, "learning_rate": 8.397782030148147e-06, "loss": 0.3702, "step": 200 }, { "epoch": 0.6902356902356902, "grad_norm": 0.1246306899349896, "learning_rate": 8.288427609806899e-06, "loss": 0.4024, "step": 205 }, { "epoch": 0.7070707070707071, "grad_norm": 0.1066849900460222, "learning_rate": 8.176227980227693e-06, "loss": 0.4052, "step": 210 }, { "epoch": 0.7239057239057239, "grad_norm": 0.11875691065211648, "learning_rate": 8.061280218634192e-06, "loss": 0.3896, "step": 215 }, { "epoch": 0.7407407407407407, "grad_norm": 0.09762435583125909, "learning_rate": 7.943683779985412e-06, "loss": 0.3868, "step": 220 }, { "epoch": 0.7575757575757576, "grad_norm": 0.10054336444833502, "learning_rate": 7.823540410925434e-06, "loss": 0.3674, "step": 225 }, { "epoch": 0.7744107744107744, "grad_norm": 0.10842059524327032, "learning_rate": 7.700954061750295e-06, "loss": 0.3918, "step": 230 }, { "epoch": 0.7912457912457912, "grad_norm": 0.09745155947498828, "learning_rate": 7.576030796468233e-06, "loss": 0.3679, "step": 235 }, { "epoch": 0.8080808080808081, "grad_norm": 0.10372602136364313, "learning_rate": 7.4488787010311425e-06, "loss": 0.386, "step": 240 }, { "epoch": 0.8249158249158249, "grad_norm": 0.10925288274096943, "learning_rate": 7.319607789816555e-06, "loss": 0.3822, "step": 245 }, { "epoch": 0.8417508417508418, "grad_norm": 0.10360616006701923, "learning_rate": 7.188329910441154e-06, "loss": 0.3656, "step": 250 }, { "epoch": 0.8585858585858586, "grad_norm": 0.09128323758171268, "learning_rate": 7.05515864698811e-06, "loss": 0.3801, "step": 255 }, { "epoch": 0.8754208754208754, "grad_norm": 0.09321485064307464, "learning_rate": 6.920209221732007e-06, "loss": 0.3453, "step": 260 }, { "epoch": 0.8922558922558923, "grad_norm": 0.11568566960803582, "learning_rate": 6.783598395446371e-06, "loss": 0.4071, "step": 265 }, { "epoch": 0.9090909090909091, "grad_norm": 0.09087682600845605, "learning_rate": 6.64544436638005e-06, "loss": 0.3685, "step": 270 }, { "epoch": 0.9259259259259259, "grad_norm": 0.08403968322909508, "learning_rate": 6.505866667989884e-06, "loss": 0.3535, "step": 275 }, { "epoch": 0.9427609427609428, "grad_norm": 0.09338895709806748, "learning_rate": 6.364986065518106e-06, "loss": 0.3425, "step": 280 }, { "epoch": 0.9595959595959596, "grad_norm": 0.11009895402833714, "learning_rate": 6.222924451504001e-06, "loss": 0.3903, "step": 285 }, { "epoch": 0.9764309764309764, "grad_norm": 0.0962540505203609, "learning_rate": 6.079804740320181e-06, "loss": 0.3743, "step": 290 }, { "epoch": 0.9932659932659933, "grad_norm": 0.11859376799293515, "learning_rate": 5.935750761824777e-06, "loss": 0.3911, "step": 295 }, { "epoch": 1.0, "eval_loss": 0.3753235638141632, "eval_runtime": 28.7233, "eval_samples_per_second": 20.506, "eval_steps_per_second": 5.153, "step": 297 }, { "epoch": 1.0101010101010102, "grad_norm": 0.09638846774704153, "learning_rate": 5.790887154221521e-06, "loss": 0.3592, "step": 300 }, { "epoch": 1.026936026936027, "grad_norm": 0.11218182778453092, "learning_rate": 5.645339256220427e-06, "loss": 0.3346, "step": 305 }, { "epoch": 1.0437710437710437, "grad_norm": 0.0894851822596739, "learning_rate": 5.499232998592399e-06, "loss": 0.3261, "step": 310 }, { "epoch": 1.0606060606060606, "grad_norm": 0.11093920612376884, "learning_rate": 5.352694795211555e-06, "loss": 0.3812, "step": 315 }, { "epoch": 1.0774410774410774, "grad_norm": 0.10664119581590888, "learning_rate": 5.20585143367959e-06, "loss": 0.3603, "step": 320 }, { "epoch": 1.0942760942760943, "grad_norm": 0.09997813791060199, "learning_rate": 5.058829965626742e-06, "loss": 0.3335, "step": 325 }, { "epoch": 1.1111111111111112, "grad_norm": 0.09409979391826565, "learning_rate": 4.911757596784358e-06, "loss": 0.3611, "step": 330 }, { "epoch": 1.127946127946128, "grad_norm": 0.10458387692519078, "learning_rate": 4.7647615769241e-06, "loss": 0.3335, "step": 335 }, { "epoch": 1.144781144781145, "grad_norm": 0.09111079710967974, "learning_rate": 4.617969089759066e-06, "loss": 0.3428, "step": 340 }, { "epoch": 1.1616161616161615, "grad_norm": 0.08443699111109511, "learning_rate": 4.471507142902036e-06, "loss": 0.3223, "step": 345 }, { "epoch": 1.1784511784511784, "grad_norm": 0.09133102611827909, "learning_rate": 4.325502457976126e-06, "loss": 0.3435, "step": 350 }, { "epoch": 1.1952861952861953, "grad_norm": 0.09066367143881884, "learning_rate": 4.180081360972852e-06, "loss": 0.3651, "step": 355 }, { "epoch": 1.2121212121212122, "grad_norm": 0.10084499213383416, "learning_rate": 4.035369672952516e-06, "loss": 0.3956, "step": 360 }, { "epoch": 1.228956228956229, "grad_norm": 0.09491527150886968, "learning_rate": 3.891492601181462e-06, "loss": 0.3453, "step": 365 }, { "epoch": 1.2457912457912457, "grad_norm": 0.1019704740198795, "learning_rate": 3.7485746308004013e-06, "loss": 0.3479, "step": 370 }, { "epoch": 1.2626262626262625, "grad_norm": 0.0968882851124518, "learning_rate": 3.6067394171175397e-06, "loss": 0.3303, "step": 375 }, { "epoch": 1.2794612794612794, "grad_norm": 0.08550746429570652, "learning_rate": 3.466109678619681e-06, "loss": 0.3269, "step": 380 }, { "epoch": 1.2962962962962963, "grad_norm": 0.10176082406674497, "learning_rate": 3.3268070907938915e-06, "loss": 0.3401, "step": 385 }, { "epoch": 1.3131313131313131, "grad_norm": 0.09276368484156704, "learning_rate": 3.1889521808515888e-06, "loss": 0.3308, "step": 390 }, { "epoch": 1.32996632996633, "grad_norm": 0.10581834870886321, "learning_rate": 3.0526642234461313e-06, "loss": 0.3238, "step": 395 }, { "epoch": 1.3468013468013469, "grad_norm": 0.10428102445335462, "learning_rate": 2.9180611374741623e-06, "loss": 0.3595, "step": 400 }, { "epoch": 1.3636363636363638, "grad_norm": 0.0899593621588282, "learning_rate": 2.785259384049959e-06, "loss": 0.3298, "step": 405 }, { "epoch": 1.3804713804713804, "grad_norm": 0.08797336350474357, "learning_rate": 2.6543738657411033e-06, "loss": 0.354, "step": 410 }, { "epoch": 1.3973063973063973, "grad_norm": 0.09021559950024842, "learning_rate": 2.525517827152614e-06, "loss": 0.3298, "step": 415 }, { "epoch": 1.4141414141414141, "grad_norm": 0.10219971409103493, "learning_rate": 2.3988027569455895e-06, "loss": 0.3468, "step": 420 }, { "epoch": 1.430976430976431, "grad_norm": 0.10290620157032898, "learning_rate": 2.274338291375147e-06, "loss": 0.3577, "step": 425 }, { "epoch": 1.4478114478114479, "grad_norm": 0.08826604210110597, "learning_rate": 2.1522321194310577e-06, "loss": 0.3473, "step": 430 }, { "epoch": 1.4646464646464645, "grad_norm": 0.08903591800358818, "learning_rate": 2.0325898896632178e-06, "loss": 0.3339, "step": 435 }, { "epoch": 1.4814814814814814, "grad_norm": 0.08848973658266114, "learning_rate": 1.915515118772555e-06, "loss": 0.3215, "step": 440 }, { "epoch": 1.4983164983164983, "grad_norm": 0.09450509559743611, "learning_rate": 1.8011091020464138e-06, "loss": 0.3519, "step": 445 }, { "epoch": 1.5151515151515151, "grad_norm": 0.08781947425372004, "learning_rate": 1.689470825715998e-06, "loss": 0.3344, "step": 450 }, { "epoch": 1.531986531986532, "grad_norm": 0.09293777777558654, "learning_rate": 1.580696881311611e-06, "loss": 0.3429, "step": 455 }, { "epoch": 1.5488215488215489, "grad_norm": 0.10333592478818118, "learning_rate": 1.4748813820898554e-06, "loss": 0.3583, "step": 460 }, { "epoch": 1.5656565656565657, "grad_norm": 0.10312555456577105, "learning_rate": 1.3721158816050872e-06, "loss": 0.3245, "step": 465 }, { "epoch": 1.5824915824915826, "grad_norm": 0.09235484309875117, "learning_rate": 1.272489294495548e-06, "loss": 0.3203, "step": 470 }, { "epoch": 1.5993265993265995, "grad_norm": 0.09652201115483773, "learning_rate": 1.1760878195527642e-06, "loss": 0.3346, "step": 475 }, { "epoch": 1.6161616161616161, "grad_norm": 0.10091941077870167, "learning_rate": 1.0829948651407374e-06, "loss": 0.3212, "step": 480 }, { "epoch": 1.632996632996633, "grad_norm": 0.08814459685515268, "learning_rate": 9.932909770294542e-07, "loss": 0.3278, "step": 485 }, { "epoch": 1.6498316498316499, "grad_norm": 0.09440301955370818, "learning_rate": 9.070537687051817e-07, "loss": 0.3194, "step": 490 }, { "epoch": 1.6666666666666665, "grad_norm": 0.08616003726483284, "learning_rate": 8.243578542178227e-07, "loss": 0.3066, "step": 495 }, { "epoch": 1.6835016835016834, "grad_norm": 0.09770744203946605, "learning_rate": 7.452747836234392e-07, "loss": 0.319, "step": 500 }, { "epoch": 1.7003367003367003, "grad_norm": 0.10679236845336801, "learning_rate": 6.698729810778065e-07, "loss": 0.3533, "step": 505 }, { "epoch": 1.7171717171717171, "grad_norm": 0.08984439281172861, "learning_rate": 5.982176856345445e-07, "loss": 0.3408, "step": 510 }, { "epoch": 1.734006734006734, "grad_norm": 0.0992116905666168, "learning_rate": 5.303708947990638e-07, "loss": 0.3397, "step": 515 }, { "epoch": 1.7508417508417509, "grad_norm": 0.08492852347673017, "learning_rate": 4.663913108871726e-07, "loss": 0.3661, "step": 520 }, { "epoch": 1.7676767676767677, "grad_norm": 0.08919997641532293, "learning_rate": 4.0633429023472004e-07, "loss": 0.3507, "step": 525 }, { "epoch": 1.7845117845117846, "grad_norm": 0.08884595301820054, "learning_rate": 3.5025179530225995e-07, "loss": 0.3475, "step": 530 }, { "epoch": 1.8013468013468015, "grad_norm": 0.10375781310984283, "learning_rate": 2.9819234971616154e-07, "loss": 0.3453, "step": 535 }, { "epoch": 1.8181818181818183, "grad_norm": 0.08617062897445908, "learning_rate": 2.5020099628504603e-07, "loss": 0.3225, "step": 540 }, { "epoch": 1.835016835016835, "grad_norm": 0.08459363441220882, "learning_rate": 2.0631925802791608e-07, "loss": 0.343, "step": 545 }, { "epoch": 1.8518518518518519, "grad_norm": 0.08738001505560129, "learning_rate": 1.6658510224765333e-07, "loss": 0.346, "step": 550 }, { "epoch": 1.8686868686868687, "grad_norm": 0.14091077594058707, "learning_rate": 1.3103290768099796e-07, "loss": 0.3525, "step": 555 }, { "epoch": 1.8855218855218854, "grad_norm": 0.10724350369660748, "learning_rate": 9.969343475342285e-08, "loss": 0.3415, "step": 560 }, { "epoch": 1.9023569023569022, "grad_norm": 0.10272663637676055, "learning_rate": 7.259379896463248e-08, "loss": 0.3283, "step": 565 }, { "epoch": 1.9191919191919191, "grad_norm": 0.10531925815392336, "learning_rate": 4.975744742772848e-08, "loss": 0.3437, "step": 570 }, { "epoch": 1.936026936026936, "grad_norm": 0.09578215650108557, "learning_rate": 3.120413858232474e-08, "loss": 0.3387, "step": 575 }, { "epoch": 1.9528619528619529, "grad_norm": 0.09866364512320212, "learning_rate": 1.69499250991767e-08, "loss": 0.327, "step": 580 }, { "epoch": 1.9696969696969697, "grad_norm": 0.09454492383929458, "learning_rate": 7.007139991108136e-09, "loss": 0.3122, "step": 585 }, { "epoch": 1.9865319865319866, "grad_norm": 0.10021678758376606, "learning_rate": 1.3843859422574269e-09, "loss": 0.3852, "step": 590 }, { "epoch": 2.0, "eval_loss": 0.37048205733299255, "eval_runtime": 28.2507, "eval_samples_per_second": 20.849, "eval_steps_per_second": 5.239, "step": 594 }, { "epoch": 2.0, "step": 594, "total_flos": 1.5456743568084828e+18, "train_loss": 0.4185141025970279, "train_runtime": 4834.4452, "train_samples_per_second": 5.896, "train_steps_per_second": 0.123 } ], "logging_steps": 5, "max_steps": 594, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5456743568084828e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }