| { | |
| "best_global_step": 2375, | |
| "best_metric": 0.3474566638469696, | |
| "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_1756729607/checkpoint-2375", | |
| "epoch": 10.0, | |
| "eval_steps": 125, | |
| "global_step": 2490, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.020080321285140562, | |
| "grad_norm": 208.77610778808594, | |
| "learning_rate": 8.032128514056225e-07, | |
| "loss": 11.8262, | |
| "num_input_tokens_seen": 832, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.040160642570281124, | |
| "grad_norm": 188.124267578125, | |
| "learning_rate": 1.8072289156626506e-06, | |
| "loss": 10.3109, | |
| "num_input_tokens_seen": 1760, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.060240963855421686, | |
| "grad_norm": 118.94561767578125, | |
| "learning_rate": 2.811244979919679e-06, | |
| "loss": 8.5244, | |
| "num_input_tokens_seen": 2608, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08032128514056225, | |
| "grad_norm": 93.1993637084961, | |
| "learning_rate": 3.8152610441767074e-06, | |
| "loss": 6.312, | |
| "num_input_tokens_seen": 3536, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10040160642570281, | |
| "grad_norm": 80.68607330322266, | |
| "learning_rate": 4.819277108433735e-06, | |
| "loss": 4.854, | |
| "num_input_tokens_seen": 4496, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 53.525691986083984, | |
| "learning_rate": 5.823293172690764e-06, | |
| "loss": 2.9588, | |
| "num_input_tokens_seen": 5424, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14056224899598393, | |
| "grad_norm": 74.08545684814453, | |
| "learning_rate": 6.827309236947792e-06, | |
| "loss": 1.8179, | |
| "num_input_tokens_seen": 6304, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.1606425702811245, | |
| "grad_norm": 39.353179931640625, | |
| "learning_rate": 7.83132530120482e-06, | |
| "loss": 0.9304, | |
| "num_input_tokens_seen": 7072, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18072289156626506, | |
| "grad_norm": 34.21826934814453, | |
| "learning_rate": 8.835341365461847e-06, | |
| "loss": 0.7447, | |
| "num_input_tokens_seen": 7856, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.20080321285140562, | |
| "grad_norm": 46.137203216552734, | |
| "learning_rate": 9.839357429718876e-06, | |
| "loss": 0.4663, | |
| "num_input_tokens_seen": 8880, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22088353413654618, | |
| "grad_norm": 21.007869720458984, | |
| "learning_rate": 1.0843373493975904e-05, | |
| "loss": 0.419, | |
| "num_input_tokens_seen": 9680, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 32.040428161621094, | |
| "learning_rate": 1.1847389558232933e-05, | |
| "loss": 0.3886, | |
| "num_input_tokens_seen": 10576, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.26104417670682734, | |
| "grad_norm": 33.35049819946289, | |
| "learning_rate": 1.285140562248996e-05, | |
| "loss": 0.3719, | |
| "num_input_tokens_seen": 11424, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.28112449799196787, | |
| "grad_norm": 27.942962646484375, | |
| "learning_rate": 1.3855421686746989e-05, | |
| "loss": 0.4068, | |
| "num_input_tokens_seen": 12224, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30120481927710846, | |
| "grad_norm": 48.38943862915039, | |
| "learning_rate": 1.4859437751004016e-05, | |
| "loss": 0.4348, | |
| "num_input_tokens_seen": 13168, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.321285140562249, | |
| "grad_norm": 54.91352462768555, | |
| "learning_rate": 1.5863453815261046e-05, | |
| "loss": 0.4399, | |
| "num_input_tokens_seen": 14080, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3413654618473896, | |
| "grad_norm": 100.44646453857422, | |
| "learning_rate": 1.6867469879518073e-05, | |
| "loss": 0.873, | |
| "num_input_tokens_seen": 15056, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 26.445947647094727, | |
| "learning_rate": 1.78714859437751e-05, | |
| "loss": 0.5697, | |
| "num_input_tokens_seen": 15904, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3815261044176707, | |
| "grad_norm": 8.536154747009277, | |
| "learning_rate": 1.8875502008032127e-05, | |
| "loss": 0.318, | |
| "num_input_tokens_seen": 16688, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.40160642570281124, | |
| "grad_norm": 0.5681027173995972, | |
| "learning_rate": 1.9879518072289157e-05, | |
| "loss": 0.0212, | |
| "num_input_tokens_seen": 17552, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.42168674698795183, | |
| "grad_norm": 43.08428955078125, | |
| "learning_rate": 2.0883534136546184e-05, | |
| "loss": 1.5137, | |
| "num_input_tokens_seen": 18400, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.44176706827309237, | |
| "grad_norm": 32.574432373046875, | |
| "learning_rate": 2.1887550200803214e-05, | |
| "loss": 0.9558, | |
| "num_input_tokens_seen": 19456, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.46184738955823296, | |
| "grad_norm": 12.149736404418945, | |
| "learning_rate": 2.289156626506024e-05, | |
| "loss": 0.2867, | |
| "num_input_tokens_seen": 20288, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 59.057838439941406, | |
| "learning_rate": 2.389558232931727e-05, | |
| "loss": 0.7216, | |
| "num_input_tokens_seen": 21328, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.5020080321285141, | |
| "grad_norm": 38.10276794433594, | |
| "learning_rate": 2.48995983935743e-05, | |
| "loss": 0.5349, | |
| "num_input_tokens_seen": 22304, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5020080321285141, | |
| "eval_loss": 1.2113524675369263, | |
| "eval_runtime": 1.2329, | |
| "eval_samples_per_second": 45.42, | |
| "eval_steps_per_second": 22.71, | |
| "num_input_tokens_seen": 22304, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5220883534136547, | |
| "grad_norm": 51.98005676269531, | |
| "learning_rate": 2.5903614457831325e-05, | |
| "loss": 1.3225, | |
| "num_input_tokens_seen": 23056, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5421686746987951, | |
| "grad_norm": 16.78879165649414, | |
| "learning_rate": 2.6907630522088356e-05, | |
| "loss": 0.3651, | |
| "num_input_tokens_seen": 23840, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.5622489959839357, | |
| "grad_norm": 7.14707088470459, | |
| "learning_rate": 2.791164658634538e-05, | |
| "loss": 0.5608, | |
| "num_input_tokens_seen": 24832, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5823293172690763, | |
| "grad_norm": 2.345181465148926, | |
| "learning_rate": 2.891566265060241e-05, | |
| "loss": 0.4016, | |
| "num_input_tokens_seen": 25648, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 13.92953109741211, | |
| "learning_rate": 2.991967871485944e-05, | |
| "loss": 0.4123, | |
| "num_input_tokens_seen": 26496, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6224899598393574, | |
| "grad_norm": 13.532402038574219, | |
| "learning_rate": 3.092369477911647e-05, | |
| "loss": 0.538, | |
| "num_input_tokens_seen": 27392, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.642570281124498, | |
| "grad_norm": 11.975309371948242, | |
| "learning_rate": 3.192771084337349e-05, | |
| "loss": 0.3054, | |
| "num_input_tokens_seen": 28272, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6626506024096386, | |
| "grad_norm": 10.431339263916016, | |
| "learning_rate": 3.2931726907630524e-05, | |
| "loss": 0.4957, | |
| "num_input_tokens_seen": 29184, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.6827309236947792, | |
| "grad_norm": 2.0895514488220215, | |
| "learning_rate": 3.393574297188755e-05, | |
| "loss": 0.3453, | |
| "num_input_tokens_seen": 30128, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.7028112449799196, | |
| "grad_norm": 3.5245909690856934, | |
| "learning_rate": 3.4939759036144585e-05, | |
| "loss": 0.4688, | |
| "num_input_tokens_seen": 30976, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 5.512556552886963, | |
| "learning_rate": 3.5943775100401605e-05, | |
| "loss": 0.4289, | |
| "num_input_tokens_seen": 31776, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7429718875502008, | |
| "grad_norm": 3.1914124488830566, | |
| "learning_rate": 3.694779116465863e-05, | |
| "loss": 0.3237, | |
| "num_input_tokens_seen": 32608, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.7630522088353414, | |
| "grad_norm": 12.990925788879395, | |
| "learning_rate": 3.7951807228915666e-05, | |
| "loss": 0.5221, | |
| "num_input_tokens_seen": 33360, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7831325301204819, | |
| "grad_norm": 0.20752322673797607, | |
| "learning_rate": 3.895582329317269e-05, | |
| "loss": 0.2575, | |
| "num_input_tokens_seen": 34176, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.8032128514056225, | |
| "grad_norm": 3.7412514686584473, | |
| "learning_rate": 3.995983935742972e-05, | |
| "loss": 1.23, | |
| "num_input_tokens_seen": 34992, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8232931726907631, | |
| "grad_norm": 2.3068435192108154, | |
| "learning_rate": 4.0963855421686746e-05, | |
| "loss": 0.4066, | |
| "num_input_tokens_seen": 35888, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 2.8328449726104736, | |
| "learning_rate": 4.196787148594378e-05, | |
| "loss": 0.3846, | |
| "num_input_tokens_seen": 36848, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8634538152610441, | |
| "grad_norm": 2.105517625808716, | |
| "learning_rate": 4.297188755020081e-05, | |
| "loss": 0.3303, | |
| "num_input_tokens_seen": 37888, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.8835341365461847, | |
| "grad_norm": 2.6428239345550537, | |
| "learning_rate": 4.3975903614457834e-05, | |
| "loss": 0.4356, | |
| "num_input_tokens_seen": 38768, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.9036144578313253, | |
| "grad_norm": 6.220267295837402, | |
| "learning_rate": 4.497991967871486e-05, | |
| "loss": 0.4328, | |
| "num_input_tokens_seen": 39488, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.9236947791164659, | |
| "grad_norm": 4.502399444580078, | |
| "learning_rate": 4.598393574297189e-05, | |
| "loss": 0.6742, | |
| "num_input_tokens_seen": 40336, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9437751004016064, | |
| "grad_norm": 4.788579940795898, | |
| "learning_rate": 4.698795180722892e-05, | |
| "loss": 0.3987, | |
| "num_input_tokens_seen": 41328, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 3.492311954498291, | |
| "learning_rate": 4.799196787148594e-05, | |
| "loss": 0.3388, | |
| "num_input_tokens_seen": 42176, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9839357429718876, | |
| "grad_norm": 2.018967628479004, | |
| "learning_rate": 4.8995983935742975e-05, | |
| "loss": 0.8551, | |
| "num_input_tokens_seen": 43312, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.0040160642570282, | |
| "grad_norm": 1.773940086364746, | |
| "learning_rate": 5e-05, | |
| "loss": 0.4306, | |
| "num_input_tokens_seen": 44064, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0040160642570282, | |
| "eval_loss": 0.4651975631713867, | |
| "eval_runtime": 1.2232, | |
| "eval_samples_per_second": 45.781, | |
| "eval_steps_per_second": 22.891, | |
| "num_input_tokens_seen": 44064, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.0240963855421688, | |
| "grad_norm": 13.694576263427734, | |
| "learning_rate": 4.9999385864396127e-05, | |
| "loss": 0.7501, | |
| "num_input_tokens_seen": 44816, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.0441767068273093, | |
| "grad_norm": 7.658875942230225, | |
| "learning_rate": 4.99975434877575e-05, | |
| "loss": 1.1828, | |
| "num_input_tokens_seen": 45776, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0642570281124497, | |
| "grad_norm": 1.0638893842697144, | |
| "learning_rate": 4.999447296060165e-05, | |
| "loss": 0.7021, | |
| "num_input_tokens_seen": 46592, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.0843373493975903, | |
| "grad_norm": 4.289158344268799, | |
| "learning_rate": 4.999017443378618e-05, | |
| "loss": 0.37, | |
| "num_input_tokens_seen": 47536, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.104417670682731, | |
| "grad_norm": 3.189358711242676, | |
| "learning_rate": 4.998464811850137e-05, | |
| "loss": 0.3415, | |
| "num_input_tokens_seen": 48320, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.1244979919678715, | |
| "grad_norm": 2.700920581817627, | |
| "learning_rate": 4.997789428625975e-05, | |
| "loss": 0.381, | |
| "num_input_tokens_seen": 49216, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.144578313253012, | |
| "grad_norm": 2.0674901008605957, | |
| "learning_rate": 4.996991326888286e-05, | |
| "loss": 0.3487, | |
| "num_input_tokens_seen": 50048, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.1646586345381527, | |
| "grad_norm": 2.0477442741394043, | |
| "learning_rate": 4.996070545848484e-05, | |
| "loss": 0.346, | |
| "num_input_tokens_seen": 50832, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1847389558232932, | |
| "grad_norm": 2.814277410507202, | |
| "learning_rate": 4.995027130745321e-05, | |
| "loss": 0.3439, | |
| "num_input_tokens_seen": 51824, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 1.5188792943954468, | |
| "learning_rate": 4.9938611328426685e-05, | |
| "loss": 0.5375, | |
| "num_input_tokens_seen": 52608, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.2248995983935742, | |
| "grad_norm": 0.27745991945266724, | |
| "learning_rate": 4.992572609426992e-05, | |
| "loss": 0.3537, | |
| "num_input_tokens_seen": 53440, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.2449799196787148, | |
| "grad_norm": 0.40650656819343567, | |
| "learning_rate": 4.99116162380454e-05, | |
| "loss": 0.3549, | |
| "num_input_tokens_seen": 54320, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.2650602409638554, | |
| "grad_norm": 1.8287845849990845, | |
| "learning_rate": 4.989628245298233e-05, | |
| "loss": 0.3352, | |
| "num_input_tokens_seen": 55072, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.285140562248996, | |
| "grad_norm": 0.5225471258163452, | |
| "learning_rate": 4.987972549244257e-05, | |
| "loss": 0.3695, | |
| "num_input_tokens_seen": 56224, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.3052208835341366, | |
| "grad_norm": 1.361476182937622, | |
| "learning_rate": 4.986194616988364e-05, | |
| "loss": 0.281, | |
| "num_input_tokens_seen": 56912, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.3253012048192772, | |
| "grad_norm": 1.0196197032928467, | |
| "learning_rate": 4.984294535881875e-05, | |
| "loss": 0.488, | |
| "num_input_tokens_seen": 57648, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3453815261044177, | |
| "grad_norm": 0.7983678579330444, | |
| "learning_rate": 4.982272399277386e-05, | |
| "loss": 0.3598, | |
| "num_input_tokens_seen": 58608, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.3654618473895583, | |
| "grad_norm": 0.21441864967346191, | |
| "learning_rate": 4.980128306524183e-05, | |
| "loss": 0.3973, | |
| "num_input_tokens_seen": 59424, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.3855421686746987, | |
| "grad_norm": 1.7802695035934448, | |
| "learning_rate": 4.9778623629633635e-05, | |
| "loss": 0.3078, | |
| "num_input_tokens_seen": 60272, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.4056224899598393, | |
| "grad_norm": 2.1653060913085938, | |
| "learning_rate": 4.975474679922655e-05, | |
| "loss": 0.4871, | |
| "num_input_tokens_seen": 61056, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.4257028112449799, | |
| "grad_norm": 1.056137204170227, | |
| "learning_rate": 4.972965374710952e-05, | |
| "loss": 0.283, | |
| "num_input_tokens_seen": 61968, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 0.806865394115448, | |
| "learning_rate": 4.9703345706125485e-05, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 62800, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.465863453815261, | |
| "grad_norm": 0.4291660189628601, | |
| "learning_rate": 4.96758239688108e-05, | |
| "loss": 0.4493, | |
| "num_input_tokens_seen": 63824, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 1.4859437751004017, | |
| "grad_norm": 0.9207323789596558, | |
| "learning_rate": 4.964708988733178e-05, | |
| "loss": 0.3217, | |
| "num_input_tokens_seen": 64800, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.5060240963855422, | |
| "grad_norm": 0.7873408198356628, | |
| "learning_rate": 4.961714487341822e-05, | |
| "loss": 0.3766, | |
| "num_input_tokens_seen": 65808, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.5060240963855422, | |
| "eval_loss": 0.38677313923835754, | |
| "eval_runtime": 1.2517, | |
| "eval_samples_per_second": 44.74, | |
| "eval_steps_per_second": 22.37, | |
| "num_input_tokens_seen": 65808, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 1.5261044176706826, | |
| "grad_norm": 0.23284302651882172, | |
| "learning_rate": 4.9585990398294043e-05, | |
| "loss": 0.4091, | |
| "num_input_tokens_seen": 66752, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.5461847389558234, | |
| "grad_norm": 0.18355536460876465, | |
| "learning_rate": 4.9553627992605066e-05, | |
| "loss": 0.3531, | |
| "num_input_tokens_seen": 67632, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 1.5662650602409638, | |
| "grad_norm": 0.12290728837251663, | |
| "learning_rate": 4.952005924634372e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 68400, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.5863453815261044, | |
| "grad_norm": 0.8354266881942749, | |
| "learning_rate": 4.948528580877099e-05, | |
| "loss": 0.3255, | |
| "num_input_tokens_seen": 69408, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 1.606425702811245, | |
| "grad_norm": 0.3600679337978363, | |
| "learning_rate": 4.944930938833535e-05, | |
| "loss": 0.3689, | |
| "num_input_tokens_seen": 70352, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.6265060240963856, | |
| "grad_norm": 0.2819594442844391, | |
| "learning_rate": 4.9412131752588874e-05, | |
| "loss": 0.374, | |
| "num_input_tokens_seen": 71184, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 1.6465863453815262, | |
| "grad_norm": 0.9768486618995667, | |
| "learning_rate": 4.937375472810033e-05, | |
| "loss": 0.3785, | |
| "num_input_tokens_seen": 72272, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6666666666666665, | |
| "grad_norm": 0.11346471309661865, | |
| "learning_rate": 4.9334180200365486e-05, | |
| "loss": 0.3645, | |
| "num_input_tokens_seen": 73136, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 1.6867469879518073, | |
| "grad_norm": 0.15687525272369385, | |
| "learning_rate": 4.929341011371448e-05, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 73872, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.7068273092369477, | |
| "grad_norm": 0.31356531381607056, | |
| "learning_rate": 4.9251446471216226e-05, | |
| "loss": 0.3495, | |
| "num_input_tokens_seen": 74784, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 1.7269076305220885, | |
| "grad_norm": 0.2755882441997528, | |
| "learning_rate": 4.9208291334580104e-05, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 75664, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.7469879518072289, | |
| "grad_norm": 1.0474437475204468, | |
| "learning_rate": 4.9163946824054574e-05, | |
| "loss": 0.4005, | |
| "num_input_tokens_seen": 76592, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 1.7670682730923695, | |
| "grad_norm": 0.9340880513191223, | |
| "learning_rate": 4.911841511832305e-05, | |
| "loss": 0.3454, | |
| "num_input_tokens_seen": 77408, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.78714859437751, | |
| "grad_norm": 0.7397641539573669, | |
| "learning_rate": 4.907169845439688e-05, | |
| "loss": 0.3494, | |
| "num_input_tokens_seen": 78272, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 1.8072289156626506, | |
| "grad_norm": 0.32739967107772827, | |
| "learning_rate": 4.902379912750537e-05, | |
| "loss": 0.3211, | |
| "num_input_tokens_seen": 79200, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.8273092369477912, | |
| "grad_norm": 0.6281508207321167, | |
| "learning_rate": 4.897471949098309e-05, | |
| "loss": 0.3843, | |
| "num_input_tokens_seen": 80112, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 1.8473895582329316, | |
| "grad_norm": 0.6183443069458008, | |
| "learning_rate": 4.892446195615423e-05, | |
| "loss": 0.3143, | |
| "num_input_tokens_seen": 81168, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8674698795180724, | |
| "grad_norm": 0.6530241966247559, | |
| "learning_rate": 4.88730289922141e-05, | |
| "loss": 0.3845, | |
| "num_input_tokens_seen": 82112, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 1.8875502008032128, | |
| "grad_norm": 0.2493213266134262, | |
| "learning_rate": 4.8820423126107845e-05, | |
| "loss": 0.3467, | |
| "num_input_tokens_seen": 83072, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.9076305220883534, | |
| "grad_norm": 0.7678804993629456, | |
| "learning_rate": 4.87666469424063e-05, | |
| "loss": 0.3683, | |
| "num_input_tokens_seen": 83920, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 0.8148710131645203, | |
| "learning_rate": 4.8711703083178986e-05, | |
| "loss": 0.3512, | |
| "num_input_tokens_seen": 84768, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.9477911646586346, | |
| "grad_norm": 0.8058724999427795, | |
| "learning_rate": 4.865559424786432e-05, | |
| "loss": 0.3478, | |
| "num_input_tokens_seen": 85616, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 1.9678714859437751, | |
| "grad_norm": 0.1543155312538147, | |
| "learning_rate": 4.859832319313697e-05, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 86400, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9879518072289155, | |
| "grad_norm": 1.0387096405029297, | |
| "learning_rate": 4.8539892732772455e-05, | |
| "loss": 0.3753, | |
| "num_input_tokens_seen": 87216, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 2.0080321285140563, | |
| "grad_norm": 0.2593827545642853, | |
| "learning_rate": 4.848030573750885e-05, | |
| "loss": 0.3159, | |
| "num_input_tokens_seen": 88048, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.0080321285140563, | |
| "eval_loss": 0.3829512596130371, | |
| "eval_runtime": 1.2272, | |
| "eval_samples_per_second": 45.632, | |
| "eval_steps_per_second": 22.816, | |
| "num_input_tokens_seen": 88048, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.0281124497991967, | |
| "grad_norm": 0.6328459978103638, | |
| "learning_rate": 4.841956513490577e-05, | |
| "loss": 0.3501, | |
| "num_input_tokens_seen": 88896, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 2.0481927710843375, | |
| "grad_norm": 0.17319746315479279, | |
| "learning_rate": 4.8357673909200563e-05, | |
| "loss": 0.3452, | |
| "num_input_tokens_seen": 89744, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.068273092369478, | |
| "grad_norm": 0.13001570105552673, | |
| "learning_rate": 4.8294635101161645e-05, | |
| "loss": 0.3738, | |
| "num_input_tokens_seen": 90528, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 2.0883534136546187, | |
| "grad_norm": 0.1256828010082245, | |
| "learning_rate": 4.8230451807939135e-05, | |
| "loss": 0.3347, | |
| "num_input_tokens_seen": 91360, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.108433734939759, | |
| "grad_norm": 0.7363554835319519, | |
| "learning_rate": 4.816512718291267e-05, | |
| "loss": 0.346, | |
| "num_input_tokens_seen": 92176, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 2.1285140562248994, | |
| "grad_norm": 0.19434113800525665, | |
| "learning_rate": 4.80986644355365e-05, | |
| "loss": 0.341, | |
| "num_input_tokens_seen": 93104, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.1485943775100402, | |
| "grad_norm": 0.1456967294216156, | |
| "learning_rate": 4.803106683118177e-05, | |
| "loss": 0.3588, | |
| "num_input_tokens_seen": 93984, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 2.1686746987951806, | |
| "grad_norm": 0.9451452493667603, | |
| "learning_rate": 4.796233769097615e-05, | |
| "loss": 0.3438, | |
| "num_input_tokens_seen": 94896, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.1887550200803214, | |
| "grad_norm": 0.1497451663017273, | |
| "learning_rate": 4.789248039164058e-05, | |
| "loss": 0.375, | |
| "num_input_tokens_seen": 95824, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 2.208835341365462, | |
| "grad_norm": 0.8181028366088867, | |
| "learning_rate": 4.782149836532345e-05, | |
| "loss": 0.3607, | |
| "num_input_tokens_seen": 96688, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.2289156626506026, | |
| "grad_norm": 0.6427181959152222, | |
| "learning_rate": 4.7749395099431924e-05, | |
| "loss": 0.3312, | |
| "num_input_tokens_seen": 97488, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 2.248995983935743, | |
| "grad_norm": 1.3496628999710083, | |
| "learning_rate": 4.7676174136460625e-05, | |
| "loss": 0.4083, | |
| "num_input_tokens_seen": 98288, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.2690763052208833, | |
| "grad_norm": 0.2799893915653229, | |
| "learning_rate": 4.760183907381757e-05, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 99200, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 2.289156626506024, | |
| "grad_norm": 0.22829043865203857, | |
| "learning_rate": 4.752639356364744e-05, | |
| "loss": 0.3228, | |
| "num_input_tokens_seen": 99984, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.3092369477911645, | |
| "grad_norm": 0.17503726482391357, | |
| "learning_rate": 4.7449841312652166e-05, | |
| "loss": 0.3781, | |
| "num_input_tokens_seen": 100784, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 2.3293172690763053, | |
| "grad_norm": 0.8941397666931152, | |
| "learning_rate": 4.737218608190878e-05, | |
| "loss": 0.367, | |
| "num_input_tokens_seen": 101584, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.3493975903614457, | |
| "grad_norm": 0.7553220987319946, | |
| "learning_rate": 4.729343168668463e-05, | |
| "loss": 0.3603, | |
| "num_input_tokens_seen": 102480, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 2.3694779116465865, | |
| "grad_norm": 0.2362431138753891, | |
| "learning_rate": 4.721358199624997e-05, | |
| "loss": 0.3631, | |
| "num_input_tokens_seen": 103408, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.389558232931727, | |
| "grad_norm": 0.2296190857887268, | |
| "learning_rate": 4.713264093368783e-05, | |
| "loss": 0.3911, | |
| "num_input_tokens_seen": 104160, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 0.16303616762161255, | |
| "learning_rate": 4.705061247570128e-05, | |
| "loss": 0.3406, | |
| "num_input_tokens_seen": 105040, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.429718875502008, | |
| "grad_norm": 0.15550056099891663, | |
| "learning_rate": 4.6967500652418034e-05, | |
| "loss": 0.3582, | |
| "num_input_tokens_seen": 105856, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 2.4497991967871484, | |
| "grad_norm": 0.9154328107833862, | |
| "learning_rate": 4.6883309547192476e-05, | |
| "loss": 0.3701, | |
| "num_input_tokens_seen": 106928, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.4698795180722892, | |
| "grad_norm": 0.5483267903327942, | |
| "learning_rate": 4.679804329640505e-05, | |
| "loss": 0.3423, | |
| "num_input_tokens_seen": 107808, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 2.4899598393574296, | |
| "grad_norm": 0.5274845957756042, | |
| "learning_rate": 4.6711706089258955e-05, | |
| "loss": 0.3104, | |
| "num_input_tokens_seen": 108656, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.5100401606425704, | |
| "grad_norm": 0.23020295798778534, | |
| "learning_rate": 4.6624302167574436e-05, | |
| "loss": 0.3958, | |
| "num_input_tokens_seen": 109696, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.5100401606425704, | |
| "eval_loss": 0.3675794303417206, | |
| "eval_runtime": 1.213, | |
| "eval_samples_per_second": 46.168, | |
| "eval_steps_per_second": 23.084, | |
| "num_input_tokens_seen": 109696, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 2.5301204819277108, | |
| "grad_norm": 0.13925790786743164, | |
| "learning_rate": 4.653583582558031e-05, | |
| "loss": 0.3587, | |
| "num_input_tokens_seen": 110576, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.550200803212851, | |
| "grad_norm": 0.9564950466156006, | |
| "learning_rate": 4.6446311409703006e-05, | |
| "loss": 0.365, | |
| "num_input_tokens_seen": 111440, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 2.570281124497992, | |
| "grad_norm": 0.2103191763162613, | |
| "learning_rate": 4.635573331835302e-05, | |
| "loss": 0.3339, | |
| "num_input_tokens_seen": 112192, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.5903614457831328, | |
| "grad_norm": 0.230157271027565, | |
| "learning_rate": 4.6264106001708824e-05, | |
| "loss": 0.3631, | |
| "num_input_tokens_seen": 113024, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 2.610441767068273, | |
| "grad_norm": 0.19389480352401733, | |
| "learning_rate": 4.61714339614982e-05, | |
| "loss": 0.3795, | |
| "num_input_tokens_seen": 113952, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.6305220883534135, | |
| "grad_norm": 0.14889311790466309, | |
| "learning_rate": 4.607772175077711e-05, | |
| "loss": 0.3586, | |
| "num_input_tokens_seen": 114928, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 2.6506024096385543, | |
| "grad_norm": 0.10515565425157547, | |
| "learning_rate": 4.598297397370596e-05, | |
| "loss": 0.3726, | |
| "num_input_tokens_seen": 115728, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.6706827309236947, | |
| "grad_norm": 0.6394887566566467, | |
| "learning_rate": 4.588719528532342e-05, | |
| "loss": 0.3549, | |
| "num_input_tokens_seen": 116544, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 2.6907630522088355, | |
| "grad_norm": 0.14106032252311707, | |
| "learning_rate": 4.5790390391317675e-05, | |
| "loss": 0.3379, | |
| "num_input_tokens_seen": 117568, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.710843373493976, | |
| "grad_norm": 0.08746648579835892, | |
| "learning_rate": 4.5692564047795316e-05, | |
| "loss": 0.3688, | |
| "num_input_tokens_seen": 118368, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 2.7309236947791167, | |
| "grad_norm": 0.5902343392372131, | |
| "learning_rate": 4.5593721061047576e-05, | |
| "loss": 0.3455, | |
| "num_input_tokens_seen": 119120, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.751004016064257, | |
| "grad_norm": 0.57841557264328, | |
| "learning_rate": 4.549386628731425e-05, | |
| "loss": 0.3575, | |
| "num_input_tokens_seen": 120064, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 2.7710843373493974, | |
| "grad_norm": 0.10715785622596741, | |
| "learning_rate": 4.5393004632545064e-05, | |
| "loss": 0.3721, | |
| "num_input_tokens_seen": 120960, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.791164658634538, | |
| "grad_norm": 0.09298089146614075, | |
| "learning_rate": 4.529114105215869e-05, | |
| "loss": 0.3545, | |
| "num_input_tokens_seen": 121760, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 2.8112449799196786, | |
| "grad_norm": 0.659833550453186, | |
| "learning_rate": 4.518828055079925e-05, | |
| "loss": 0.3675, | |
| "num_input_tokens_seen": 122720, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.8313253012048194, | |
| "grad_norm": 0.600629985332489, | |
| "learning_rate": 4.508442818209042e-05, | |
| "loss": 0.3543, | |
| "num_input_tokens_seen": 123712, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 2.8514056224899598, | |
| "grad_norm": 0.1273653358221054, | |
| "learning_rate": 4.4979589048387186e-05, | |
| "loss": 0.3561, | |
| "num_input_tokens_seen": 124624, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.8714859437751006, | |
| "grad_norm": 0.5190867185592651, | |
| "learning_rate": 4.487376830052511e-05, | |
| "loss": 0.3474, | |
| "num_input_tokens_seen": 125696, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 0.5262147784233093, | |
| "learning_rate": 4.476697113756731e-05, | |
| "loss": 0.2977, | |
| "num_input_tokens_seen": 126480, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.9116465863453813, | |
| "grad_norm": 0.9929115772247314, | |
| "learning_rate": 4.465920280654901e-05, | |
| "loss": 0.3658, | |
| "num_input_tokens_seen": 127312, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 2.931726907630522, | |
| "grad_norm": 0.26060280203819275, | |
| "learning_rate": 4.4550468602219716e-05, | |
| "loss": 0.3475, | |
| "num_input_tokens_seen": 128352, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.9518072289156625, | |
| "grad_norm": 0.2218063920736313, | |
| "learning_rate": 4.4440773866783136e-05, | |
| "loss": 0.4262, | |
| "num_input_tokens_seen": 129232, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 2.9718875502008033, | |
| "grad_norm": 0.6310736536979675, | |
| "learning_rate": 4.433012398963468e-05, | |
| "loss": 0.4037, | |
| "num_input_tokens_seen": 130080, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.9919678714859437, | |
| "grad_norm": 0.5500450134277344, | |
| "learning_rate": 4.421852440709666e-05, | |
| "loss": 0.3459, | |
| "num_input_tokens_seen": 130880, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 3.0120481927710845, | |
| "grad_norm": 0.5436108112335205, | |
| "learning_rate": 4.4105980602151256e-05, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 131872, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.0120481927710845, | |
| "eval_loss": 0.35073402523994446, | |
| "eval_runtime": 1.2195, | |
| "eval_samples_per_second": 45.921, | |
| "eval_steps_per_second": 22.961, | |
| "num_input_tokens_seen": 131872, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.032128514056225, | |
| "grad_norm": 0.5504011511802673, | |
| "learning_rate": 4.399249810417108e-05, | |
| "loss": 0.354, | |
| "num_input_tokens_seen": 132656, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 3.0522088353413657, | |
| "grad_norm": 0.09489892423152924, | |
| "learning_rate": 4.387808248864751e-05, | |
| "loss": 0.3708, | |
| "num_input_tokens_seen": 133472, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.072289156626506, | |
| "grad_norm": 0.1529596596956253, | |
| "learning_rate": 4.376273937691681e-05, | |
| "loss": 0.3463, | |
| "num_input_tokens_seen": 134416, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 3.0923694779116464, | |
| "grad_norm": 0.08475496619939804, | |
| "learning_rate": 4.364647443588389e-05, | |
| "loss": 0.3485, | |
| "num_input_tokens_seen": 135344, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.112449799196787, | |
| "grad_norm": 0.12930436432361603, | |
| "learning_rate": 4.352929337774395e-05, | |
| "loss": 0.3382, | |
| "num_input_tokens_seen": 136240, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 3.1325301204819276, | |
| "grad_norm": 1.1815729141235352, | |
| "learning_rate": 4.341120195970178e-05, | |
| "loss": 0.3559, | |
| "num_input_tokens_seen": 137120, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 3.1526104417670684, | |
| "grad_norm": 0.3863106071949005, | |
| "learning_rate": 4.3292205983688905e-05, | |
| "loss": 0.36, | |
| "num_input_tokens_seen": 138112, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 3.1726907630522088, | |
| "grad_norm": 0.4967614412307739, | |
| "learning_rate": 4.3172311296078595e-05, | |
| "loss": 0.3472, | |
| "num_input_tokens_seen": 138960, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 3.1927710843373496, | |
| "grad_norm": 0.13387857377529144, | |
| "learning_rate": 4.305152378739855e-05, | |
| "loss": 0.3646, | |
| "num_input_tokens_seen": 140016, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 3.21285140562249, | |
| "grad_norm": 0.11857640743255615, | |
| "learning_rate": 4.292984939204155e-05, | |
| "loss": 0.3357, | |
| "num_input_tokens_seen": 140768, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.2329317269076308, | |
| "grad_norm": 0.12806545197963715, | |
| "learning_rate": 4.2807294087973834e-05, | |
| "loss": 0.3444, | |
| "num_input_tokens_seen": 141664, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 3.253012048192771, | |
| "grad_norm": 0.11283908039331436, | |
| "learning_rate": 4.2683863896441475e-05, | |
| "loss": 0.3541, | |
| "num_input_tokens_seen": 142448, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 3.2730923694779115, | |
| "grad_norm": 0.06589579582214355, | |
| "learning_rate": 4.255956488167449e-05, | |
| "loss": 0.3619, | |
| "num_input_tokens_seen": 143408, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 3.2931726907630523, | |
| "grad_norm": 0.6529943346977234, | |
| "learning_rate": 4.2434403150588895e-05, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 144256, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 3.3132530120481927, | |
| "grad_norm": 0.618222177028656, | |
| "learning_rate": 4.230838485248674e-05, | |
| "loss": 0.3504, | |
| "num_input_tokens_seen": 145120, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.566098690032959, | |
| "learning_rate": 4.21815161787539e-05, | |
| "loss": 0.3445, | |
| "num_input_tokens_seen": 146080, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 3.353413654618474, | |
| "grad_norm": 0.13646817207336426, | |
| "learning_rate": 4.205380336255594e-05, | |
| "loss": 0.3418, | |
| "num_input_tokens_seen": 146912, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 3.3734939759036147, | |
| "grad_norm": 0.21328403055667877, | |
| "learning_rate": 4.192525267853188e-05, | |
| "loss": 0.2934, | |
| "num_input_tokens_seen": 147776, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 3.393574297188755, | |
| "grad_norm": 0.3965797424316406, | |
| "learning_rate": 4.179587044248585e-05, | |
| "loss": 0.2829, | |
| "num_input_tokens_seen": 148768, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 3.4136546184738954, | |
| "grad_norm": 0.24930249154567719, | |
| "learning_rate": 4.166566301107687e-05, | |
| "loss": 0.5387, | |
| "num_input_tokens_seen": 149728, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 3.433734939759036, | |
| "grad_norm": 0.7255300879478455, | |
| "learning_rate": 4.153463678150651e-05, | |
| "loss": 0.3639, | |
| "num_input_tokens_seen": 150784, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 3.4538152610441766, | |
| "grad_norm": 0.11454490572214127, | |
| "learning_rate": 4.140279819120457e-05, | |
| "loss": 0.3721, | |
| "num_input_tokens_seen": 151728, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 3.4738955823293174, | |
| "grad_norm": 0.7094687819480896, | |
| "learning_rate": 4.127015371751284e-05, | |
| "loss": 0.3656, | |
| "num_input_tokens_seen": 152640, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 3.4939759036144578, | |
| "grad_norm": 0.5258346796035767, | |
| "learning_rate": 4.1136709877366844e-05, | |
| "loss": 0.3193, | |
| "num_input_tokens_seen": 153424, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 3.5140562248995986, | |
| "grad_norm": 0.5166998505592346, | |
| "learning_rate": 4.100247322697562e-05, | |
| "loss": 0.3677, | |
| "num_input_tokens_seen": 154416, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 3.5140562248995986, | |
| "eval_loss": 0.3535325825214386, | |
| "eval_runtime": 1.2211, | |
| "eval_samples_per_second": 45.861, | |
| "eval_steps_per_second": 22.931, | |
| "num_input_tokens_seen": 154416, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 3.534136546184739, | |
| "grad_norm": 0.49516451358795166, | |
| "learning_rate": 4.08674503614997e-05, | |
| "loss": 0.3907, | |
| "num_input_tokens_seen": 155184, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 3.5542168674698793, | |
| "grad_norm": 0.0980529636144638, | |
| "learning_rate": 4.0731647914727004e-05, | |
| "loss": 0.3941, | |
| "num_input_tokens_seen": 156000, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 3.57429718875502, | |
| "grad_norm": 0.5644952058792114, | |
| "learning_rate": 4.059507255874694e-05, | |
| "loss": 0.345, | |
| "num_input_tokens_seen": 156976, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 3.5943775100401605, | |
| "grad_norm": 0.5101115703582764, | |
| "learning_rate": 4.0457731003622606e-05, | |
| "loss": 0.3331, | |
| "num_input_tokens_seen": 157904, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 3.6144578313253013, | |
| "grad_norm": 0.4910569190979004, | |
| "learning_rate": 4.0319629997061116e-05, | |
| "loss": 0.3339, | |
| "num_input_tokens_seen": 158864, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.6345381526104417, | |
| "grad_norm": 0.48415863513946533, | |
| "learning_rate": 4.018077632408207e-05, | |
| "loss": 0.2827, | |
| "num_input_tokens_seen": 159744, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 3.6546184738955825, | |
| "grad_norm": 0.4711949825286865, | |
| "learning_rate": 4.004117680668422e-05, | |
| "loss": 0.3838, | |
| "num_input_tokens_seen": 160608, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 3.674698795180723, | |
| "grad_norm": 0.935171902179718, | |
| "learning_rate": 3.990083830351027e-05, | |
| "loss": 0.3816, | |
| "num_input_tokens_seen": 161488, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 3.694779116465863, | |
| "grad_norm": 0.48552215099334717, | |
| "learning_rate": 3.975976770950994e-05, | |
| "loss": 0.4066, | |
| "num_input_tokens_seen": 162224, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 3.714859437751004, | |
| "grad_norm": 0.5080327391624451, | |
| "learning_rate": 3.961797195560118e-05, | |
| "loss": 0.3183, | |
| "num_input_tokens_seen": 163056, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 3.734939759036145, | |
| "grad_norm": 0.606795072555542, | |
| "learning_rate": 3.947545800832967e-05, | |
| "loss": 0.3641, | |
| "num_input_tokens_seen": 163856, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.755020080321285, | |
| "grad_norm": 0.5324833989143372, | |
| "learning_rate": 3.9332232869526534e-05, | |
| "loss": 0.3394, | |
| "num_input_tokens_seen": 164768, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 3.7751004016064256, | |
| "grad_norm": 0.10697121173143387, | |
| "learning_rate": 3.918830357596434e-05, | |
| "loss": 0.3368, | |
| "num_input_tokens_seen": 165600, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 3.7951807228915664, | |
| "grad_norm": 0.13268576562404633, | |
| "learning_rate": 3.9043677199011364e-05, | |
| "loss": 0.3511, | |
| "num_input_tokens_seen": 166400, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 3.8152610441767068, | |
| "grad_norm": 0.12882153689861298, | |
| "learning_rate": 3.889836084428422e-05, | |
| "loss": 0.328, | |
| "num_input_tokens_seen": 167296, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 3.835341365461847, | |
| "grad_norm": 0.14181359112262726, | |
| "learning_rate": 3.8752361651298675e-05, | |
| "loss": 0.369, | |
| "num_input_tokens_seen": 168208, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 3.855421686746988, | |
| "grad_norm": 0.4742559492588043, | |
| "learning_rate": 3.860568679311893e-05, | |
| "loss": 0.3657, | |
| "num_input_tokens_seen": 169056, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 3.8755020080321287, | |
| "grad_norm": 0.1299924999475479, | |
| "learning_rate": 3.8458343476005196e-05, | |
| "loss": 0.3849, | |
| "num_input_tokens_seen": 169888, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 3.895582329317269, | |
| "grad_norm": 0.08048601448535919, | |
| "learning_rate": 3.8310338939059644e-05, | |
| "loss": 0.3541, | |
| "num_input_tokens_seen": 170704, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.9156626506024095, | |
| "grad_norm": 0.6629543304443359, | |
| "learning_rate": 3.8161680453870715e-05, | |
| "loss": 0.3558, | |
| "num_input_tokens_seen": 171600, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 3.9357429718875503, | |
| "grad_norm": 0.15418274700641632, | |
| "learning_rate": 3.8012375324155904e-05, | |
| "loss": 0.3131, | |
| "num_input_tokens_seen": 172480, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 3.9558232931726907, | |
| "grad_norm": 0.4927317202091217, | |
| "learning_rate": 3.7862430885402876e-05, | |
| "loss": 0.3661, | |
| "num_input_tokens_seen": 173504, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 3.9759036144578315, | |
| "grad_norm": 0.4768475890159607, | |
| "learning_rate": 3.7711854504509135e-05, | |
| "loss": 0.3373, | |
| "num_input_tokens_seen": 174288, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 3.995983935742972, | |
| "grad_norm": 0.7225349545478821, | |
| "learning_rate": 3.756065357941999e-05, | |
| "loss": 0.3623, | |
| "num_input_tokens_seen": 175104, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 4.016064257028113, | |
| "grad_norm": 0.7921448945999146, | |
| "learning_rate": 3.740883553876515e-05, | |
| "loss": 0.3426, | |
| "num_input_tokens_seen": 176048, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.016064257028113, | |
| "eval_loss": 0.35071006417274475, | |
| "eval_runtime": 1.2197, | |
| "eval_samples_per_second": 45.913, | |
| "eval_steps_per_second": 22.957, | |
| "num_input_tokens_seen": 176048, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 4.036144578313253, | |
| "grad_norm": 0.11272845417261124, | |
| "learning_rate": 3.725640784149375e-05, | |
| "loss": 0.4204, | |
| "num_input_tokens_seen": 176880, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 4.056224899598393, | |
| "grad_norm": 0.08953544497489929, | |
| "learning_rate": 3.710337797650787e-05, | |
| "loss": 0.339, | |
| "num_input_tokens_seen": 177680, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 4.076305220883534, | |
| "grad_norm": 0.4361952543258667, | |
| "learning_rate": 3.694975346229458e-05, | |
| "loss": 0.3311, | |
| "num_input_tokens_seen": 178608, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 4.096385542168675, | |
| "grad_norm": 0.09541574120521545, | |
| "learning_rate": 3.679554184655659e-05, | |
| "loss": 0.3611, | |
| "num_input_tokens_seen": 179600, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 4.116465863453815, | |
| "grad_norm": 0.5854984521865845, | |
| "learning_rate": 3.6640750705841405e-05, | |
| "loss": 0.3403, | |
| "num_input_tokens_seen": 180464, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 4.136546184738956, | |
| "grad_norm": 0.10651904344558716, | |
| "learning_rate": 3.6485387645169064e-05, | |
| "loss": 0.3243, | |
| "num_input_tokens_seen": 181344, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 4.156626506024097, | |
| "grad_norm": 0.5942978262901306, | |
| "learning_rate": 3.632946029765856e-05, | |
| "loss": 0.3965, | |
| "num_input_tokens_seen": 182080, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 4.176706827309237, | |
| "grad_norm": 0.07312840223312378, | |
| "learning_rate": 3.617297632415273e-05, | |
| "loss": 0.3719, | |
| "num_input_tokens_seen": 182848, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 4.196787148594377, | |
| "grad_norm": 0.5075451135635376, | |
| "learning_rate": 3.601594341284195e-05, | |
| "loss": 0.3512, | |
| "num_input_tokens_seen": 183840, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 4.216867469879518, | |
| "grad_norm": 0.047960445284843445, | |
| "learning_rate": 3.5858369278886354e-05, | |
| "loss": 0.3388, | |
| "num_input_tokens_seen": 184720, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 4.236947791164659, | |
| "grad_norm": 0.08333683758974075, | |
| "learning_rate": 3.5700261664036827e-05, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 185504, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 4.257028112449799, | |
| "grad_norm": 0.0653541311621666, | |
| "learning_rate": 3.55416283362546e-05, | |
| "loss": 0.3588, | |
| "num_input_tokens_seen": 186272, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 4.27710843373494, | |
| "grad_norm": 0.5113236308097839, | |
| "learning_rate": 3.5382477089329646e-05, | |
| "loss": 0.3579, | |
| "num_input_tokens_seen": 187296, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 4.2971887550200805, | |
| "grad_norm": 0.07462375611066818, | |
| "learning_rate": 3.522281574249774e-05, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 188320, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 4.317269076305221, | |
| "grad_norm": 0.11710739135742188, | |
| "learning_rate": 3.5062652140056275e-05, | |
| "loss": 0.3282, | |
| "num_input_tokens_seen": 189248, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 4.337349397590361, | |
| "grad_norm": 0.15031148493289948, | |
| "learning_rate": 3.490199415097892e-05, | |
| "loss": 0.3005, | |
| "num_input_tokens_seen": 190432, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 4.357429718875502, | |
| "grad_norm": 0.6503745913505554, | |
| "learning_rate": 3.474084966852897e-05, | |
| "loss": 0.4539, | |
| "num_input_tokens_seen": 191296, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 4.377510040160643, | |
| "grad_norm": 0.14889173209667206, | |
| "learning_rate": 3.457922660987155e-05, | |
| "loss": 0.3682, | |
| "num_input_tokens_seen": 192368, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 4.397590361445783, | |
| "grad_norm": 0.4457005560398102, | |
| "learning_rate": 3.441713291568462e-05, | |
| "loss": 0.3338, | |
| "num_input_tokens_seen": 193232, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 4.417670682730924, | |
| "grad_norm": 0.5409120321273804, | |
| "learning_rate": 3.42545765497689e-05, | |
| "loss": 0.3587, | |
| "num_input_tokens_seen": 194128, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 4.437751004016064, | |
| "grad_norm": 0.085002101957798, | |
| "learning_rate": 3.409156549865654e-05, | |
| "loss": 0.3609, | |
| "num_input_tokens_seen": 194944, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 4.457831325301205, | |
| "grad_norm": 0.49231743812561035, | |
| "learning_rate": 3.392810777121876e-05, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 195840, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 4.477911646586345, | |
| "grad_norm": 0.5549922585487366, | |
| "learning_rate": 3.376421139827237e-05, | |
| "loss": 0.3871, | |
| "num_input_tokens_seen": 196640, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 4.497991967871486, | |
| "grad_norm": 0.06657743453979492, | |
| "learning_rate": 3.3599884432185225e-05, | |
| "loss": 0.3481, | |
| "num_input_tokens_seen": 197440, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 4.518072289156627, | |
| "grad_norm": 0.13579045236110687, | |
| "learning_rate": 3.343513494648055e-05, | |
| "loss": 0.3393, | |
| "num_input_tokens_seen": 198432, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 4.518072289156627, | |
| "eval_loss": 0.3545505702495575, | |
| "eval_runtime": 1.4272, | |
| "eval_samples_per_second": 39.238, | |
| "eval_steps_per_second": 19.619, | |
| "num_input_tokens_seen": 198432, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 4.538152610441767, | |
| "grad_norm": 0.4401510953903198, | |
| "learning_rate": 3.326997103544035e-05, | |
| "loss": 0.3349, | |
| "num_input_tokens_seen": 199232, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 4.5582329317269075, | |
| "grad_norm": 0.14849136769771576, | |
| "learning_rate": 3.310440081370767e-05, | |
| "loss": 0.3373, | |
| "num_input_tokens_seen": 200144, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 4.578313253012048, | |
| "grad_norm": 0.621387243270874, | |
| "learning_rate": 3.2938432415887984e-05, | |
| "loss": 0.3213, | |
| "num_input_tokens_seen": 200896, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 4.598393574297189, | |
| "grad_norm": 0.7517121434211731, | |
| "learning_rate": 3.2772073996149435e-05, | |
| "loss": 0.3475, | |
| "num_input_tokens_seen": 201760, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 4.618473895582329, | |
| "grad_norm": 0.41686856746673584, | |
| "learning_rate": 3.260533372782234e-05, | |
| "loss": 0.4032, | |
| "num_input_tokens_seen": 202688, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 4.63855421686747, | |
| "grad_norm": 0.6020703315734863, | |
| "learning_rate": 3.24382198029975e-05, | |
| "loss": 0.3564, | |
| "num_input_tokens_seen": 203392, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 4.658634538152611, | |
| "grad_norm": 0.40914788842201233, | |
| "learning_rate": 3.227074043212383e-05, | |
| "loss": 0.322, | |
| "num_input_tokens_seen": 204080, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 4.678714859437751, | |
| "grad_norm": 0.09926328808069229, | |
| "learning_rate": 3.2102903843604885e-05, | |
| "loss": 0.373, | |
| "num_input_tokens_seen": 204816, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 4.698795180722891, | |
| "grad_norm": 0.08548900485038757, | |
| "learning_rate": 3.1934718283394646e-05, | |
| "loss": 0.3587, | |
| "num_input_tokens_seen": 205616, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 4.718875502008032, | |
| "grad_norm": 0.11746017634868622, | |
| "learning_rate": 3.1766192014592344e-05, | |
| "loss": 0.3571, | |
| "num_input_tokens_seen": 206512, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 4.738955823293173, | |
| "grad_norm": 0.4761631190776825, | |
| "learning_rate": 3.1597333317036545e-05, | |
| "loss": 0.3507, | |
| "num_input_tokens_seen": 207424, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 4.759036144578313, | |
| "grad_norm": 0.5010347366333008, | |
| "learning_rate": 3.142815048689828e-05, | |
| "loss": 0.3575, | |
| "num_input_tokens_seen": 208464, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 4.779116465863454, | |
| "grad_norm": 0.07341606169939041, | |
| "learning_rate": 3.125865183627354e-05, | |
| "loss": 0.3579, | |
| "num_input_tokens_seen": 209280, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 4.7991967871485945, | |
| "grad_norm": 0.43029800057411194, | |
| "learning_rate": 3.10888456927748e-05, | |
| "loss": 0.3327, | |
| "num_input_tokens_seen": 210080, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 4.8192771084337345, | |
| "grad_norm": 0.49401140213012695, | |
| "learning_rate": 3.091874039912195e-05, | |
| "loss": 0.3619, | |
| "num_input_tokens_seen": 210960, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.839357429718875, | |
| "grad_norm": 0.07773241400718689, | |
| "learning_rate": 3.074834431273236e-05, | |
| "loss": 0.3488, | |
| "num_input_tokens_seen": 211776, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 4.859437751004016, | |
| "grad_norm": 0.4646616280078888, | |
| "learning_rate": 3.057766580531031e-05, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 212576, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 4.879518072289157, | |
| "grad_norm": 0.4323027431964874, | |
| "learning_rate": 3.0406713262435656e-05, | |
| "loss": 0.3362, | |
| "num_input_tokens_seen": 213360, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 4.899598393574297, | |
| "grad_norm": 0.502923846244812, | |
| "learning_rate": 3.0235495083151844e-05, | |
| "loss": 0.3814, | |
| "num_input_tokens_seen": 214304, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 4.919678714859438, | |
| "grad_norm": 0.43908852338790894, | |
| "learning_rate": 3.0064019679553274e-05, | |
| "loss": 0.3492, | |
| "num_input_tokens_seen": 215072, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 4.9397590361445785, | |
| "grad_norm": 0.07500998675823212, | |
| "learning_rate": 2.9892295476371988e-05, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 215904, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 4.959839357429718, | |
| "grad_norm": 0.10780856758356094, | |
| "learning_rate": 2.9720330910563772e-05, | |
| "loss": 0.3543, | |
| "num_input_tokens_seen": 216864, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 4.979919678714859, | |
| "grad_norm": 0.0808030292391777, | |
| "learning_rate": 2.9548134430893604e-05, | |
| "loss": 0.3387, | |
| "num_input_tokens_seen": 217856, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.0589243620634079, | |
| "learning_rate": 2.9375714497520623e-05, | |
| "loss": 0.339, | |
| "num_input_tokens_seen": 218864, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 5.020080321285141, | |
| "grad_norm": 0.05416898429393768, | |
| "learning_rate": 2.920307958158241e-05, | |
| "loss": 0.3601, | |
| "num_input_tokens_seen": 219680, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 5.020080321285141, | |
| "eval_loss": 0.3591695725917816, | |
| "eval_runtime": 1.2186, | |
| "eval_samples_per_second": 45.953, | |
| "eval_steps_per_second": 22.976, | |
| "num_input_tokens_seen": 219680, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 5.040160642570281, | |
| "grad_norm": 0.40490707755088806, | |
| "learning_rate": 2.903023816477885e-05, | |
| "loss": 0.3239, | |
| "num_input_tokens_seen": 220560, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 5.0602409638554215, | |
| "grad_norm": 0.12411481887102127, | |
| "learning_rate": 2.885719873895536e-05, | |
| "loss": 0.3419, | |
| "num_input_tokens_seen": 221440, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 5.080321285140562, | |
| "grad_norm": 0.5238391160964966, | |
| "learning_rate": 2.868396980568572e-05, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 222304, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 5.100401606425703, | |
| "grad_norm": 0.358173131942749, | |
| "learning_rate": 2.8510559875854377e-05, | |
| "loss": 0.2762, | |
| "num_input_tokens_seen": 223248, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 5.120481927710843, | |
| "grad_norm": 0.3846758306026459, | |
| "learning_rate": 2.833697746923829e-05, | |
| "loss": 0.2662, | |
| "num_input_tokens_seen": 224000, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 5.140562248995984, | |
| "grad_norm": 0.233295738697052, | |
| "learning_rate": 2.816323111408835e-05, | |
| "loss": 0.3421, | |
| "num_input_tokens_seen": 224880, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 5.160642570281125, | |
| "grad_norm": 0.7878521084785461, | |
| "learning_rate": 2.7989329346710375e-05, | |
| "loss": 0.4232, | |
| "num_input_tokens_seen": 225776, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 5.180722891566265, | |
| "grad_norm": 0.3618873655796051, | |
| "learning_rate": 2.7815280711045717e-05, | |
| "loss": 0.3838, | |
| "num_input_tokens_seen": 226576, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 5.2008032128514055, | |
| "grad_norm": 0.14807634055614471, | |
| "learning_rate": 2.7641093758251497e-05, | |
| "loss": 0.3104, | |
| "num_input_tokens_seen": 227360, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 5.220883534136546, | |
| "grad_norm": 0.3595626950263977, | |
| "learning_rate": 2.7466777046280457e-05, | |
| "loss": 0.3105, | |
| "num_input_tokens_seen": 228112, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 5.240963855421687, | |
| "grad_norm": 0.6282910108566284, | |
| "learning_rate": 2.7292339139460556e-05, | |
| "loss": 0.3474, | |
| "num_input_tokens_seen": 228992, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 5.261044176706827, | |
| "grad_norm": 0.357793390750885, | |
| "learning_rate": 2.71177886080741e-05, | |
| "loss": 0.3076, | |
| "num_input_tokens_seen": 229872, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 5.281124497991968, | |
| "grad_norm": 0.1209518164396286, | |
| "learning_rate": 2.69431340279368e-05, | |
| "loss": 0.4231, | |
| "num_input_tokens_seen": 230720, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 5.301204819277109, | |
| "grad_norm": 0.09653452038764954, | |
| "learning_rate": 2.676838397997633e-05, | |
| "loss": 0.3725, | |
| "num_input_tokens_seen": 231568, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 5.321285140562249, | |
| "grad_norm": 0.4242056608200073, | |
| "learning_rate": 2.659354704981078e-05, | |
| "loss": 0.3237, | |
| "num_input_tokens_seen": 232368, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 5.341365461847389, | |
| "grad_norm": 0.10253465920686722, | |
| "learning_rate": 2.6418631827326857e-05, | |
| "loss": 0.3534, | |
| "num_input_tokens_seen": 233184, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 5.36144578313253, | |
| "grad_norm": 0.08719359338283539, | |
| "learning_rate": 2.6243646906257806e-05, | |
| "loss": 0.338, | |
| "num_input_tokens_seen": 233984, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 5.381526104417671, | |
| "grad_norm": 0.5312795639038086, | |
| "learning_rate": 2.606860088376126e-05, | |
| "loss": 0.3687, | |
| "num_input_tokens_seen": 234848, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 5.401606425702811, | |
| "grad_norm": 0.47353821992874146, | |
| "learning_rate": 2.5893502359996786e-05, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 235760, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 5.421686746987952, | |
| "grad_norm": 0.49804431200027466, | |
| "learning_rate": 2.5718359937703408e-05, | |
| "loss": 0.3504, | |
| "num_input_tokens_seen": 236640, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 5.4417670682730925, | |
| "grad_norm": 0.5166244506835938, | |
| "learning_rate": 2.554318222177689e-05, | |
| "loss": 0.3538, | |
| "num_input_tokens_seen": 237616, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 5.461847389558233, | |
| "grad_norm": 0.1073295921087265, | |
| "learning_rate": 2.5367977818847034e-05, | |
| "loss": 0.3354, | |
| "num_input_tokens_seen": 238528, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 5.481927710843373, | |
| "grad_norm": 0.49680086970329285, | |
| "learning_rate": 2.519275533685477e-05, | |
| "loss": 0.3354, | |
| "num_input_tokens_seen": 239424, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 5.502008032128514, | |
| "grad_norm": 0.11628666520118713, | |
| "learning_rate": 2.5017523384629298e-05, | |
| "loss": 0.354, | |
| "num_input_tokens_seen": 240272, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 5.522088353413655, | |
| "grad_norm": 0.11434385180473328, | |
| "learning_rate": 2.484229057146507e-05, | |
| "loss": 0.3422, | |
| "num_input_tokens_seen": 241136, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 5.522088353413655, | |
| "eval_loss": 0.35063984990119934, | |
| "eval_runtime": 1.4257, | |
| "eval_samples_per_second": 39.28, | |
| "eval_steps_per_second": 19.64, | |
| "num_input_tokens_seen": 241136, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 5.542168674698795, | |
| "grad_norm": 0.5793833136558533, | |
| "learning_rate": 2.466706550669886e-05, | |
| "loss": 0.3574, | |
| "num_input_tokens_seen": 241936, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 5.562248995983936, | |
| "grad_norm": 0.5316616296768188, | |
| "learning_rate": 2.449185679928672e-05, | |
| "loss": 0.3748, | |
| "num_input_tokens_seen": 242672, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 5.582329317269076, | |
| "grad_norm": 0.10326164960861206, | |
| "learning_rate": 2.431667305738112e-05, | |
| "loss": 0.3507, | |
| "num_input_tokens_seen": 243808, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 5.602409638554217, | |
| "grad_norm": 0.46083107590675354, | |
| "learning_rate": 2.414152288790787e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 244688, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 5.622489959839357, | |
| "grad_norm": 0.42732155323028564, | |
| "learning_rate": 2.3966414896143385e-05, | |
| "loss": 0.3386, | |
| "num_input_tokens_seen": 245696, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 5.642570281124498, | |
| "grad_norm": 0.4096117317676544, | |
| "learning_rate": 2.3791357685291863e-05, | |
| "loss": 0.3298, | |
| "num_input_tokens_seen": 246544, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 5.662650602409639, | |
| "grad_norm": 0.39851030707359314, | |
| "learning_rate": 2.361635985606256e-05, | |
| "loss": 0.3413, | |
| "num_input_tokens_seen": 247744, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 5.682730923694779, | |
| "grad_norm": 0.385628879070282, | |
| "learning_rate": 2.344143000624729e-05, | |
| "loss": 0.3623, | |
| "num_input_tokens_seen": 248480, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 5.7028112449799195, | |
| "grad_norm": 0.38799935579299927, | |
| "learning_rate": 2.3266576730297956e-05, | |
| "loss": 0.3284, | |
| "num_input_tokens_seen": 249312, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 5.72289156626506, | |
| "grad_norm": 0.11895764619112015, | |
| "learning_rate": 2.3091808618904352e-05, | |
| "loss": 0.3679, | |
| "num_input_tokens_seen": 250304, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 5.742971887550201, | |
| "grad_norm": 0.3756169080734253, | |
| "learning_rate": 2.2917134258572038e-05, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 251216, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 5.763052208835341, | |
| "grad_norm": 0.5232722163200378, | |
| "learning_rate": 2.274256223120051e-05, | |
| "loss": 0.3512, | |
| "num_input_tokens_seen": 251952, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 5.783132530120482, | |
| "grad_norm": 0.06907851248979568, | |
| "learning_rate": 2.2568101113661577e-05, | |
| "loss": 0.3292, | |
| "num_input_tokens_seen": 253072, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 5.803212851405623, | |
| "grad_norm": 0.08446797728538513, | |
| "learning_rate": 2.239375947737793e-05, | |
| "loss": 0.3499, | |
| "num_input_tokens_seen": 253840, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 5.823293172690763, | |
| "grad_norm": 0.06622636318206787, | |
| "learning_rate": 2.221954588790206e-05, | |
| "loss": 0.3647, | |
| "num_input_tokens_seen": 254640, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 5.843373493975903, | |
| "grad_norm": 0.03167250007390976, | |
| "learning_rate": 2.2045468904495415e-05, | |
| "loss": 0.3518, | |
| "num_input_tokens_seen": 255456, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 5.863453815261044, | |
| "grad_norm": 0.07110590487718582, | |
| "learning_rate": 2.1871537079707833e-05, | |
| "loss": 0.354, | |
| "num_input_tokens_seen": 256304, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 5.883534136546185, | |
| "grad_norm": 0.5281655788421631, | |
| "learning_rate": 2.1697758958957448e-05, | |
| "loss": 0.3385, | |
| "num_input_tokens_seen": 257104, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 5.903614457831325, | |
| "grad_norm": 0.5472500920295715, | |
| "learning_rate": 2.1524143080110716e-05, | |
| "loss": 0.3532, | |
| "num_input_tokens_seen": 258080, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 5.923694779116466, | |
| "grad_norm": 0.5764107704162598, | |
| "learning_rate": 2.135069797306308e-05, | |
| "loss": 0.3701, | |
| "num_input_tokens_seen": 259056, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 5.943775100401607, | |
| "grad_norm": 0.5297293663024902, | |
| "learning_rate": 2.1177432159319754e-05, | |
| "loss": 0.3721, | |
| "num_input_tokens_seen": 260000, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 5.9638554216867465, | |
| "grad_norm": 0.03520062938332558, | |
| "learning_rate": 2.100435415157718e-05, | |
| "loss": 0.3517, | |
| "num_input_tokens_seen": 260768, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 5.983935742971887, | |
| "grad_norm": 0.451326847076416, | |
| "learning_rate": 2.083147245330468e-05, | |
| "loss": 0.3572, | |
| "num_input_tokens_seen": 261760, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 6.004016064257028, | |
| "grad_norm": 0.4301218092441559, | |
| "learning_rate": 2.0658795558326743e-05, | |
| "loss": 0.3476, | |
| "num_input_tokens_seen": 262752, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 6.024096385542169, | |
| "grad_norm": 0.09133653342723846, | |
| "learning_rate": 2.048633195040572e-05, | |
| "loss": 0.3609, | |
| "num_input_tokens_seen": 263616, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 6.024096385542169, | |
| "eval_loss": 0.3502369225025177, | |
| "eval_runtime": 1.2111, | |
| "eval_samples_per_second": 46.238, | |
| "eval_steps_per_second": 23.119, | |
| "num_input_tokens_seen": 263616, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 6.044176706827309, | |
| "grad_norm": 0.4608069062232971, | |
| "learning_rate": 2.0314090102824963e-05, | |
| "loss": 0.3669, | |
| "num_input_tokens_seen": 264432, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 6.06425702811245, | |
| "grad_norm": 0.08459752053022385, | |
| "learning_rate": 2.014207847797256e-05, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 265184, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 6.0843373493975905, | |
| "grad_norm": 0.04717332869768143, | |
| "learning_rate": 1.997030552692556e-05, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 266064, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 6.104417670682731, | |
| "grad_norm": 0.07227271795272827, | |
| "learning_rate": 1.9798779689034757e-05, | |
| "loss": 0.3483, | |
| "num_input_tokens_seen": 266928, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 6.124497991967871, | |
| "grad_norm": 0.4734782874584198, | |
| "learning_rate": 1.9627509391510086e-05, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 267824, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 6.144578313253012, | |
| "grad_norm": 0.430178701877594, | |
| "learning_rate": 1.9456503049006542e-05, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 268608, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 6.164658634538153, | |
| "grad_norm": 0.4436280131340027, | |
| "learning_rate": 1.9285769063210812e-05, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 269696, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 6.184738955823293, | |
| "grad_norm": 0.44335198402404785, | |
| "learning_rate": 1.9115315822428437e-05, | |
| "loss": 0.351, | |
| "num_input_tokens_seen": 270704, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 6.204819277108434, | |
| "grad_norm": 0.0994335189461708, | |
| "learning_rate": 1.8945151701171755e-05, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 271568, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 6.224899598393574, | |
| "grad_norm": 0.42669767141342163, | |
| "learning_rate": 1.877528505974838e-05, | |
| "loss": 0.3386, | |
| "num_input_tokens_seen": 272304, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 6.244979919678715, | |
| "grad_norm": 0.091577909886837, | |
| "learning_rate": 1.8605724243850502e-05, | |
| "loss": 0.3302, | |
| "num_input_tokens_seen": 273152, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 6.265060240963855, | |
| "grad_norm": 0.5046164393424988, | |
| "learning_rate": 1.8436477584144863e-05, | |
| "loss": 0.3962, | |
| "num_input_tokens_seen": 274112, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 6.285140562248996, | |
| "grad_norm": 0.1121777817606926, | |
| "learning_rate": 1.826755339586341e-05, | |
| "loss": 0.3337, | |
| "num_input_tokens_seen": 274944, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 6.305220883534137, | |
| "grad_norm": 0.40517058968544006, | |
| "learning_rate": 1.809895997839482e-05, | |
| "loss": 0.3484, | |
| "num_input_tokens_seen": 275712, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 6.325301204819277, | |
| "grad_norm": 0.09261249750852585, | |
| "learning_rate": 1.793070561487672e-05, | |
| "loss": 0.3391, | |
| "num_input_tokens_seen": 276560, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 6.3453815261044175, | |
| "grad_norm": 0.49878165125846863, | |
| "learning_rate": 1.7762798571788707e-05, | |
| "loss": 0.3948, | |
| "num_input_tokens_seen": 277456, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 6.365461847389558, | |
| "grad_norm": 0.415039598941803, | |
| "learning_rate": 1.759524709854626e-05, | |
| "loss": 0.3246, | |
| "num_input_tokens_seen": 278352, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 6.385542168674699, | |
| "grad_norm": 0.11119506508111954, | |
| "learning_rate": 1.742805942709538e-05, | |
| "loss": 0.3468, | |
| "num_input_tokens_seen": 279264, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 6.405622489959839, | |
| "grad_norm": 0.10621926933526993, | |
| "learning_rate": 1.7261243771508208e-05, | |
| "loss": 0.3428, | |
| "num_input_tokens_seen": 280144, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 6.42570281124498, | |
| "grad_norm": 0.10251349955797195, | |
| "learning_rate": 1.70948083275794e-05, | |
| "loss": 0.3439, | |
| "num_input_tokens_seen": 281008, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 6.445783132530121, | |
| "grad_norm": 0.4213389456272125, | |
| "learning_rate": 1.6928761272423522e-05, | |
| "loss": 0.3717, | |
| "num_input_tokens_seen": 281792, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 6.4658634538152615, | |
| "grad_norm": 0.08103923499584198, | |
| "learning_rate": 1.6763110764073235e-05, | |
| "loss": 0.3517, | |
| "num_input_tokens_seen": 282560, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 6.485943775100401, | |
| "grad_norm": 0.46979421377182007, | |
| "learning_rate": 1.6597864941078552e-05, | |
| "loss": 0.3423, | |
| "num_input_tokens_seen": 283440, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 6.506024096385542, | |
| "grad_norm": 0.12765252590179443, | |
| "learning_rate": 1.643303192210693e-05, | |
| "loss": 0.358, | |
| "num_input_tokens_seen": 284592, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 6.526104417670683, | |
| "grad_norm": 0.06467333436012268, | |
| "learning_rate": 1.626861980554441e-05, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 285424, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 6.526104417670683, | |
| "eval_loss": 0.3553968071937561, | |
| "eval_runtime": 1.2158, | |
| "eval_samples_per_second": 46.06, | |
| "eval_steps_per_second": 23.03, | |
| "num_input_tokens_seen": 285424, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 6.546184738955823, | |
| "grad_norm": 0.41862520575523376, | |
| "learning_rate": 1.6104636669097776e-05, | |
| "loss": 0.3518, | |
| "num_input_tokens_seen": 286272, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 6.566265060240964, | |
| "grad_norm": 0.0668744370341301, | |
| "learning_rate": 1.5941090569397616e-05, | |
| "loss": 0.3512, | |
| "num_input_tokens_seen": 287200, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 6.586345381526105, | |
| "grad_norm": 0.08563435077667236, | |
| "learning_rate": 1.5777989541602533e-05, | |
| "loss": 0.348, | |
| "num_input_tokens_seen": 288224, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 6.606425702811245, | |
| "grad_norm": 0.44376978278160095, | |
| "learning_rate": 1.561534159900441e-05, | |
| "loss": 0.3353, | |
| "num_input_tokens_seen": 289216, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 6.626506024096385, | |
| "grad_norm": 0.12750263512134552, | |
| "learning_rate": 1.5453154732634616e-05, | |
| "loss": 0.3476, | |
| "num_input_tokens_seen": 290080, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 6.646586345381526, | |
| "grad_norm": 0.4618399739265442, | |
| "learning_rate": 1.52914369108715e-05, | |
| "loss": 0.351, | |
| "num_input_tokens_seen": 290880, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.07694806158542633, | |
| "learning_rate": 1.513019607904882e-05, | |
| "loss": 0.3607, | |
| "num_input_tokens_seen": 291728, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 6.686746987951807, | |
| "grad_norm": 0.09036281704902649, | |
| "learning_rate": 1.4969440159065439e-05, | |
| "loss": 0.3573, | |
| "num_input_tokens_seen": 292624, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 6.706827309236948, | |
| "grad_norm": 0.08298249542713165, | |
| "learning_rate": 1.4809177048996064e-05, | |
| "loss": 0.3476, | |
| "num_input_tokens_seen": 293488, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 6.7269076305220885, | |
| "grad_norm": 0.4784527122974396, | |
| "learning_rate": 1.464941462270325e-05, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 294400, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 6.746987951807229, | |
| "grad_norm": 0.45142361521720886, | |
| "learning_rate": 1.449016072945053e-05, | |
| "loss": 0.357, | |
| "num_input_tokens_seen": 295184, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 6.767068273092369, | |
| "grad_norm": 0.4861927330493927, | |
| "learning_rate": 1.4331423193516768e-05, | |
| "loss": 0.3575, | |
| "num_input_tokens_seen": 296176, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 6.78714859437751, | |
| "grad_norm": 0.41130194067955017, | |
| "learning_rate": 1.4173209813811788e-05, | |
| "loss": 0.3358, | |
| "num_input_tokens_seen": 297072, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 6.807228915662651, | |
| "grad_norm": 0.40937137603759766, | |
| "learning_rate": 1.4015528363493125e-05, | |
| "loss": 0.3491, | |
| "num_input_tokens_seen": 297856, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 6.827309236947791, | |
| "grad_norm": 0.40058666467666626, | |
| "learning_rate": 1.3858386589584187e-05, | |
| "loss": 0.3253, | |
| "num_input_tokens_seen": 298896, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 6.847389558232932, | |
| "grad_norm": 0.11114007234573364, | |
| "learning_rate": 1.3701792212593662e-05, | |
| "loss": 0.3302, | |
| "num_input_tokens_seen": 299712, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 6.867469879518072, | |
| "grad_norm": 0.11977384239435196, | |
| "learning_rate": 1.354575292613611e-05, | |
| "loss": 0.3882, | |
| "num_input_tokens_seen": 300720, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 6.887550200803213, | |
| "grad_norm": 0.5076762437820435, | |
| "learning_rate": 1.3390276396554052e-05, | |
| "loss": 0.3658, | |
| "num_input_tokens_seen": 301552, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 6.907630522088353, | |
| "grad_norm": 0.07989758253097534, | |
| "learning_rate": 1.3235370262541272e-05, | |
| "loss": 0.3388, | |
| "num_input_tokens_seen": 302352, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 6.927710843373494, | |
| "grad_norm": 0.38454264402389526, | |
| "learning_rate": 1.3081042134767554e-05, | |
| "loss": 0.3335, | |
| "num_input_tokens_seen": 303232, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 6.947791164658635, | |
| "grad_norm": 0.07989054918289185, | |
| "learning_rate": 1.292729959550473e-05, | |
| "loss": 0.3262, | |
| "num_input_tokens_seen": 304016, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 6.967871485943775, | |
| "grad_norm": 0.3905481994152069, | |
| "learning_rate": 1.277415019825417e-05, | |
| "loss": 0.3396, | |
| "num_input_tokens_seen": 304944, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 6.9879518072289155, | |
| "grad_norm": 0.39736074209213257, | |
| "learning_rate": 1.2621601467375684e-05, | |
| "loss": 0.3422, | |
| "num_input_tokens_seen": 305984, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 7.008032128514056, | |
| "grad_norm": 0.1431533843278885, | |
| "learning_rate": 1.2469660897717816e-05, | |
| "loss": 0.3182, | |
| "num_input_tokens_seen": 306992, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 7.028112449799197, | |
| "grad_norm": 0.39490222930908203, | |
| "learning_rate": 1.2318335954249669e-05, | |
| "loss": 0.315, | |
| "num_input_tokens_seen": 307792, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 7.028112449799197, | |
| "eval_loss": 0.36506387591362, | |
| "eval_runtime": 1.2174, | |
| "eval_samples_per_second": 45.998, | |
| "eval_steps_per_second": 22.999, | |
| "num_input_tokens_seen": 307792, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 7.048192771084337, | |
| "grad_norm": 0.13177447021007538, | |
| "learning_rate": 1.2167634071694081e-05, | |
| "loss": 0.3174, | |
| "num_input_tokens_seen": 308624, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 7.068273092369478, | |
| "grad_norm": 0.38232582807540894, | |
| "learning_rate": 1.2017562654162357e-05, | |
| "loss": 0.2887, | |
| "num_input_tokens_seen": 309680, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 7.088353413654619, | |
| "grad_norm": 0.19812007248401642, | |
| "learning_rate": 1.1868129074790577e-05, | |
| "loss": 0.3394, | |
| "num_input_tokens_seen": 310544, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 7.108433734939759, | |
| "grad_norm": 0.2020581066608429, | |
| "learning_rate": 1.1719340675377252e-05, | |
| "loss": 0.3113, | |
| "num_input_tokens_seen": 311568, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 7.128514056224899, | |
| "grad_norm": 0.16722743213176727, | |
| "learning_rate": 1.1571204766022665e-05, | |
| "loss": 0.4907, | |
| "num_input_tokens_seen": 312432, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 7.14859437751004, | |
| "grad_norm": 0.14364704489707947, | |
| "learning_rate": 1.1423728624769695e-05, | |
| "loss": 0.3627, | |
| "num_input_tokens_seen": 313168, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 7.168674698795181, | |
| "grad_norm": 0.11236248165369034, | |
| "learning_rate": 1.1276919497246288e-05, | |
| "loss": 0.3648, | |
| "num_input_tokens_seen": 313968, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 7.188755020080321, | |
| "grad_norm": 0.11342489719390869, | |
| "learning_rate": 1.1130784596309409e-05, | |
| "loss": 0.3585, | |
| "num_input_tokens_seen": 314736, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 7.208835341365462, | |
| "grad_norm": 0.09296204149723053, | |
| "learning_rate": 1.098533110169071e-05, | |
| "loss": 0.3485, | |
| "num_input_tokens_seen": 315664, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 7.228915662650603, | |
| "grad_norm": 0.1172434464097023, | |
| "learning_rate": 1.084056615964377e-05, | |
| "loss": 0.3442, | |
| "num_input_tokens_seen": 316704, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 7.2489959839357425, | |
| "grad_norm": 0.0936344638466835, | |
| "learning_rate": 1.069649688259299e-05, | |
| "loss": 0.388, | |
| "num_input_tokens_seen": 317520, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 7.269076305220883, | |
| "grad_norm": 0.4171283543109894, | |
| "learning_rate": 1.0553130348784182e-05, | |
| "loss": 0.3306, | |
| "num_input_tokens_seen": 318496, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 7.289156626506024, | |
| "grad_norm": 0.403551310300827, | |
| "learning_rate": 1.0410473601936765e-05, | |
| "loss": 0.3181, | |
| "num_input_tokens_seen": 319344, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 7.309236947791165, | |
| "grad_norm": 0.08753710985183716, | |
| "learning_rate": 1.026853365089773e-05, | |
| "loss": 0.3494, | |
| "num_input_tokens_seen": 320224, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 7.329317269076305, | |
| "grad_norm": 0.393200546503067, | |
| "learning_rate": 1.0127317469297277e-05, | |
| "loss": 0.3193, | |
| "num_input_tokens_seen": 320976, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 7.349397590361446, | |
| "grad_norm": 0.1304997056722641, | |
| "learning_rate": 9.986831995206195e-06, | |
| "loss": 0.3271, | |
| "num_input_tokens_seen": 321808, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 7.3694779116465865, | |
| "grad_norm": 0.37172961235046387, | |
| "learning_rate": 9.847084130795028e-06, | |
| "loss": 0.3504, | |
| "num_input_tokens_seen": 322624, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 7.389558232931727, | |
| "grad_norm": 0.3792823255062103, | |
| "learning_rate": 9.708080741994868e-06, | |
| "loss": 0.3165, | |
| "num_input_tokens_seen": 323696, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 7.409638554216867, | |
| "grad_norm": 0.365556925535202, | |
| "learning_rate": 9.569828658160158e-06, | |
| "loss": 0.3184, | |
| "num_input_tokens_seen": 324496, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 7.429718875502008, | |
| "grad_norm": 0.10406666994094849, | |
| "learning_rate": 9.432334671733039e-06, | |
| "loss": 0.3824, | |
| "num_input_tokens_seen": 325328, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 7.449799196787149, | |
| "grad_norm": 0.3579881191253662, | |
| "learning_rate": 9.295605537909708e-06, | |
| "loss": 0.3336, | |
| "num_input_tokens_seen": 326304, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 7.469879518072289, | |
| "grad_norm": 0.171662375330925, | |
| "learning_rate": 9.159647974308494e-06, | |
| "loss": 0.3148, | |
| "num_input_tokens_seen": 327120, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 7.48995983935743, | |
| "grad_norm": 0.5380930304527283, | |
| "learning_rate": 9.024468660639826e-06, | |
| "loss": 0.3811, | |
| "num_input_tokens_seen": 328128, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 7.51004016064257, | |
| "grad_norm": 0.11611666530370712, | |
| "learning_rate": 8.890074238378074e-06, | |
| "loss": 0.351, | |
| "num_input_tokens_seen": 329008, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 7.530120481927711, | |
| "grad_norm": 0.10266629606485367, | |
| "learning_rate": 8.756471310435204e-06, | |
| "loss": 0.3149, | |
| "num_input_tokens_seen": 329840, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 7.530120481927711, | |
| "eval_loss": 0.3625907897949219, | |
| "eval_runtime": 1.2148, | |
| "eval_samples_per_second": 46.1, | |
| "eval_steps_per_second": 23.05, | |
| "num_input_tokens_seen": 329840, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 7.550200803212851, | |
| "grad_norm": 0.15210554003715515, | |
| "learning_rate": 8.623666440836404e-06, | |
| "loss": 0.3623, | |
| "num_input_tokens_seen": 330624, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 7.570281124497992, | |
| "grad_norm": 0.09614621847867966, | |
| "learning_rate": 8.491666154397573e-06, | |
| "loss": 0.3149, | |
| "num_input_tokens_seen": 331440, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 7.590361445783133, | |
| "grad_norm": 0.11491771787405014, | |
| "learning_rate": 8.360476936404754e-06, | |
| "loss": 0.3897, | |
| "num_input_tokens_seen": 332192, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 7.610441767068274, | |
| "grad_norm": 0.09532292187213898, | |
| "learning_rate": 8.230105232295538e-06, | |
| "loss": 0.3736, | |
| "num_input_tokens_seen": 333168, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 7.6305220883534135, | |
| "grad_norm": 0.5067830681800842, | |
| "learning_rate": 8.100557447342327e-06, | |
| "loss": 0.3618, | |
| "num_input_tokens_seen": 334080, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 7.650602409638554, | |
| "grad_norm": 0.13660845160484314, | |
| "learning_rate": 7.971839946337698e-06, | |
| "loss": 0.3533, | |
| "num_input_tokens_seen": 335040, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 7.670682730923695, | |
| "grad_norm": 0.41254377365112305, | |
| "learning_rate": 7.843959053281663e-06, | |
| "loss": 0.3532, | |
| "num_input_tokens_seen": 335824, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 7.690763052208835, | |
| "grad_norm": 0.07652189582586288, | |
| "learning_rate": 7.71692105107098e-06, | |
| "loss": 0.3362, | |
| "num_input_tokens_seen": 336656, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 7.710843373493976, | |
| "grad_norm": 0.0759321078658104, | |
| "learning_rate": 7.590732181190482e-06, | |
| "loss": 0.3608, | |
| "num_input_tokens_seen": 337488, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 7.730923694779117, | |
| "grad_norm": 0.0837617963552475, | |
| "learning_rate": 7.465398643406366e-06, | |
| "loss": 0.342, | |
| "num_input_tokens_seen": 338400, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 7.7510040160642575, | |
| "grad_norm": 0.09123997390270233, | |
| "learning_rate": 7.340926595461687e-06, | |
| "loss": 0.3573, | |
| "num_input_tokens_seen": 339248, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 7.771084337349397, | |
| "grad_norm": 0.09809573739767075, | |
| "learning_rate": 7.217322152773742e-06, | |
| "loss": 0.3539, | |
| "num_input_tokens_seen": 340112, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 7.791164658634538, | |
| "grad_norm": 0.4673003554344177, | |
| "learning_rate": 7.094591388133659e-06, | |
| "loss": 0.3471, | |
| "num_input_tokens_seen": 340896, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 7.811244979919679, | |
| "grad_norm": 0.46651169657707214, | |
| "learning_rate": 6.972740331408015e-06, | |
| "loss": 0.3599, | |
| "num_input_tokens_seen": 341760, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 7.831325301204819, | |
| "grad_norm": 0.09857732057571411, | |
| "learning_rate": 6.851774969242589e-06, | |
| "loss": 0.3382, | |
| "num_input_tokens_seen": 342608, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 7.85140562248996, | |
| "grad_norm": 0.14156574010849, | |
| "learning_rate": 6.731701244768254e-06, | |
| "loss": 0.338, | |
| "num_input_tokens_seen": 343632, | |
| "step": 1955 | |
| }, | |
| { | |
| "epoch": 7.871485943775101, | |
| "grad_norm": 0.08120275288820267, | |
| "learning_rate": 6.612525057308949e-06, | |
| "loss": 0.3473, | |
| "num_input_tokens_seen": 344528, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 7.891566265060241, | |
| "grad_norm": 0.4519568085670471, | |
| "learning_rate": 6.494252262091857e-06, | |
| "loss": 0.3505, | |
| "num_input_tokens_seen": 345568, | |
| "step": 1965 | |
| }, | |
| { | |
| "epoch": 7.911646586345381, | |
| "grad_norm": 0.10098995268344879, | |
| "learning_rate": 6.3768886699597436e-06, | |
| "loss": 0.3443, | |
| "num_input_tokens_seen": 346496, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 7.931726907630522, | |
| "grad_norm": 0.44386422634124756, | |
| "learning_rate": 6.260440047085439e-06, | |
| "loss": 0.3473, | |
| "num_input_tokens_seen": 347360, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 7.951807228915663, | |
| "grad_norm": 0.06753702461719513, | |
| "learning_rate": 6.1449121146885894e-06, | |
| "loss": 0.3445, | |
| "num_input_tokens_seen": 348128, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 7.971887550200803, | |
| "grad_norm": 0.42976370453834534, | |
| "learning_rate": 6.030310548754506e-06, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 348960, | |
| "step": 1985 | |
| }, | |
| { | |
| "epoch": 7.991967871485944, | |
| "grad_norm": 0.4281879961490631, | |
| "learning_rate": 5.9166409797553415e-06, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 349856, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 8.012048192771084, | |
| "grad_norm": 0.4621274471282959, | |
| "learning_rate": 5.803908992373449e-06, | |
| "loss": 0.338, | |
| "num_input_tokens_seen": 350784, | |
| "step": 1995 | |
| }, | |
| { | |
| "epoch": 8.032128514056225, | |
| "grad_norm": 0.08658602088689804, | |
| "learning_rate": 5.692120125226993e-06, | |
| "loss": 0.3441, | |
| "num_input_tokens_seen": 351552, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 8.032128514056225, | |
| "eval_loss": 0.3484961986541748, | |
| "eval_runtime": 1.2167, | |
| "eval_samples_per_second": 46.027, | |
| "eval_steps_per_second": 23.014, | |
| "num_input_tokens_seen": 351552, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 8.052208835341366, | |
| "grad_norm": 0.0772676169872284, | |
| "learning_rate": 5.581279870597867e-06, | |
| "loss": 0.3537, | |
| "num_input_tokens_seen": 352592, | |
| "step": 2005 | |
| }, | |
| { | |
| "epoch": 8.072289156626505, | |
| "grad_norm": 0.08750884979963303, | |
| "learning_rate": 5.4713936741617845e-06, | |
| "loss": 0.3441, | |
| "num_input_tokens_seen": 353392, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 8.092369477911646, | |
| "grad_norm": 0.4653143882751465, | |
| "learning_rate": 5.3624669347208085e-06, | |
| "loss": 0.3473, | |
| "num_input_tokens_seen": 354176, | |
| "step": 2015 | |
| }, | |
| { | |
| "epoch": 8.112449799196787, | |
| "grad_norm": 0.07544849812984467, | |
| "learning_rate": 5.254505003938043e-06, | |
| "loss": 0.335, | |
| "num_input_tokens_seen": 355040, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 8.132530120481928, | |
| "grad_norm": 0.4536716043949127, | |
| "learning_rate": 5.147513186074751e-06, | |
| "loss": 0.3445, | |
| "num_input_tokens_seen": 355984, | |
| "step": 2025 | |
| }, | |
| { | |
| "epoch": 8.152610441767068, | |
| "grad_norm": 0.06866519898176193, | |
| "learning_rate": 5.041496737729687e-06, | |
| "loss": 0.3443, | |
| "num_input_tokens_seen": 356768, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 8.17269076305221, | |
| "grad_norm": 0.07859829813241959, | |
| "learning_rate": 4.936460867580889e-06, | |
| "loss": 0.3604, | |
| "num_input_tokens_seen": 357648, | |
| "step": 2035 | |
| }, | |
| { | |
| "epoch": 8.19277108433735, | |
| "grad_norm": 0.48313626646995544, | |
| "learning_rate": 4.832410736129778e-06, | |
| "loss": 0.3509, | |
| "num_input_tokens_seen": 358464, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 8.21285140562249, | |
| "grad_norm": 0.4338780343532562, | |
| "learning_rate": 4.729351455447573e-06, | |
| "loss": 0.3421, | |
| "num_input_tokens_seen": 359520, | |
| "step": 2045 | |
| }, | |
| { | |
| "epoch": 8.23293172690763, | |
| "grad_norm": 0.4279603660106659, | |
| "learning_rate": 4.627288088924156e-06, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 360256, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 8.25301204819277, | |
| "grad_norm": 0.09719569236040115, | |
| "learning_rate": 4.526225651019309e-06, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 361184, | |
| "step": 2055 | |
| }, | |
| { | |
| "epoch": 8.273092369477911, | |
| "grad_norm": 0.11000480502843857, | |
| "learning_rate": 4.4261691070163316e-06, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 362064, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 8.293172690763052, | |
| "grad_norm": 0.11856409162282944, | |
| "learning_rate": 4.327123372778122e-06, | |
| "loss": 0.3415, | |
| "num_input_tokens_seen": 362928, | |
| "step": 2065 | |
| }, | |
| { | |
| "epoch": 8.313253012048193, | |
| "grad_norm": 0.06177099421620369, | |
| "learning_rate": 4.229093314505619e-06, | |
| "loss": 0.336, | |
| "num_input_tokens_seen": 363888, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 8.333333333333334, | |
| "grad_norm": 0.07357407361268997, | |
| "learning_rate": 4.132083748498744e-06, | |
| "loss": 0.3572, | |
| "num_input_tokens_seen": 364800, | |
| "step": 2075 | |
| }, | |
| { | |
| "epoch": 8.353413654618475, | |
| "grad_norm": 0.1254434585571289, | |
| "learning_rate": 4.036099440919763e-06, | |
| "loss": 0.3479, | |
| "num_input_tokens_seen": 365680, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 8.373493975903614, | |
| "grad_norm": 0.12399672716856003, | |
| "learning_rate": 3.9411451075591464e-06, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 366560, | |
| "step": 2085 | |
| }, | |
| { | |
| "epoch": 8.393574297188755, | |
| "grad_norm": 0.4643438458442688, | |
| "learning_rate": 3.847225413603839e-06, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 367424, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 8.413654618473895, | |
| "grad_norm": 0.09002748876810074, | |
| "learning_rate": 3.754344973408064e-06, | |
| "loss": 0.3538, | |
| "num_input_tokens_seen": 368272, | |
| "step": 2095 | |
| }, | |
| { | |
| "epoch": 8.433734939759036, | |
| "grad_norm": 0.06885527074337006, | |
| "learning_rate": 3.6625083502666554e-06, | |
| "loss": 0.3472, | |
| "num_input_tokens_seen": 369040, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 8.453815261044177, | |
| "grad_norm": 0.4609389007091522, | |
| "learning_rate": 3.5717200561908026e-06, | |
| "loss": 0.3411, | |
| "num_input_tokens_seen": 369808, | |
| "step": 2105 | |
| }, | |
| { | |
| "epoch": 8.473895582329318, | |
| "grad_norm": 0.09589619934558868, | |
| "learning_rate": 3.481984551686429e-06, | |
| "loss": 0.3383, | |
| "num_input_tokens_seen": 370672, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 8.493975903614459, | |
| "grad_norm": 0.11223804205656052, | |
| "learning_rate": 3.3933062455349744e-06, | |
| "loss": 0.3417, | |
| "num_input_tokens_seen": 371520, | |
| "step": 2115 | |
| }, | |
| { | |
| "epoch": 8.514056224899598, | |
| "grad_norm": 0.47606217861175537, | |
| "learning_rate": 3.305689494576847e-06, | |
| "loss": 0.36, | |
| "num_input_tokens_seen": 372368, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 8.534136546184738, | |
| "grad_norm": 0.5172960162162781, | |
| "learning_rate": 3.2191386034973627e-06, | |
| "loss": 0.3574, | |
| "num_input_tokens_seen": 373424, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 8.534136546184738, | |
| "eval_loss": 0.3515996038913727, | |
| "eval_runtime": 1.2192, | |
| "eval_samples_per_second": 45.93, | |
| "eval_steps_per_second": 22.965, | |
| "num_input_tokens_seen": 373424, | |
| "step": 2125 | |
| }, | |
| { | |
| "epoch": 8.55421686746988, | |
| "grad_norm": 0.4503559172153473, | |
| "learning_rate": 3.1336578246152103e-06, | |
| "loss": 0.3443, | |
| "num_input_tokens_seen": 374240, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 8.57429718875502, | |
| "grad_norm": 0.11173038929700851, | |
| "learning_rate": 3.049251357673577e-06, | |
| "loss": 0.3383, | |
| "num_input_tokens_seen": 375104, | |
| "step": 2135 | |
| }, | |
| { | |
| "epoch": 8.594377510040161, | |
| "grad_norm": 0.09678234905004501, | |
| "learning_rate": 2.9659233496337786e-06, | |
| "loss": 0.3476, | |
| "num_input_tokens_seen": 376080, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 8.614457831325302, | |
| "grad_norm": 0.08732342720031738, | |
| "learning_rate": 2.8836778944715454e-06, | |
| "loss": 0.3415, | |
| "num_input_tokens_seen": 376928, | |
| "step": 2145 | |
| }, | |
| { | |
| "epoch": 8.634538152610443, | |
| "grad_norm": 0.12217875570058823, | |
| "learning_rate": 2.802519032975859e-06, | |
| "loss": 0.351, | |
| "num_input_tokens_seen": 377856, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 8.654618473895582, | |
| "grad_norm": 0.4865402579307556, | |
| "learning_rate": 2.722450752550429e-06, | |
| "loss": 0.3417, | |
| "num_input_tokens_seen": 378784, | |
| "step": 2155 | |
| }, | |
| { | |
| "epoch": 8.674698795180722, | |
| "grad_norm": 0.4950112998485565, | |
| "learning_rate": 2.6434769870177985e-06, | |
| "loss": 0.3604, | |
| "num_input_tokens_seen": 379696, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 8.694779116465863, | |
| "grad_norm": 0.07212843000888824, | |
| "learning_rate": 2.5656016164260554e-06, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 380512, | |
| "step": 2165 | |
| }, | |
| { | |
| "epoch": 8.714859437751004, | |
| "grad_norm": 0.09282089024782181, | |
| "learning_rate": 2.4888284668582285e-06, | |
| "loss": 0.3445, | |
| "num_input_tokens_seen": 381520, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 8.734939759036145, | |
| "grad_norm": 0.08465081453323364, | |
| "learning_rate": 2.4131613102442857e-06, | |
| "loss": 0.3354, | |
| "num_input_tokens_seen": 382480, | |
| "step": 2175 | |
| }, | |
| { | |
| "epoch": 8.755020080321286, | |
| "grad_norm": 0.45736274123191833, | |
| "learning_rate": 2.3386038641758063e-06, | |
| "loss": 0.3383, | |
| "num_input_tokens_seen": 383440, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 8.775100401606426, | |
| "grad_norm": 0.46960991621017456, | |
| "learning_rate": 2.265159791723373e-06, | |
| "loss": 0.3508, | |
| "num_input_tokens_seen": 384240, | |
| "step": 2185 | |
| }, | |
| { | |
| "epoch": 8.795180722891565, | |
| "grad_norm": 0.4292510449886322, | |
| "learning_rate": 2.1928327012565696e-06, | |
| "loss": 0.3483, | |
| "num_input_tokens_seen": 385120, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 8.815261044176706, | |
| "grad_norm": 0.07858365774154663, | |
| "learning_rate": 2.121626146266706e-06, | |
| "loss": 0.3546, | |
| "num_input_tokens_seen": 385984, | |
| "step": 2195 | |
| }, | |
| { | |
| "epoch": 8.835341365461847, | |
| "grad_norm": 0.5046152472496033, | |
| "learning_rate": 2.051543625192226e-06, | |
| "loss": 0.3609, | |
| "num_input_tokens_seen": 386896, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 8.855421686746988, | |
| "grad_norm": 0.43499457836151123, | |
| "learning_rate": 1.9825885812468524e-06, | |
| "loss": 0.33, | |
| "num_input_tokens_seen": 387776, | |
| "step": 2205 | |
| }, | |
| { | |
| "epoch": 8.875502008032129, | |
| "grad_norm": 0.4260156750679016, | |
| "learning_rate": 1.914764402250385e-06, | |
| "loss": 0.3487, | |
| "num_input_tokens_seen": 388704, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 8.89558232931727, | |
| "grad_norm": 0.43153613805770874, | |
| "learning_rate": 1.8480744204622757e-06, | |
| "loss": 0.3512, | |
| "num_input_tokens_seen": 389456, | |
| "step": 2215 | |
| }, | |
| { | |
| "epoch": 8.91566265060241, | |
| "grad_norm": 0.4854128956794739, | |
| "learning_rate": 1.7825219124179004e-06, | |
| "loss": 0.3522, | |
| "num_input_tokens_seen": 390304, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 8.93574297188755, | |
| "grad_norm": 0.08903124928474426, | |
| "learning_rate": 1.7181100987675862e-06, | |
| "loss": 0.3356, | |
| "num_input_tokens_seen": 391104, | |
| "step": 2225 | |
| }, | |
| { | |
| "epoch": 8.95582329317269, | |
| "grad_norm": 0.49092555046081543, | |
| "learning_rate": 1.6548421441183875e-06, | |
| "loss": 0.3516, | |
| "num_input_tokens_seen": 392112, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 8.975903614457831, | |
| "grad_norm": 0.08050279319286346, | |
| "learning_rate": 1.5927211568785878e-06, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 392880, | |
| "step": 2235 | |
| }, | |
| { | |
| "epoch": 8.995983935742972, | |
| "grad_norm": 0.4535515308380127, | |
| "learning_rate": 1.5317501891049719e-06, | |
| "loss": 0.3302, | |
| "num_input_tokens_seen": 393728, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 9.016064257028113, | |
| "grad_norm": 0.48401492834091187, | |
| "learning_rate": 1.4719322363529242e-06, | |
| "loss": 0.3487, | |
| "num_input_tokens_seen": 394688, | |
| "step": 2245 | |
| }, | |
| { | |
| "epoch": 9.036144578313253, | |
| "grad_norm": 0.4877987205982208, | |
| "learning_rate": 1.4132702375291989e-06, | |
| "loss": 0.3673, | |
| "num_input_tokens_seen": 395616, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 9.036144578313253, | |
| "eval_loss": 0.35450634360313416, | |
| "eval_runtime": 1.2106, | |
| "eval_samples_per_second": 46.258, | |
| "eval_steps_per_second": 23.129, | |
| "num_input_tokens_seen": 395616, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 9.056224899598394, | |
| "grad_norm": 0.4849400520324707, | |
| "learning_rate": 1.3557670747475714e-06, | |
| "loss": 0.3455, | |
| "num_input_tokens_seen": 396560, | |
| "step": 2255 | |
| }, | |
| { | |
| "epoch": 9.076305220883533, | |
| "grad_norm": 0.4335307478904724, | |
| "learning_rate": 1.2994255731871963e-06, | |
| "loss": 0.3489, | |
| "num_input_tokens_seen": 397456, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 9.096385542168674, | |
| "grad_norm": 0.10791157931089401, | |
| "learning_rate": 1.244248500953854e-06, | |
| "loss": 0.3648, | |
| "num_input_tokens_seen": 398448, | |
| "step": 2265 | |
| }, | |
| { | |
| "epoch": 9.116465863453815, | |
| "grad_norm": 0.4895195960998535, | |
| "learning_rate": 1.1902385689439022e-06, | |
| "loss": 0.3544, | |
| "num_input_tokens_seen": 399248, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 9.136546184738956, | |
| "grad_norm": 0.45951637625694275, | |
| "learning_rate": 1.137398430711123e-06, | |
| "loss": 0.3574, | |
| "num_input_tokens_seen": 400096, | |
| "step": 2275 | |
| }, | |
| { | |
| "epoch": 9.156626506024097, | |
| "grad_norm": 0.44542670249938965, | |
| "learning_rate": 1.085730682336325e-06, | |
| "loss": 0.3477, | |
| "num_input_tokens_seen": 401024, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 9.176706827309237, | |
| "grad_norm": 0.10208380967378616, | |
| "learning_rate": 1.0352378622998204e-06, | |
| "loss": 0.3506, | |
| "num_input_tokens_seen": 401856, | |
| "step": 2285 | |
| }, | |
| { | |
| "epoch": 9.196787148594378, | |
| "grad_norm": 0.43247994780540466, | |
| "learning_rate": 9.85922451356694e-07, | |
| "loss": 0.3388, | |
| "num_input_tokens_seen": 402736, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 9.216867469879517, | |
| "grad_norm": 0.13525407016277313, | |
| "learning_rate": 9.377868724149197e-07, | |
| "loss": 0.3413, | |
| "num_input_tokens_seen": 403696, | |
| "step": 2295 | |
| }, | |
| { | |
| "epoch": 9.236947791164658, | |
| "grad_norm": 0.4610027074813843, | |
| "learning_rate": 8.908334904163207e-07, | |
| "loss": 0.3445, | |
| "num_input_tokens_seen": 404480, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 9.257028112449799, | |
| "grad_norm": 0.4158003330230713, | |
| "learning_rate": 8.450646122203865e-07, | |
| "loss": 0.3233, | |
| "num_input_tokens_seen": 405536, | |
| "step": 2305 | |
| }, | |
| { | |
| "epoch": 9.27710843373494, | |
| "grad_norm": 0.4854465425014496, | |
| "learning_rate": 8.004824864909277e-07, | |
| "loss": 0.3513, | |
| "num_input_tokens_seen": 406368, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 9.29718875502008, | |
| "grad_norm": 0.42681699991226196, | |
| "learning_rate": 7.570893035856091e-07, | |
| "loss": 0.3417, | |
| "num_input_tokens_seen": 407184, | |
| "step": 2315 | |
| }, | |
| { | |
| "epoch": 9.317269076305221, | |
| "grad_norm": 0.48750439286231995, | |
| "learning_rate": 7.148871954483105e-07, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 407904, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 9.337349397590362, | |
| "grad_norm": 0.11087116599082947, | |
| "learning_rate": 6.738782355044049e-07, | |
| "loss": 0.3387, | |
| "num_input_tokens_seen": 408736, | |
| "step": 2325 | |
| }, | |
| { | |
| "epoch": 9.357429718875501, | |
| "grad_norm": 0.4181234538555145, | |
| "learning_rate": 6.340644385588846e-07, | |
| "loss": 0.33, | |
| "num_input_tokens_seen": 409664, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 9.377510040160642, | |
| "grad_norm": 0.1247372180223465, | |
| "learning_rate": 5.954477606973679e-07, | |
| "loss": 0.3643, | |
| "num_input_tokens_seen": 410736, | |
| "step": 2335 | |
| }, | |
| { | |
| "epoch": 9.397590361445783, | |
| "grad_norm": 0.09054780006408691, | |
| "learning_rate": 5.580300991899989e-07, | |
| "loss": 0.3612, | |
| "num_input_tokens_seen": 411680, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 9.417670682730924, | |
| "grad_norm": 0.44483572244644165, | |
| "learning_rate": 5.218132923982267e-07, | |
| "loss": 0.3417, | |
| "num_input_tokens_seen": 412480, | |
| "step": 2345 | |
| }, | |
| { | |
| "epoch": 9.437751004016064, | |
| "grad_norm": 0.07839091122150421, | |
| "learning_rate": 4.867991196844918e-07, | |
| "loss": 0.3352, | |
| "num_input_tokens_seen": 413248, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 9.457831325301205, | |
| "grad_norm": 0.12925294041633606, | |
| "learning_rate": 4.5298930132480213e-07, | |
| "loss": 0.3606, | |
| "num_input_tokens_seen": 414080, | |
| "step": 2355 | |
| }, | |
| { | |
| "epoch": 9.477911646586346, | |
| "grad_norm": 0.15520651638507843, | |
| "learning_rate": 4.203854984242195e-07, | |
| "loss": 0.3481, | |
| "num_input_tokens_seen": 414928, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 9.497991967871485, | |
| "grad_norm": 0.4360347092151642, | |
| "learning_rate": 3.8898931283523344e-07, | |
| "loss": 0.364, | |
| "num_input_tokens_seen": 415728, | |
| "step": 2365 | |
| }, | |
| { | |
| "epoch": 9.518072289156626, | |
| "grad_norm": 0.4386354088783264, | |
| "learning_rate": 3.5880228707907417e-07, | |
| "loss": 0.3336, | |
| "num_input_tokens_seen": 416672, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 9.538152610441767, | |
| "grad_norm": 0.48464497923851013, | |
| "learning_rate": 3.2982590426993145e-07, | |
| "loss": 0.3419, | |
| "num_input_tokens_seen": 417520, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 9.538152610441767, | |
| "eval_loss": 0.3474566638469696, | |
| "eval_runtime": 1.2124, | |
| "eval_samples_per_second": 46.19, | |
| "eval_steps_per_second": 23.095, | |
| "num_input_tokens_seen": 417520, | |
| "step": 2375 | |
| }, | |
| { | |
| "epoch": 9.558232931726907, | |
| "grad_norm": 0.43182554841041565, | |
| "learning_rate": 3.020615880420713e-07, | |
| "loss": 0.3326, | |
| "num_input_tokens_seen": 418400, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 9.578313253012048, | |
| "grad_norm": 0.09252151101827621, | |
| "learning_rate": 2.7551070247990305e-07, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 419248, | |
| "step": 2385 | |
| }, | |
| { | |
| "epoch": 9.598393574297189, | |
| "grad_norm": 0.44276589155197144, | |
| "learning_rate": 2.501745520509552e-07, | |
| "loss": 0.3481, | |
| "num_input_tokens_seen": 420096, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 9.61847389558233, | |
| "grad_norm": 0.07001478224992752, | |
| "learning_rate": 2.2605438154179038e-07, | |
| "loss": 0.3386, | |
| "num_input_tokens_seen": 420848, | |
| "step": 2395 | |
| }, | |
| { | |
| "epoch": 9.638554216867469, | |
| "grad_norm": 0.11785867065191269, | |
| "learning_rate": 2.0315137599685174e-07, | |
| "loss": 0.3294, | |
| "num_input_tokens_seen": 421728, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 9.65863453815261, | |
| "grad_norm": 0.08471374958753586, | |
| "learning_rate": 1.814666606602261e-07, | |
| "loss": 0.3604, | |
| "num_input_tokens_seen": 422656, | |
| "step": 2405 | |
| }, | |
| { | |
| "epoch": 9.67871485943775, | |
| "grad_norm": 0.4377971589565277, | |
| "learning_rate": 1.6100130092037703e-07, | |
| "loss": 0.3457, | |
| "num_input_tokens_seen": 423600, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 9.698795180722891, | |
| "grad_norm": 0.12307439744472504, | |
| "learning_rate": 1.4175630225778947e-07, | |
| "loss": 0.3447, | |
| "num_input_tokens_seen": 424448, | |
| "step": 2415 | |
| }, | |
| { | |
| "epoch": 9.718875502008032, | |
| "grad_norm": 0.11983578652143478, | |
| "learning_rate": 1.237326101955677e-07, | |
| "loss": 0.3544, | |
| "num_input_tokens_seen": 425632, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 9.738955823293173, | |
| "grad_norm": 0.1070198193192482, | |
| "learning_rate": 1.0693111025300017e-07, | |
| "loss": 0.3385, | |
| "num_input_tokens_seen": 426576, | |
| "step": 2425 | |
| }, | |
| { | |
| "epoch": 9.759036144578314, | |
| "grad_norm": 0.44310370087623596, | |
| "learning_rate": 9.13526279020277e-08, | |
| "loss": 0.3296, | |
| "num_input_tokens_seen": 427376, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 9.779116465863455, | |
| "grad_norm": 0.4734679162502289, | |
| "learning_rate": 7.699792852670362e-08, | |
| "loss": 0.3514, | |
| "num_input_tokens_seen": 428256, | |
| "step": 2435 | |
| }, | |
| { | |
| "epoch": 9.799196787148594, | |
| "grad_norm": 0.13541147112846375, | |
| "learning_rate": 6.386771738558506e-08, | |
| "loss": 0.3389, | |
| "num_input_tokens_seen": 429216, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 9.819277108433734, | |
| "grad_norm": 0.48132413625717163, | |
| "learning_rate": 5.196263957708836e-08, | |
| "loss": 0.3542, | |
| "num_input_tokens_seen": 430208, | |
| "step": 2445 | |
| }, | |
| { | |
| "epoch": 9.839357429718875, | |
| "grad_norm": 0.4347268044948578, | |
| "learning_rate": 4.1283280007778366e-08, | |
| "loss": 0.3292, | |
| "num_input_tokens_seen": 430960, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 9.859437751004016, | |
| "grad_norm": 0.43290168046951294, | |
| "learning_rate": 3.1830163363655296e-08, | |
| "loss": 0.355, | |
| "num_input_tokens_seen": 431936, | |
| "step": 2455 | |
| }, | |
| { | |
| "epoch": 9.879518072289157, | |
| "grad_norm": 0.10507988184690475, | |
| "learning_rate": 2.3603754084358663e-08, | |
| "loss": 0.3425, | |
| "num_input_tokens_seen": 432912, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 9.899598393574298, | |
| "grad_norm": 0.4859294593334198, | |
| "learning_rate": 1.6604456340352235e-08, | |
| "loss": 0.3449, | |
| "num_input_tokens_seen": 433696, | |
| "step": 2465 | |
| }, | |
| { | |
| "epoch": 9.919678714859439, | |
| "grad_norm": 0.1426219940185547, | |
| "learning_rate": 1.0832614013073228e-08, | |
| "loss": 0.3513, | |
| "num_input_tokens_seen": 434528, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 9.939759036144578, | |
| "grad_norm": 0.08813058584928513, | |
| "learning_rate": 6.288510678031934e-09, | |
| "loss": 0.3633, | |
| "num_input_tokens_seen": 435280, | |
| "step": 2475 | |
| }, | |
| { | |
| "epoch": 9.959839357429718, | |
| "grad_norm": 0.4306187033653259, | |
| "learning_rate": 2.972369590878432e-09, | |
| "loss": 0.3483, | |
| "num_input_tokens_seen": 436096, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 9.97991967871486, | |
| "grad_norm": 0.07171786576509476, | |
| "learning_rate": 8.843536764419069e-10, | |
| "loss": 0.3521, | |
| "num_input_tokens_seen": 437008, | |
| "step": 2485 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.0965154618024826, | |
| "learning_rate": 2.4565520709285417e-11, | |
| "loss": 0.3294, | |
| "num_input_tokens_seen": 437760, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "num_input_tokens_seen": 437760, | |
| "step": 2490, | |
| "total_flos": 1.971213494648832e+16, | |
| "train_loss": 0.4594599806580199, | |
| "train_runtime": 258.0589, | |
| "train_samples_per_second": 19.298, | |
| "train_steps_per_second": 9.649 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 2490, | |
| "num_input_tokens_seen": 437760, | |
| "num_train_epochs": 10, | |
| "save_steps": 125, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.971213494648832e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |