{ "best_global_step": 490, "best_metric": 1.0370746850967407, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 10, "global_step": 600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 4.007787227630615, "learning_rate": 5e-05, "loss": 5.2601, "step": 10 }, { "epoch": 0.1, "eval_loss": 4.682629585266113, "eval_mean_token_accuracy": 0.37830855041742323, "eval_num_tokens": 6333.0, "eval_runtime": 97.2723, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 10 }, { "epoch": 0.2, "grad_norm": 8.059301376342773, "learning_rate": 9.999927156177032e-05, "loss": 3.7743, "step": 20 }, { "epoch": 0.2, "eval_loss": 2.5042078495025635, "eval_mean_token_accuracy": 0.6304361110925675, "eval_num_tokens": 12685.0, "eval_runtime": 97.2264, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 20 }, { "epoch": 0.3, "grad_norm": 2.4571871757507324, "learning_rate": 9.991188465340766e-05, "loss": 2.0475, "step": 30 }, { "epoch": 0.3, "eval_loss": 1.8001786470413208, "eval_mean_token_accuracy": 0.6825134682655335, "eval_num_tokens": 19121.0, "eval_runtime": 97.26, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 30 }, { "epoch": 0.4, "grad_norm": 1.8676475286483765, "learning_rate": 9.967910180154889e-05, "loss": 1.6727, "step": 40 }, { "epoch": 0.4, "eval_loss": 1.611749529838562, "eval_mean_token_accuracy": 0.6887511330842971, "eval_num_tokens": 25515.0, "eval_runtime": 97.2714, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 40 }, { "epoch": 0.5, "grad_norm": 1.9121489524841309, "learning_rate": 9.930160111487716e-05, "loss": 1.5602, "step": 50 }, { "epoch": 0.5, "eval_loss": 1.519342303276062, "eval_mean_token_accuracy": 0.6936497485637665, "eval_num_tokens": 31902.0, "eval_runtime": 97.2536, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 50 }, { "epoch": 0.6, "grad_norm": 1.4662973880767822, "learning_rate": 9.87804822727352e-05, "loss": 1.4916, "step": 60 }, { "epoch": 0.6, "eval_loss": 1.4659887552261353, "eval_mean_token_accuracy": 0.6948034042119979, "eval_num_tokens": 38297.0, "eval_runtime": 97.2339, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 60 }, { "epoch": 0.7, "grad_norm": 2.132735252380371, "learning_rate": 9.811726332170153e-05, "loss": 1.4291, "step": 70 }, { "epoch": 0.7, "eval_loss": 1.422730565071106, "eval_mean_token_accuracy": 0.6967671060562134, "eval_num_tokens": 44694.0, "eval_runtime": 97.2271, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 70 }, { "epoch": 0.8, "grad_norm": 1.3734078407287598, "learning_rate": 9.731387625344104e-05, "loss": 1.3832, "step": 80 }, { "epoch": 0.8, "eval_loss": 1.3986200094223022, "eval_mean_token_accuracy": 0.6979268860816955, "eval_num_tokens": 51069.0, "eval_runtime": 97.2072, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 80 }, { "epoch": 0.9, "grad_norm": 1.4604731798171997, "learning_rate": 9.637266137671177e-05, "loss": 1.3693, "step": 90 }, { "epoch": 0.9, "eval_loss": 1.3532851934432983, "eval_mean_token_accuracy": 0.7013956385850907, "eval_num_tokens": 57445.0, "eval_runtime": 97.2573, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 90 }, { "epoch": 1.0, "grad_norm": 1.3173726797103882, "learning_rate": 9.529636049992234e-05, "loss": 1.3143, "step": 100 }, { "epoch": 1.0, "eval_loss": 1.2985877990722656, "eval_mean_token_accuracy": 0.7077123075723648, "eval_num_tokens": 63853.0, "eval_runtime": 97.2409, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 100 }, { "epoch": 1.1, "grad_norm": 1.3028841018676758, "learning_rate": 9.408810894410009e-05, "loss": 1.2416, "step": 110 }, { "epoch": 1.1, "eval_loss": 1.2666221857070923, "eval_mean_token_accuracy": 0.7159780770540237, "eval_num_tokens": 70269.0, "eval_runtime": 97.2354, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 110 }, { "epoch": 1.2, "grad_norm": 1.479728102684021, "learning_rate": 9.2751426409536e-05, "loss": 1.208, "step": 120 }, { "epoch": 1.2, "eval_loss": 1.2293524742126465, "eval_mean_token_accuracy": 0.7204430556297302, "eval_num_tokens": 76691.0, "eval_runtime": 97.2278, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 120 }, { "epoch": 1.3, "grad_norm": 1.6283237934112549, "learning_rate": 9.129020672271283e-05, "loss": 1.1881, "step": 130 }, { "epoch": 1.3, "eval_loss": 1.2158491611480713, "eval_mean_token_accuracy": 0.7250880861282348, "eval_num_tokens": 83083.0, "eval_runtime": 97.221, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 130 }, { "epoch": 1.4, "grad_norm": 1.2515714168548584, "learning_rate": 8.970870649338387e-05, "loss": 1.1893, "step": 140 }, { "epoch": 1.4, "eval_loss": 1.2067173719406128, "eval_mean_token_accuracy": 0.7283873379230499, "eval_num_tokens": 89470.0, "eval_runtime": 97.217, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 140 }, { "epoch": 1.5, "grad_norm": 1.7679073810577393, "learning_rate": 8.801153271484502e-05, "loss": 1.1742, "step": 150 }, { "epoch": 1.5, "eval_loss": 1.2018429040908813, "eval_mean_token_accuracy": 0.7298205083608628, "eval_num_tokens": 95793.0, "eval_runtime": 97.2454, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 150 }, { "epoch": 1.6, "grad_norm": 1.4671496152877808, "learning_rate": 8.620362934352109e-05, "loss": 1.1713, "step": 160 }, { "epoch": 1.6, "eval_loss": 1.2024760246276855, "eval_mean_token_accuracy": 0.7248463779687881, "eval_num_tokens": 102197.0, "eval_runtime": 97.2434, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 160 }, { "epoch": 1.7, "grad_norm": 1.3356602191925049, "learning_rate": 8.429026289696091e-05, "loss": 1.1616, "step": 170 }, { "epoch": 1.7, "eval_loss": 1.1988317966461182, "eval_mean_token_accuracy": 0.7268992912769318, "eval_num_tokens": 108568.0, "eval_runtime": 97.2357, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 170 }, { "epoch": 1.8, "grad_norm": 1.8510276079177856, "learning_rate": 8.227700711219493e-05, "loss": 1.1541, "step": 180 }, { "epoch": 1.8, "eval_loss": 1.1827600002288818, "eval_mean_token_accuracy": 0.7310593771934509, "eval_num_tokens": 114964.0, "eval_runtime": 97.2104, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 180 }, { "epoch": 1.9, "grad_norm": 1.4240201711654663, "learning_rate": 8.016972670914624e-05, "loss": 1.1708, "step": 190 }, { "epoch": 1.9, "eval_loss": 1.1701805591583252, "eval_mean_token_accuracy": 0.7321947473287582, "eval_num_tokens": 121333.0, "eval_runtime": 97.2429, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 190 }, { "epoch": 2.0, "grad_norm": 1.8542824983596802, "learning_rate": 7.797456030639313e-05, "loss": 1.1509, "step": 200 }, { "epoch": 2.0, "eval_loss": 1.157992959022522, "eval_mean_token_accuracy": 0.7394243097305297, "eval_num_tokens": 127706.0, "eval_runtime": 97.2191, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 200 }, { "epoch": 2.1, "grad_norm": 1.5795457363128662, "learning_rate": 7.569790253905059e-05, "loss": 1.0941, "step": 210 }, { "epoch": 2.1, "eval_loss": 1.148593544960022, "eval_mean_token_accuracy": 0.7429650634527206, "eval_num_tokens": 134095.0, "eval_runtime": 97.2383, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 210 }, { "epoch": 2.2, "grad_norm": 1.4866969585418701, "learning_rate": 7.334638543086203e-05, "loss": 1.0931, "step": 220 }, { "epoch": 2.2, "eval_loss": 1.1364799737930298, "eval_mean_token_accuracy": 0.7470852738618851, "eval_num_tokens": 140527.0, "eval_runtime": 97.2225, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 220 }, { "epoch": 2.3, "grad_norm": 1.651310920715332, "learning_rate": 7.092685907476558e-05, "loss": 1.0897, "step": 230 }, { "epoch": 2.3, "eval_loss": 1.1412357091903687, "eval_mean_token_accuracy": 0.7455077153444291, "eval_num_tokens": 146939.0, "eval_runtime": 97.2224, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 230 }, { "epoch": 2.4, "grad_norm": 1.3703193664550781, "learning_rate": 6.844637167821326e-05, "loss": 1.0945, "step": 240 }, { "epoch": 2.4, "eval_loss": 1.1335656642913818, "eval_mean_token_accuracy": 0.7461127752065658, "eval_num_tokens": 153360.0, "eval_runtime": 97.2102, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 240 }, { "epoch": 2.5, "grad_norm": 1.7046772241592407, "learning_rate": 6.59121490313722e-05, "loss": 1.0803, "step": 250 }, { "epoch": 2.5, "eval_loss": 1.1254699230194092, "eval_mean_token_accuracy": 0.7490965259075165, "eval_num_tokens": 159731.0, "eval_runtime": 97.2222, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 250 }, { "epoch": 2.6, "grad_norm": 1.6481680870056152, "learning_rate": 6.333157345801809e-05, "loss": 1.0859, "step": 260 }, { "epoch": 2.6, "eval_loss": 1.1334922313690186, "eval_mean_token_accuracy": 0.7479477733373642, "eval_num_tokens": 166113.0, "eval_runtime": 97.2341, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 260 }, { "epoch": 2.7, "grad_norm": 1.7207422256469727, "learning_rate": 6.071216231043799e-05, "loss": 1.0934, "step": 270 }, { "epoch": 2.7, "eval_loss": 1.1130690574645996, "eval_mean_token_accuracy": 0.7543548595905304, "eval_num_tokens": 172481.0, "eval_runtime": 97.2173, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 270 }, { "epoch": 2.8, "grad_norm": 1.7942646741867065, "learning_rate": 5.8061546070987994e-05, "loss": 1.0733, "step": 280 }, { "epoch": 2.8, "eval_loss": 1.1070621013641357, "eval_mean_token_accuracy": 0.7557503712177277, "eval_num_tokens": 178813.0, "eval_runtime": 97.2529, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 280 }, { "epoch": 2.9, "grad_norm": 1.5024192333221436, "learning_rate": 5.538744612409701e-05, "loss": 1.0767, "step": 290 }, { "epoch": 2.9, "eval_loss": 1.109175205230713, "eval_mean_token_accuracy": 0.7545353853702546, "eval_num_tokens": 185220.0, "eval_runtime": 97.25, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 290 }, { "epoch": 3.0, "grad_norm": 1.7613085508346558, "learning_rate": 5.2697652263468125e-05, "loss": 1.0574, "step": 300 }, { "epoch": 3.0, "eval_loss": 1.0976698398590088, "eval_mean_token_accuracy": 0.759365046620369, "eval_num_tokens": 191559.0, "eval_runtime": 97.2105, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 300 }, { "epoch": 3.1, "grad_norm": 1.6670249700546265, "learning_rate": 5e-05, "loss": 1.0182, "step": 310 }, { "epoch": 3.1, "eval_loss": 1.094779372215271, "eval_mean_token_accuracy": 0.76161092877388, "eval_num_tokens": 197915.0, "eval_runtime": 97.2406, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 310 }, { "epoch": 3.2, "grad_norm": 1.7392572164535522, "learning_rate": 4.730234773653188e-05, "loss": 1.007, "step": 320 }, { "epoch": 3.2, "eval_loss": 1.0834596157073975, "eval_mean_token_accuracy": 0.7656834137439728, "eval_num_tokens": 204252.0, "eval_runtime": 97.2394, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 320 }, { "epoch": 3.3, "grad_norm": 2.0105206966400146, "learning_rate": 4.461255387590299e-05, "loss": 1.0119, "step": 330 }, { "epoch": 3.3, "eval_loss": 1.0809874534606934, "eval_mean_token_accuracy": 0.7655541002750397, "eval_num_tokens": 210574.0, "eval_runtime": 97.2409, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 330 }, { "epoch": 3.4, "grad_norm": 1.759490966796875, "learning_rate": 4.193845392901201e-05, "loss": 0.9974, "step": 340 }, { "epoch": 3.4, "eval_loss": 1.0805025100708008, "eval_mean_token_accuracy": 0.7660670423507691, "eval_num_tokens": 216977.0, "eval_runtime": 97.206, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 340 }, { "epoch": 3.5, "grad_norm": 1.8894627094268799, "learning_rate": 3.9287837689562016e-05, "loss": 1.0107, "step": 350 }, { "epoch": 3.5, "eval_loss": 1.0740913152694702, "eval_mean_token_accuracy": 0.7694036465883255, "eval_num_tokens": 223412.0, "eval_runtime": 97.2057, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 350 }, { "epoch": 3.6, "grad_norm": 2.0457475185394287, "learning_rate": 3.666842654198191e-05, "loss": 1.0004, "step": 360 }, { "epoch": 3.6, "eval_loss": 1.0669925212860107, "eval_mean_token_accuracy": 0.7717740494012832, "eval_num_tokens": 229731.0, "eval_runtime": 97.1971, "eval_samples_per_second": 2.058, "eval_steps_per_second": 1.029, "step": 360 }, { "epoch": 3.7, "grad_norm": 1.5154274702072144, "learning_rate": 3.408785096862782e-05, "loss": 0.9902, "step": 370 }, { "epoch": 3.7, "eval_loss": 1.0629109144210815, "eval_mean_token_accuracy": 0.7715724587440491, "eval_num_tokens": 236188.0, "eval_runtime": 97.203, "eval_samples_per_second": 2.058, "eval_steps_per_second": 1.029, "step": 370 }, { "epoch": 3.8, "grad_norm": 2.06585693359375, "learning_rate": 3.1553628321786745e-05, "loss": 1.0053, "step": 380 }, { "epoch": 3.8, "eval_loss": 1.0563435554504395, "eval_mean_token_accuracy": 0.7732643353939056, "eval_num_tokens": 242558.0, "eval_runtime": 97.2143, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 380 }, { "epoch": 3.9, "grad_norm": 1.730677843093872, "learning_rate": 2.907314092523442e-05, "loss": 0.9813, "step": 390 }, { "epoch": 3.9, "eval_loss": 1.0525062084197998, "eval_mean_token_accuracy": 0.7765388804674148, "eval_num_tokens": 249000.0, "eval_runtime": 97.2525, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 390 }, { "epoch": 4.0, "grad_norm": 1.5716614723205566, "learning_rate": 2.6653614569137968e-05, "loss": 0.9852, "step": 400 }, { "epoch": 4.0, "eval_loss": 1.0519495010375977, "eval_mean_token_accuracy": 0.7759864777326584, "eval_num_tokens": 255412.0, "eval_runtime": 97.2138, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 400 }, { "epoch": 4.1, "grad_norm": 1.607945203781128, "learning_rate": 2.430209746094943e-05, "loss": 0.9515, "step": 410 }, { "epoch": 4.1, "eval_loss": 1.0613422393798828, "eval_mean_token_accuracy": 0.7760749870538711, "eval_num_tokens": 261847.0, "eval_runtime": 97.2595, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 410 }, { "epoch": 4.2, "grad_norm": 1.7474143505096436, "learning_rate": 2.2025439693606882e-05, "loss": 0.9453, "step": 420 }, { "epoch": 4.2, "eval_loss": 1.0487031936645508, "eval_mean_token_accuracy": 0.7783306258916854, "eval_num_tokens": 268220.0, "eval_runtime": 97.2251, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.029, "step": 420 }, { "epoch": 4.3, "grad_norm": 1.3773257732391357, "learning_rate": 1.983027329085377e-05, "loss": 0.9398, "step": 430 }, { "epoch": 4.3, "eval_loss": 1.0485966205596924, "eval_mean_token_accuracy": 0.7771646714210511, "eval_num_tokens": 274612.0, "eval_runtime": 97.2492, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 430 }, { "epoch": 4.4, "grad_norm": 1.5962660312652588, "learning_rate": 1.772299288780508e-05, "loss": 0.9283, "step": 440 }, { "epoch": 4.4, "eval_loss": 1.049791693687439, "eval_mean_token_accuracy": 0.778401963710785, "eval_num_tokens": 281009.0, "eval_runtime": 97.2759, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 440 }, { "epoch": 4.5, "grad_norm": 2.201416015625, "learning_rate": 1.5709737103039103e-05, "loss": 0.944, "step": 450 }, { "epoch": 4.5, "eval_loss": 1.0448309183120728, "eval_mean_token_accuracy": 0.7783755934238434, "eval_num_tokens": 287341.0, "eval_runtime": 97.2585, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 450 }, { "epoch": 4.6, "grad_norm": 1.6953043937683105, "learning_rate": 1.3796370656478935e-05, "loss": 0.9357, "step": 460 }, { "epoch": 4.6, "eval_loss": 1.0441021919250488, "eval_mean_token_accuracy": 0.7798552727699279, "eval_num_tokens": 293667.0, "eval_runtime": 97.2712, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 460 }, { "epoch": 4.7, "grad_norm": 2.495961904525757, "learning_rate": 1.1988467285154987e-05, "loss": 0.9543, "step": 470 }, { "epoch": 4.7, "eval_loss": 1.0410436391830444, "eval_mean_token_accuracy": 0.7808067119121551, "eval_num_tokens": 300075.0, "eval_runtime": 97.273, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 470 }, { "epoch": 4.8, "grad_norm": 2.314516067504883, "learning_rate": 1.0291293506616133e-05, "loss": 0.9453, "step": 480 }, { "epoch": 4.8, "eval_loss": 1.040062427520752, "eval_mean_token_accuracy": 0.7821026688814163, "eval_num_tokens": 306434.0, "eval_runtime": 97.2666, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 480 }, { "epoch": 4.9, "grad_norm": 2.006592035293579, "learning_rate": 8.70979327728718e-06, "loss": 0.9376, "step": 490 }, { "epoch": 4.9, "eval_loss": 1.0370746850967407, "eval_mean_token_accuracy": 0.7834529572725296, "eval_num_tokens": 312881.0, "eval_runtime": 97.2721, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 490 }, { "epoch": 5.0, "grad_norm": 1.7751625776290894, "learning_rate": 7.248573590464014e-06, "loss": 0.9288, "step": 500 }, { "epoch": 5.0, "eval_loss": 1.0378228425979614, "eval_mean_token_accuracy": 0.7834442704916, "eval_num_tokens": 319265.0, "eval_runtime": 97.2917, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 500 }, { "epoch": 5.1, "grad_norm": 1.7418688535690308, "learning_rate": 5.91189105589992e-06, "loss": 0.9145, "step": 510 }, { "epoch": 5.1, "eval_loss": 1.0420323610305786, "eval_mean_token_accuracy": 0.7838906270265579, "eval_num_tokens": 325626.0, "eval_runtime": 97.2737, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 510 }, { "epoch": 5.2, "grad_norm": 1.826409101486206, "learning_rate": 4.703639500077656e-06, "loss": 0.9237, "step": 520 }, { "epoch": 5.2, "eval_loss": 1.0423357486724854, "eval_mean_token_accuracy": 0.7844956815242767, "eval_num_tokens": 331976.0, "eval_runtime": 97.2685, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 520 }, { "epoch": 5.3, "grad_norm": 1.7615423202514648, "learning_rate": 3.6273386232882343e-06, "loss": 0.9201, "step": 530 }, { "epoch": 5.3, "eval_loss": 1.0414623022079468, "eval_mean_token_accuracy": 0.7839177978038788, "eval_num_tokens": 338413.0, "eval_runtime": 97.2327, "eval_samples_per_second": 2.057, "eval_steps_per_second": 1.028, "step": 530 }, { "epoch": 5.4, "grad_norm": 1.5898215770721436, "learning_rate": 2.686123746558961e-06, "loss": 0.9002, "step": 540 }, { "epoch": 5.4, "eval_loss": 1.0424439907073975, "eval_mean_token_accuracy": 0.7836556518077851, "eval_num_tokens": 344800.0, "eval_runtime": 97.2686, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 540 }, { "epoch": 5.5, "grad_norm": 1.721126675605774, "learning_rate": 1.8827366782984913e-06, "loss": 0.9025, "step": 550 }, { "epoch": 5.5, "eval_loss": 1.043124794960022, "eval_mean_token_accuracy": 0.7836492872238159, "eval_num_tokens": 351242.0, "eval_runtime": 97.3002, "eval_samples_per_second": 2.055, "eval_steps_per_second": 1.028, "step": 550 }, { "epoch": 5.6, "grad_norm": 1.763380527496338, "learning_rate": 1.2195177272648127e-06, "loss": 0.9187, "step": 560 }, { "epoch": 5.6, "eval_loss": 1.0419065952301025, "eval_mean_token_accuracy": 0.7836943608522415, "eval_num_tokens": 357626.0, "eval_runtime": 97.2575, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 560 }, { "epoch": 5.7, "grad_norm": 1.7301427125930786, "learning_rate": 6.983988851228473e-07, "loss": 0.9192, "step": 570 }, { "epoch": 5.7, "eval_loss": 1.0414124727249146, "eval_mean_token_accuracy": 0.784173795580864, "eval_num_tokens": 363989.0, "eval_runtime": 97.2552, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 570 }, { "epoch": 5.8, "grad_norm": 1.9148461818695068, "learning_rate": 3.208981984511195e-07, "loss": 0.9075, "step": 580 }, { "epoch": 5.8, "eval_loss": 1.0412297248840332, "eval_mean_token_accuracy": 0.7838809263706207, "eval_num_tokens": 370407.0, "eval_runtime": 97.2933, "eval_samples_per_second": 2.056, "eval_steps_per_second": 1.028, "step": 580 }, { "epoch": 5.9, "grad_norm": 1.3000129461288452, "learning_rate": 8.811534659234899e-08, "loss": 0.9122, "step": 590 }, { "epoch": 5.9, "eval_loss": 1.041398048400879, "eval_mean_token_accuracy": 0.7838524436950683, "eval_num_tokens": 376689.0, "eval_runtime": 97.3246, "eval_samples_per_second": 2.055, "eval_steps_per_second": 1.027, "step": 590 }, { "epoch": 6.0, "grad_norm": 1.879518747329712, "learning_rate": 7.284382296801617e-10, "loss": 0.9134, "step": 600 }, { "epoch": 6.0, "eval_loss": 1.0410884618759155, "eval_mean_token_accuracy": 0.7832156884670257, "eval_num_tokens": 383118.0, "eval_runtime": 97.3129, "eval_samples_per_second": 2.055, "eval_steps_per_second": 1.028, "step": 600 } ], "logging_steps": 10, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.777512987967488e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }