{ "best_global_step": 700, "best_metric": 2.8837223052978516, "best_model_checkpoint": "./qwen-chat-finetuned/checkpoint-700", "epoch": 2.094426229508197, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.7374715983867646, "epoch": 0.02622950819672131, "grad_norm": 36.25, "learning_rate": 1.565217391304348e-06, "loss": 4.3802, "mean_token_accuracy": 0.352320809289813, "num_tokens": 53401.0, "step": 10 }, { "entropy": 1.7163084208965302, "epoch": 0.05245901639344262, "grad_norm": 76.0, "learning_rate": 3.3043478260869567e-06, "loss": 3.9169, "mean_token_accuracy": 0.3523058544844389, "num_tokens": 110972.0, "step": 20 }, { "entropy": 1.9017599046230316, "epoch": 0.07868852459016394, "grad_norm": 20.125, "learning_rate": 5.043478260869565e-06, "loss": 3.9013, "mean_token_accuracy": 0.3457519307732582, "num_tokens": 165938.0, "step": 30 }, { "entropy": 2.472129786014557, "epoch": 0.10491803278688525, "grad_norm": 9.625, "learning_rate": 6.782608695652174e-06, "loss": 3.3305, "mean_token_accuracy": 0.3779371060431004, "num_tokens": 222674.0, "step": 40 }, { "entropy": 2.775526815652847, "epoch": 0.13114754098360656, "grad_norm": 11.5625, "learning_rate": 8.521739130434783e-06, "loss": 3.2128, "mean_token_accuracy": 0.3930466562509537, "num_tokens": 278701.0, "step": 50 }, { "entropy": 2.78606663942337, "epoch": 0.15737704918032788, "grad_norm": 10.6875, "learning_rate": 1.0260869565217393e-05, "loss": 3.1361, "mean_token_accuracy": 0.38859146609902384, "num_tokens": 334399.0, "step": 60 }, { "entropy": 2.7956872820854186, "epoch": 0.18360655737704917, "grad_norm": 12.5625, "learning_rate": 1.2e-05, "loss": 2.9741, "mean_token_accuracy": 0.40764485821127894, "num_tokens": 388307.0, "step": 70 }, { "entropy": 2.6394418656826018, "epoch": 0.2098360655737705, "grad_norm": 8.5625, "learning_rate": 1.373913043478261e-05, "loss": 3.18, "mean_token_accuracy": 0.4003832370042801, "num_tokens": 440854.0, "step": 80 }, { "entropy": 2.5192583084106444, "epoch": 0.2360655737704918, "grad_norm": 7.90625, "learning_rate": 1.5478260869565217e-05, "loss": 2.8277, "mean_token_accuracy": 0.42245824187994, "num_tokens": 496198.0, "step": 90 }, { "entropy": 2.4694125235080717, "epoch": 0.26229508196721313, "grad_norm": 7.40625, "learning_rate": 1.721739130434783e-05, "loss": 2.9568, "mean_token_accuracy": 0.4054948791861534, "num_tokens": 549533.0, "step": 100 }, { "epoch": 0.26229508196721313, "eval_entropy": 2.3915325473336613, "eval_loss": 3.1522939205169678, "eval_mean_token_accuracy": 0.41849513439571157, "eval_num_tokens": 549533.0, "eval_runtime": 5.9294, "eval_samples_per_second": 228.521, "eval_steps_per_second": 28.67, "step": 100 }, { "entropy": 2.3309587478637694, "epoch": 0.28852459016393445, "grad_norm": 5.65625, "learning_rate": 1.8956521739130434e-05, "loss": 3.0738, "mean_token_accuracy": 0.4145698197185993, "num_tokens": 604712.0, "step": 110 }, { "entropy": 2.3101665914058684, "epoch": 0.31475409836065577, "grad_norm": 8.9375, "learning_rate": 1.9999257208329125e-05, "loss": 2.9566, "mean_token_accuracy": 0.4204997166991234, "num_tokens": 660327.0, "step": 120 }, { "entropy": 2.2639360249042513, "epoch": 0.34098360655737703, "grad_norm": 12.125, "learning_rate": 1.9990902069256562e-05, "loss": 3.0436, "mean_token_accuracy": 0.4053013317286968, "num_tokens": 712709.0, "step": 130 }, { "entropy": 2.2741274297237397, "epoch": 0.36721311475409835, "grad_norm": 8.625, "learning_rate": 1.9973271084541296e-05, "loss": 2.8725, "mean_token_accuracy": 0.43222680017352105, "num_tokens": 762841.0, "step": 140 }, { "entropy": 2.245914125442505, "epoch": 0.39344262295081966, "grad_norm": 9.6875, "learning_rate": 1.9946380623305116e-05, "loss": 2.924, "mean_token_accuracy": 0.42352988123893737, "num_tokens": 813780.0, "step": 150 }, { "entropy": 2.248101031780243, "epoch": 0.419672131147541, "grad_norm": 7.75, "learning_rate": 1.991025565143858e-05, "loss": 2.9713, "mean_token_accuracy": 0.41958039328455926, "num_tokens": 866574.0, "step": 160 }, { "entropy": 2.2917182624340056, "epoch": 0.4459016393442623, "grad_norm": 10.375, "learning_rate": 1.986492970842197e-05, "loss": 2.9419, "mean_token_accuracy": 0.41459466964006425, "num_tokens": 918240.0, "step": 170 }, { "entropy": 2.278603363037109, "epoch": 0.4721311475409836, "grad_norm": 6.59375, "learning_rate": 1.9810444876186235e-05, "loss": 2.6958, "mean_token_accuracy": 0.4311013951897621, "num_tokens": 977542.0, "step": 180 }, { "entropy": 2.246600490808487, "epoch": 0.49836065573770494, "grad_norm": 8.125, "learning_rate": 1.9746851740042906e-05, "loss": 2.8167, "mean_token_accuracy": 0.4296098329126835, "num_tokens": 1033813.0, "step": 190 }, { "entropy": 2.154702216386795, "epoch": 0.5245901639344263, "grad_norm": 15.125, "learning_rate": 1.9674209341719224e-05, "loss": 2.7415, "mean_token_accuracy": 0.4451119631528854, "num_tokens": 1085702.0, "step": 200 }, { "epoch": 0.5245901639344263, "eval_entropy": 2.186812935155981, "eval_loss": 3.0109927654266357, "eval_mean_token_accuracy": 0.43699205885915193, "eval_num_tokens": 1085702.0, "eval_runtime": 5.0574, "eval_samples_per_second": 267.925, "eval_steps_per_second": 33.614, "step": 200 }, { "entropy": 2.162086147069931, "epoch": 0.5508196721311476, "grad_norm": 7.03125, "learning_rate": 1.9592585124542055e-05, "loss": 2.9784, "mean_token_accuracy": 0.4247356228530407, "num_tokens": 1137871.0, "step": 210 }, { "entropy": 2.1591184198856355, "epoch": 0.5770491803278689, "grad_norm": 9.375, "learning_rate": 1.950205487082154e-05, "loss": 2.7191, "mean_token_accuracy": 0.4592340663075447, "num_tokens": 1195487.0, "step": 220 }, { "entropy": 2.1033347219228746, "epoch": 0.6032786885245902, "grad_norm": 6.5625, "learning_rate": 1.9402702631492595e-05, "loss": 2.8622, "mean_token_accuracy": 0.4423126816749573, "num_tokens": 1245528.0, "step": 230 }, { "entropy": 2.141283091902733, "epoch": 0.6295081967213115, "grad_norm": 6.96875, "learning_rate": 1.9294620648079543e-05, "loss": 2.8314, "mean_token_accuracy": 0.4316069222986698, "num_tokens": 1300795.0, "step": 240 }, { "entropy": 2.1625925838947295, "epoch": 0.6557377049180327, "grad_norm": 10.875, "learning_rate": 1.9177909267056403e-05, "loss": 3.0047, "mean_token_accuracy": 0.42482461109757425, "num_tokens": 1356910.0, "step": 250 }, { "entropy": 2.1598483502864836, "epoch": 0.6819672131147541, "grad_norm": 10.8125, "learning_rate": 1.9052676846682278e-05, "loss": 2.7993, "mean_token_accuracy": 0.43536330983042715, "num_tokens": 1412641.0, "step": 260 }, { "entropy": 2.128132125735283, "epoch": 0.7081967213114754, "grad_norm": 6.1875, "learning_rate": 1.8919039656398387e-05, "loss": 2.8191, "mean_token_accuracy": 0.4389350414276123, "num_tokens": 1470575.0, "step": 270 }, { "entropy": 2.1286911875009538, "epoch": 0.7344262295081967, "grad_norm": 11.125, "learning_rate": 1.8777121768880108e-05, "loss": 2.7836, "mean_token_accuracy": 0.4473900467157364, "num_tokens": 1529970.0, "step": 280 }, { "entropy": 2.1200373440980913, "epoch": 0.760655737704918, "grad_norm": 8.9375, "learning_rate": 1.862705494484429e-05, "loss": 2.9483, "mean_token_accuracy": 0.4232171691954136, "num_tokens": 1582507.0, "step": 290 }, { "entropy": 2.1294125616550446, "epoch": 0.7868852459016393, "grad_norm": 8.625, "learning_rate": 1.8468978510718734e-05, "loss": 2.9155, "mean_token_accuracy": 0.4285292446613312, "num_tokens": 1633961.0, "step": 300 }, { "epoch": 0.7868852459016393, "eval_entropy": 2.1470566707498886, "eval_loss": 2.9455931186676025, "eval_mean_token_accuracy": 0.4428922993295333, "eval_num_tokens": 1633961.0, "eval_runtime": 5.0322, "eval_samples_per_second": 269.267, "eval_steps_per_second": 33.783, "step": 300 }, { "entropy": 2.105194115638733, "epoch": 0.8131147540983606, "grad_norm": 10.0625, "learning_rate": 1.8303039229287476e-05, "loss": 2.804, "mean_token_accuracy": 0.4374969556927681, "num_tokens": 1691177.0, "step": 310 }, { "entropy": 2.056723091006279, "epoch": 0.839344262295082, "grad_norm": 12.9375, "learning_rate": 1.8129391163431912e-05, "loss": 2.6799, "mean_token_accuracy": 0.459722201526165, "num_tokens": 1746899.0, "step": 320 }, { "entropy": 2.0097413182258608, "epoch": 0.8655737704918033, "grad_norm": 11.0625, "learning_rate": 1.7948195533094343e-05, "loss": 2.6355, "mean_token_accuracy": 0.4685002237558365, "num_tokens": 1797548.0, "step": 330 }, { "entropy": 2.033376136422157, "epoch": 0.8918032786885246, "grad_norm": 13.875, "learning_rate": 1.7759620565596628e-05, "loss": 2.8326, "mean_token_accuracy": 0.4427059732377529, "num_tokens": 1850940.0, "step": 340 }, { "entropy": 2.1023750483989714, "epoch": 0.9180327868852459, "grad_norm": 7.71875, "learning_rate": 1.7563841339453053e-05, "loss": 2.7896, "mean_token_accuracy": 0.4486307971179485, "num_tokens": 1906789.0, "step": 350 }, { "entropy": 2.150878208875656, "epoch": 0.9442622950819672, "grad_norm": 11.3125, "learning_rate": 1.7361039621822288e-05, "loss": 2.9686, "mean_token_accuracy": 0.4197625443339348, "num_tokens": 1960496.0, "step": 360 }, { "entropy": 2.115389880537987, "epoch": 0.9704918032786886, "grad_norm": 12.75, "learning_rate": 1.7151403699749467e-05, "loss": 2.7354, "mean_token_accuracy": 0.4496216416358948, "num_tokens": 2018312.0, "step": 370 }, { "entropy": 2.0364568412303923, "epoch": 0.9967213114754099, "grad_norm": 15.25, "learning_rate": 1.693512820535498e-05, "loss": 2.7294, "mean_token_accuracy": 0.4530358798801899, "num_tokens": 2075123.0, "step": 380 }, { "entropy": 2.071015902467676, "epoch": 1.020983606557377, "grad_norm": 11.6875, "learning_rate": 1.6712413935132307e-05, "loss": 2.785, "mean_token_accuracy": 0.4375705743158186, "num_tokens": 2127638.0, "step": 390 }, { "entropy": 1.9601984202861786, "epoch": 1.0472131147540984, "grad_norm": 8.125, "learning_rate": 1.648346766352272e-05, "loss": 2.5404, "mean_token_accuracy": 0.4696895979344845, "num_tokens": 2182937.0, "step": 400 }, { "epoch": 1.0472131147540984, "eval_entropy": 1.962728219172534, "eval_loss": 2.9205315113067627, "eval_mean_token_accuracy": 0.44614641999497134, "eval_num_tokens": 2182937.0, "eval_runtime": 5.0426, "eval_samples_per_second": 268.709, "eval_steps_per_second": 33.713, "step": 400 }, { "entropy": 1.9237431466579438, "epoch": 1.0734426229508196, "grad_norm": 14.0625, "learning_rate": 1.6248501950939788e-05, "loss": 2.6068, "mean_token_accuracy": 0.4642325811088085, "num_tokens": 2238757.0, "step": 410 }, { "entropy": 1.9247209161520005, "epoch": 1.099672131147541, "grad_norm": 23.25, "learning_rate": 1.6007734946422125e-05, "loss": 2.5779, "mean_token_accuracy": 0.48758235424757, "num_tokens": 2293678.0, "step": 420 }, { "entropy": 1.9393797785043716, "epoch": 1.1259016393442622, "grad_norm": 5.9375, "learning_rate": 1.576139018509742e-05, "loss": 2.4135, "mean_token_accuracy": 0.4751130722463131, "num_tokens": 2345349.0, "step": 430 }, { "entropy": 1.8991259425878524, "epoch": 1.1521311475409837, "grad_norm": 11.375, "learning_rate": 1.550969638064589e-05, "loss": 2.5987, "mean_token_accuracy": 0.4642434611916542, "num_tokens": 2401578.0, "step": 440 }, { "entropy": 1.9015632569789886, "epoch": 1.1783606557377049, "grad_norm": 8.1875, "learning_rate": 1.525288721295581e-05, "loss": 2.4481, "mean_token_accuracy": 0.49190146625041964, "num_tokens": 2457689.0, "step": 450 }, { "entropy": 1.8667599350214004, "epoch": 1.2045901639344263, "grad_norm": 7.1875, "learning_rate": 1.4991201111168272e-05, "loss": 2.458, "mean_token_accuracy": 0.48113037571310996, "num_tokens": 2512267.0, "step": 460 }, { "entropy": 1.8301506459712982, "epoch": 1.2308196721311475, "grad_norm": 12.25, "learning_rate": 1.4724881032312603e-05, "loss": 2.3436, "mean_token_accuracy": 0.48804719001054764, "num_tokens": 2569873.0, "step": 470 }, { "entropy": 1.8779793590307237, "epoch": 1.257049180327869, "grad_norm": 6.84375, "learning_rate": 1.4454174235737943e-05, "loss": 2.5532, "mean_token_accuracy": 0.4793685719370842, "num_tokens": 2625844.0, "step": 480 }, { "entropy": 1.8586382299661637, "epoch": 1.2832786885245901, "grad_norm": 8.375, "learning_rate": 1.4179332053550452e-05, "loss": 2.4023, "mean_token_accuracy": 0.4916908532381058, "num_tokens": 2684182.0, "step": 490 }, { "entropy": 1.864977639913559, "epoch": 1.3095081967213114, "grad_norm": 10.8125, "learning_rate": 1.3900609657269222e-05, "loss": 2.4986, "mean_token_accuracy": 0.47516297921538353, "num_tokens": 2737200.0, "step": 500 }, { "epoch": 1.3095081967213114, "eval_entropy": 1.9044717620400822, "eval_loss": 2.9017913341522217, "eval_mean_token_accuracy": 0.45116734504699707, "eval_num_tokens": 2737200.0, "eval_runtime": 5.0643, "eval_samples_per_second": 267.557, "eval_steps_per_second": 33.568, "step": 500 }, { "entropy": 1.8558655947446823, "epoch": 1.3357377049180328, "grad_norm": 10.5625, "learning_rate": 1.3618265820917565e-05, "loss": 2.5359, "mean_token_accuracy": 0.46635654121637343, "num_tokens": 2788464.0, "step": 510 }, { "entropy": 1.8646740198135376, "epoch": 1.3619672131147542, "grad_norm": 11.875, "learning_rate": 1.333256268076967e-05, "loss": 2.5993, "mean_token_accuracy": 0.47755324840545654, "num_tokens": 2840606.0, "step": 520 }, { "entropy": 1.8370872467756272, "epoch": 1.3881967213114754, "grad_norm": 9.1875, "learning_rate": 1.3043765491975602e-05, "loss": 2.5856, "mean_token_accuracy": 0.47580934390425683, "num_tokens": 2889736.0, "step": 530 }, { "entropy": 1.8797135382890702, "epoch": 1.4144262295081966, "grad_norm": 11.375, "learning_rate": 1.275214238229066e-05, "loss": 2.6422, "mean_token_accuracy": 0.4679927438497543, "num_tokens": 2938632.0, "step": 540 }, { "entropy": 1.8948182821273805, "epoch": 1.440655737704918, "grad_norm": 8.8125, "learning_rate": 1.2457964103137747e-05, "loss": 2.5687, "mean_token_accuracy": 0.46481930911540986, "num_tokens": 2992166.0, "step": 550 }, { "entropy": 1.884895321726799, "epoch": 1.4668852459016393, "grad_norm": 4.46875, "learning_rate": 1.2161503778233813e-05, "loss": 2.4362, "mean_token_accuracy": 0.49427496194839476, "num_tokens": 3047092.0, "step": 560 }, { "entropy": 1.8731955915689469, "epoch": 1.4931147540983607, "grad_norm": 5.8125, "learning_rate": 1.1863036650013817e-05, "loss": 2.5261, "mean_token_accuracy": 0.48092877194285394, "num_tokens": 3103514.0, "step": 570 }, { "entropy": 1.8721953690052033, "epoch": 1.519344262295082, "grad_norm": 12.8125, "learning_rate": 1.1562839824087604e-05, "loss": 2.6489, "mean_token_accuracy": 0.46040653586387636, "num_tokens": 3154223.0, "step": 580 }, { "entropy": 1.8559091806411743, "epoch": 1.5455737704918033, "grad_norm": 9.0625, "learning_rate": 1.1261192011966952e-05, "loss": 2.5391, "mean_token_accuracy": 0.49064989015460014, "num_tokens": 3208440.0, "step": 590 }, { "entropy": 1.8500412583351136, "epoch": 1.5718032786885245, "grad_norm": 11.375, "learning_rate": 1.0958373272301647e-05, "loss": 2.667, "mean_token_accuracy": 0.45823334604501725, "num_tokens": 3261342.0, "step": 600 }, { "epoch": 1.5718032786885245, "eval_entropy": 1.8744515650412616, "eval_loss": 2.891632556915283, "eval_mean_token_accuracy": 0.4510051518678665, "eval_num_tokens": 3261342.0, "eval_runtime": 5.0441, "eval_samples_per_second": 268.629, "eval_steps_per_second": 33.703, "step": 600 }, { "entropy": 1.8289855808019637, "epoch": 1.5980327868852457, "grad_norm": 8.1875, "learning_rate": 1.0654664750864851e-05, "loss": 2.6261, "mean_token_accuracy": 0.4699882663786411, "num_tokens": 3312799.0, "step": 610 }, { "entropy": 1.8798219680786132, "epoch": 1.6242622950819672, "grad_norm": 9.875, "learning_rate": 1.0350348419529143e-05, "loss": 2.7662, "mean_token_accuracy": 0.4601142071187496, "num_tokens": 3365971.0, "step": 620 }, { "entropy": 1.8897109836339951, "epoch": 1.6504918032786886, "grad_norm": 5.15625, "learning_rate": 1.004570681447559e-05, "loss": 2.4977, "mean_token_accuracy": 0.48899385556578634, "num_tokens": 3422509.0, "step": 630 }, { "entropy": 1.8842475891113282, "epoch": 1.6767213114754098, "grad_norm": 5.40625, "learning_rate": 9.741022773878885e-06, "loss": 2.4265, "mean_token_accuracy": 0.48201805353164673, "num_tokens": 3482220.0, "step": 640 }, { "entropy": 1.8733120083808898, "epoch": 1.702950819672131, "grad_norm": 11.4375, "learning_rate": 9.436579175312121e-06, "loss": 2.5942, "mean_token_accuracy": 0.4757945977151394, "num_tokens": 3535417.0, "step": 650 }, { "entropy": 1.8299070119857788, "epoch": 1.7291803278688525, "grad_norm": 8.0625, "learning_rate": 9.132658673114977e-06, "loss": 2.5514, "mean_token_accuracy": 0.4865307964384556, "num_tokens": 3591470.0, "step": 660 }, { "entropy": 1.836677971482277, "epoch": 1.7554098360655739, "grad_norm": 11.5, "learning_rate": 8.829543435969176e-06, "loss": 2.3891, "mean_token_accuracy": 0.5033678129315377, "num_tokens": 3645508.0, "step": 670 }, { "entropy": 1.8698895812034606, "epoch": 1.781639344262295, "grad_norm": 5.59375, "learning_rate": 8.527514884924806e-06, "loss": 2.2491, "mean_token_accuracy": 0.5068600162863731, "num_tokens": 3708816.0, "step": 680 }, { "entropy": 1.7854702413082122, "epoch": 1.8078688524590163, "grad_norm": 8.75, "learning_rate": 8.226853432120808e-06, "loss": 2.5441, "mean_token_accuracy": 0.48928667306900026, "num_tokens": 3760527.0, "step": 690 }, { "entropy": 1.8351004660129546, "epoch": 1.8340983606557377, "grad_norm": 14.0, "learning_rate": 7.927838220442138e-06, "loss": 2.6377, "mean_token_accuracy": 0.46876697689294816, "num_tokens": 3815286.0, "step": 700 }, { "epoch": 1.8340983606557377, "eval_entropy": 1.8660439694628996, "eval_loss": 2.8837223052978516, "eval_mean_token_accuracy": 0.4532979618100559, "eval_num_tokens": 3815286.0, "eval_runtime": 5.0517, "eval_samples_per_second": 268.228, "eval_steps_per_second": 33.652, "step": 700 }, { "entropy": 1.8734222501516342, "epoch": 1.8603278688524592, "grad_norm": 11.3125, "learning_rate": 7.630746864355353e-06, "loss": 2.6619, "mean_token_accuracy": 0.46061802059412005, "num_tokens": 3867525.0, "step": 710 }, { "entropy": 1.8854580372571945, "epoch": 1.8865573770491804, "grad_norm": 6.25, "learning_rate": 7.335855192163215e-06, "loss": 2.5745, "mean_token_accuracy": 0.47656749188899994, "num_tokens": 3923519.0, "step": 720 }, { "entropy": 1.879018184542656, "epoch": 1.9127868852459016, "grad_norm": 5.625, "learning_rate": 7.043436989917631e-06, "loss": 2.528, "mean_token_accuracy": 0.4868933133780956, "num_tokens": 3981456.0, "step": 730 }, { "entropy": 1.8539914727210998, "epoch": 1.939016393442623, "grad_norm": 10.375, "learning_rate": 6.753763747228687e-06, "loss": 2.3218, "mean_token_accuracy": 0.5035036608576775, "num_tokens": 4036987.0, "step": 740 }, { "entropy": 1.8262196511030198, "epoch": 1.9652459016393444, "grad_norm": 6.1875, "learning_rate": 6.467104405205744e-06, "loss": 2.5221, "mean_token_accuracy": 0.4888892278075218, "num_tokens": 4091343.0, "step": 750 }, { "entropy": 1.8170676976442337, "epoch": 1.9914754098360654, "grad_norm": 6.4375, "learning_rate": 6.183725106764643e-06, "loss": 2.3265, "mean_token_accuracy": 0.512597742676735, "num_tokens": 4146393.0, "step": 760 }, { "entropy": 1.8072978258132935, "epoch": 2.015737704918033, "grad_norm": 8.9375, "learning_rate": 5.903888949532854e-06, "loss": 2.4277, "mean_token_accuracy": 0.49353572726249695, "num_tokens": 4193522.0, "step": 770 }, { "entropy": 1.8114411652088165, "epoch": 2.041967213114754, "grad_norm": 12.375, "learning_rate": 5.627855741581935e-06, "loss": 2.4582, "mean_token_accuracy": 0.4922221273183823, "num_tokens": 4248189.0, "step": 780 }, { "entropy": 1.7977081060409545, "epoch": 2.0681967213114754, "grad_norm": 4.5625, "learning_rate": 5.355881760214121e-06, "loss": 2.3459, "mean_token_accuracy": 0.49450727328658106, "num_tokens": 4300966.0, "step": 790 }, { "entropy": 1.797827911376953, "epoch": 2.094426229508197, "grad_norm": 7.25, "learning_rate": 5.088219514026984e-06, "loss": 2.384, "mean_token_accuracy": 0.4919115357100964, "num_tokens": 4354844.0, "step": 800 }, { "epoch": 2.094426229508197, "eval_entropy": 1.8060384897624746, "eval_loss": 2.8908982276916504, "eval_mean_token_accuracy": 0.4530702908249462, "eval_num_tokens": 4354844.0, "eval_runtime": 5.0625, "eval_samples_per_second": 267.652, "eval_steps_per_second": 33.58, "step": 800 } ], "logging_steps": 10, "max_steps": 1146, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.254342993392435e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }