| { | |
| "best_global_step": 700, | |
| "best_metric": 2.8837223052978516, | |
| "best_model_checkpoint": "./qwen-chat-finetuned/checkpoint-700", | |
| "epoch": 2.094426229508197, | |
| "eval_steps": 100, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 1.7374715983867646, | |
| "epoch": 0.02622950819672131, | |
| "grad_norm": 36.25, | |
| "learning_rate": 1.565217391304348e-06, | |
| "loss": 4.3802, | |
| "mean_token_accuracy": 0.352320809289813, | |
| "num_tokens": 53401.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 1.7163084208965302, | |
| "epoch": 0.05245901639344262, | |
| "grad_norm": 76.0, | |
| "learning_rate": 3.3043478260869567e-06, | |
| "loss": 3.9169, | |
| "mean_token_accuracy": 0.3523058544844389, | |
| "num_tokens": 110972.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 1.9017599046230316, | |
| "epoch": 0.07868852459016394, | |
| "grad_norm": 20.125, | |
| "learning_rate": 5.043478260869565e-06, | |
| "loss": 3.9013, | |
| "mean_token_accuracy": 0.3457519307732582, | |
| "num_tokens": 165938.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.472129786014557, | |
| "epoch": 0.10491803278688525, | |
| "grad_norm": 9.625, | |
| "learning_rate": 6.782608695652174e-06, | |
| "loss": 3.3305, | |
| "mean_token_accuracy": 0.3779371060431004, | |
| "num_tokens": 222674.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.775526815652847, | |
| "epoch": 0.13114754098360656, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 8.521739130434783e-06, | |
| "loss": 3.2128, | |
| "mean_token_accuracy": 0.3930466562509537, | |
| "num_tokens": 278701.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.78606663942337, | |
| "epoch": 0.15737704918032788, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.0260869565217393e-05, | |
| "loss": 3.1361, | |
| "mean_token_accuracy": 0.38859146609902384, | |
| "num_tokens": 334399.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 2.7956872820854186, | |
| "epoch": 0.18360655737704917, | |
| "grad_norm": 12.5625, | |
| "learning_rate": 1.2e-05, | |
| "loss": 2.9741, | |
| "mean_token_accuracy": 0.40764485821127894, | |
| "num_tokens": 388307.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 2.6394418656826018, | |
| "epoch": 0.2098360655737705, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 1.373913043478261e-05, | |
| "loss": 3.18, | |
| "mean_token_accuracy": 0.4003832370042801, | |
| "num_tokens": 440854.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 2.5192583084106444, | |
| "epoch": 0.2360655737704918, | |
| "grad_norm": 7.90625, | |
| "learning_rate": 1.5478260869565217e-05, | |
| "loss": 2.8277, | |
| "mean_token_accuracy": 0.42245824187994, | |
| "num_tokens": 496198.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 2.4694125235080717, | |
| "epoch": 0.26229508196721313, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 1.721739130434783e-05, | |
| "loss": 2.9568, | |
| "mean_token_accuracy": 0.4054948791861534, | |
| "num_tokens": 549533.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.26229508196721313, | |
| "eval_entropy": 2.3915325473336613, | |
| "eval_loss": 3.1522939205169678, | |
| "eval_mean_token_accuracy": 0.41849513439571157, | |
| "eval_num_tokens": 549533.0, | |
| "eval_runtime": 5.9294, | |
| "eval_samples_per_second": 228.521, | |
| "eval_steps_per_second": 28.67, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 2.3309587478637694, | |
| "epoch": 0.28852459016393445, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 1.8956521739130434e-05, | |
| "loss": 3.0738, | |
| "mean_token_accuracy": 0.4145698197185993, | |
| "num_tokens": 604712.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 2.3101665914058684, | |
| "epoch": 0.31475409836065577, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 1.9999257208329125e-05, | |
| "loss": 2.9566, | |
| "mean_token_accuracy": 0.4204997166991234, | |
| "num_tokens": 660327.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 2.2639360249042513, | |
| "epoch": 0.34098360655737703, | |
| "grad_norm": 12.125, | |
| "learning_rate": 1.9990902069256562e-05, | |
| "loss": 3.0436, | |
| "mean_token_accuracy": 0.4053013317286968, | |
| "num_tokens": 712709.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 2.2741274297237397, | |
| "epoch": 0.36721311475409835, | |
| "grad_norm": 8.625, | |
| "learning_rate": 1.9973271084541296e-05, | |
| "loss": 2.8725, | |
| "mean_token_accuracy": 0.43222680017352105, | |
| "num_tokens": 762841.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 2.245914125442505, | |
| "epoch": 0.39344262295081966, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 1.9946380623305116e-05, | |
| "loss": 2.924, | |
| "mean_token_accuracy": 0.42352988123893737, | |
| "num_tokens": 813780.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 2.248101031780243, | |
| "epoch": 0.419672131147541, | |
| "grad_norm": 7.75, | |
| "learning_rate": 1.991025565143858e-05, | |
| "loss": 2.9713, | |
| "mean_token_accuracy": 0.41958039328455926, | |
| "num_tokens": 866574.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 2.2917182624340056, | |
| "epoch": 0.4459016393442623, | |
| "grad_norm": 10.375, | |
| "learning_rate": 1.986492970842197e-05, | |
| "loss": 2.9419, | |
| "mean_token_accuracy": 0.41459466964006425, | |
| "num_tokens": 918240.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 2.278603363037109, | |
| "epoch": 0.4721311475409836, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 1.9810444876186235e-05, | |
| "loss": 2.6958, | |
| "mean_token_accuracy": 0.4311013951897621, | |
| "num_tokens": 977542.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 2.246600490808487, | |
| "epoch": 0.49836065573770494, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.9746851740042906e-05, | |
| "loss": 2.8167, | |
| "mean_token_accuracy": 0.4296098329126835, | |
| "num_tokens": 1033813.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 2.154702216386795, | |
| "epoch": 0.5245901639344263, | |
| "grad_norm": 15.125, | |
| "learning_rate": 1.9674209341719224e-05, | |
| "loss": 2.7415, | |
| "mean_token_accuracy": 0.4451119631528854, | |
| "num_tokens": 1085702.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5245901639344263, | |
| "eval_entropy": 2.186812935155981, | |
| "eval_loss": 3.0109927654266357, | |
| "eval_mean_token_accuracy": 0.43699205885915193, | |
| "eval_num_tokens": 1085702.0, | |
| "eval_runtime": 5.0574, | |
| "eval_samples_per_second": 267.925, | |
| "eval_steps_per_second": 33.614, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 2.162086147069931, | |
| "epoch": 0.5508196721311476, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 1.9592585124542055e-05, | |
| "loss": 2.9784, | |
| "mean_token_accuracy": 0.4247356228530407, | |
| "num_tokens": 1137871.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 2.1591184198856355, | |
| "epoch": 0.5770491803278689, | |
| "grad_norm": 9.375, | |
| "learning_rate": 1.950205487082154e-05, | |
| "loss": 2.7191, | |
| "mean_token_accuracy": 0.4592340663075447, | |
| "num_tokens": 1195487.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 2.1033347219228746, | |
| "epoch": 0.6032786885245902, | |
| "grad_norm": 6.5625, | |
| "learning_rate": 1.9402702631492595e-05, | |
| "loss": 2.8622, | |
| "mean_token_accuracy": 0.4423126816749573, | |
| "num_tokens": 1245528.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 2.141283091902733, | |
| "epoch": 0.6295081967213115, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 1.9294620648079543e-05, | |
| "loss": 2.8314, | |
| "mean_token_accuracy": 0.4316069222986698, | |
| "num_tokens": 1300795.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 2.1625925838947295, | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 10.875, | |
| "learning_rate": 1.9177909267056403e-05, | |
| "loss": 3.0047, | |
| "mean_token_accuracy": 0.42482461109757425, | |
| "num_tokens": 1356910.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 2.1598483502864836, | |
| "epoch": 0.6819672131147541, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.9052676846682278e-05, | |
| "loss": 2.7993, | |
| "mean_token_accuracy": 0.43536330983042715, | |
| "num_tokens": 1412641.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 2.128132125735283, | |
| "epoch": 0.7081967213114754, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 1.8919039656398387e-05, | |
| "loss": 2.8191, | |
| "mean_token_accuracy": 0.4389350414276123, | |
| "num_tokens": 1470575.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 2.1286911875009538, | |
| "epoch": 0.7344262295081967, | |
| "grad_norm": 11.125, | |
| "learning_rate": 1.8777121768880108e-05, | |
| "loss": 2.7836, | |
| "mean_token_accuracy": 0.4473900467157364, | |
| "num_tokens": 1529970.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 2.1200373440980913, | |
| "epoch": 0.760655737704918, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 1.862705494484429e-05, | |
| "loss": 2.9483, | |
| "mean_token_accuracy": 0.4232171691954136, | |
| "num_tokens": 1582507.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 2.1294125616550446, | |
| "epoch": 0.7868852459016393, | |
| "grad_norm": 8.625, | |
| "learning_rate": 1.8468978510718734e-05, | |
| "loss": 2.9155, | |
| "mean_token_accuracy": 0.4285292446613312, | |
| "num_tokens": 1633961.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7868852459016393, | |
| "eval_entropy": 2.1470566707498886, | |
| "eval_loss": 2.9455931186676025, | |
| "eval_mean_token_accuracy": 0.4428922993295333, | |
| "eval_num_tokens": 1633961.0, | |
| "eval_runtime": 5.0322, | |
| "eval_samples_per_second": 269.267, | |
| "eval_steps_per_second": 33.783, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 2.105194115638733, | |
| "epoch": 0.8131147540983606, | |
| "grad_norm": 10.0625, | |
| "learning_rate": 1.8303039229287476e-05, | |
| "loss": 2.804, | |
| "mean_token_accuracy": 0.4374969556927681, | |
| "num_tokens": 1691177.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 2.056723091006279, | |
| "epoch": 0.839344262295082, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 1.8129391163431912e-05, | |
| "loss": 2.6799, | |
| "mean_token_accuracy": 0.459722201526165, | |
| "num_tokens": 1746899.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 2.0097413182258608, | |
| "epoch": 0.8655737704918033, | |
| "grad_norm": 11.0625, | |
| "learning_rate": 1.7948195533094343e-05, | |
| "loss": 2.6355, | |
| "mean_token_accuracy": 0.4685002237558365, | |
| "num_tokens": 1797548.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 2.033376136422157, | |
| "epoch": 0.8918032786885246, | |
| "grad_norm": 13.875, | |
| "learning_rate": 1.7759620565596628e-05, | |
| "loss": 2.8326, | |
| "mean_token_accuracy": 0.4427059732377529, | |
| "num_tokens": 1850940.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 2.1023750483989714, | |
| "epoch": 0.9180327868852459, | |
| "grad_norm": 7.71875, | |
| "learning_rate": 1.7563841339453053e-05, | |
| "loss": 2.7896, | |
| "mean_token_accuracy": 0.4486307971179485, | |
| "num_tokens": 1906789.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 2.150878208875656, | |
| "epoch": 0.9442622950819672, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 1.7361039621822288e-05, | |
| "loss": 2.9686, | |
| "mean_token_accuracy": 0.4197625443339348, | |
| "num_tokens": 1960496.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 2.115389880537987, | |
| "epoch": 0.9704918032786886, | |
| "grad_norm": 12.75, | |
| "learning_rate": 1.7151403699749467e-05, | |
| "loss": 2.7354, | |
| "mean_token_accuracy": 0.4496216416358948, | |
| "num_tokens": 2018312.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 2.0364568412303923, | |
| "epoch": 0.9967213114754099, | |
| "grad_norm": 15.25, | |
| "learning_rate": 1.693512820535498e-05, | |
| "loss": 2.7294, | |
| "mean_token_accuracy": 0.4530358798801899, | |
| "num_tokens": 2075123.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 2.071015902467676, | |
| "epoch": 1.020983606557377, | |
| "grad_norm": 11.6875, | |
| "learning_rate": 1.6712413935132307e-05, | |
| "loss": 2.785, | |
| "mean_token_accuracy": 0.4375705743158186, | |
| "num_tokens": 2127638.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 1.9601984202861786, | |
| "epoch": 1.0472131147540984, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.648346766352272e-05, | |
| "loss": 2.5404, | |
| "mean_token_accuracy": 0.4696895979344845, | |
| "num_tokens": 2182937.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0472131147540984, | |
| "eval_entropy": 1.962728219172534, | |
| "eval_loss": 2.9205315113067627, | |
| "eval_mean_token_accuracy": 0.44614641999497134, | |
| "eval_num_tokens": 2182937.0, | |
| "eval_runtime": 5.0426, | |
| "eval_samples_per_second": 268.709, | |
| "eval_steps_per_second": 33.713, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 1.9237431466579438, | |
| "epoch": 1.0734426229508196, | |
| "grad_norm": 14.0625, | |
| "learning_rate": 1.6248501950939788e-05, | |
| "loss": 2.6068, | |
| "mean_token_accuracy": 0.4642325811088085, | |
| "num_tokens": 2238757.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.9247209161520005, | |
| "epoch": 1.099672131147541, | |
| "grad_norm": 23.25, | |
| "learning_rate": 1.6007734946422125e-05, | |
| "loss": 2.5779, | |
| "mean_token_accuracy": 0.48758235424757, | |
| "num_tokens": 2293678.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 1.9393797785043716, | |
| "epoch": 1.1259016393442622, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 1.576139018509742e-05, | |
| "loss": 2.4135, | |
| "mean_token_accuracy": 0.4751130722463131, | |
| "num_tokens": 2345349.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 1.8991259425878524, | |
| "epoch": 1.1521311475409837, | |
| "grad_norm": 11.375, | |
| "learning_rate": 1.550969638064589e-05, | |
| "loss": 2.5987, | |
| "mean_token_accuracy": 0.4642434611916542, | |
| "num_tokens": 2401578.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 1.9015632569789886, | |
| "epoch": 1.1783606557377049, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 1.525288721295581e-05, | |
| "loss": 2.4481, | |
| "mean_token_accuracy": 0.49190146625041964, | |
| "num_tokens": 2457689.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 1.8667599350214004, | |
| "epoch": 1.2045901639344263, | |
| "grad_norm": 7.1875, | |
| "learning_rate": 1.4991201111168272e-05, | |
| "loss": 2.458, | |
| "mean_token_accuracy": 0.48113037571310996, | |
| "num_tokens": 2512267.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 1.8301506459712982, | |
| "epoch": 1.2308196721311475, | |
| "grad_norm": 12.25, | |
| "learning_rate": 1.4724881032312603e-05, | |
| "loss": 2.3436, | |
| "mean_token_accuracy": 0.48804719001054764, | |
| "num_tokens": 2569873.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 1.8779793590307237, | |
| "epoch": 1.257049180327869, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 1.4454174235737943e-05, | |
| "loss": 2.5532, | |
| "mean_token_accuracy": 0.4793685719370842, | |
| "num_tokens": 2625844.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 1.8586382299661637, | |
| "epoch": 1.2832786885245901, | |
| "grad_norm": 8.375, | |
| "learning_rate": 1.4179332053550452e-05, | |
| "loss": 2.4023, | |
| "mean_token_accuracy": 0.4916908532381058, | |
| "num_tokens": 2684182.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 1.864977639913559, | |
| "epoch": 1.3095081967213114, | |
| "grad_norm": 10.8125, | |
| "learning_rate": 1.3900609657269222e-05, | |
| "loss": 2.4986, | |
| "mean_token_accuracy": 0.47516297921538353, | |
| "num_tokens": 2737200.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.3095081967213114, | |
| "eval_entropy": 1.9044717620400822, | |
| "eval_loss": 2.9017913341522217, | |
| "eval_mean_token_accuracy": 0.45116734504699707, | |
| "eval_num_tokens": 2737200.0, | |
| "eval_runtime": 5.0643, | |
| "eval_samples_per_second": 267.557, | |
| "eval_steps_per_second": 33.568, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 1.8558655947446823, | |
| "epoch": 1.3357377049180328, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.3618265820917565e-05, | |
| "loss": 2.5359, | |
| "mean_token_accuracy": 0.46635654121637343, | |
| "num_tokens": 2788464.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 1.8646740198135376, | |
| "epoch": 1.3619672131147542, | |
| "grad_norm": 11.875, | |
| "learning_rate": 1.333256268076967e-05, | |
| "loss": 2.5993, | |
| "mean_token_accuracy": 0.47755324840545654, | |
| "num_tokens": 2840606.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 1.8370872467756272, | |
| "epoch": 1.3881967213114754, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 1.3043765491975602e-05, | |
| "loss": 2.5856, | |
| "mean_token_accuracy": 0.47580934390425683, | |
| "num_tokens": 2889736.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 1.8797135382890702, | |
| "epoch": 1.4144262295081966, | |
| "grad_norm": 11.375, | |
| "learning_rate": 1.275214238229066e-05, | |
| "loss": 2.6422, | |
| "mean_token_accuracy": 0.4679927438497543, | |
| "num_tokens": 2938632.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 1.8948182821273805, | |
| "epoch": 1.440655737704918, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 1.2457964103137747e-05, | |
| "loss": 2.5687, | |
| "mean_token_accuracy": 0.46481930911540986, | |
| "num_tokens": 2992166.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 1.884895321726799, | |
| "epoch": 1.4668852459016393, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.2161503778233813e-05, | |
| "loss": 2.4362, | |
| "mean_token_accuracy": 0.49427496194839476, | |
| "num_tokens": 3047092.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 1.8731955915689469, | |
| "epoch": 1.4931147540983607, | |
| "grad_norm": 5.8125, | |
| "learning_rate": 1.1863036650013817e-05, | |
| "loss": 2.5261, | |
| "mean_token_accuracy": 0.48092877194285394, | |
| "num_tokens": 3103514.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 1.8721953690052033, | |
| "epoch": 1.519344262295082, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.1562839824087604e-05, | |
| "loss": 2.6489, | |
| "mean_token_accuracy": 0.46040653586387636, | |
| "num_tokens": 3154223.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 1.8559091806411743, | |
| "epoch": 1.5455737704918033, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 1.1261192011966952e-05, | |
| "loss": 2.5391, | |
| "mean_token_accuracy": 0.49064989015460014, | |
| "num_tokens": 3208440.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 1.8500412583351136, | |
| "epoch": 1.5718032786885245, | |
| "grad_norm": 11.375, | |
| "learning_rate": 1.0958373272301647e-05, | |
| "loss": 2.667, | |
| "mean_token_accuracy": 0.45823334604501725, | |
| "num_tokens": 3261342.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.5718032786885245, | |
| "eval_entropy": 1.8744515650412616, | |
| "eval_loss": 2.891632556915283, | |
| "eval_mean_token_accuracy": 0.4510051518678665, | |
| "eval_num_tokens": 3261342.0, | |
| "eval_runtime": 5.0441, | |
| "eval_samples_per_second": 268.629, | |
| "eval_steps_per_second": 33.703, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 1.8289855808019637, | |
| "epoch": 1.5980327868852457, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 1.0654664750864851e-05, | |
| "loss": 2.6261, | |
| "mean_token_accuracy": 0.4699882663786411, | |
| "num_tokens": 3312799.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 1.8798219680786132, | |
| "epoch": 1.6242622950819672, | |
| "grad_norm": 9.875, | |
| "learning_rate": 1.0350348419529143e-05, | |
| "loss": 2.7662, | |
| "mean_token_accuracy": 0.4601142071187496, | |
| "num_tokens": 3365971.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 1.8897109836339951, | |
| "epoch": 1.6504918032786886, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 1.004570681447559e-05, | |
| "loss": 2.4977, | |
| "mean_token_accuracy": 0.48899385556578634, | |
| "num_tokens": 3422509.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 1.8842475891113282, | |
| "epoch": 1.6767213114754098, | |
| "grad_norm": 5.40625, | |
| "learning_rate": 9.741022773878885e-06, | |
| "loss": 2.4265, | |
| "mean_token_accuracy": 0.48201805353164673, | |
| "num_tokens": 3482220.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 1.8733120083808898, | |
| "epoch": 1.702950819672131, | |
| "grad_norm": 11.4375, | |
| "learning_rate": 9.436579175312121e-06, | |
| "loss": 2.5942, | |
| "mean_token_accuracy": 0.4757945977151394, | |
| "num_tokens": 3535417.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 1.8299070119857788, | |
| "epoch": 1.7291803278688525, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 9.132658673114977e-06, | |
| "loss": 2.5514, | |
| "mean_token_accuracy": 0.4865307964384556, | |
| "num_tokens": 3591470.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 1.836677971482277, | |
| "epoch": 1.7554098360655739, | |
| "grad_norm": 11.5, | |
| "learning_rate": 8.829543435969176e-06, | |
| "loss": 2.3891, | |
| "mean_token_accuracy": 0.5033678129315377, | |
| "num_tokens": 3645508.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 1.8698895812034606, | |
| "epoch": 1.781639344262295, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 8.527514884924806e-06, | |
| "loss": 2.2491, | |
| "mean_token_accuracy": 0.5068600162863731, | |
| "num_tokens": 3708816.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 1.7854702413082122, | |
| "epoch": 1.8078688524590163, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8.226853432120808e-06, | |
| "loss": 2.5441, | |
| "mean_token_accuracy": 0.48928667306900026, | |
| "num_tokens": 3760527.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 1.8351004660129546, | |
| "epoch": 1.8340983606557377, | |
| "grad_norm": 14.0, | |
| "learning_rate": 7.927838220442138e-06, | |
| "loss": 2.6377, | |
| "mean_token_accuracy": 0.46876697689294816, | |
| "num_tokens": 3815286.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.8340983606557377, | |
| "eval_entropy": 1.8660439694628996, | |
| "eval_loss": 2.8837223052978516, | |
| "eval_mean_token_accuracy": 0.4532979618100559, | |
| "eval_num_tokens": 3815286.0, | |
| "eval_runtime": 5.0517, | |
| "eval_samples_per_second": 268.228, | |
| "eval_steps_per_second": 33.652, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 1.8734222501516342, | |
| "epoch": 1.8603278688524592, | |
| "grad_norm": 11.3125, | |
| "learning_rate": 7.630746864355353e-06, | |
| "loss": 2.6619, | |
| "mean_token_accuracy": 0.46061802059412005, | |
| "num_tokens": 3867525.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 1.8854580372571945, | |
| "epoch": 1.8865573770491804, | |
| "grad_norm": 6.25, | |
| "learning_rate": 7.335855192163215e-06, | |
| "loss": 2.5745, | |
| "mean_token_accuracy": 0.47656749188899994, | |
| "num_tokens": 3923519.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 1.879018184542656, | |
| "epoch": 1.9127868852459016, | |
| "grad_norm": 5.625, | |
| "learning_rate": 7.043436989917631e-06, | |
| "loss": 2.528, | |
| "mean_token_accuracy": 0.4868933133780956, | |
| "num_tokens": 3981456.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 1.8539914727210998, | |
| "epoch": 1.939016393442623, | |
| "grad_norm": 10.375, | |
| "learning_rate": 6.753763747228687e-06, | |
| "loss": 2.3218, | |
| "mean_token_accuracy": 0.5035036608576775, | |
| "num_tokens": 4036987.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 1.8262196511030198, | |
| "epoch": 1.9652459016393444, | |
| "grad_norm": 6.1875, | |
| "learning_rate": 6.467104405205744e-06, | |
| "loss": 2.5221, | |
| "mean_token_accuracy": 0.4888892278075218, | |
| "num_tokens": 4091343.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 1.8170676976442337, | |
| "epoch": 1.9914754098360654, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 6.183725106764643e-06, | |
| "loss": 2.3265, | |
| "mean_token_accuracy": 0.512597742676735, | |
| "num_tokens": 4146393.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 1.8072978258132935, | |
| "epoch": 2.015737704918033, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 5.903888949532854e-06, | |
| "loss": 2.4277, | |
| "mean_token_accuracy": 0.49353572726249695, | |
| "num_tokens": 4193522.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 1.8114411652088165, | |
| "epoch": 2.041967213114754, | |
| "grad_norm": 12.375, | |
| "learning_rate": 5.627855741581935e-06, | |
| "loss": 2.4582, | |
| "mean_token_accuracy": 0.4922221273183823, | |
| "num_tokens": 4248189.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 1.7977081060409545, | |
| "epoch": 2.0681967213114754, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 5.355881760214121e-06, | |
| "loss": 2.3459, | |
| "mean_token_accuracy": 0.49450727328658106, | |
| "num_tokens": 4300966.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 1.797827911376953, | |
| "epoch": 2.094426229508197, | |
| "grad_norm": 7.25, | |
| "learning_rate": 5.088219514026984e-06, | |
| "loss": 2.384, | |
| "mean_token_accuracy": 0.4919115357100964, | |
| "num_tokens": 4354844.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.094426229508197, | |
| "eval_entropy": 1.8060384897624746, | |
| "eval_loss": 2.8908982276916504, | |
| "eval_mean_token_accuracy": 0.4530702908249462, | |
| "eval_num_tokens": 4354844.0, | |
| "eval_runtime": 5.0625, | |
| "eval_samples_per_second": 267.652, | |
| "eval_steps_per_second": 33.58, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1146, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.254342993392435e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |