{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.4624954262714964, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.5018481526523828, "epoch": 0.029271862422246615, "grad_norm": 53.5, "learning_rate": 1.8000000000000001e-06, "loss": 0.9524, "mean_token_accuracy": 0.8672725282609463, "num_tokens": 178219.0, "step": 10 }, { "entropy": 0.4691743772476912, "epoch": 0.05854372484449323, "grad_norm": 213.0, "learning_rate": 3.8000000000000005e-06, "loss": 0.7404, "mean_token_accuracy": 0.8850074164569378, "num_tokens": 364483.0, "step": 20 }, { "entropy": 0.4770399186760187, "epoch": 0.08781558726673985, "grad_norm": 15.5, "learning_rate": 5.8e-06, "loss": 0.4118, "mean_token_accuracy": 0.9136587515473366, "num_tokens": 544552.0, "step": 30 }, { "entropy": 0.4692947888746858, "epoch": 0.11708744968898646, "grad_norm": 11.9375, "learning_rate": 7.800000000000002e-06, "loss": 0.2463, "mean_token_accuracy": 0.9470756828784943, "num_tokens": 717853.0, "step": 40 }, { "entropy": 0.4310881284996867, "epoch": 0.14635931211123307, "grad_norm": 37.75, "learning_rate": 9.800000000000001e-06, "loss": 0.1406, "mean_token_accuracy": 0.967316172271967, "num_tokens": 892769.0, "step": 50 }, { "entropy": 0.47044283971190454, "epoch": 0.1756311745334797, "grad_norm": 5.90625, "learning_rate": 9.997902051783373e-06, "loss": 0.1637, "mean_token_accuracy": 0.9594391070306301, "num_tokens": 1062544.0, "step": 60 }, { "entropy": 0.4564293924719095, "epoch": 0.20490303695572631, "grad_norm": 8.8125, "learning_rate": 9.990652145366113e-06, "loss": 0.1078, "mean_token_accuracy": 0.9732326805591583, "num_tokens": 1238635.0, "step": 70 }, { "entropy": 0.48633960634469986, "epoch": 0.23417489937797292, "grad_norm": 3.296875, "learning_rate": 9.978231889316302e-06, "loss": 0.1079, "mean_token_accuracy": 0.9725632824003696, "num_tokens": 1422527.0, "step": 80 }, { "entropy": 0.5045951271429658, "epoch": 0.26344676180021953, "grad_norm": 4.96875, "learning_rate": 9.960654151103846e-06, "loss": 0.1129, "mean_token_accuracy": 0.971237526088953, "num_tokens": 1604412.0, "step": 90 }, { "entropy": 0.5754191160202027, "epoch": 0.29271862422246614, "grad_norm": 3.546875, "learning_rate": 9.937937141385323e-06, "loss": 0.1192, "mean_token_accuracy": 0.9681814014911652, "num_tokens": 1785808.0, "step": 100 }, { "entropy": 0.6233104987069964, "epoch": 0.32199048664471275, "grad_norm": 16.875, "learning_rate": 9.91010439513761e-06, "loss": 0.1532, "mean_token_accuracy": 0.9626872226595878, "num_tokens": 1963662.0, "step": 110 }, { "entropy": 0.592149405926466, "epoch": 0.3512623490669594, "grad_norm": 7.3125, "learning_rate": 9.87718474727549e-06, "loss": 0.1171, "mean_token_accuracy": 0.9693892918527126, "num_tokens": 2138861.0, "step": 120 }, { "entropy": 0.5508204206824303, "epoch": 0.380534211489206, "grad_norm": 11.875, "learning_rate": 9.839212302778493e-06, "loss": 0.0928, "mean_token_accuracy": 0.9756750777363777, "num_tokens": 2314376.0, "step": 130 }, { "entropy": 0.5581824988126755, "epoch": 0.40980607391145263, "grad_norm": 5.4375, "learning_rate": 9.796226401357884e-06, "loss": 0.1213, "mean_token_accuracy": 0.9697595350444317, "num_tokens": 2494288.0, "step": 140 }, { "entropy": 0.5680775640532374, "epoch": 0.43907793633369924, "grad_norm": 3.390625, "learning_rate": 9.748271576700476e-06, "loss": 0.1081, "mean_token_accuracy": 0.9728541418910026, "num_tokens": 2682554.0, "step": 150 }, { "entropy": 0.5555528217926622, "epoch": 0.46834979875594585, "grad_norm": 28.875, "learning_rate": 9.69539751033141e-06, "loss": 0.1001, "mean_token_accuracy": 0.972498145699501, "num_tokens": 2855437.0, "step": 160 }, { "entropy": 0.5888866983354092, "epoch": 0.49762166117819245, "grad_norm": 6.0625, "learning_rate": 9.637658980143771e-06, "loss": 0.091, "mean_token_accuracy": 0.9761380776762962, "num_tokens": 3034970.0, "step": 170 }, { "entropy": 0.6018309678882361, "epoch": 0.5268935236004391, "grad_norm": 5.09375, "learning_rate": 9.575115803648303e-06, "loss": 0.0946, "mean_token_accuracy": 0.9742907010018825, "num_tokens": 3208412.0, "step": 180 }, { "entropy": 0.6276524242013692, "epoch": 0.5561653860226857, "grad_norm": 2.75, "learning_rate": 9.507832776002069e-06, "loss": 0.0982, "mean_token_accuracy": 0.9730873227119445, "num_tokens": 3395494.0, "step": 190 }, { "entropy": 0.6615390798076988, "epoch": 0.5854372484449323, "grad_norm": 4.46875, "learning_rate": 9.43587960288023e-06, "loss": 0.1392, "mean_token_accuracy": 0.965959795564413, "num_tokens": 3573674.0, "step": 200 }, { "entropy": 0.6626946024596692, "epoch": 0.6147091108671789, "grad_norm": 20.125, "learning_rate": 9.359330828260477e-06, "loss": 0.1367, "mean_token_accuracy": 0.9644782721996308, "num_tokens": 3749797.0, "step": 210 }, { "entropy": 0.6417636029422283, "epoch": 0.6439809732894255, "grad_norm": 7.71875, "learning_rate": 9.278265757194983e-06, "loss": 0.0971, "mean_token_accuracy": 0.9736645758152008, "num_tokens": 3927226.0, "step": 220 }, { "entropy": 0.6675109906122089, "epoch": 0.6732528357116722, "grad_norm": 20.375, "learning_rate": 9.1927683736498e-06, "loss": 0.1292, "mean_token_accuracy": 0.9648571148514747, "num_tokens": 4107295.0, "step": 230 }, { "entropy": 0.6541570518165827, "epoch": 0.7025246981339188, "grad_norm": 11.875, "learning_rate": 9.102927253496926e-06, "loss": 0.1156, "mean_token_accuracy": 0.9699708484113216, "num_tokens": 4286908.0, "step": 240 }, { "entropy": 0.6058449555188418, "epoch": 0.7317965605561654, "grad_norm": 6.78125, "learning_rate": 9.008835472749085e-06, "loss": 0.0923, "mean_token_accuracy": 0.9739408694207669, "num_tokens": 4467461.0, "step": 250 }, { "entropy": 0.6178297221660614, "epoch": 0.761068422978412, "grad_norm": 5.25, "learning_rate": 8.910590511132339e-06, "loss": 0.0817, "mean_token_accuracy": 0.9773662097752094, "num_tokens": 4646737.0, "step": 260 }, { "entropy": 0.6829405203461647, "epoch": 0.7903402854006586, "grad_norm": 3.40625, "learning_rate": 8.808294151096436e-06, "loss": 0.092, "mean_token_accuracy": 0.9755907997488975, "num_tokens": 4821010.0, "step": 270 }, { "entropy": 0.6392896398901939, "epoch": 0.8196121478229053, "grad_norm": 4.65625, "learning_rate": 8.702052372367496e-06, "loss": 0.0871, "mean_token_accuracy": 0.9771530278027057, "num_tokens": 4999440.0, "step": 280 }, { "entropy": 0.7133271377533674, "epoch": 0.8488840102451518, "grad_norm": 12.3125, "learning_rate": 8.591975242152293e-06, "loss": 0.1063, "mean_token_accuracy": 0.9715632744133472, "num_tokens": 5171265.0, "step": 290 }, { "entropy": 0.6667846977710724, "epoch": 0.8781558726673985, "grad_norm": 3.71875, "learning_rate": 8.478176801107872e-06, "loss": 0.1216, "mean_token_accuracy": 0.9661677174270153, "num_tokens": 5338761.0, "step": 300 }, { "entropy": 0.7178533673286438, "epoch": 0.907427735089645, "grad_norm": 8.6875, "learning_rate": 8.360774945194666e-06, "loss": 0.1041, "mean_token_accuracy": 0.9715993881225586, "num_tokens": 5515062.0, "step": 310 }, { "entropy": 0.7344689719378948, "epoch": 0.9366995975118917, "grad_norm": 3.5, "learning_rate": 8.239891303535457e-06, "loss": 0.0974, "mean_token_accuracy": 0.9735251195728779, "num_tokens": 5696911.0, "step": 320 }, { "entropy": 0.7144436400383711, "epoch": 0.9659714599341384, "grad_norm": 6.15625, "learning_rate": 8.1156511124068e-06, "loss": 0.0817, "mean_token_accuracy": 0.9768537104129791, "num_tokens": 5869301.0, "step": 330 }, { "entropy": 0.6643564581871033, "epoch": 0.9952433223563849, "grad_norm": 7.25, "learning_rate": 7.988183085493362e-06, "loss": 0.1041, "mean_token_accuracy": 0.971296526491642, "num_tokens": 6044424.0, "step": 340 }, { "entropy": 0.6510233766846842, "epoch": 1.0234174899377972, "grad_norm": 9.375, "learning_rate": 7.85761928053969e-06, "loss": 0.0824, "mean_token_accuracy": 0.9768819661883564, "num_tokens": 6217116.0, "step": 350 }, { "entropy": 0.6857636030763388, "epoch": 1.0526893523600438, "grad_norm": 5.0625, "learning_rate": 7.72409496253747e-06, "loss": 0.0949, "mean_token_accuracy": 0.9749896325170994, "num_tokens": 6387512.0, "step": 360 }, { "entropy": 0.6679304147139191, "epoch": 1.0819612147822906, "grad_norm": 2.8125, "learning_rate": 7.5877484635900876e-06, "loss": 0.0958, "mean_token_accuracy": 0.9742133662104606, "num_tokens": 6573367.0, "step": 370 }, { "entropy": 0.6774611625820398, "epoch": 1.1112330772045371, "grad_norm": 12.9375, "learning_rate": 7.448721039599616e-06, "loss": 0.1029, "mean_token_accuracy": 0.972662803530693, "num_tokens": 6751810.0, "step": 380 }, { "entropy": 0.6481307320296764, "epoch": 1.1405049396267837, "grad_norm": 19.375, "learning_rate": 7.307156723924742e-06, "loss": 0.0842, "mean_token_accuracy": 0.9763316512107849, "num_tokens": 6940710.0, "step": 390 }, { "entropy": 0.6626342430710792, "epoch": 1.1697768020490305, "grad_norm": 3.640625, "learning_rate": 7.1632021781612305e-06, "loss": 0.0834, "mean_token_accuracy": 0.9770378485321999, "num_tokens": 7120616.0, "step": 400 }, { "entropy": 0.689614738151431, "epoch": 1.199048664471277, "grad_norm": 7.59375, "learning_rate": 7.017006540199501e-06, "loss": 0.0962, "mean_token_accuracy": 0.9729363858699799, "num_tokens": 7295346.0, "step": 410 }, { "entropy": 0.731178673915565, "epoch": 1.2283205268935236, "grad_norm": 56.25, "learning_rate": 6.8687212697167685e-06, "loss": 0.1229, "mean_token_accuracy": 0.9676964573562146, "num_tokens": 7473599.0, "step": 420 }, { "entropy": 0.6449350535869598, "epoch": 1.25759238931577, "grad_norm": 15.5, "learning_rate": 6.718499991263776e-06, "loss": 0.0794, "mean_token_accuracy": 0.9775209553539753, "num_tokens": 7666324.0, "step": 430 }, { "entropy": 0.6695770636200905, "epoch": 1.2868642517380169, "grad_norm": 10.4375, "learning_rate": 6.566498335108719e-06, "loss": 0.1376, "mean_token_accuracy": 0.9635490447282791, "num_tokens": 7842308.0, "step": 440 }, { "entropy": 0.6306964188814164, "epoch": 1.3161361141602634, "grad_norm": 13.4375, "learning_rate": 6.412873776003224e-06, "loss": 0.0804, "mean_token_accuracy": 0.9786465026438236, "num_tokens": 8027201.0, "step": 450 }, { "entropy": 0.6525185916572809, "epoch": 1.34540797658251, "grad_norm": 9.4375, "learning_rate": 6.2577854700374326e-06, "loss": 0.073, "mean_token_accuracy": 0.9785589501261711, "num_tokens": 8198350.0, "step": 460 }, { "entropy": 0.6055996913462878, "epoch": 1.3746798390047568, "grad_norm": 4.375, "learning_rate": 6.101394089753215e-06, "loss": 0.091, "mean_token_accuracy": 0.9752737417817116, "num_tokens": 8384460.0, "step": 470 }, { "entropy": 0.6333212267607451, "epoch": 1.4039517014270033, "grad_norm": 4.25, "learning_rate": 5.9438616576863085e-06, "loss": 0.0817, "mean_token_accuracy": 0.9772769220173358, "num_tokens": 8555391.0, "step": 480 }, { "entropy": 0.6566504757851362, "epoch": 1.4332235638492499, "grad_norm": 6.34375, "learning_rate": 5.785351378509875e-06, "loss": 0.0844, "mean_token_accuracy": 0.9760506093502045, "num_tokens": 8734253.0, "step": 490 }, { "entropy": 0.6695024535059929, "epoch": 1.4624954262714964, "grad_norm": 11.9375, "learning_rate": 5.626027469953345e-06, "loss": 0.1066, "mean_token_accuracy": 0.9704300530254841, "num_tokens": 8910777.0, "step": 500 } ], "logging_steps": 10, "max_steps": 1026, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.353167433816843e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }