| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.4624954262714964, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.5018481526523828, |
| "epoch": 0.029271862422246615, |
| "grad_norm": 53.5, |
| "learning_rate": 1.8000000000000001e-06, |
| "loss": 0.9524, |
| "mean_token_accuracy": 0.8672725282609463, |
| "num_tokens": 178219.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.4691743772476912, |
| "epoch": 0.05854372484449323, |
| "grad_norm": 213.0, |
| "learning_rate": 3.8000000000000005e-06, |
| "loss": 0.7404, |
| "mean_token_accuracy": 0.8850074164569378, |
| "num_tokens": 364483.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.4770399186760187, |
| "epoch": 0.08781558726673985, |
| "grad_norm": 15.5, |
| "learning_rate": 5.8e-06, |
| "loss": 0.4118, |
| "mean_token_accuracy": 0.9136587515473366, |
| "num_tokens": 544552.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.4692947888746858, |
| "epoch": 0.11708744968898646, |
| "grad_norm": 11.9375, |
| "learning_rate": 7.800000000000002e-06, |
| "loss": 0.2463, |
| "mean_token_accuracy": 0.9470756828784943, |
| "num_tokens": 717853.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.4310881284996867, |
| "epoch": 0.14635931211123307, |
| "grad_norm": 37.75, |
| "learning_rate": 9.800000000000001e-06, |
| "loss": 0.1406, |
| "mean_token_accuracy": 0.967316172271967, |
| "num_tokens": 892769.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.47044283971190454, |
| "epoch": 0.1756311745334797, |
| "grad_norm": 5.90625, |
| "learning_rate": 9.997902051783373e-06, |
| "loss": 0.1637, |
| "mean_token_accuracy": 0.9594391070306301, |
| "num_tokens": 1062544.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.4564293924719095, |
| "epoch": 0.20490303695572631, |
| "grad_norm": 8.8125, |
| "learning_rate": 9.990652145366113e-06, |
| "loss": 0.1078, |
| "mean_token_accuracy": 0.9732326805591583, |
| "num_tokens": 1238635.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.48633960634469986, |
| "epoch": 0.23417489937797292, |
| "grad_norm": 3.296875, |
| "learning_rate": 9.978231889316302e-06, |
| "loss": 0.1079, |
| "mean_token_accuracy": 0.9725632824003696, |
| "num_tokens": 1422527.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.5045951271429658, |
| "epoch": 0.26344676180021953, |
| "grad_norm": 4.96875, |
| "learning_rate": 9.960654151103846e-06, |
| "loss": 0.1129, |
| "mean_token_accuracy": 0.971237526088953, |
| "num_tokens": 1604412.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.5754191160202027, |
| "epoch": 0.29271862422246614, |
| "grad_norm": 3.546875, |
| "learning_rate": 9.937937141385323e-06, |
| "loss": 0.1192, |
| "mean_token_accuracy": 0.9681814014911652, |
| "num_tokens": 1785808.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.6233104987069964, |
| "epoch": 0.32199048664471275, |
| "grad_norm": 16.875, |
| "learning_rate": 9.91010439513761e-06, |
| "loss": 0.1532, |
| "mean_token_accuracy": 0.9626872226595878, |
| "num_tokens": 1963662.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.592149405926466, |
| "epoch": 0.3512623490669594, |
| "grad_norm": 7.3125, |
| "learning_rate": 9.87718474727549e-06, |
| "loss": 0.1171, |
| "mean_token_accuracy": 0.9693892918527126, |
| "num_tokens": 2138861.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.5508204206824303, |
| "epoch": 0.380534211489206, |
| "grad_norm": 11.875, |
| "learning_rate": 9.839212302778493e-06, |
| "loss": 0.0928, |
| "mean_token_accuracy": 0.9756750777363777, |
| "num_tokens": 2314376.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.5581824988126755, |
| "epoch": 0.40980607391145263, |
| "grad_norm": 5.4375, |
| "learning_rate": 9.796226401357884e-06, |
| "loss": 0.1213, |
| "mean_token_accuracy": 0.9697595350444317, |
| "num_tokens": 2494288.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.5680775640532374, |
| "epoch": 0.43907793633369924, |
| "grad_norm": 3.390625, |
| "learning_rate": 9.748271576700476e-06, |
| "loss": 0.1081, |
| "mean_token_accuracy": 0.9728541418910026, |
| "num_tokens": 2682554.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.5555528217926622, |
| "epoch": 0.46834979875594585, |
| "grad_norm": 28.875, |
| "learning_rate": 9.69539751033141e-06, |
| "loss": 0.1001, |
| "mean_token_accuracy": 0.972498145699501, |
| "num_tokens": 2855437.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.5888866983354092, |
| "epoch": 0.49762166117819245, |
| "grad_norm": 6.0625, |
| "learning_rate": 9.637658980143771e-06, |
| "loss": 0.091, |
| "mean_token_accuracy": 0.9761380776762962, |
| "num_tokens": 3034970.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.6018309678882361, |
| "epoch": 0.5268935236004391, |
| "grad_norm": 5.09375, |
| "learning_rate": 9.575115803648303e-06, |
| "loss": 0.0946, |
| "mean_token_accuracy": 0.9742907010018825, |
| "num_tokens": 3208412.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.6276524242013692, |
| "epoch": 0.5561653860226857, |
| "grad_norm": 2.75, |
| "learning_rate": 9.507832776002069e-06, |
| "loss": 0.0982, |
| "mean_token_accuracy": 0.9730873227119445, |
| "num_tokens": 3395494.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.6615390798076988, |
| "epoch": 0.5854372484449323, |
| "grad_norm": 4.46875, |
| "learning_rate": 9.43587960288023e-06, |
| "loss": 0.1392, |
| "mean_token_accuracy": 0.965959795564413, |
| "num_tokens": 3573674.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.6626946024596692, |
| "epoch": 0.6147091108671789, |
| "grad_norm": 20.125, |
| "learning_rate": 9.359330828260477e-06, |
| "loss": 0.1367, |
| "mean_token_accuracy": 0.9644782721996308, |
| "num_tokens": 3749797.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.6417636029422283, |
| "epoch": 0.6439809732894255, |
| "grad_norm": 7.71875, |
| "learning_rate": 9.278265757194983e-06, |
| "loss": 0.0971, |
| "mean_token_accuracy": 0.9736645758152008, |
| "num_tokens": 3927226.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.6675109906122089, |
| "epoch": 0.6732528357116722, |
| "grad_norm": 20.375, |
| "learning_rate": 9.1927683736498e-06, |
| "loss": 0.1292, |
| "mean_token_accuracy": 0.9648571148514747, |
| "num_tokens": 4107295.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.6541570518165827, |
| "epoch": 0.7025246981339188, |
| "grad_norm": 11.875, |
| "learning_rate": 9.102927253496926e-06, |
| "loss": 0.1156, |
| "mean_token_accuracy": 0.9699708484113216, |
| "num_tokens": 4286908.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.6058449555188418, |
| "epoch": 0.7317965605561654, |
| "grad_norm": 6.78125, |
| "learning_rate": 9.008835472749085e-06, |
| "loss": 0.0923, |
| "mean_token_accuracy": 0.9739408694207669, |
| "num_tokens": 4467461.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.6178297221660614, |
| "epoch": 0.761068422978412, |
| "grad_norm": 5.25, |
| "learning_rate": 8.910590511132339e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9773662097752094, |
| "num_tokens": 4646737.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.6829405203461647, |
| "epoch": 0.7903402854006586, |
| "grad_norm": 3.40625, |
| "learning_rate": 8.808294151096436e-06, |
| "loss": 0.092, |
| "mean_token_accuracy": 0.9755907997488975, |
| "num_tokens": 4821010.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.6392896398901939, |
| "epoch": 0.8196121478229053, |
| "grad_norm": 4.65625, |
| "learning_rate": 8.702052372367496e-06, |
| "loss": 0.0871, |
| "mean_token_accuracy": 0.9771530278027057, |
| "num_tokens": 4999440.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.7133271377533674, |
| "epoch": 0.8488840102451518, |
| "grad_norm": 12.3125, |
| "learning_rate": 8.591975242152293e-06, |
| "loss": 0.1063, |
| "mean_token_accuracy": 0.9715632744133472, |
| "num_tokens": 5171265.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.6667846977710724, |
| "epoch": 0.8781558726673985, |
| "grad_norm": 3.71875, |
| "learning_rate": 8.478176801107872e-06, |
| "loss": 0.1216, |
| "mean_token_accuracy": 0.9661677174270153, |
| "num_tokens": 5338761.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.7178533673286438, |
| "epoch": 0.907427735089645, |
| "grad_norm": 8.6875, |
| "learning_rate": 8.360774945194666e-06, |
| "loss": 0.1041, |
| "mean_token_accuracy": 0.9715993881225586, |
| "num_tokens": 5515062.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.7344689719378948, |
| "epoch": 0.9366995975118917, |
| "grad_norm": 3.5, |
| "learning_rate": 8.239891303535457e-06, |
| "loss": 0.0974, |
| "mean_token_accuracy": 0.9735251195728779, |
| "num_tokens": 5696911.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.7144436400383711, |
| "epoch": 0.9659714599341384, |
| "grad_norm": 6.15625, |
| "learning_rate": 8.1156511124068e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9768537104129791, |
| "num_tokens": 5869301.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.6643564581871033, |
| "epoch": 0.9952433223563849, |
| "grad_norm": 7.25, |
| "learning_rate": 7.988183085493362e-06, |
| "loss": 0.1041, |
| "mean_token_accuracy": 0.971296526491642, |
| "num_tokens": 6044424.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.6510233766846842, |
| "epoch": 1.0234174899377972, |
| "grad_norm": 9.375, |
| "learning_rate": 7.85761928053969e-06, |
| "loss": 0.0824, |
| "mean_token_accuracy": 0.9768819661883564, |
| "num_tokens": 6217116.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.6857636030763388, |
| "epoch": 1.0526893523600438, |
| "grad_norm": 5.0625, |
| "learning_rate": 7.72409496253747e-06, |
| "loss": 0.0949, |
| "mean_token_accuracy": 0.9749896325170994, |
| "num_tokens": 6387512.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.6679304147139191, |
| "epoch": 1.0819612147822906, |
| "grad_norm": 2.8125, |
| "learning_rate": 7.5877484635900876e-06, |
| "loss": 0.0958, |
| "mean_token_accuracy": 0.9742133662104606, |
| "num_tokens": 6573367.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.6774611625820398, |
| "epoch": 1.1112330772045371, |
| "grad_norm": 12.9375, |
| "learning_rate": 7.448721039599616e-06, |
| "loss": 0.1029, |
| "mean_token_accuracy": 0.972662803530693, |
| "num_tokens": 6751810.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.6481307320296764, |
| "epoch": 1.1405049396267837, |
| "grad_norm": 19.375, |
| "learning_rate": 7.307156723924742e-06, |
| "loss": 0.0842, |
| "mean_token_accuracy": 0.9763316512107849, |
| "num_tokens": 6940710.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.6626342430710792, |
| "epoch": 1.1697768020490305, |
| "grad_norm": 3.640625, |
| "learning_rate": 7.1632021781612305e-06, |
| "loss": 0.0834, |
| "mean_token_accuracy": 0.9770378485321999, |
| "num_tokens": 7120616.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.689614738151431, |
| "epoch": 1.199048664471277, |
| "grad_norm": 7.59375, |
| "learning_rate": 7.017006540199501e-06, |
| "loss": 0.0962, |
| "mean_token_accuracy": 0.9729363858699799, |
| "num_tokens": 7295346.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.731178673915565, |
| "epoch": 1.2283205268935236, |
| "grad_norm": 56.25, |
| "learning_rate": 6.8687212697167685e-06, |
| "loss": 0.1229, |
| "mean_token_accuracy": 0.9676964573562146, |
| "num_tokens": 7473599.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.6449350535869598, |
| "epoch": 1.25759238931577, |
| "grad_norm": 15.5, |
| "learning_rate": 6.718499991263776e-06, |
| "loss": 0.0794, |
| "mean_token_accuracy": 0.9775209553539753, |
| "num_tokens": 7666324.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.6695770636200905, |
| "epoch": 1.2868642517380169, |
| "grad_norm": 10.4375, |
| "learning_rate": 6.566498335108719e-06, |
| "loss": 0.1376, |
| "mean_token_accuracy": 0.9635490447282791, |
| "num_tokens": 7842308.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.6306964188814164, |
| "epoch": 1.3161361141602634, |
| "grad_norm": 13.4375, |
| "learning_rate": 6.412873776003224e-06, |
| "loss": 0.0804, |
| "mean_token_accuracy": 0.9786465026438236, |
| "num_tokens": 8027201.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.6525185916572809, |
| "epoch": 1.34540797658251, |
| "grad_norm": 9.4375, |
| "learning_rate": 6.2577854700374326e-06, |
| "loss": 0.073, |
| "mean_token_accuracy": 0.9785589501261711, |
| "num_tokens": 8198350.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.6055996913462878, |
| "epoch": 1.3746798390047568, |
| "grad_norm": 4.375, |
| "learning_rate": 6.101394089753215e-06, |
| "loss": 0.091, |
| "mean_token_accuracy": 0.9752737417817116, |
| "num_tokens": 8384460.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.6333212267607451, |
| "epoch": 1.4039517014270033, |
| "grad_norm": 4.25, |
| "learning_rate": 5.9438616576863085e-06, |
| "loss": 0.0817, |
| "mean_token_accuracy": 0.9772769220173358, |
| "num_tokens": 8555391.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.6566504757851362, |
| "epoch": 1.4332235638492499, |
| "grad_norm": 6.34375, |
| "learning_rate": 5.785351378509875e-06, |
| "loss": 0.0844, |
| "mean_token_accuracy": 0.9760506093502045, |
| "num_tokens": 8734253.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.6695024535059929, |
| "epoch": 1.4624954262714964, |
| "grad_norm": 11.9375, |
| "learning_rate": 5.626027469953345e-06, |
| "loss": 0.1066, |
| "mean_token_accuracy": 0.9704300530254841, |
| "num_tokens": 8910777.0, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1026, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.353167433816843e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|