{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 625, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.5365671901141895, "epoch": 0.0016, "grad_norm": 4.21875, "learning_rate": 0.0, "loss": 5.494313716888428, "mean_token_accuracy": 0.6573571685482474, "num_tokens": 348701.0, "step": 1 }, { "entropy": 1.348903712299135, "epoch": 0.016, "grad_norm": 2.59375, "learning_rate": 3.157894736842105e-05, "loss": 5.544558631049262, "mean_token_accuracy": 0.6651431231035126, "num_tokens": 371496.0, "step": 10 }, { "entropy": 1.360134594142437, "epoch": 0.032, "grad_norm": 3.296875, "learning_rate": 6.666666666666667e-05, "loss": 5.742576599121094, "mean_token_accuracy": 0.6662841200828552, "num_tokens": 400839.0, "step": 20 }, { "entropy": 1.375771728157997, "epoch": 0.048, "grad_norm": 3.140625, "learning_rate": 0.0001017543859649123, "loss": 5.714028167724609, "mean_token_accuracy": 0.6566085889935493, "num_tokens": 431025.0, "step": 30 }, { "entropy": 1.2748979568481444, "epoch": 0.064, "grad_norm": 2.828125, "learning_rate": 0.0001368421052631579, "loss": 5.11612548828125, "mean_token_accuracy": 0.6816287323832512, "num_tokens": 459089.0, "step": 40 }, { "entropy": 1.3155348658561707, "epoch": 0.08, "grad_norm": 3.15625, "learning_rate": 0.00017192982456140353, "loss": 5.35040512084961, "mean_token_accuracy": 0.6721501812338829, "num_tokens": 484648.0, "step": 50 }, { "entropy": 1.1791150525212288, "epoch": 0.096, "grad_norm": 2.375, "learning_rate": 0.00019999940277008808, "loss": 4.642951583862304, "mean_token_accuracy": 0.7028968930244446, "num_tokens": 511463.0, "step": 60 }, { "entropy": 1.1177749201655387, "epoch": 0.112, "grad_norm": 2.421875, "learning_rate": 0.0001999785004721968, "loss": 4.523546981811523, "mean_token_accuracy": 0.7057401552796364, "num_tokens": 538438.0, "step": 70 }, { "entropy": 1.1784485399723053, "epoch": 0.128, "grad_norm": 2.25, "learning_rate": 0.00019992774381199778, "loss": 4.712202072143555, "mean_token_accuracy": 0.7004493281245232, "num_tokens": 569421.0, "step": 80 }, { "entropy": 1.0666756927967072, "epoch": 0.144, "grad_norm": 4.25, "learning_rate": 0.00019984714794582683, "loss": 4.181539154052734, "mean_token_accuracy": 0.7218294009566307, "num_tokens": 595806.0, "step": 90 }, { "entropy": 1.1410984337329864, "epoch": 0.16, "grad_norm": 2.59375, "learning_rate": 0.00019973673694024, "loss": 4.578577041625977, "mean_token_accuracy": 0.70914496332407, "num_tokens": 626238.0, "step": 100 }, { "entropy": 1.058976437151432, "epoch": 0.176, "grad_norm": 4.34375, "learning_rate": 0.0001995965437648273, "loss": 4.2924964904785154, "mean_token_accuracy": 0.7180401891469955, "num_tokens": 653441.0, "step": 110 }, { "entropy": 1.2249037250876427, "epoch": 0.192, "grad_norm": 3.96875, "learning_rate": 0.00019942661028236745, "loss": 5.084912490844727, "mean_token_accuracy": 0.6847136527299881, "num_tokens": 682582.0, "step": 120 }, { "entropy": 1.1726351112127305, "epoch": 0.208, "grad_norm": 3.296875, "learning_rate": 0.00019922698723632767, "loss": 5.018433380126953, "mean_token_accuracy": 0.7055773109197616, "num_tokens": 710468.0, "step": 130 }, { "entropy": 1.3469643473625184, "epoch": 0.224, "grad_norm": 2.875, "learning_rate": 0.000198997734235711, "loss": 5.7476848602294925, "mean_token_accuracy": 0.6790369123220443, "num_tokens": 740979.0, "step": 140 }, { "entropy": 1.187085197865963, "epoch": 0.24, "grad_norm": 2.359375, "learning_rate": 0.0001987389197372567, "loss": 5.085608673095703, "mean_token_accuracy": 0.7057169727981091, "num_tokens": 768427.0, "step": 150 }, { "entropy": 1.2938067495822907, "epoch": 0.256, "grad_norm": 2.359375, "learning_rate": 0.0001984506210249986, "loss": 5.570721054077149, "mean_token_accuracy": 0.6840770006179809, "num_tokens": 796744.0, "step": 160 }, { "entropy": 1.1161220327019692, "epoch": 0.272, "grad_norm": 2.265625, "learning_rate": 0.00019813292418718732, "loss": 4.821302795410157, "mean_token_accuracy": 0.7168306574225426, "num_tokens": 822225.0, "step": 170 }, { "entropy": 1.2217196941375732, "epoch": 0.288, "grad_norm": 2.484375, "learning_rate": 0.00019778592409058378, "loss": 5.171672821044922, "mean_token_accuracy": 0.6998881995677948, "num_tokens": 848049.0, "step": 180 }, { "entropy": 1.1091715931892394, "epoch": 0.304, "grad_norm": 3.109375, "learning_rate": 0.00019740972435213115, "loss": 4.743664169311524, "mean_token_accuracy": 0.7203769966959953, "num_tokens": 873172.0, "step": 190 }, { "entropy": 1.2154748886823654, "epoch": 0.32, "grad_norm": 2.21875, "learning_rate": 0.00019700443730801413, "loss": 5.212123107910156, "mean_token_accuracy": 0.7076091229915619, "num_tokens": 901716.0, "step": 200 }, { "entropy": 1.161145742237568, "epoch": 0.336, "grad_norm": 2.078125, "learning_rate": 0.00019657018398011434, "loss": 4.889891052246094, "mean_token_accuracy": 0.7145821407437325, "num_tokens": 928506.0, "step": 210 }, { "entropy": 1.1209837168455123, "epoch": 0.352, "grad_norm": 2.09375, "learning_rate": 0.00019610709403987246, "loss": 4.862021636962891, "mean_token_accuracy": 0.7152845054864884, "num_tokens": 958723.0, "step": 220 }, { "entropy": 1.1322738587856294, "epoch": 0.368, "grad_norm": 2.203125, "learning_rate": 0.00019561530576956703, "loss": 4.911846923828125, "mean_token_accuracy": 0.7134342208504677, "num_tokens": 987751.0, "step": 230 }, { "entropy": 1.1780440196394921, "epoch": 0.384, "grad_norm": 2.421875, "learning_rate": 0.00019509496602102252, "loss": 5.0671440124511715, "mean_token_accuracy": 0.7104542285203934, "num_tokens": 1018315.0, "step": 240 }, { "entropy": 1.1776200592517854, "epoch": 0.4, "grad_norm": 2.359375, "learning_rate": 0.00019454623017175812, "loss": 5.145317459106446, "mean_token_accuracy": 0.7086253471672534, "num_tokens": 1046627.0, "step": 250 }, { "entropy": 1.1513659209012985, "epoch": 0.416, "grad_norm": 2.140625, "learning_rate": 0.00019396926207859084, "loss": 4.8750255584716795, "mean_token_accuracy": 0.7203947469592095, "num_tokens": 1075027.0, "step": 260 }, { "entropy": 1.1364729449152946, "epoch": 0.432, "grad_norm": 2.265625, "learning_rate": 0.00019336423402870653, "loss": 4.866281890869141, "mean_token_accuracy": 0.7150357633829116, "num_tokens": 1102334.0, "step": 270 }, { "entropy": 1.108440762758255, "epoch": 0.448, "grad_norm": 1.9453125, "learning_rate": 0.00019273132668821364, "loss": 4.697020721435547, "mean_token_accuracy": 0.7265092357993126, "num_tokens": 1131415.0, "step": 280 }, { "entropy": 1.1001321360468865, "epoch": 0.464, "grad_norm": 2.125, "learning_rate": 0.00019207072904819486, "loss": 4.747102355957031, "mean_token_accuracy": 0.7231316924095154, "num_tokens": 1158280.0, "step": 290 }, { "entropy": 1.1048941344022751, "epoch": 0.48, "grad_norm": 2.234375, "learning_rate": 0.00019138263836827288, "loss": 4.644550323486328, "mean_token_accuracy": 0.7270571634173393, "num_tokens": 1188651.0, "step": 300 }, { "entropy": 1.0945911705493927, "epoch": 0.496, "grad_norm": 1.984375, "learning_rate": 0.00019066726011770726, "loss": 4.7493640899658205, "mean_token_accuracy": 0.7258608743548394, "num_tokens": 1216504.0, "step": 310 }, { "entropy": 1.0356923416256905, "epoch": 0.512, "grad_norm": 2.1875, "learning_rate": 0.00018992480791403958, "loss": 4.3843944549560545, "mean_token_accuracy": 0.7372462660074234, "num_tokens": 1243732.0, "step": 320 }, { "entropy": 1.0773125097155571, "epoch": 0.528, "grad_norm": 2.328125, "learning_rate": 0.0001891555034593055, "loss": 4.6180767059326175, "mean_token_accuracy": 0.7283408164978027, "num_tokens": 1269272.0, "step": 330 }, { "entropy": 1.050987295806408, "epoch": 0.544, "grad_norm": 1.640625, "learning_rate": 0.00018835957647383303, "loss": 4.4863533020019535, "mean_token_accuracy": 0.7359548002481461, "num_tokens": 1296001.0, "step": 340 }, { "entropy": 1.0461898490786552, "epoch": 0.56, "grad_norm": 2.453125, "learning_rate": 0.000187537264627646, "loss": 4.434906005859375, "mean_token_accuracy": 0.7349108412861824, "num_tokens": 1322180.0, "step": 350 }, { "entropy": 1.0046504065394402, "epoch": 0.576, "grad_norm": 2.125, "learning_rate": 0.00018668881346949417, "loss": 4.309226989746094, "mean_token_accuracy": 0.73991359770298, "num_tokens": 1347532.0, "step": 360 }, { "entropy": 1.0861794739961623, "epoch": 0.592, "grad_norm": 1.890625, "learning_rate": 0.0001858144763535302, "loss": 4.599692535400391, "mean_token_accuracy": 0.7285229310393333, "num_tokens": 1373584.0, "step": 370 }, { "entropy": 1.0600647926330566, "epoch": 0.608, "grad_norm": 2.203125, "learning_rate": 0.00018491451436365627, "loss": 4.5491493225097654, "mean_token_accuracy": 0.7357145607471466, "num_tokens": 1399594.0, "step": 380 }, { "entropy": 1.098224511742592, "epoch": 0.624, "grad_norm": 1.9296875, "learning_rate": 0.00018398919623556238, "loss": 4.8250572204589846, "mean_token_accuracy": 0.7251888766884804, "num_tokens": 1428792.0, "step": 390 }, { "entropy": 1.1232757449150086, "epoch": 0.64, "grad_norm": 2.03125, "learning_rate": 0.00018303879827647975, "loss": 4.712010192871094, "mean_token_accuracy": 0.7287421196699142, "num_tokens": 1457889.0, "step": 400 }, { "entropy": 1.0411028936505318, "epoch": 0.656, "grad_norm": 1.859375, "learning_rate": 0.00018206360428267332, "loss": 4.405958938598633, "mean_token_accuracy": 0.7350577011704444, "num_tokens": 1484479.0, "step": 410 }, { "entropy": 1.1146624743938447, "epoch": 0.672, "grad_norm": 1.734375, "learning_rate": 0.00018106390545469795, "loss": 4.742184448242187, "mean_token_accuracy": 0.7282734125852585, "num_tokens": 1514268.0, "step": 420 }, { "entropy": 1.0480840012431145, "epoch": 0.688, "grad_norm": 1.78125, "learning_rate": 0.0001800400003104436, "loss": 4.4207916259765625, "mean_token_accuracy": 0.7365467861294747, "num_tokens": 1540474.0, "step": 430 }, { "entropy": 1.0067586719989776, "epoch": 0.704, "grad_norm": 1.921875, "learning_rate": 0.0001789921945959958, "loss": 4.31513671875, "mean_token_accuracy": 0.7437789484858512, "num_tokens": 1568557.0, "step": 440 }, { "entropy": 1.0166094586253167, "epoch": 0.72, "grad_norm": 1.6171875, "learning_rate": 0.0001779208011943371, "loss": 4.292739486694336, "mean_token_accuracy": 0.7471857726573944, "num_tokens": 1594403.0, "step": 450 }, { "entropy": 1.0385540708899499, "epoch": 0.736, "grad_norm": 2.75, "learning_rate": 0.00017682614003191807, "loss": 4.415458679199219, "mean_token_accuracy": 0.7400641202926636, "num_tokens": 1624111.0, "step": 460 }, { "entropy": 1.0706985503435136, "epoch": 0.752, "grad_norm": 2.265625, "learning_rate": 0.0001757085379831246, "loss": 4.554729080200195, "mean_token_accuracy": 0.7342943042516709, "num_tokens": 1652609.0, "step": 470 }, { "entropy": 0.9689434483647347, "epoch": 0.768, "grad_norm": 2.03125, "learning_rate": 0.00017456832877267084, "loss": 4.2303211212158205, "mean_token_accuracy": 0.7474748462438583, "num_tokens": 1678589.0, "step": 480 }, { "entropy": 1.1837652444839477, "epoch": 0.784, "grad_norm": 2.078125, "learning_rate": 0.00017340585287594604, "loss": 4.968061447143555, "mean_token_accuracy": 0.7166377156972885, "num_tokens": 1708513.0, "step": 490 }, { "entropy": 1.042286379635334, "epoch": 0.8, "grad_norm": 2.515625, "learning_rate": 0.00017222145741734626, "loss": 4.422880172729492, "mean_token_accuracy": 0.7405225187540054, "num_tokens": 1736283.0, "step": 500 }, { "entropy": 0.9832519263029098, "epoch": 0.816, "grad_norm": 1.6171875, "learning_rate": 0.00017101549606662024, "loss": 4.119276428222657, "mean_token_accuracy": 0.7495438739657402, "num_tokens": 1765017.0, "step": 510 }, { "entropy": 0.9872105062007904, "epoch": 0.832, "grad_norm": 1.71875, "learning_rate": 0.00016978832893326074, "loss": 4.234106826782226, "mean_token_accuracy": 0.7473902180790901, "num_tokens": 1790682.0, "step": 520 }, { "entropy": 0.9845283895730972, "epoch": 0.848, "grad_norm": 1.4921875, "learning_rate": 0.00016854032245897308, "loss": 4.146430969238281, "mean_token_accuracy": 0.7520387843251228, "num_tokens": 1820050.0, "step": 530 }, { "entropy": 1.0613365799188614, "epoch": 0.864, "grad_norm": 2.046875, "learning_rate": 0.00016727184930825288, "loss": 4.48931655883789, "mean_token_accuracy": 0.7396015107631684, "num_tokens": 1847079.0, "step": 540 }, { "entropy": 0.9684726029634476, "epoch": 0.88, "grad_norm": 2.03125, "learning_rate": 0.00016598328825710533, "loss": 4.12475357055664, "mean_token_accuracy": 0.7526269048452378, "num_tokens": 1872383.0, "step": 550 }, { "entropy": 0.9990086019039154, "epoch": 0.896, "grad_norm": 1.8984375, "learning_rate": 0.00016467502407993992, "loss": 4.360863494873047, "mean_token_accuracy": 0.7387492001056671, "num_tokens": 1903630.0, "step": 560 }, { "entropy": 0.9708257809281349, "epoch": 0.912, "grad_norm": 1.6328125, "learning_rate": 0.00016334744743467364, "loss": 4.054442596435547, "mean_token_accuracy": 0.7545965671539306, "num_tokens": 1930592.0, "step": 570 }, { "entropy": 1.0035204842686654, "epoch": 0.928, "grad_norm": 1.8984375, "learning_rate": 0.00016200095474607753, "loss": 4.25194206237793, "mean_token_accuracy": 0.7464351058006287, "num_tokens": 1960212.0, "step": 580 }, { "entropy": 0.9635734960436821, "epoch": 0.944, "grad_norm": 1.53125, "learning_rate": 0.00016063594808740113, "loss": 4.0587310791015625, "mean_token_accuracy": 0.7547481089830399, "num_tokens": 1986049.0, "step": 590 }, { "entropy": 0.9986386775970459, "epoch": 0.96, "grad_norm": 1.703125, "learning_rate": 0.0001592528350603103, "loss": 4.331460952758789, "mean_token_accuracy": 0.7503352165222168, "num_tokens": 2013266.0, "step": 600 }, { "entropy": 1.006168755888939, "epoch": 0.976, "grad_norm": 1.734375, "learning_rate": 0.00015785202867317407, "loss": 4.238505554199219, "mean_token_accuracy": 0.7441465124487877, "num_tokens": 2041824.0, "step": 610 }, { "entropy": 1.0482723653316497, "epoch": 0.992, "grad_norm": 1.9296875, "learning_rate": 0.0001564339472177373, "loss": 4.394336700439453, "mean_token_accuracy": 0.7416493371129036, "num_tokens": 2071693.0, "step": 620 }, { "epoch": 1.0, "eval_entropy": 1.0188530464172363, "eval_loss": 1.0276753902435303, "eval_mean_token_accuracy": 0.7517141411304474, "eval_num_tokens": 2084150.0, "eval_runtime": 72.9791, "eval_samples_per_second": 13.703, "eval_steps_per_second": 3.426, "step": 625 } ], "logging_steps": 10, "max_steps": 1875, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.400684024323021e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }