| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 1.0, |
| "eval_steps": 500, |
| "global_step": 87, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 0.3709419071674347, |
| "epoch": 0.011494252873563218, |
| "grad_norm": 320.0, |
| "learning_rate": 5e-05, |
| "loss": 4.9724, |
| "mean_token_accuracy": 0.5486404895782471, |
| "num_tokens": 1657.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.0495511293411255, |
| "epoch": 0.022988505747126436, |
| "grad_norm": 41.0, |
| "learning_rate": 5e-05, |
| "loss": 1.9782, |
| "mean_token_accuracy": 0.7108578085899353, |
| "num_tokens": 3326.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 2.898812770843506, |
| "epoch": 0.034482758620689655, |
| "grad_norm": 133.0, |
| "learning_rate": 5e-05, |
| "loss": 2.3167, |
| "mean_token_accuracy": 0.6533090472221375, |
| "num_tokens": 4975.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 2.2339625358581543, |
| "epoch": 0.04597701149425287, |
| "grad_norm": 30.5, |
| "learning_rate": 5e-05, |
| "loss": 1.384, |
| "mean_token_accuracy": 0.7507635951042175, |
| "num_tokens": 6614.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.8993045091629028, |
| "epoch": 0.05747126436781609, |
| "grad_norm": 74.5, |
| "learning_rate": 5e-05, |
| "loss": 1.2395, |
| "mean_token_accuracy": 0.7583282589912415, |
| "num_tokens": 8267.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.1725226640701294, |
| "epoch": 0.06896551724137931, |
| "grad_norm": 18.875, |
| "learning_rate": 5e-05, |
| "loss": 0.9074, |
| "mean_token_accuracy": 0.8194946050643921, |
| "num_tokens": 9931.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 0.9498358368873596, |
| "epoch": 0.08045977011494253, |
| "grad_norm": 11.875, |
| "learning_rate": 5e-05, |
| "loss": 0.6491, |
| "mean_token_accuracy": 0.8492492437362671, |
| "num_tokens": 11598.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 0.6594022512435913, |
| "epoch": 0.09195402298850575, |
| "grad_norm": 12.0625, |
| "learning_rate": 5e-05, |
| "loss": 0.4387, |
| "mean_token_accuracy": 0.888014554977417, |
| "num_tokens": 13252.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 0.3986137807369232, |
| "epoch": 0.10344827586206896, |
| "grad_norm": 9.375, |
| "learning_rate": 5e-05, |
| "loss": 0.2872, |
| "mean_token_accuracy": 0.9253910779953003, |
| "num_tokens": 14916.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 0.25546491146087646, |
| "epoch": 0.11494252873563218, |
| "grad_norm": 11.0, |
| "learning_rate": 5e-05, |
| "loss": 0.194, |
| "mean_token_accuracy": 0.9643934965133667, |
| "num_tokens": 16575.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.1774519979953766, |
| "epoch": 0.12643678160919541, |
| "grad_norm": 6.375, |
| "learning_rate": 5e-05, |
| "loss": 0.1092, |
| "mean_token_accuracy": 0.978941023349762, |
| "num_tokens": 18239.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 0.13791397213935852, |
| "epoch": 0.13793103448275862, |
| "grad_norm": 5.625, |
| "learning_rate": 5e-05, |
| "loss": 0.1203, |
| "mean_token_accuracy": 0.9824984669685364, |
| "num_tokens": 19898.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 0.08871836215257645, |
| "epoch": 0.14942528735632185, |
| "grad_norm": 3.5625, |
| "learning_rate": 5e-05, |
| "loss": 0.0618, |
| "mean_token_accuracy": 0.9909420013427734, |
| "num_tokens": 21556.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 0.06183680146932602, |
| "epoch": 0.16091954022988506, |
| "grad_norm": 3.796875, |
| "learning_rate": 5e-05, |
| "loss": 0.088, |
| "mean_token_accuracy": 0.9915305376052856, |
| "num_tokens": 23211.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 0.06007755175232887, |
| "epoch": 0.1724137931034483, |
| "grad_norm": 2.984375, |
| "learning_rate": 5e-05, |
| "loss": 0.0707, |
| "mean_token_accuracy": 0.9921497702598572, |
| "num_tokens": 24869.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 0.05882040038704872, |
| "epoch": 0.1839080459770115, |
| "grad_norm": 1.796875, |
| "learning_rate": 5e-05, |
| "loss": 0.0539, |
| "mean_token_accuracy": 0.9939903616905212, |
| "num_tokens": 26535.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 0.04828764125704765, |
| "epoch": 0.19540229885057472, |
| "grad_norm": 1.7890625, |
| "learning_rate": 5e-05, |
| "loss": 0.0478, |
| "mean_token_accuracy": 0.9945717453956604, |
| "num_tokens": 28195.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 0.044041331857442856, |
| "epoch": 0.20689655172413793, |
| "grad_norm": 2.046875, |
| "learning_rate": 5e-05, |
| "loss": 0.0495, |
| "mean_token_accuracy": 0.9951807260513306, |
| "num_tokens": 29857.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 0.0408758781850338, |
| "epoch": 0.21839080459770116, |
| "grad_norm": 1.625, |
| "learning_rate": 5e-05, |
| "loss": 0.0329, |
| "mean_token_accuracy": 0.9952038526535034, |
| "num_tokens": 31527.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 0.035789985209703445, |
| "epoch": 0.22988505747126436, |
| "grad_norm": 1.734375, |
| "learning_rate": 5e-05, |
| "loss": 0.046, |
| "mean_token_accuracy": 0.9945717453956604, |
| "num_tokens": 33187.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.035998038947582245, |
| "epoch": 0.2413793103448276, |
| "grad_norm": 1.453125, |
| "learning_rate": 5e-05, |
| "loss": 0.0357, |
| "mean_token_accuracy": 0.9933494329452515, |
| "num_tokens": 34843.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 0.03482215851545334, |
| "epoch": 0.25287356321839083, |
| "grad_norm": 1.90625, |
| "learning_rate": 5e-05, |
| "loss": 0.0454, |
| "mean_token_accuracy": 0.9927536249160767, |
| "num_tokens": 36501.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 0.028973015025258064, |
| "epoch": 0.26436781609195403, |
| "grad_norm": 1.59375, |
| "learning_rate": 5e-05, |
| "loss": 0.0345, |
| "mean_token_accuracy": 0.9963768124580383, |
| "num_tokens": 38159.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 0.029489481821656227, |
| "epoch": 0.27586206896551724, |
| "grad_norm": 0.83203125, |
| "learning_rate": 5e-05, |
| "loss": 0.0275, |
| "mean_token_accuracy": 0.996372401714325, |
| "num_tokens": 39815.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 0.03362823650240898, |
| "epoch": 0.28735632183908044, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5e-05, |
| "loss": 0.0309, |
| "mean_token_accuracy": 0.9957805871963501, |
| "num_tokens": 41476.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 0.029244303703308105, |
| "epoch": 0.2988505747126437, |
| "grad_norm": 1.5078125, |
| "learning_rate": 5e-05, |
| "loss": 0.0346, |
| "mean_token_accuracy": 0.9957805871963501, |
| "num_tokens": 43137.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 0.03644675388932228, |
| "epoch": 0.3103448275862069, |
| "grad_norm": 2.0, |
| "learning_rate": 5e-05, |
| "loss": 0.035, |
| "mean_token_accuracy": 0.9933934211730957, |
| "num_tokens": 44804.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 0.03685041889548302, |
| "epoch": 0.3218390804597701, |
| "grad_norm": 2.1875, |
| "learning_rate": 5e-05, |
| "loss": 0.047, |
| "mean_token_accuracy": 0.9933574795722961, |
| "num_tokens": 46462.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 0.02877631224691868, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.8828125, |
| "learning_rate": 5e-05, |
| "loss": 0.0217, |
| "mean_token_accuracy": 0.9957343339920044, |
| "num_tokens": 48105.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 0.040678467601537704, |
| "epoch": 0.3448275862068966, |
| "grad_norm": 1.0078125, |
| "learning_rate": 5e-05, |
| "loss": 0.0332, |
| "mean_token_accuracy": 0.9940083622932434, |
| "num_tokens": 49776.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.027814431115984917, |
| "epoch": 0.3563218390804598, |
| "grad_norm": 0.83984375, |
| "learning_rate": 5e-05, |
| "loss": 0.0301, |
| "mean_token_accuracy": 0.9957395195960999, |
| "num_tokens": 51421.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 0.027178222313523293, |
| "epoch": 0.367816091954023, |
| "grad_norm": 0.6640625, |
| "learning_rate": 5e-05, |
| "loss": 0.0214, |
| "mean_token_accuracy": 0.9963614344596863, |
| "num_tokens": 53072.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 0.031195349991321564, |
| "epoch": 0.3793103448275862, |
| "grad_norm": 0.59375, |
| "learning_rate": 5e-05, |
| "loss": 0.0248, |
| "mean_token_accuracy": 0.9957652688026428, |
| "num_tokens": 54727.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 0.029933765530586243, |
| "epoch": 0.39080459770114945, |
| "grad_norm": 0.58203125, |
| "learning_rate": 5e-05, |
| "loss": 0.0191, |
| "mean_token_accuracy": 0.9957957863807678, |
| "num_tokens": 56394.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 0.03709409758448601, |
| "epoch": 0.40229885057471265, |
| "grad_norm": 1.890625, |
| "learning_rate": 5e-05, |
| "loss": 0.0453, |
| "mean_token_accuracy": 0.9921592473983765, |
| "num_tokens": 58054.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 0.03175961226224899, |
| "epoch": 0.41379310344827586, |
| "grad_norm": 0.73828125, |
| "learning_rate": 5e-05, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.9951778054237366, |
| "num_tokens": 59715.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 0.03274337574839592, |
| "epoch": 0.42528735632183906, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5e-05, |
| "loss": 0.0242, |
| "mean_token_accuracy": 0.9939283728599548, |
| "num_tokens": 61364.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 0.03900443762540817, |
| "epoch": 0.4367816091954023, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5e-05, |
| "loss": 0.0329, |
| "mean_token_accuracy": 0.9927841424942017, |
| "num_tokens": 63029.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 0.03216073289513588, |
| "epoch": 0.4482758620689655, |
| "grad_norm": 0.85546875, |
| "learning_rate": 5e-05, |
| "loss": 0.0407, |
| "mean_token_accuracy": 0.9933854341506958, |
| "num_tokens": 64694.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 0.026994826272130013, |
| "epoch": 0.45977011494252873, |
| "grad_norm": 0.9921875, |
| "learning_rate": 5e-05, |
| "loss": 0.0336, |
| "mean_token_accuracy": 0.9939283728599548, |
| "num_tokens": 66343.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.028900790959596634, |
| "epoch": 0.47126436781609193, |
| "grad_norm": 0.51171875, |
| "learning_rate": 5e-05, |
| "loss": 0.0191, |
| "mean_token_accuracy": 0.9969879388809204, |
| "num_tokens": 68005.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 0.017634548246860504, |
| "epoch": 0.4827586206896552, |
| "grad_norm": 0.609375, |
| "learning_rate": 5e-05, |
| "loss": 0.0143, |
| "mean_token_accuracy": 0.9975772500038147, |
| "num_tokens": 69658.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 0.02459569275379181, |
| "epoch": 0.4942528735632184, |
| "grad_norm": 0.8671875, |
| "learning_rate": 5e-05, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.9969861507415771, |
| "num_tokens": 71319.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 0.03161758929491043, |
| "epoch": 0.5057471264367817, |
| "grad_norm": 0.7890625, |
| "learning_rate": 5e-05, |
| "loss": 0.0183, |
| "mean_token_accuracy": 0.9970042109489441, |
| "num_tokens": 72990.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 0.02282997965812683, |
| "epoch": 0.5172413793103449, |
| "grad_norm": 0.67578125, |
| "learning_rate": 5e-05, |
| "loss": 0.0157, |
| "mean_token_accuracy": 0.9963877201080322, |
| "num_tokens": 74653.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 0.027062978595495224, |
| "epoch": 0.5287356321839081, |
| "grad_norm": 1.109375, |
| "learning_rate": 5e-05, |
| "loss": 0.0234, |
| "mean_token_accuracy": 0.9957678318023682, |
| "num_tokens": 76309.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 0.021824488416314125, |
| "epoch": 0.5402298850574713, |
| "grad_norm": 0.62890625, |
| "learning_rate": 5e-05, |
| "loss": 0.0175, |
| "mean_token_accuracy": 0.9975845217704773, |
| "num_tokens": 77967.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 0.0219434704631567, |
| "epoch": 0.5517241379310345, |
| "grad_norm": 0.59375, |
| "learning_rate": 5e-05, |
| "loss": 0.0184, |
| "mean_token_accuracy": 0.9969567656517029, |
| "num_tokens": 79612.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 0.018387747928500175, |
| "epoch": 0.5632183908045977, |
| "grad_norm": 0.68359375, |
| "learning_rate": 5e-05, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.9952009320259094, |
| "num_tokens": 81281.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 0.018757076933979988, |
| "epoch": 0.5747126436781609, |
| "grad_norm": 1.65625, |
| "learning_rate": 5e-05, |
| "loss": 0.0229, |
| "mean_token_accuracy": 0.9963658452033997, |
| "num_tokens": 82934.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.016816047951579094, |
| "epoch": 0.5862068965517241, |
| "grad_norm": 1.125, |
| "learning_rate": 5e-05, |
| "loss": 0.0268, |
| "mean_token_accuracy": 0.996964156627655, |
| "num_tokens": 84583.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 0.020629465579986572, |
| "epoch": 0.5977011494252874, |
| "grad_norm": 0.55078125, |
| "learning_rate": 5e-05, |
| "loss": 0.0212, |
| "mean_token_accuracy": 0.9952067136764526, |
| "num_tokens": 86254.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 0.01931397244334221, |
| "epoch": 0.6091954022988506, |
| "grad_norm": 0.91015625, |
| "learning_rate": 5e-05, |
| "loss": 0.0251, |
| "mean_token_accuracy": 0.9957831501960754, |
| "num_tokens": 87916.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 0.015544063411653042, |
| "epoch": 0.6206896551724138, |
| "grad_norm": 0.5625, |
| "learning_rate": 5e-05, |
| "loss": 0.015, |
| "mean_token_accuracy": 0.9969770312309265, |
| "num_tokens": 89572.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 0.016354193910956383, |
| "epoch": 0.632183908045977, |
| "grad_norm": 0.8671875, |
| "learning_rate": 5e-05, |
| "loss": 0.0276, |
| "mean_token_accuracy": 0.9957957863807678, |
| "num_tokens": 91239.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.02520756609737873, |
| "epoch": 0.6436781609195402, |
| "grad_norm": 0.7109375, |
| "learning_rate": 5e-05, |
| "loss": 0.0166, |
| "mean_token_accuracy": 0.9957882165908813, |
| "num_tokens": 92903.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 0.017131350934505463, |
| "epoch": 0.6551724137931034, |
| "grad_norm": 0.494140625, |
| "learning_rate": 5e-05, |
| "loss": 0.0127, |
| "mean_token_accuracy": 0.9975830912590027, |
| "num_tokens": 94560.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 0.01473468728363514, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.6171875, |
| "learning_rate": 5e-05, |
| "loss": 0.0117, |
| "mean_token_accuracy": 0.995785653591156, |
| "num_tokens": 96223.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 0.014780101366341114, |
| "epoch": 0.6781609195402298, |
| "grad_norm": 0.50390625, |
| "learning_rate": 5e-05, |
| "loss": 0.0166, |
| "mean_token_accuracy": 0.9969788789749146, |
| "num_tokens": 97880.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 0.02004481479525566, |
| "epoch": 0.6896551724137931, |
| "grad_norm": 3.640625, |
| "learning_rate": 5e-05, |
| "loss": 0.0245, |
| "mean_token_accuracy": 0.9957678318023682, |
| "num_tokens": 99536.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.02079402096569538, |
| "epoch": 0.7011494252873564, |
| "grad_norm": 0.8984375, |
| "learning_rate": 5e-05, |
| "loss": 0.0256, |
| "mean_token_accuracy": 0.9957652688026428, |
| "num_tokens": 101191.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 0.01947801001369953, |
| "epoch": 0.7126436781609196, |
| "grad_norm": 0.67578125, |
| "learning_rate": 5e-05, |
| "loss": 0.0214, |
| "mean_token_accuracy": 0.9970059990882874, |
| "num_tokens": 102863.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 0.021246658638119698, |
| "epoch": 0.7241379310344828, |
| "grad_norm": 12.75, |
| "learning_rate": 5e-05, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9970005750656128, |
| "num_tokens": 104532.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 0.01568767800927162, |
| "epoch": 0.735632183908046, |
| "grad_norm": 0.77734375, |
| "learning_rate": 5e-05, |
| "loss": 0.0137, |
| "mean_token_accuracy": 0.996995210647583, |
| "num_tokens": 106198.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 0.018703024834394455, |
| "epoch": 0.7471264367816092, |
| "grad_norm": 0.796875, |
| "learning_rate": 5e-05, |
| "loss": 0.0228, |
| "mean_token_accuracy": 0.9963658452033997, |
| "num_tokens": 107851.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.019862579181790352, |
| "epoch": 0.7586206896551724, |
| "grad_norm": 0.60546875, |
| "learning_rate": 5e-05, |
| "loss": 0.0229, |
| "mean_token_accuracy": 0.9957805871963501, |
| "num_tokens": 109512.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 0.012604566290974617, |
| "epoch": 0.7701149425287356, |
| "grad_norm": 1.046875, |
| "learning_rate": 5e-05, |
| "loss": 0.0147, |
| "mean_token_accuracy": 0.9975669384002686, |
| "num_tokens": 111158.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 0.01671748049557209, |
| "epoch": 0.7816091954022989, |
| "grad_norm": 0.625, |
| "learning_rate": 5e-05, |
| "loss": 0.0175, |
| "mean_token_accuracy": 0.9969987869262695, |
| "num_tokens": 112826.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 0.021043848246335983, |
| "epoch": 0.7931034482758621, |
| "grad_norm": 0.486328125, |
| "learning_rate": 5e-05, |
| "loss": 0.0144, |
| "mean_token_accuracy": 0.9958033561706543, |
| "num_tokens": 114496.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 0.016358301043510437, |
| "epoch": 0.8045977011494253, |
| "grad_norm": 0.73828125, |
| "learning_rate": 5e-05, |
| "loss": 0.0145, |
| "mean_token_accuracy": 0.9981883764266968, |
| "num_tokens": 116154.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.017069363966584206, |
| "epoch": 0.8160919540229885, |
| "grad_norm": 0.67578125, |
| "learning_rate": 5e-05, |
| "loss": 0.0193, |
| "mean_token_accuracy": 0.9963877201080322, |
| "num_tokens": 117817.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 0.014579691924154758, |
| "epoch": 0.8275862068965517, |
| "grad_norm": 0.484375, |
| "learning_rate": 5e-05, |
| "loss": 0.0147, |
| "mean_token_accuracy": 0.9975830912590027, |
| "num_tokens": 119474.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 0.016145411878824234, |
| "epoch": 0.8390804597701149, |
| "grad_norm": 0.859375, |
| "learning_rate": 5e-05, |
| "loss": 0.0173, |
| "mean_token_accuracy": 0.996372401714325, |
| "num_tokens": 121130.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 0.015459074638783932, |
| "epoch": 0.8505747126436781, |
| "grad_norm": 0.484375, |
| "learning_rate": 5e-05, |
| "loss": 0.0142, |
| "mean_token_accuracy": 0.9975845217704773, |
| "num_tokens": 122788.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 0.016901882365345955, |
| "epoch": 0.8620689655172413, |
| "grad_norm": 0.3671875, |
| "learning_rate": 5e-05, |
| "loss": 0.0142, |
| "mean_token_accuracy": 0.9969861507415771, |
| "num_tokens": 124449.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.01234087347984314, |
| "epoch": 0.8735632183908046, |
| "grad_norm": 0.345703125, |
| "learning_rate": 5e-05, |
| "loss": 0.0084, |
| "mean_token_accuracy": 0.9969915747642517, |
| "num_tokens": 126113.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 0.013088205829262733, |
| "epoch": 0.8850574712643678, |
| "grad_norm": 0.7109375, |
| "learning_rate": 5e-05, |
| "loss": 0.0201, |
| "mean_token_accuracy": 0.9969770312309265, |
| "num_tokens": 127769.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 0.014324485324323177, |
| "epoch": 0.896551724137931, |
| "grad_norm": 0.6484375, |
| "learning_rate": 5e-05, |
| "loss": 0.0176, |
| "mean_token_accuracy": 0.9969770312309265, |
| "num_tokens": 129425.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 0.015041607432067394, |
| "epoch": 0.9080459770114943, |
| "grad_norm": 0.67578125, |
| "learning_rate": 5e-05, |
| "loss": 0.0179, |
| "mean_token_accuracy": 0.9963658452033997, |
| "num_tokens": 131078.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 0.011612314730882645, |
| "epoch": 0.9195402298850575, |
| "grad_norm": 0.3515625, |
| "learning_rate": 5e-05, |
| "loss": 0.0071, |
| "mean_token_accuracy": 0.9981971383094788, |
| "num_tokens": 132744.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.014071737416088581, |
| "epoch": 0.9310344827586207, |
| "grad_norm": 0.361328125, |
| "learning_rate": 5e-05, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.9975990653038025, |
| "num_tokens": 134412.0, |
| "step": 81 |
| }, |
| { |
| "entropy": 0.013796189799904823, |
| "epoch": 0.9425287356321839, |
| "grad_norm": 0.57421875, |
| "learning_rate": 5e-05, |
| "loss": 0.0158, |
| "mean_token_accuracy": 0.9969715476036072, |
| "num_tokens": 136065.0, |
| "step": 82 |
| }, |
| { |
| "entropy": 0.01726696267724037, |
| "epoch": 0.9540229885057471, |
| "grad_norm": 0.796875, |
| "learning_rate": 5e-05, |
| "loss": 0.0319, |
| "mean_token_accuracy": 0.9946042895317078, |
| "num_tokens": 137735.0, |
| "step": 83 |
| }, |
| { |
| "entropy": 0.010913386940956116, |
| "epoch": 0.9655172413793104, |
| "grad_norm": 0.400390625, |
| "learning_rate": 5e-05, |
| "loss": 0.0136, |
| "mean_token_accuracy": 0.9975874423980713, |
| "num_tokens": 139395.0, |
| "step": 84 |
| }, |
| { |
| "entropy": 0.01786569133400917, |
| "epoch": 0.9770114942528736, |
| "grad_norm": 0.546875, |
| "learning_rate": 5e-05, |
| "loss": 0.0187, |
| "mean_token_accuracy": 0.9957907199859619, |
| "num_tokens": 141060.0, |
| "step": 85 |
| }, |
| { |
| "entropy": 0.015070064924657345, |
| "epoch": 0.9885057471264368, |
| "grad_norm": 1.890625, |
| "learning_rate": 5e-05, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9969879388809204, |
| "num_tokens": 142722.0, |
| "step": 86 |
| }, |
| { |
| "entropy": 0.018884530290961266, |
| "epoch": 1.0, |
| "grad_norm": 0.77734375, |
| "learning_rate": 5e-05, |
| "loss": 0.0208, |
| "mean_token_accuracy": 0.9951603412628174, |
| "num_tokens": 144377.0, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 0.017042334967603285, |
| "eval_loss": 0.01563352532684803, |
| "eval_mean_token_accuracy": 0.9966080685456594, |
| "eval_num_tokens": 144377.0, |
| "eval_runtime": 159.7482, |
| "eval_samples_per_second": 0.275, |
| "eval_steps_per_second": 0.038, |
| "step": 87 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 696, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 8, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 87243904817664.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|