{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 58, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017391304347826087, "grad_norm": 55.45934295654297, "learning_rate": 0.0, "loss": 7.4024, "mean_token_accuracy": 0.1686166636645794, "num_tokens": 6224.0, "step": 1 }, { "epoch": 0.034782608695652174, "grad_norm": 50.480812072753906, "learning_rate": 3.3333333333333335e-05, "loss": 7.0353, "mean_token_accuracy": 0.18751946836709976, "num_tokens": 11886.0, "step": 2 }, { "epoch": 0.05217391304347826, "grad_norm": 47.20847702026367, "learning_rate": 6.666666666666667e-05, "loss": 6.7961, "mean_token_accuracy": 0.1992376409471035, "num_tokens": 17328.0, "step": 3 }, { "epoch": 0.06956521739130435, "grad_norm": 38.443172454833984, "learning_rate": 0.0001, "loss": 5.9767, "mean_token_accuracy": 0.21114255115389824, "num_tokens": 22752.0, "step": 4 }, { "epoch": 0.08695652173913043, "grad_norm": 22.228315353393555, "learning_rate": 9.99266096766761e-05, "loss": 4.9025, "mean_token_accuracy": 0.24107784777879715, "num_tokens": 29004.0, "step": 5 }, { "epoch": 0.10434782608695652, "grad_norm": 13.45175552368164, "learning_rate": 9.970667809068476e-05, "loss": 4.2551, "mean_token_accuracy": 0.3103659078478813, "num_tokens": 35240.0, "step": 6 }, { "epoch": 0.12173913043478261, "grad_norm": 11.450566291809082, "learning_rate": 9.93409226131462e-05, "loss": 3.4704, "mean_token_accuracy": 0.3919261246919632, "num_tokens": 41848.0, "step": 7 }, { "epoch": 0.1391304347826087, "grad_norm": 11.856537818908691, "learning_rate": 9.883053626240502e-05, "loss": 3.3263, "mean_token_accuracy": 0.42777257412672043, "num_tokens": 48131.0, "step": 8 }, { "epoch": 0.1565217391304348, "grad_norm": 11.830099105834961, "learning_rate": 9.81771838126524e-05, "loss": 2.3787, "mean_token_accuracy": 0.5440730974078178, "num_tokens": 55437.0, "step": 9 }, { "epoch": 0.17391304347826086, "grad_norm": 3.2861013412475586, "learning_rate": 9.738299636377862e-05, "loss": 1.752, "mean_token_accuracy": 0.6369369179010391, "num_tokens": 61534.0, "step": 10 }, { "epoch": 0.19130434782608696, "grad_norm": 3.029527425765991, "learning_rate": 9.645056439016827e-05, "loss": 1.4911, "mean_token_accuracy": 0.67353655397892, "num_tokens": 67205.0, "step": 11 }, { "epoch": 0.20869565217391303, "grad_norm": 2.049616813659668, "learning_rate": 9.538292929111113e-05, "loss": 1.5348, "mean_token_accuracy": 0.6536547541618347, "num_tokens": 73359.0, "step": 12 }, { "epoch": 0.22608695652173913, "grad_norm": 2.315335750579834, "learning_rate": 9.418357347038998e-05, "loss": 1.3448, "mean_token_accuracy": 0.6856418550014496, "num_tokens": 79250.0, "step": 13 }, { "epoch": 0.24347826086956523, "grad_norm": 2.0546271800994873, "learning_rate": 9.285640897740315e-05, "loss": 1.4246, "mean_token_accuracy": 0.664088174700737, "num_tokens": 84981.0, "step": 14 }, { "epoch": 0.2608695652173913, "grad_norm": 1.8393396139144897, "learning_rate": 9.140576474687264e-05, "loss": 1.3907, "mean_token_accuracy": 0.6777825355529785, "num_tokens": 91084.0, "step": 15 }, { "epoch": 0.2782608695652174, "grad_norm": 1.3824291229248047, "learning_rate": 8.983637247875872e-05, "loss": 1.0379, "mean_token_accuracy": 0.7471358180046082, "num_tokens": 98793.0, "step": 16 }, { "epoch": 0.2956521739130435, "grad_norm": 1.8309787511825562, "learning_rate": 8.815335120443822e-05, "loss": 1.0902, "mean_token_accuracy": 0.7380675822496414, "num_tokens": 105183.0, "step": 17 }, { "epoch": 0.3130434782608696, "grad_norm": 1.8205807209014893, "learning_rate": 8.636219058948823e-05, "loss": 1.0416, "mean_token_accuracy": 0.7440497726202011, "num_tokens": 111071.0, "step": 18 }, { "epoch": 0.33043478260869563, "grad_norm": 1.7150083780288696, "learning_rate": 8.446873302753784e-05, "loss": 0.9923, "mean_token_accuracy": 0.7531551718711853, "num_tokens": 117451.0, "step": 19 }, { "epoch": 0.34782608695652173, "grad_norm": 1.842894434928894, "learning_rate": 8.247915458359473e-05, "loss": 0.9871, "mean_token_accuracy": 0.7675948143005371, "num_tokens": 123384.0, "step": 20 }, { "epoch": 0.3652173913043478, "grad_norm": 1.4929476976394653, "learning_rate": 8.039994484900463e-05, "loss": 0.804, "mean_token_accuracy": 0.7979889959096909, "num_tokens": 129294.0, "step": 21 }, { "epoch": 0.3826086956521739, "grad_norm": 1.15473473072052, "learning_rate": 7.82378857737533e-05, "loss": 0.8532, "mean_token_accuracy": 0.793354332447052, "num_tokens": 135724.0, "step": 22 }, { "epoch": 0.4, "grad_norm": 1.2569639682769775, "learning_rate": 7.600002954515532e-05, "loss": 0.8879, "mean_token_accuracy": 0.796680137515068, "num_tokens": 141797.0, "step": 23 }, { "epoch": 0.41739130434782606, "grad_norm": 1.0053095817565918, "learning_rate": 7.369367558508489e-05, "loss": 0.8051, "mean_token_accuracy": 0.8075380921363831, "num_tokens": 147918.0, "step": 24 }, { "epoch": 0.43478260869565216, "grad_norm": 0.793317437171936, "learning_rate": 7.132634674077883e-05, "loss": 0.9785, "mean_token_accuracy": 0.7718029171228409, "num_tokens": 154434.0, "step": 25 }, { "epoch": 0.45217391304347826, "grad_norm": 0.7299309968948364, "learning_rate": 6.890576474687263e-05, "loss": 1.0357, "mean_token_accuracy": 0.744295209646225, "num_tokens": 161540.0, "step": 26 }, { "epoch": 0.46956521739130436, "grad_norm": 1.4765475988388062, "learning_rate": 6.643982503870693e-05, "loss": 0.6015, "mean_token_accuracy": 0.8558884114027023, "num_tokens": 167030.0, "step": 27 }, { "epoch": 0.48695652173913045, "grad_norm": 0.7446058392524719, "learning_rate": 6.393657099905855e-05, "loss": 0.7297, "mean_token_accuracy": 0.8198393434286118, "num_tokens": 172575.0, "step": 28 }, { "epoch": 0.5043478260869565, "grad_norm": 0.662484347820282, "learning_rate": 6.140416772229784e-05, "loss": 0.7215, "mean_token_accuracy": 0.8261076658964157, "num_tokens": 178395.0, "step": 29 }, { "epoch": 0.5217391304347826, "grad_norm": 1.0676500797271729, "learning_rate": 5.88508753815478e-05, "loss": 0.7392, "mean_token_accuracy": 0.8211972415447235, "num_tokens": 184121.0, "step": 30 }, { "epoch": 0.5391304347826087, "grad_norm": 1.291377067565918, "learning_rate": 5.628502228571633e-05, "loss": 0.6282, "mean_token_accuracy": 0.8491714000701904, "num_tokens": 190440.0, "step": 31 }, { "epoch": 0.5565217391304348, "grad_norm": 0.5546766519546509, "learning_rate": 5.3714977714283674e-05, "loss": 0.6955, "mean_token_accuracy": 0.8281570971012115, "num_tokens": 197221.0, "step": 32 }, { "epoch": 0.5739130434782609, "grad_norm": 0.5453093647956848, "learning_rate": 5.114912461845223e-05, "loss": 0.7473, "mean_token_accuracy": 0.817421019077301, "num_tokens": 203874.0, "step": 33 }, { "epoch": 0.591304347826087, "grad_norm": 0.46395328640937805, "learning_rate": 4.859583227770218e-05, "loss": 0.7886, "mean_token_accuracy": 0.805167943239212, "num_tokens": 210064.0, "step": 34 }, { "epoch": 0.6086956521739131, "grad_norm": 0.4362604022026062, "learning_rate": 4.606342900094147e-05, "loss": 0.7002, "mean_token_accuracy": 0.829146608710289, "num_tokens": 216569.0, "step": 35 }, { "epoch": 0.6260869565217392, "grad_norm": 0.4113505184650421, "learning_rate": 4.3560174961293097e-05, "loss": 0.8059, "mean_token_accuracy": 0.8010579198598862, "num_tokens": 222896.0, "step": 36 }, { "epoch": 0.6434782608695652, "grad_norm": 0.682939350605011, "learning_rate": 4.109423525312738e-05, "loss": 0.6212, "mean_token_accuracy": 0.8504037708044052, "num_tokens": 229011.0, "step": 37 }, { "epoch": 0.6608695652173913, "grad_norm": 0.5376163125038147, "learning_rate": 3.8673653259221166e-05, "loss": 0.8097, "mean_token_accuracy": 0.8016069531440735, "num_tokens": 235909.0, "step": 38 }, { "epoch": 0.6782608695652174, "grad_norm": 0.47806745767593384, "learning_rate": 3.630632441491512e-05, "loss": 0.7105, "mean_token_accuracy": 0.8202503025531769, "num_tokens": 242513.0, "step": 39 }, { "epoch": 0.6956521739130435, "grad_norm": 0.582610011100769, "learning_rate": 3.399997045484469e-05, "loss": 0.7017, "mean_token_accuracy": 0.8234356045722961, "num_tokens": 248157.0, "step": 40 }, { "epoch": 0.7130434782608696, "grad_norm": 0.5676279664039612, "learning_rate": 3.176211422624672e-05, "loss": 0.6786, "mean_token_accuracy": 0.8357308208942413, "num_tokens": 253917.0, "step": 41 }, { "epoch": 0.7304347826086957, "grad_norm": 0.43341803550720215, "learning_rate": 2.9600055150995398e-05, "loss": 0.7935, "mean_token_accuracy": 0.802367627620697, "num_tokens": 260731.0, "step": 42 }, { "epoch": 0.7478260869565218, "grad_norm": 0.5170316100120544, "learning_rate": 2.7520845416405282e-05, "loss": 0.6804, "mean_token_accuracy": 0.8378360271453857, "num_tokens": 266433.0, "step": 43 }, { "epoch": 0.7652173913043478, "grad_norm": 0.48307615518569946, "learning_rate": 2.5531266972462177e-05, "loss": 0.7748, "mean_token_accuracy": 0.8051830232143402, "num_tokens": 273029.0, "step": 44 }, { "epoch": 0.782608695652174, "grad_norm": 0.45667076110839844, "learning_rate": 2.36378094105118e-05, "loss": 0.5937, "mean_token_accuracy": 0.8383966088294983, "num_tokens": 279655.0, "step": 45 }, { "epoch": 0.8, "grad_norm": 0.41323214769363403, "learning_rate": 2.1846648795561774e-05, "loss": 0.6688, "mean_token_accuracy": 0.8343894928693771, "num_tokens": 286066.0, "step": 46 }, { "epoch": 0.8173913043478261, "grad_norm": 0.6046397089958191, "learning_rate": 2.0163627521241292e-05, "loss": 0.614, "mean_token_accuracy": 0.8473068177700043, "num_tokens": 291788.0, "step": 47 }, { "epoch": 0.8347826086956521, "grad_norm": 0.3877752721309662, "learning_rate": 1.8594235253127375e-05, "loss": 0.4869, "mean_token_accuracy": 0.8654044568538666, "num_tokens": 298877.0, "step": 48 }, { "epoch": 0.8521739130434782, "grad_norm": 0.38120725750923157, "learning_rate": 1.7143591022596845e-05, "loss": 0.6164, "mean_token_accuracy": 0.8455094546079636, "num_tokens": 304903.0, "step": 49 }, { "epoch": 0.8695652173913043, "grad_norm": 0.4243324398994446, "learning_rate": 1.5816426529610035e-05, "loss": 0.6793, "mean_token_accuracy": 0.8279502987861633, "num_tokens": 310741.0, "step": 50 }, { "epoch": 0.8869565217391304, "grad_norm": 0.4153580963611603, "learning_rate": 1.4617070708888881e-05, "loss": 0.6215, "mean_token_accuracy": 0.8429995030164719, "num_tokens": 315959.0, "step": 51 }, { "epoch": 0.9043478260869565, "grad_norm": 0.3602670133113861, "learning_rate": 1.3549435609831752e-05, "loss": 0.7285, "mean_token_accuracy": 0.822294071316719, "num_tokens": 322024.0, "step": 52 }, { "epoch": 0.9217391304347826, "grad_norm": 0.4163176119327545, "learning_rate": 1.2617003636221395e-05, "loss": 0.7324, "mean_token_accuracy": 0.8162952065467834, "num_tokens": 328585.0, "step": 53 }, { "epoch": 0.9391304347826087, "grad_norm": 0.4420251250267029, "learning_rate": 1.1822816187347623e-05, "loss": 0.7539, "mean_token_accuracy": 0.813583567738533, "num_tokens": 334774.0, "step": 54 }, { "epoch": 0.9565217391304348, "grad_norm": 0.4156906008720398, "learning_rate": 1.1169463737594995e-05, "loss": 0.6769, "mean_token_accuracy": 0.8312394767999649, "num_tokens": 340913.0, "step": 55 }, { "epoch": 0.9739130434782609, "grad_norm": 0.4741043150424957, "learning_rate": 1.0659077386853816e-05, "loss": 0.6661, "mean_token_accuracy": 0.8269830048084259, "num_tokens": 347461.0, "step": 56 }, { "epoch": 0.991304347826087, "grad_norm": 0.4164107143878937, "learning_rate": 1.0293321909315242e-05, "loss": 0.6548, "mean_token_accuracy": 0.8303200602531433, "num_tokens": 353498.0, "step": 57 }, { "epoch": 1.0, "grad_norm": 0.6555842757225037, "learning_rate": 1.0073390323323897e-05, "loss": 0.9622, "mean_token_accuracy": 0.8200015425682068, "num_tokens": 355699.0, "step": 58 } ], "logging_steps": 1, "max_steps": 58, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.441734033785367e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }