| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 58, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017391304347826087, | |
| "grad_norm": 55.45934295654297, | |
| "learning_rate": 0.0, | |
| "loss": 7.4024, | |
| "mean_token_accuracy": 0.1686166636645794, | |
| "num_tokens": 6224.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.034782608695652174, | |
| "grad_norm": 50.480812072753906, | |
| "learning_rate": 3.3333333333333335e-05, | |
| "loss": 7.0353, | |
| "mean_token_accuracy": 0.18751946836709976, | |
| "num_tokens": 11886.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.05217391304347826, | |
| "grad_norm": 47.20847702026367, | |
| "learning_rate": 6.666666666666667e-05, | |
| "loss": 6.7961, | |
| "mean_token_accuracy": 0.1992376409471035, | |
| "num_tokens": 17328.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.06956521739130435, | |
| "grad_norm": 38.443172454833984, | |
| "learning_rate": 0.0001, | |
| "loss": 5.9767, | |
| "mean_token_accuracy": 0.21114255115389824, | |
| "num_tokens": 22752.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 22.228315353393555, | |
| "learning_rate": 9.99266096766761e-05, | |
| "loss": 4.9025, | |
| "mean_token_accuracy": 0.24107784777879715, | |
| "num_tokens": 29004.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.10434782608695652, | |
| "grad_norm": 13.45175552368164, | |
| "learning_rate": 9.970667809068476e-05, | |
| "loss": 4.2551, | |
| "mean_token_accuracy": 0.3103659078478813, | |
| "num_tokens": 35240.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.12173913043478261, | |
| "grad_norm": 11.450566291809082, | |
| "learning_rate": 9.93409226131462e-05, | |
| "loss": 3.4704, | |
| "mean_token_accuracy": 0.3919261246919632, | |
| "num_tokens": 41848.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.1391304347826087, | |
| "grad_norm": 11.856537818908691, | |
| "learning_rate": 9.883053626240502e-05, | |
| "loss": 3.3263, | |
| "mean_token_accuracy": 0.42777257412672043, | |
| "num_tokens": 48131.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.1565217391304348, | |
| "grad_norm": 11.830099105834961, | |
| "learning_rate": 9.81771838126524e-05, | |
| "loss": 2.3787, | |
| "mean_token_accuracy": 0.5440730974078178, | |
| "num_tokens": 55437.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 3.2861013412475586, | |
| "learning_rate": 9.738299636377862e-05, | |
| "loss": 1.752, | |
| "mean_token_accuracy": 0.6369369179010391, | |
| "num_tokens": 61534.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.19130434782608696, | |
| "grad_norm": 3.029527425765991, | |
| "learning_rate": 9.645056439016827e-05, | |
| "loss": 1.4911, | |
| "mean_token_accuracy": 0.67353655397892, | |
| "num_tokens": 67205.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.20869565217391303, | |
| "grad_norm": 2.049616813659668, | |
| "learning_rate": 9.538292929111113e-05, | |
| "loss": 1.5348, | |
| "mean_token_accuracy": 0.6536547541618347, | |
| "num_tokens": 73359.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.22608695652173913, | |
| "grad_norm": 2.315335750579834, | |
| "learning_rate": 9.418357347038998e-05, | |
| "loss": 1.3448, | |
| "mean_token_accuracy": 0.6856418550014496, | |
| "num_tokens": 79250.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.24347826086956523, | |
| "grad_norm": 2.0546271800994873, | |
| "learning_rate": 9.285640897740315e-05, | |
| "loss": 1.4246, | |
| "mean_token_accuracy": 0.664088174700737, | |
| "num_tokens": 84981.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 1.8393396139144897, | |
| "learning_rate": 9.140576474687264e-05, | |
| "loss": 1.3907, | |
| "mean_token_accuracy": 0.6777825355529785, | |
| "num_tokens": 91084.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.2782608695652174, | |
| "grad_norm": 1.3824291229248047, | |
| "learning_rate": 8.983637247875872e-05, | |
| "loss": 1.0379, | |
| "mean_token_accuracy": 0.7471358180046082, | |
| "num_tokens": 98793.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.2956521739130435, | |
| "grad_norm": 1.8309787511825562, | |
| "learning_rate": 8.815335120443822e-05, | |
| "loss": 1.0902, | |
| "mean_token_accuracy": 0.7380675822496414, | |
| "num_tokens": 105183.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.3130434782608696, | |
| "grad_norm": 1.8205807209014893, | |
| "learning_rate": 8.636219058948823e-05, | |
| "loss": 1.0416, | |
| "mean_token_accuracy": 0.7440497726202011, | |
| "num_tokens": 111071.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.33043478260869563, | |
| "grad_norm": 1.7150083780288696, | |
| "learning_rate": 8.446873302753784e-05, | |
| "loss": 0.9923, | |
| "mean_token_accuracy": 0.7531551718711853, | |
| "num_tokens": 117451.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 1.842894434928894, | |
| "learning_rate": 8.247915458359473e-05, | |
| "loss": 0.9871, | |
| "mean_token_accuracy": 0.7675948143005371, | |
| "num_tokens": 123384.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3652173913043478, | |
| "grad_norm": 1.4929476976394653, | |
| "learning_rate": 8.039994484900463e-05, | |
| "loss": 0.804, | |
| "mean_token_accuracy": 0.7979889959096909, | |
| "num_tokens": 129294.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3826086956521739, | |
| "grad_norm": 1.15473473072052, | |
| "learning_rate": 7.82378857737533e-05, | |
| "loss": 0.8532, | |
| "mean_token_accuracy": 0.793354332447052, | |
| "num_tokens": 135724.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.2569639682769775, | |
| "learning_rate": 7.600002954515532e-05, | |
| "loss": 0.8879, | |
| "mean_token_accuracy": 0.796680137515068, | |
| "num_tokens": 141797.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.41739130434782606, | |
| "grad_norm": 1.0053095817565918, | |
| "learning_rate": 7.369367558508489e-05, | |
| "loss": 0.8051, | |
| "mean_token_accuracy": 0.8075380921363831, | |
| "num_tokens": 147918.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.793317437171936, | |
| "learning_rate": 7.132634674077883e-05, | |
| "loss": 0.9785, | |
| "mean_token_accuracy": 0.7718029171228409, | |
| "num_tokens": 154434.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.45217391304347826, | |
| "grad_norm": 0.7299309968948364, | |
| "learning_rate": 6.890576474687263e-05, | |
| "loss": 1.0357, | |
| "mean_token_accuracy": 0.744295209646225, | |
| "num_tokens": 161540.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.46956521739130436, | |
| "grad_norm": 1.4765475988388062, | |
| "learning_rate": 6.643982503870693e-05, | |
| "loss": 0.6015, | |
| "mean_token_accuracy": 0.8558884114027023, | |
| "num_tokens": 167030.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.48695652173913045, | |
| "grad_norm": 0.7446058392524719, | |
| "learning_rate": 6.393657099905855e-05, | |
| "loss": 0.7297, | |
| "mean_token_accuracy": 0.8198393434286118, | |
| "num_tokens": 172575.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.5043478260869565, | |
| "grad_norm": 0.662484347820282, | |
| "learning_rate": 6.140416772229784e-05, | |
| "loss": 0.7215, | |
| "mean_token_accuracy": 0.8261076658964157, | |
| "num_tokens": 178395.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 1.0676500797271729, | |
| "learning_rate": 5.88508753815478e-05, | |
| "loss": 0.7392, | |
| "mean_token_accuracy": 0.8211972415447235, | |
| "num_tokens": 184121.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5391304347826087, | |
| "grad_norm": 1.291377067565918, | |
| "learning_rate": 5.628502228571633e-05, | |
| "loss": 0.6282, | |
| "mean_token_accuracy": 0.8491714000701904, | |
| "num_tokens": 190440.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.5565217391304348, | |
| "grad_norm": 0.5546766519546509, | |
| "learning_rate": 5.3714977714283674e-05, | |
| "loss": 0.6955, | |
| "mean_token_accuracy": 0.8281570971012115, | |
| "num_tokens": 197221.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.5739130434782609, | |
| "grad_norm": 0.5453093647956848, | |
| "learning_rate": 5.114912461845223e-05, | |
| "loss": 0.7473, | |
| "mean_token_accuracy": 0.817421019077301, | |
| "num_tokens": 203874.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.591304347826087, | |
| "grad_norm": 0.46395328640937805, | |
| "learning_rate": 4.859583227770218e-05, | |
| "loss": 0.7886, | |
| "mean_token_accuracy": 0.805167943239212, | |
| "num_tokens": 210064.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 0.4362604022026062, | |
| "learning_rate": 4.606342900094147e-05, | |
| "loss": 0.7002, | |
| "mean_token_accuracy": 0.829146608710289, | |
| "num_tokens": 216569.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.6260869565217392, | |
| "grad_norm": 0.4113505184650421, | |
| "learning_rate": 4.3560174961293097e-05, | |
| "loss": 0.8059, | |
| "mean_token_accuracy": 0.8010579198598862, | |
| "num_tokens": 222896.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.6434782608695652, | |
| "grad_norm": 0.682939350605011, | |
| "learning_rate": 4.109423525312738e-05, | |
| "loss": 0.6212, | |
| "mean_token_accuracy": 0.8504037708044052, | |
| "num_tokens": 229011.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.6608695652173913, | |
| "grad_norm": 0.5376163125038147, | |
| "learning_rate": 3.8673653259221166e-05, | |
| "loss": 0.8097, | |
| "mean_token_accuracy": 0.8016069531440735, | |
| "num_tokens": 235909.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.6782608695652174, | |
| "grad_norm": 0.47806745767593384, | |
| "learning_rate": 3.630632441491512e-05, | |
| "loss": 0.7105, | |
| "mean_token_accuracy": 0.8202503025531769, | |
| "num_tokens": 242513.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 0.582610011100769, | |
| "learning_rate": 3.399997045484469e-05, | |
| "loss": 0.7017, | |
| "mean_token_accuracy": 0.8234356045722961, | |
| "num_tokens": 248157.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.7130434782608696, | |
| "grad_norm": 0.5676279664039612, | |
| "learning_rate": 3.176211422624672e-05, | |
| "loss": 0.6786, | |
| "mean_token_accuracy": 0.8357308208942413, | |
| "num_tokens": 253917.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.7304347826086957, | |
| "grad_norm": 0.43341803550720215, | |
| "learning_rate": 2.9600055150995398e-05, | |
| "loss": 0.7935, | |
| "mean_token_accuracy": 0.802367627620697, | |
| "num_tokens": 260731.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.7478260869565218, | |
| "grad_norm": 0.5170316100120544, | |
| "learning_rate": 2.7520845416405282e-05, | |
| "loss": 0.6804, | |
| "mean_token_accuracy": 0.8378360271453857, | |
| "num_tokens": 266433.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.7652173913043478, | |
| "grad_norm": 0.48307615518569946, | |
| "learning_rate": 2.5531266972462177e-05, | |
| "loss": 0.7748, | |
| "mean_token_accuracy": 0.8051830232143402, | |
| "num_tokens": 273029.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 0.45667076110839844, | |
| "learning_rate": 2.36378094105118e-05, | |
| "loss": 0.5937, | |
| "mean_token_accuracy": 0.8383966088294983, | |
| "num_tokens": 279655.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.41323214769363403, | |
| "learning_rate": 2.1846648795561774e-05, | |
| "loss": 0.6688, | |
| "mean_token_accuracy": 0.8343894928693771, | |
| "num_tokens": 286066.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.8173913043478261, | |
| "grad_norm": 0.6046397089958191, | |
| "learning_rate": 2.0163627521241292e-05, | |
| "loss": 0.614, | |
| "mean_token_accuracy": 0.8473068177700043, | |
| "num_tokens": 291788.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.8347826086956521, | |
| "grad_norm": 0.3877752721309662, | |
| "learning_rate": 1.8594235253127375e-05, | |
| "loss": 0.4869, | |
| "mean_token_accuracy": 0.8654044568538666, | |
| "num_tokens": 298877.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.8521739130434782, | |
| "grad_norm": 0.38120725750923157, | |
| "learning_rate": 1.7143591022596845e-05, | |
| "loss": 0.6164, | |
| "mean_token_accuracy": 0.8455094546079636, | |
| "num_tokens": 304903.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.4243324398994446, | |
| "learning_rate": 1.5816426529610035e-05, | |
| "loss": 0.6793, | |
| "mean_token_accuracy": 0.8279502987861633, | |
| "num_tokens": 310741.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8869565217391304, | |
| "grad_norm": 0.4153580963611603, | |
| "learning_rate": 1.4617070708888881e-05, | |
| "loss": 0.6215, | |
| "mean_token_accuracy": 0.8429995030164719, | |
| "num_tokens": 315959.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.9043478260869565, | |
| "grad_norm": 0.3602670133113861, | |
| "learning_rate": 1.3549435609831752e-05, | |
| "loss": 0.7285, | |
| "mean_token_accuracy": 0.822294071316719, | |
| "num_tokens": 322024.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.9217391304347826, | |
| "grad_norm": 0.4163176119327545, | |
| "learning_rate": 1.2617003636221395e-05, | |
| "loss": 0.7324, | |
| "mean_token_accuracy": 0.8162952065467834, | |
| "num_tokens": 328585.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.9391304347826087, | |
| "grad_norm": 0.4420251250267029, | |
| "learning_rate": 1.1822816187347623e-05, | |
| "loss": 0.7539, | |
| "mean_token_accuracy": 0.813583567738533, | |
| "num_tokens": 334774.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 0.4156906008720398, | |
| "learning_rate": 1.1169463737594995e-05, | |
| "loss": 0.6769, | |
| "mean_token_accuracy": 0.8312394767999649, | |
| "num_tokens": 340913.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.9739130434782609, | |
| "grad_norm": 0.4741043150424957, | |
| "learning_rate": 1.0659077386853816e-05, | |
| "loss": 0.6661, | |
| "mean_token_accuracy": 0.8269830048084259, | |
| "num_tokens": 347461.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.991304347826087, | |
| "grad_norm": 0.4164107143878937, | |
| "learning_rate": 1.0293321909315242e-05, | |
| "loss": 0.6548, | |
| "mean_token_accuracy": 0.8303200602531433, | |
| "num_tokens": 353498.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.6555842757225037, | |
| "learning_rate": 1.0073390323323897e-05, | |
| "loss": 0.9622, | |
| "mean_token_accuracy": 0.8200015425682068, | |
| "num_tokens": 355699.0, | |
| "step": 58 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 58, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.441734033785367e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |