| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 27.330623626708984, | |
| "learning_rate": 1e-05, | |
| "loss": 14.9828, | |
| "mean_token_accuracy": 0.43762992322444916, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 25.78533935546875, | |
| "learning_rate": 2e-05, | |
| "loss": 14.48, | |
| "mean_token_accuracy": 0.45947156846523285, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 23.762821197509766, | |
| "learning_rate": 3e-05, | |
| "loss": 14.3424, | |
| "mean_token_accuracy": 0.4559449180960655, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 21.071897506713867, | |
| "learning_rate": 4e-05, | |
| "loss": 13.9143, | |
| "mean_token_accuracy": 0.46790433675050735, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 19.133302688598633, | |
| "learning_rate": 5e-05, | |
| "loss": 12.2497, | |
| "mean_token_accuracy": 0.5162213444709778, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 17.784639358520508, | |
| "learning_rate": 4.9473684210526315e-05, | |
| "loss": 11.9364, | |
| "mean_token_accuracy": 0.5214760452508926, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 18.64559555053711, | |
| "learning_rate": 4.8947368421052635e-05, | |
| "loss": 10.9395, | |
| "mean_token_accuracy": 0.5401209890842438, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 17.828125, | |
| "learning_rate": 4.842105263157895e-05, | |
| "loss": 10.1876, | |
| "mean_token_accuracy": 0.5798913389444351, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 18.621353149414062, | |
| "learning_rate": 4.789473684210526e-05, | |
| "loss": 9.51, | |
| "mean_token_accuracy": 0.6059905588626862, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 14.266256332397461, | |
| "learning_rate": 4.736842105263158e-05, | |
| "loss": 9.324, | |
| "mean_token_accuracy": 0.623360276222229, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 13.611992835998535, | |
| "learning_rate": 4.68421052631579e-05, | |
| "loss": 8.9613, | |
| "mean_token_accuracy": 0.6348667591810226, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 13.141629219055176, | |
| "learning_rate": 4.6315789473684214e-05, | |
| "loss": 8.1299, | |
| "mean_token_accuracy": 0.6677338033914566, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 11.582746505737305, | |
| "learning_rate": 4.5789473684210527e-05, | |
| "loss": 8.1148, | |
| "mean_token_accuracy": 0.663286492228508, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 10.934531211853027, | |
| "learning_rate": 4.5263157894736846e-05, | |
| "loss": 7.5403, | |
| "mean_token_accuracy": 0.673496663570404, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 9.977241516113281, | |
| "learning_rate": 4.473684210526316e-05, | |
| "loss": 7.2111, | |
| "mean_token_accuracy": 0.688684269785881, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 10.482184410095215, | |
| "learning_rate": 4.421052631578947e-05, | |
| "loss": 7.5258, | |
| "mean_token_accuracy": 0.671795666217804, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 10.160177230834961, | |
| "learning_rate": 4.368421052631579e-05, | |
| "loss": 7.1163, | |
| "mean_token_accuracy": 0.6790599226951599, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 10.689698219299316, | |
| "learning_rate": 4.3157894736842105e-05, | |
| "loss": 7.1231, | |
| "mean_token_accuracy": 0.6911827921867371, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 9.446402549743652, | |
| "learning_rate": 4.2631578947368425e-05, | |
| "loss": 7.3457, | |
| "mean_token_accuracy": 0.689079686999321, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 10.626145362854004, | |
| "learning_rate": 4.210526315789474e-05, | |
| "loss": 6.9955, | |
| "mean_token_accuracy": 0.6898764669895172, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 10.320823669433594, | |
| "learning_rate": 4.157894736842106e-05, | |
| "loss": 6.9504, | |
| "mean_token_accuracy": 0.6948718279600143, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 10.236137390136719, | |
| "learning_rate": 4.105263157894737e-05, | |
| "loss": 6.2089, | |
| "mean_token_accuracy": 0.7240265011787415, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 9.453045845031738, | |
| "learning_rate": 4.0526315789473684e-05, | |
| "loss": 6.7502, | |
| "mean_token_accuracy": 0.7005183100700378, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 9.327564239501953, | |
| "learning_rate": 4e-05, | |
| "loss": 6.4527, | |
| "mean_token_accuracy": 0.7164693623781204, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 9.546314239501953, | |
| "learning_rate": 3.9473684210526316e-05, | |
| "loss": 5.8974, | |
| "mean_token_accuracy": 0.7328019142150879, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 8.894572257995605, | |
| "learning_rate": 3.894736842105263e-05, | |
| "loss": 6.109, | |
| "mean_token_accuracy": 0.7263201028108597, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 9.127656936645508, | |
| "learning_rate": 3.842105263157895e-05, | |
| "loss": 6.4875, | |
| "mean_token_accuracy": 0.7085271328687668, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 9.237127304077148, | |
| "learning_rate": 3.789473684210527e-05, | |
| "loss": 6.159, | |
| "mean_token_accuracy": 0.7449014335870743, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 9.572649002075195, | |
| "learning_rate": 3.736842105263158e-05, | |
| "loss": 6.247, | |
| "mean_token_accuracy": 0.739266037940979, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 9.581724166870117, | |
| "learning_rate": 3.6842105263157895e-05, | |
| "loss": 6.4162, | |
| "mean_token_accuracy": 0.7320354580879211, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 9.571109771728516, | |
| "learning_rate": 3.6315789473684214e-05, | |
| "loss": 6.3865, | |
| "mean_token_accuracy": 0.7435038238763809, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 9.66092300415039, | |
| "learning_rate": 3.578947368421053e-05, | |
| "loss": 5.8215, | |
| "mean_token_accuracy": 0.7524790912866592, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 8.532500267028809, | |
| "learning_rate": 3.526315789473684e-05, | |
| "loss": 5.8042, | |
| "mean_token_accuracy": 0.7616962492465973, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 8.403843879699707, | |
| "learning_rate": 3.473684210526316e-05, | |
| "loss": 5.9317, | |
| "mean_token_accuracy": 0.7603590935468674, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 8.805196762084961, | |
| "learning_rate": 3.421052631578947e-05, | |
| "loss": 6.1436, | |
| "mean_token_accuracy": 0.7516646534204483, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 8.515336036682129, | |
| "learning_rate": 3.368421052631579e-05, | |
| "loss": 6.0511, | |
| "mean_token_accuracy": 0.7515160739421844, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 7.560244560241699, | |
| "learning_rate": 3.3157894736842106e-05, | |
| "loss": 5.4362, | |
| "mean_token_accuracy": 0.772291824221611, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 8.334061622619629, | |
| "learning_rate": 3.2631578947368426e-05, | |
| "loss": 5.5668, | |
| "mean_token_accuracy": 0.7626153230667114, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 8.146302223205566, | |
| "learning_rate": 3.210526315789474e-05, | |
| "loss": 6.1313, | |
| "mean_token_accuracy": 0.7336651831865311, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 7.829502582550049, | |
| "learning_rate": 3.157894736842105e-05, | |
| "loss": 6.0167, | |
| "mean_token_accuracy": 0.7388159483671188, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 8.113974571228027, | |
| "learning_rate": 3.105263157894737e-05, | |
| "loss": 5.2954, | |
| "mean_token_accuracy": 0.776334211230278, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 7.286009311676025, | |
| "learning_rate": 3.0526315789473684e-05, | |
| "loss": 5.6481, | |
| "mean_token_accuracy": 0.7564087808132172, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 7.154447555541992, | |
| "learning_rate": 3e-05, | |
| "loss": 5.3633, | |
| "mean_token_accuracy": 0.7598460763692856, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 8.190098762512207, | |
| "learning_rate": 2.9473684210526314e-05, | |
| "loss": 5.3728, | |
| "mean_token_accuracy": 0.7727002501487732, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 7.3226542472839355, | |
| "learning_rate": 2.8947368421052634e-05, | |
| "loss": 5.4446, | |
| "mean_token_accuracy": 0.7465341240167618, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 8.403494834899902, | |
| "learning_rate": 2.842105263157895e-05, | |
| "loss": 4.6045, | |
| "mean_token_accuracy": 0.7857130914926529, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 8.063408851623535, | |
| "learning_rate": 2.7894736842105263e-05, | |
| "loss": 5.2821, | |
| "mean_token_accuracy": 0.762213259935379, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 7.193646430969238, | |
| "learning_rate": 2.7368421052631583e-05, | |
| "loss": 5.7485, | |
| "mean_token_accuracy": 0.7417114228010178, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 7.28355073928833, | |
| "learning_rate": 2.6842105263157896e-05, | |
| "loss": 5.5915, | |
| "mean_token_accuracy": 0.7473081052303314, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.384160995483398, | |
| "learning_rate": 2.6315789473684212e-05, | |
| "loss": 6.1476, | |
| "mean_token_accuracy": 0.7356720864772797, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 7.840450763702393, | |
| "learning_rate": 2.578947368421053e-05, | |
| "loss": 5.2035, | |
| "mean_token_accuracy": 0.7573880255222321, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 7.19984245300293, | |
| "learning_rate": 2.5263157894736845e-05, | |
| "loss": 4.6355, | |
| "mean_token_accuracy": 0.7865147292613983, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 7.227206707000732, | |
| "learning_rate": 2.4736842105263158e-05, | |
| "loss": 5.7802, | |
| "mean_token_accuracy": 0.7442787438631058, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 7.431645393371582, | |
| "learning_rate": 2.4210526315789474e-05, | |
| "loss": 5.5745, | |
| "mean_token_accuracy": 0.7558661848306656, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 9.079876899719238, | |
| "learning_rate": 2.368421052631579e-05, | |
| "loss": 5.6616, | |
| "mean_token_accuracy": 0.7577391117811203, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 7.336010456085205, | |
| "learning_rate": 2.3157894736842107e-05, | |
| "loss": 4.7, | |
| "mean_token_accuracy": 0.7845469415187836, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 6.669713020324707, | |
| "learning_rate": 2.2631578947368423e-05, | |
| "loss": 5.5094, | |
| "mean_token_accuracy": 0.7579665780067444, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 7.331737995147705, | |
| "learning_rate": 2.2105263157894736e-05, | |
| "loss": 4.6607, | |
| "mean_token_accuracy": 0.7783682346343994, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 6.724721431732178, | |
| "learning_rate": 2.1578947368421053e-05, | |
| "loss": 5.0542, | |
| "mean_token_accuracy": 0.7743726819753647, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 6.390171527862549, | |
| "learning_rate": 2.105263157894737e-05, | |
| "loss": 5.0009, | |
| "mean_token_accuracy": 0.780523419380188, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 7.519730567932129, | |
| "learning_rate": 2.0526315789473685e-05, | |
| "loss": 5.4899, | |
| "mean_token_accuracy": 0.7700912803411484, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 6.765895366668701, | |
| "learning_rate": 2e-05, | |
| "loss": 4.8741, | |
| "mean_token_accuracy": 0.7866266071796417, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 6.651461601257324, | |
| "learning_rate": 1.9473684210526315e-05, | |
| "loss": 5.1589, | |
| "mean_token_accuracy": 0.769757404923439, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 6.601439952850342, | |
| "learning_rate": 1.8947368421052634e-05, | |
| "loss": 5.1586, | |
| "mean_token_accuracy": 0.7651441991329193, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 7.300909042358398, | |
| "learning_rate": 1.8421052631578947e-05, | |
| "loss": 5.1605, | |
| "mean_token_accuracy": 0.7735024839639664, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 6.311753749847412, | |
| "learning_rate": 1.7894736842105264e-05, | |
| "loss": 5.0041, | |
| "mean_token_accuracy": 0.7669179141521454, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 7.332730293273926, | |
| "learning_rate": 1.736842105263158e-05, | |
| "loss": 5.2616, | |
| "mean_token_accuracy": 0.7765114605426788, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 6.422898292541504, | |
| "learning_rate": 1.6842105263157896e-05, | |
| "loss": 6.1462, | |
| "mean_token_accuracy": 0.728233814239502, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 6.685412883758545, | |
| "learning_rate": 1.6315789473684213e-05, | |
| "loss": 5.7309, | |
| "mean_token_accuracy": 0.7524611800909042, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 7.049284934997559, | |
| "learning_rate": 1.5789473684210526e-05, | |
| "loss": 5.0148, | |
| "mean_token_accuracy": 0.77178093791008, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 7.036037445068359, | |
| "learning_rate": 1.5263157894736842e-05, | |
| "loss": 5.0748, | |
| "mean_token_accuracy": 0.7729067206382751, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 7.007073402404785, | |
| "learning_rate": 1.4736842105263157e-05, | |
| "loss": 4.5864, | |
| "mean_token_accuracy": 0.7893485873937607, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 6.996627330780029, | |
| "learning_rate": 1.4210526315789475e-05, | |
| "loss": 5.8196, | |
| "mean_token_accuracy": 0.7412619888782501, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 6.522240161895752, | |
| "learning_rate": 1.3684210526315791e-05, | |
| "loss": 4.3882, | |
| "mean_token_accuracy": 0.8006375879049301, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 7.209001541137695, | |
| "learning_rate": 1.3157894736842106e-05, | |
| "loss": 4.9107, | |
| "mean_token_accuracy": 0.7705955505371094, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 6.461360454559326, | |
| "learning_rate": 1.2631578947368422e-05, | |
| "loss": 4.9865, | |
| "mean_token_accuracy": 0.7676331996917725, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 6.816041469573975, | |
| "learning_rate": 1.2105263157894737e-05, | |
| "loss": 4.7486, | |
| "mean_token_accuracy": 0.8014501333236694, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 6.4101433753967285, | |
| "learning_rate": 1.1578947368421053e-05, | |
| "loss": 4.7705, | |
| "mean_token_accuracy": 0.7754499018192291, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 6.9798970222473145, | |
| "learning_rate": 1.1052631578947368e-05, | |
| "loss": 4.9164, | |
| "mean_token_accuracy": 0.7753257304430008, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 6.403416633605957, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 4.7818, | |
| "mean_token_accuracy": 0.7874085158109665, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 6.675773620605469, | |
| "learning_rate": 1e-05, | |
| "loss": 4.5508, | |
| "mean_token_accuracy": 0.795722022652626, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 6.712479591369629, | |
| "learning_rate": 9.473684210526317e-06, | |
| "loss": 4.9168, | |
| "mean_token_accuracy": 0.7771104872226715, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 5.9163665771484375, | |
| "learning_rate": 8.947368421052632e-06, | |
| "loss": 4.891, | |
| "mean_token_accuracy": 0.7830108255147934, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 7.163206100463867, | |
| "learning_rate": 8.421052631578948e-06, | |
| "loss": 5.149, | |
| "mean_token_accuracy": 0.7670323401689529, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 6.317421913146973, | |
| "learning_rate": 7.894736842105263e-06, | |
| "loss": 4.8002, | |
| "mean_token_accuracy": 0.7903847545385361, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 6.364376544952393, | |
| "learning_rate": 7.3684210526315784e-06, | |
| "loss": 4.7666, | |
| "mean_token_accuracy": 0.7809462994337082, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 6.32914924621582, | |
| "learning_rate": 6.842105263157896e-06, | |
| "loss": 5.2743, | |
| "mean_token_accuracy": 0.7658153772354126, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 6.604763984680176, | |
| "learning_rate": 6.315789473684211e-06, | |
| "loss": 5.2035, | |
| "mean_token_accuracy": 0.776008740067482, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.310863494873047, | |
| "learning_rate": 5.789473684210527e-06, | |
| "loss": 5.5161, | |
| "mean_token_accuracy": 0.7567505836486816, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 6.1613945960998535, | |
| "learning_rate": 5.263157894736842e-06, | |
| "loss": 5.0648, | |
| "mean_token_accuracy": 0.7786727696657181, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 6.521576404571533, | |
| "learning_rate": 4.736842105263159e-06, | |
| "loss": 4.433, | |
| "mean_token_accuracy": 0.7890913188457489, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 6.6246466636657715, | |
| "learning_rate": 4.210526315789474e-06, | |
| "loss": 4.6914, | |
| "mean_token_accuracy": 0.7863939553499222, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 7.000185489654541, | |
| "learning_rate": 3.6842105263157892e-06, | |
| "loss": 5.0143, | |
| "mean_token_accuracy": 0.7784310281276703, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 6.975312232971191, | |
| "learning_rate": 3.1578947368421056e-06, | |
| "loss": 4.7931, | |
| "mean_token_accuracy": 0.7743921875953674, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 6.039007663726807, | |
| "learning_rate": 2.631578947368421e-06, | |
| "loss": 4.7647, | |
| "mean_token_accuracy": 0.780229240655899, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 6.673982620239258, | |
| "learning_rate": 2.105263157894737e-06, | |
| "loss": 4.7419, | |
| "mean_token_accuracy": 0.7849721014499664, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 6.379744529724121, | |
| "learning_rate": 1.5789473684210528e-06, | |
| "loss": 4.7868, | |
| "mean_token_accuracy": 0.7784082293510437, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 6.387270450592041, | |
| "learning_rate": 1.0526315789473685e-06, | |
| "loss": 4.6571, | |
| "mean_token_accuracy": 0.7906839102506638, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 6.2963056564331055, | |
| "learning_rate": 5.263157894736843e-07, | |
| "loss": 5.1662, | |
| "mean_token_accuracy": 0.7627293914556503, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 7.28849983215332, | |
| "learning_rate": 0.0, | |
| "loss": 5.0461, | |
| "mean_token_accuracy": 0.7743319720029831, | |
| "step": 100 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 100, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 522011226931200.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |