| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9917808219178084, | |
| "eval_steps": 100000.0, | |
| "global_step": 364, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005479452054794521, | |
| "grad_norm": 432.0, | |
| "learning_rate": 0.0, | |
| "loss": 5.7373, | |
| "mean_token_accuracy": 0.6561740338802338, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010958904109589041, | |
| "grad_norm": 334.0, | |
| "learning_rate": 5.454545454545455e-07, | |
| "loss": 5.8256, | |
| "mean_token_accuracy": 0.6489620804786682, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01643835616438356, | |
| "grad_norm": 268.0, | |
| "learning_rate": 1.090909090909091e-06, | |
| "loss": 5.6617, | |
| "mean_token_accuracy": 0.6564787924289703, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.021917808219178082, | |
| "grad_norm": 235.0, | |
| "learning_rate": 1.6363636363636363e-06, | |
| "loss": 5.3991, | |
| "mean_token_accuracy": 0.6644423305988312, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0273972602739726, | |
| "grad_norm": 208.0, | |
| "learning_rate": 2.181818181818182e-06, | |
| "loss": 5.2301, | |
| "mean_token_accuracy": 0.6637968420982361, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03287671232876712, | |
| "grad_norm": 137.0, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 4.7387, | |
| "mean_token_accuracy": 0.664221853017807, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.038356164383561646, | |
| "grad_norm": 127.0, | |
| "learning_rate": 3.2727272727272725e-06, | |
| "loss": 4.2579, | |
| "mean_token_accuracy": 0.6794511079788208, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.043835616438356165, | |
| "grad_norm": 77.5, | |
| "learning_rate": 3.818181818181818e-06, | |
| "loss": 3.9438, | |
| "mean_token_accuracy": 0.6846511363983154, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.049315068493150684, | |
| "grad_norm": 63.25, | |
| "learning_rate": 4.363636363636364e-06, | |
| "loss": 3.6434, | |
| "mean_token_accuracy": 0.6952552497386932, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0547945205479452, | |
| "grad_norm": 38.25, | |
| "learning_rate": 4.90909090909091e-06, | |
| "loss": 3.1414, | |
| "mean_token_accuracy": 0.7147609293460846, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06027397260273973, | |
| "grad_norm": 30.625, | |
| "learning_rate": 5.4545454545454545e-06, | |
| "loss": 2.846, | |
| "mean_token_accuracy": 0.7295940220355988, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.06575342465753424, | |
| "grad_norm": 24.125, | |
| "learning_rate": 6e-06, | |
| "loss": 2.6953, | |
| "mean_token_accuracy": 0.7373267114162445, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07123287671232877, | |
| "grad_norm": 24.25, | |
| "learning_rate": 6.545454545454545e-06, | |
| "loss": 2.457, | |
| "mean_token_accuracy": 0.7515099048614502, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.07671232876712329, | |
| "grad_norm": 28.375, | |
| "learning_rate": 7.090909090909091e-06, | |
| "loss": 2.3428, | |
| "mean_token_accuracy": 0.7600045800209045, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0821917808219178, | |
| "grad_norm": 25.0, | |
| "learning_rate": 7.636363636363636e-06, | |
| "loss": 2.1486, | |
| "mean_token_accuracy": 0.7709822058677673, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08767123287671233, | |
| "grad_norm": 53.0, | |
| "learning_rate": 8.181818181818181e-06, | |
| "loss": 1.9053, | |
| "mean_token_accuracy": 0.7999999523162842, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.09315068493150686, | |
| "grad_norm": 15.1875, | |
| "learning_rate": 8.727272727272728e-06, | |
| "loss": 1.8888, | |
| "mean_token_accuracy": 0.8073928356170654, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.09863013698630137, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 9.272727272727273e-06, | |
| "loss": 1.7244, | |
| "mean_token_accuracy": 0.8240509331226349, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.10410958904109589, | |
| "grad_norm": 20.875, | |
| "learning_rate": 9.81818181818182e-06, | |
| "loss": 1.4995, | |
| "mean_token_accuracy": 0.8403702080249786, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1095890410958904, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.0363636363636364e-05, | |
| "loss": 1.3693, | |
| "mean_token_accuracy": 0.8489924073219299, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11506849315068493, | |
| "grad_norm": 10.5625, | |
| "learning_rate": 1.0909090909090909e-05, | |
| "loss": 1.3431, | |
| "mean_token_accuracy": 0.8515270948410034, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.12054794520547946, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 1.1454545454545455e-05, | |
| "loss": 1.2566, | |
| "mean_token_accuracy": 0.8611365258693695, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.12602739726027398, | |
| "grad_norm": 7.125, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.2904, | |
| "mean_token_accuracy": 0.8501682281494141, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.13150684931506848, | |
| "grad_norm": 6.40625, | |
| "learning_rate": 1.2545454545454545e-05, | |
| "loss": 1.2241, | |
| "mean_token_accuracy": 0.8605895340442657, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.136986301369863, | |
| "grad_norm": 7.96875, | |
| "learning_rate": 1.309090909090909e-05, | |
| "loss": 1.2675, | |
| "mean_token_accuracy": 0.8513283133506775, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.14246575342465753, | |
| "grad_norm": 22.0, | |
| "learning_rate": 1.3636363636363637e-05, | |
| "loss": 1.1597, | |
| "mean_token_accuracy": 0.8639258444309235, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.14794520547945206, | |
| "grad_norm": 14.625, | |
| "learning_rate": 1.4181818181818181e-05, | |
| "loss": 1.1697, | |
| "mean_token_accuracy": 0.8613505661487579, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.15342465753424658, | |
| "grad_norm": 32.75, | |
| "learning_rate": 1.4727272727272728e-05, | |
| "loss": 1.1789, | |
| "mean_token_accuracy": 0.864201158285141, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1589041095890411, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 1.5272727272727273e-05, | |
| "loss": 1.0445, | |
| "mean_token_accuracy": 0.8769258260726929, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.1643835616438356, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 1.5818181818181818e-05, | |
| "loss": 1.1214, | |
| "mean_token_accuracy": 0.8670870959758759, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16986301369863013, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 1.6363636363636363e-05, | |
| "loss": 1.1023, | |
| "mean_token_accuracy": 0.8681823015213013, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.17534246575342466, | |
| "grad_norm": 3.5, | |
| "learning_rate": 1.6909090909090907e-05, | |
| "loss": 1.0772, | |
| "mean_token_accuracy": 0.8703848421573639, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.18082191780821918, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 1.7454545454545456e-05, | |
| "loss": 1.0942, | |
| "mean_token_accuracy": 0.8670253753662109, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1863013698630137, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.1039, | |
| "mean_token_accuracy": 0.8668511509895325, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.1917808219178082, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 1.8545454545454545e-05, | |
| "loss": 1.0494, | |
| "mean_token_accuracy": 0.8717499673366547, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.19726027397260273, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 1.909090909090909e-05, | |
| "loss": 1.0125, | |
| "mean_token_accuracy": 0.8741099536418915, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.20273972602739726, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.963636363636364e-05, | |
| "loss": 1.0391, | |
| "mean_token_accuracy": 0.8721306025981903, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.20821917808219179, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.0181818181818183e-05, | |
| "loss": 1.0215, | |
| "mean_token_accuracy": 0.8722277879714966, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2136986301369863, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.0727272727272728e-05, | |
| "loss": 1.223, | |
| "mean_token_accuracy": 0.8559636473655701, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2191780821917808, | |
| "grad_norm": 10.625, | |
| "learning_rate": 2.1272727272727273e-05, | |
| "loss": 1.0187, | |
| "mean_token_accuracy": 0.8717967867851257, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22465753424657534, | |
| "grad_norm": 2.625, | |
| "learning_rate": 2.1818181818181818e-05, | |
| "loss": 0.9888, | |
| "mean_token_accuracy": 0.8752720355987549, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.23013698630136986, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.2363636363636366e-05, | |
| "loss": 0.9372, | |
| "mean_token_accuracy": 0.8807816803455353, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2356164383561644, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.290909090909091e-05, | |
| "loss": 0.9489, | |
| "mean_token_accuracy": 0.878354549407959, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2410958904109589, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.3454545454545456e-05, | |
| "loss": 0.9668, | |
| "mean_token_accuracy": 0.8792111873626709, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2465753424657534, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.4e-05, | |
| "loss": 0.9728, | |
| "mean_token_accuracy": 0.8788473904132843, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.25205479452054796, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.454545454545455e-05, | |
| "loss": 0.9948, | |
| "mean_token_accuracy": 0.8776916861534119, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.25753424657534246, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.509090909090909e-05, | |
| "loss": 0.944, | |
| "mean_token_accuracy": 0.8777068257331848, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.26301369863013696, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.5636363636363635e-05, | |
| "loss": 1.0027, | |
| "mean_token_accuracy": 0.8723031580448151, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2684931506849315, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.618181818181818e-05, | |
| "loss": 0.9266, | |
| "mean_token_accuracy": 0.8784593641757965, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.273972602739726, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 2.6727272727272728e-05, | |
| "loss": 0.9568, | |
| "mean_token_accuracy": 0.8799601793289185, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.27945205479452057, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.7272727272727273e-05, | |
| "loss": 0.9519, | |
| "mean_token_accuracy": 0.8768041133880615, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.28493150684931506, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 2.7818181818181818e-05, | |
| "loss": 0.9182, | |
| "mean_token_accuracy": 0.8792722523212433, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.29041095890410956, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.8363636363636363e-05, | |
| "loss": 0.9485, | |
| "mean_token_accuracy": 0.8771733045578003, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2958904109589041, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 2.890909090909091e-05, | |
| "loss": 0.9637, | |
| "mean_token_accuracy": 0.8786839842796326, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3013698630136986, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 2.9454545454545456e-05, | |
| "loss": 0.9598, | |
| "mean_token_accuracy": 0.8768008947372437, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.30684931506849317, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3e-05, | |
| "loss": 0.938, | |
| "mean_token_accuracy": 0.88134765625, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.31232876712328766, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.9902912621359224e-05, | |
| "loss": 0.9513, | |
| "mean_token_accuracy": 0.8794196844100952, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3178082191780822, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.9805825242718447e-05, | |
| "loss": 0.9744, | |
| "mean_token_accuracy": 0.8775176107883453, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3232876712328767, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.970873786407767e-05, | |
| "loss": 0.8583, | |
| "mean_token_accuracy": 0.8896125555038452, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3287671232876712, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.9611650485436896e-05, | |
| "loss": 0.8944, | |
| "mean_token_accuracy": 0.8858632743358612, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.33424657534246577, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 2.9514563106796115e-05, | |
| "loss": 0.9149, | |
| "mean_token_accuracy": 0.8818275928497314, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.33972602739726027, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 2.941747572815534e-05, | |
| "loss": 0.9311, | |
| "mean_token_accuracy": 0.8800110220909119, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3452054794520548, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.9320388349514565e-05, | |
| "loss": 0.9611, | |
| "mean_token_accuracy": 0.8757826089859009, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3506849315068493, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.9223300970873787e-05, | |
| "loss": 0.8995, | |
| "mean_token_accuracy": 0.8855718970298767, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3561643835616438, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.912621359223301e-05, | |
| "loss": 0.7973, | |
| "mean_token_accuracy": 0.899163693189621, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.36164383561643837, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.9029126213592237e-05, | |
| "loss": 0.873, | |
| "mean_token_accuracy": 0.8860227465629578, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.36712328767123287, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.8932038834951456e-05, | |
| "loss": 0.9315, | |
| "mean_token_accuracy": 0.8812584280967712, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3726027397260274, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 2.8834951456310683e-05, | |
| "loss": 0.989, | |
| "mean_token_accuracy": 0.876181036233902, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3780821917808219, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.8737864077669902e-05, | |
| "loss": 0.956, | |
| "mean_token_accuracy": 0.8795388042926788, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.3835616438356164, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.8640776699029125e-05, | |
| "loss": 0.943, | |
| "mean_token_accuracy": 0.8829234540462494, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.38904109589041097, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 2.854368932038835e-05, | |
| "loss": 0.9224, | |
| "mean_token_accuracy": 0.8784515261650085, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.39452054794520547, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 2.844660194174757e-05, | |
| "loss": 0.8924, | |
| "mean_token_accuracy": 0.8858824968338013, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 2.8349514563106797e-05, | |
| "loss": 0.8978, | |
| "mean_token_accuracy": 0.8824579417705536, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4054794520547945, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.825242718446602e-05, | |
| "loss": 0.9025, | |
| "mean_token_accuracy": 0.8830565512180328, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.410958904109589, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.8155339805825243e-05, | |
| "loss": 0.9297, | |
| "mean_token_accuracy": 0.8840304613113403, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.41643835616438357, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.8058252427184466e-05, | |
| "loss": 0.8745, | |
| "mean_token_accuracy": 0.887761652469635, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.42191780821917807, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.7961165048543692e-05, | |
| "loss": 0.8751, | |
| "mean_token_accuracy": 0.8896037936210632, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.4273972602739726, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 2.786407766990291e-05, | |
| "loss": 0.8875, | |
| "mean_token_accuracy": 0.8843996524810791, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4328767123287671, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.7766990291262138e-05, | |
| "loss": 0.8524, | |
| "mean_token_accuracy": 0.8866465091705322, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4383561643835616, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 2.766990291262136e-05, | |
| "loss": 0.9276, | |
| "mean_token_accuracy": 0.8828848004341125, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4438356164383562, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.757281553398058e-05, | |
| "loss": 0.9067, | |
| "mean_token_accuracy": 0.8834775686264038, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.44931506849315067, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.7475728155339807e-05, | |
| "loss": 0.8763, | |
| "mean_token_accuracy": 0.8872078657150269, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4547945205479452, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.737864077669903e-05, | |
| "loss": 0.9264, | |
| "mean_token_accuracy": 0.8818502724170685, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.4602739726027397, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 2.7281553398058253e-05, | |
| "loss": 0.848, | |
| "mean_token_accuracy": 0.8881970345973969, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4657534246575342, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.7184466019417475e-05, | |
| "loss": 0.845, | |
| "mean_token_accuracy": 0.8866861760616302, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4712328767123288, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.7087378640776702e-05, | |
| "loss": 0.8696, | |
| "mean_token_accuracy": 0.8877186477184296, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4767123287671233, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.699029126213592e-05, | |
| "loss": 0.838, | |
| "mean_token_accuracy": 0.8892369568347931, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.4821917808219178, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.6893203883495148e-05, | |
| "loss": 0.8945, | |
| "mean_token_accuracy": 0.8853996694087982, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.4876712328767123, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 2.679611650485437e-05, | |
| "loss": 0.8833, | |
| "mean_token_accuracy": 0.8856672644615173, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.4931506849315068, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 2.6699029126213593e-05, | |
| "loss": 0.8055, | |
| "mean_token_accuracy": 0.8934458792209625, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4986301369863014, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.6601941747572816e-05, | |
| "loss": 0.852, | |
| "mean_token_accuracy": 0.8876021206378937, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5041095890410959, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 2.6504854368932043e-05, | |
| "loss": 0.8645, | |
| "mean_token_accuracy": 0.8882910907268524, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5095890410958904, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.6407766990291262e-05, | |
| "loss": 0.8375, | |
| "mean_token_accuracy": 0.8917896747589111, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.5150684931506849, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 2.6310679611650485e-05, | |
| "loss": 0.8659, | |
| "mean_token_accuracy": 0.8871166706085205, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5205479452054794, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 2.6213592233009708e-05, | |
| "loss": 0.7912, | |
| "mean_token_accuracy": 0.8958589434623718, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5260273972602739, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.611650485436893e-05, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.8912127017974854, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5315068493150685, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.6019417475728157e-05, | |
| "loss": 0.9165, | |
| "mean_token_accuracy": 0.8832310140132904, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.536986301369863, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 2.5922330097087377e-05, | |
| "loss": 0.8473, | |
| "mean_token_accuracy": 0.8883266150951385, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5424657534246575, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.5825242718446603e-05, | |
| "loss": 0.8002, | |
| "mean_token_accuracy": 0.8930515050888062, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 2.5728155339805826e-05, | |
| "loss": 0.9012, | |
| "mean_token_accuracy": 0.8813818693161011, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5534246575342465, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.563106796116505e-05, | |
| "loss": 0.7787, | |
| "mean_token_accuracy": 0.8958908021450043, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5589041095890411, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.5533980582524272e-05, | |
| "loss": 0.8255, | |
| "mean_token_accuracy": 0.8933219909667969, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5643835616438356, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.5436893203883498e-05, | |
| "loss": 0.7465, | |
| "mean_token_accuracy": 0.9005298018455505, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.5698630136986301, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.5339805825242718e-05, | |
| "loss": 0.8528, | |
| "mean_token_accuracy": 0.8887440264225006, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5753424657534246, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 2.5242718446601944e-05, | |
| "loss": 0.8662, | |
| "mean_token_accuracy": 0.8833577632904053, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5808219178082191, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 2.5145631067961167e-05, | |
| "loss": 0.8146, | |
| "mean_token_accuracy": 0.894057959318161, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5863013698630137, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 2.5048543689320386e-05, | |
| "loss": 0.8284, | |
| "mean_token_accuracy": 0.8923339545726776, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5917808219178082, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 2.4951456310679613e-05, | |
| "loss": 0.8293, | |
| "mean_token_accuracy": 0.8885443806648254, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5972602739726027, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.4854368932038836e-05, | |
| "loss": 0.7819, | |
| "mean_token_accuracy": 0.8964778184890747, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.6027397260273972, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 2.475728155339806e-05, | |
| "loss": 0.8938, | |
| "mean_token_accuracy": 0.8822830021381378, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6082191780821918, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.466019417475728e-05, | |
| "loss": 0.8234, | |
| "mean_token_accuracy": 0.8917379379272461, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.6136986301369863, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.4563106796116508e-05, | |
| "loss": 0.8456, | |
| "mean_token_accuracy": 0.8898132741451263, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6191780821917808, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 2.4466019417475727e-05, | |
| "loss": 0.7978, | |
| "mean_token_accuracy": 0.8950874209403992, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6246575342465753, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.4368932038834954e-05, | |
| "loss": 0.7492, | |
| "mean_token_accuracy": 0.9010921716690063, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6301369863013698, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 2.4271844660194176e-05, | |
| "loss": 0.8407, | |
| "mean_token_accuracy": 0.891789972782135, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6356164383561644, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.41747572815534e-05, | |
| "loss": 0.7685, | |
| "mean_token_accuracy": 0.8990534842014313, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6410958904109589, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 2.4077669902912622e-05, | |
| "loss": 0.777, | |
| "mean_token_accuracy": 0.8945018351078033, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6465753424657534, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.3980582524271845e-05, | |
| "loss": 0.7919, | |
| "mean_token_accuracy": 0.8964290022850037, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6520547945205479, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.3883495145631068e-05, | |
| "loss": 0.8213, | |
| "mean_token_accuracy": 0.8932888507843018, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.6575342465753424, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 2.378640776699029e-05, | |
| "loss": 0.7114, | |
| "mean_token_accuracy": 0.9056997001171112, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.663013698630137, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 2.3689320388349514e-05, | |
| "loss": 0.8486, | |
| "mean_token_accuracy": 0.8888402283191681, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.6684931506849315, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.3592233009708737e-05, | |
| "loss": 0.7702, | |
| "mean_token_accuracy": 0.8981845676898956, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.673972602739726, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.3495145631067963e-05, | |
| "loss": 0.7341, | |
| "mean_token_accuracy": 0.9025160670280457, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.6794520547945205, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.3398058252427183e-05, | |
| "loss": 0.833, | |
| "mean_token_accuracy": 0.8900012671947479, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 2.330097087378641e-05, | |
| "loss": 0.8187, | |
| "mean_token_accuracy": 0.8908300995826721, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6904109589041096, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.3203883495145632e-05, | |
| "loss": 0.807, | |
| "mean_token_accuracy": 0.8888507187366486, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6958904109589041, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.3106796116504855e-05, | |
| "loss": 0.7383, | |
| "mean_token_accuracy": 0.8997284471988678, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.7013698630136986, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 2.3009708737864078e-05, | |
| "loss": 0.7662, | |
| "mean_token_accuracy": 0.8989343047142029, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.7068493150684931, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 2.2912621359223304e-05, | |
| "loss": 0.7907, | |
| "mean_token_accuracy": 0.8955272138118744, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.7123287671232876, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 2.2815533980582524e-05, | |
| "loss": 0.7521, | |
| "mean_token_accuracy": 0.897832453250885, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7178082191780822, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.2718446601941746e-05, | |
| "loss": 0.7513, | |
| "mean_token_accuracy": 0.8988269567489624, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.7232876712328767, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.2621359223300973e-05, | |
| "loss": 0.7355, | |
| "mean_token_accuracy": 0.9037497639656067, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7287671232876712, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 2.2524271844660192e-05, | |
| "loss": 0.8022, | |
| "mean_token_accuracy": 0.8948836922645569, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.7342465753424657, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.242718446601942e-05, | |
| "loss": 0.7153, | |
| "mean_token_accuracy": 0.9037838876247406, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7397260273972602, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.233009708737864e-05, | |
| "loss": 0.7801, | |
| "mean_token_accuracy": 0.8935177326202393, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7452054794520548, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.2233009708737864e-05, | |
| "loss": 0.742, | |
| "mean_token_accuracy": 0.8993457555770874, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7506849315068493, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 2.2135922330097087e-05, | |
| "loss": 0.7563, | |
| "mean_token_accuracy": 0.9012070000171661, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.7561643835616438, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.2038834951456314e-05, | |
| "loss": 0.666, | |
| "mean_token_accuracy": 0.9094895124435425, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7616438356164383, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.1941747572815533e-05, | |
| "loss": 0.8154, | |
| "mean_token_accuracy": 0.8935888111591339, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.7671232876712328, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 2.184466019417476e-05, | |
| "loss": 0.7885, | |
| "mean_token_accuracy": 0.8952848613262177, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7726027397260274, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 2.1747572815533982e-05, | |
| "loss": 0.8052, | |
| "mean_token_accuracy": 0.895246148109436, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.7780821917808219, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.1650485436893205e-05, | |
| "loss": 0.7475, | |
| "mean_token_accuracy": 0.9011639654636383, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7835616438356164, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.1553398058252428e-05, | |
| "loss": 0.7869, | |
| "mean_token_accuracy": 0.8955070972442627, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.7890410958904109, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 2.145631067961165e-05, | |
| "loss": 0.7642, | |
| "mean_token_accuracy": 0.900310605764389, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7945205479452054, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 2.1359223300970874e-05, | |
| "loss": 0.7373, | |
| "mean_token_accuracy": 0.9015851616859436, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.1262135922330097e-05, | |
| "loss": 0.6909, | |
| "mean_token_accuracy": 0.9065253138542175, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8054794520547945, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.116504854368932e-05, | |
| "loss": 0.8044, | |
| "mean_token_accuracy": 0.8923492431640625, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.810958904109589, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.1067961165048543e-05, | |
| "loss": 0.7616, | |
| "mean_token_accuracy": 0.898425966501236, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8164383561643835, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.097087378640777e-05, | |
| "loss": 0.7691, | |
| "mean_token_accuracy": 0.8980711996555328, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.821917808219178, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 2.087378640776699e-05, | |
| "loss": 0.6681, | |
| "mean_token_accuracy": 0.9098504185676575, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8273972602739726, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.0776699029126215e-05, | |
| "loss": 0.7424, | |
| "mean_token_accuracy": 0.9017688035964966, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.8328767123287671, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 2.0679611650485438e-05, | |
| "loss": 0.7083, | |
| "mean_token_accuracy": 0.9045331180095673, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8383561643835616, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.058252427184466e-05, | |
| "loss": 0.6966, | |
| "mean_token_accuracy": 0.9057001769542694, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8438356164383561, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 2.0485436893203884e-05, | |
| "loss": 0.7436, | |
| "mean_token_accuracy": 0.9005197286605835, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8493150684931506, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.0388349514563107e-05, | |
| "loss": 0.7627, | |
| "mean_token_accuracy": 0.8969089984893799, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8547945205479452, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.029126213592233e-05, | |
| "loss": 0.8324, | |
| "mean_token_accuracy": 0.8917456269264221, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.8602739726027397, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.0194174757281552e-05, | |
| "loss": 0.7393, | |
| "mean_token_accuracy": 0.8997950255870819, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.8657534246575342, | |
| "grad_norm": 1.625, | |
| "learning_rate": 2.009708737864078e-05, | |
| "loss": 0.7519, | |
| "mean_token_accuracy": 0.8997250497341156, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8712328767123287, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.9999999999999998e-05, | |
| "loss": 0.7531, | |
| "mean_token_accuracy": 0.8967012465000153, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.8767123287671232, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.9902912621359225e-05, | |
| "loss": 0.7243, | |
| "mean_token_accuracy": 0.9039227664470673, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8821917808219178, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 1.9805825242718447e-05, | |
| "loss": 0.7315, | |
| "mean_token_accuracy": 0.9020437300205231, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.8876712328767123, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.970873786407767e-05, | |
| "loss": 0.8194, | |
| "mean_token_accuracy": 0.8896900415420532, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8931506849315068, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.9611650485436893e-05, | |
| "loss": 0.7287, | |
| "mean_token_accuracy": 0.9017397165298462, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.8986301369863013, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.951456310679612e-05, | |
| "loss": 0.8069, | |
| "mean_token_accuracy": 0.896651417016983, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.9041095890410958, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.941747572815534e-05, | |
| "loss": 0.8212, | |
| "mean_token_accuracy": 0.890827476978302, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9095890410958904, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 1.9320388349514565e-05, | |
| "loss": 0.7502, | |
| "mean_token_accuracy": 0.9001190960407257, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.915068493150685, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.922330097087379e-05, | |
| "loss": 0.7855, | |
| "mean_token_accuracy": 0.8954821825027466, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.9205479452054794, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.9126213592233008e-05, | |
| "loss": 0.7034, | |
| "mean_token_accuracy": 0.9049507081508636, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9260273972602739, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.9029126213592234e-05, | |
| "loss": 0.7162, | |
| "mean_token_accuracy": 0.9048155248165131, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.9315068493150684, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.8932038834951457e-05, | |
| "loss": 0.7308, | |
| "mean_token_accuracy": 0.9018749892711639, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.936986301369863, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.883495145631068e-05, | |
| "loss": 0.7384, | |
| "mean_token_accuracy": 0.9014742374420166, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.9424657534246575, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.8737864077669903e-05, | |
| "loss": 0.797, | |
| "mean_token_accuracy": 0.8951345086097717, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.947945205479452, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.8640776699029126e-05, | |
| "loss": 0.7149, | |
| "mean_token_accuracy": 0.9054000973701477, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.9534246575342465, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.854368932038835e-05, | |
| "loss": 0.693, | |
| "mean_token_accuracy": 0.9068967401981354, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.958904109589041, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.8446601941747575e-05, | |
| "loss": 0.7656, | |
| "mean_token_accuracy": 0.8967747688293457, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.9643835616438357, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 1.8349514563106795e-05, | |
| "loss": 0.7075, | |
| "mean_token_accuracy": 0.9012430012226105, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.9698630136986301, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.825242718446602e-05, | |
| "loss": 0.7531, | |
| "mean_token_accuracy": 0.9003328382968903, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.9753424657534246, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 1.8155339805825244e-05, | |
| "loss": 0.7687, | |
| "mean_token_accuracy": 0.8957654237747192, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.9808219178082191, | |
| "grad_norm": 1.625, | |
| "learning_rate": 1.8058252427184467e-05, | |
| "loss": 0.7372, | |
| "mean_token_accuracy": 0.9013311266899109, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.9863013698630136, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 1.796116504854369e-05, | |
| "loss": 0.7142, | |
| "mean_token_accuracy": 0.9064333140850067, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9917808219178083, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.7864077669902913e-05, | |
| "loss": 0.6527, | |
| "mean_token_accuracy": 0.9137677848339081, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.9972602739726028, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.7766990291262135e-05, | |
| "loss": 0.631, | |
| "mean_token_accuracy": 0.9135155379772186, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 1.766990291262136e-05, | |
| "loss": 0.3493, | |
| "mean_token_accuracy": 0.9048470258712769, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.0054794520547945, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.7572815533980585e-05, | |
| "loss": 0.5393, | |
| "mean_token_accuracy": 0.9253315627574921, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.010958904109589, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.7475728155339804e-05, | |
| "loss": 0.6205, | |
| "mean_token_accuracy": 0.9160933494567871, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.0164383561643835, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.737864077669903e-05, | |
| "loss": 0.6018, | |
| "mean_token_accuracy": 0.9159659445285797, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.021917808219178, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.7281553398058253e-05, | |
| "loss": 0.6171, | |
| "mean_token_accuracy": 0.9156332015991211, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.0273972602739727, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.7184466019417476e-05, | |
| "loss": 0.5991, | |
| "mean_token_accuracy": 0.9195460677146912, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.0328767123287672, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.70873786407767e-05, | |
| "loss": 0.6002, | |
| "mean_token_accuracy": 0.9162729382514954, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.0383561643835617, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.6990291262135926e-05, | |
| "loss": 0.5521, | |
| "mean_token_accuracy": 0.9219829440116882, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0438356164383562, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.6893203883495145e-05, | |
| "loss": 0.5947, | |
| "mean_token_accuracy": 0.9174557328224182, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.0493150684931507, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.6796116504854368e-05, | |
| "loss": 0.563, | |
| "mean_token_accuracy": 0.9200533628463745, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.0547945205479452, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.6699029126213594e-05, | |
| "loss": 0.5265, | |
| "mean_token_accuracy": 0.9236235916614532, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.0602739726027397, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.6601941747572814e-05, | |
| "loss": 0.6031, | |
| "mean_token_accuracy": 0.9163488149642944, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.0657534246575342, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.650485436893204e-05, | |
| "loss": 0.5715, | |
| "mean_token_accuracy": 0.9195785820484161, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.0712328767123287, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.6407766990291263e-05, | |
| "loss": 0.5419, | |
| "mean_token_accuracy": 0.9279742538928986, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.0767123287671232, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.6310679611650486e-05, | |
| "loss": 0.5914, | |
| "mean_token_accuracy": 0.9168040454387665, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.0821917808219177, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.621359223300971e-05, | |
| "loss": 0.5428, | |
| "mean_token_accuracy": 0.9232941567897797, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.0876712328767124, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.6116504854368932e-05, | |
| "loss": 0.5731, | |
| "mean_token_accuracy": 0.9218332469463348, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.093150684931507, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.6019417475728155e-05, | |
| "loss": 0.5765, | |
| "mean_token_accuracy": 0.9191625118255615, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0986301369863014, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.592233009708738e-05, | |
| "loss": 0.5568, | |
| "mean_token_accuracy": 0.9245510995388031, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.104109589041096, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.58252427184466e-05, | |
| "loss": 0.623, | |
| "mean_token_accuracy": 0.9128701388835907, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.1095890410958904, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.5728155339805827e-05, | |
| "loss": 0.5732, | |
| "mean_token_accuracy": 0.9211214780807495, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.115068493150685, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.563106796116505e-05, | |
| "loss": 0.5804, | |
| "mean_token_accuracy": 0.9198083877563477, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.1205479452054794, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.553398058252427e-05, | |
| "loss": 0.5704, | |
| "mean_token_accuracy": 0.9222142696380615, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.126027397260274, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.5436893203883496e-05, | |
| "loss": 0.5466, | |
| "mean_token_accuracy": 0.9223719537258148, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.1315068493150684, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.533980582524272e-05, | |
| "loss": 0.5751, | |
| "mean_token_accuracy": 0.91919806599617, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.1369863013698631, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 1.5242718446601941e-05, | |
| "loss": 0.5742, | |
| "mean_token_accuracy": 0.9198531806468964, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.1424657534246576, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.5145631067961166e-05, | |
| "loss": 0.5554, | |
| "mean_token_accuracy": 0.921671062707901, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.1479452054794521, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.504854368932039e-05, | |
| "loss": 0.637, | |
| "mean_token_accuracy": 0.9104412496089935, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1534246575342466, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.4951456310679612e-05, | |
| "loss": 0.5364, | |
| "mean_token_accuracy": 0.924155980348587, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.158904109589041, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.4854368932038835e-05, | |
| "loss": 0.5095, | |
| "mean_token_accuracy": 0.928600013256073, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.1643835616438356, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.4757281553398058e-05, | |
| "loss": 0.5027, | |
| "mean_token_accuracy": 0.9302247166633606, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.16986301369863, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.4660194174757282e-05, | |
| "loss": 0.4987, | |
| "mean_token_accuracy": 0.9295750260353088, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.1753424657534246, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.4563106796116505e-05, | |
| "loss": 0.5288, | |
| "mean_token_accuracy": 0.9235924184322357, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.180821917808219, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.4466019417475728e-05, | |
| "loss": 0.5665, | |
| "mean_token_accuracy": 0.9195916950702667, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.1863013698630138, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.4368932038834951e-05, | |
| "loss": 0.5959, | |
| "mean_token_accuracy": 0.9180994033813477, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.191780821917808, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.4271844660194176e-05, | |
| "loss": 0.5501, | |
| "mean_token_accuracy": 0.9220919609069824, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.1972602739726028, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.4174757281553399e-05, | |
| "loss": 0.6014, | |
| "mean_token_accuracy": 0.9183669984340668, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.2027397260273973, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.4077669902912621e-05, | |
| "loss": 0.5593, | |
| "mean_token_accuracy": 0.9229636490345001, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2082191780821918, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.3980582524271846e-05, | |
| "loss": 0.5657, | |
| "mean_token_accuracy": 0.9209258854389191, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.2136986301369863, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.3883495145631069e-05, | |
| "loss": 0.5373, | |
| "mean_token_accuracy": 0.9277951121330261, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.2191780821917808, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.378640776699029e-05, | |
| "loss": 0.4988, | |
| "mean_token_accuracy": 0.9296124577522278, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.2246575342465753, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.3689320388349515e-05, | |
| "loss": 0.5201, | |
| "mean_token_accuracy": 0.9280821681022644, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.2301369863013698, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.3592233009708738e-05, | |
| "loss": 0.5496, | |
| "mean_token_accuracy": 0.9242799878120422, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.2356164383561643, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.349514563106796e-05, | |
| "loss": 0.5598, | |
| "mean_token_accuracy": 0.9225422143936157, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.2410958904109588, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 1.3398058252427185e-05, | |
| "loss": 0.5737, | |
| "mean_token_accuracy": 0.919199138879776, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.2465753424657535, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.3300970873786408e-05, | |
| "loss": 0.5535, | |
| "mean_token_accuracy": 0.9228192865848541, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.252054794520548, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.3203883495145631e-05, | |
| "loss": 0.5765, | |
| "mean_token_accuracy": 0.9180297553539276, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.2575342465753425, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3106796116504854e-05, | |
| "loss": 0.537, | |
| "mean_token_accuracy": 0.9278073906898499, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.263013698630137, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.3009708737864079e-05, | |
| "loss": 0.5653, | |
| "mean_token_accuracy": 0.9221877753734589, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.2684931506849315, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.2912621359223302e-05, | |
| "loss": 0.5512, | |
| "mean_token_accuracy": 0.925201803445816, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.273972602739726, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.2815533980582524e-05, | |
| "loss": 0.5753, | |
| "mean_token_accuracy": 0.9201514422893524, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.2794520547945205, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.2718446601941749e-05, | |
| "loss": 0.6003, | |
| "mean_token_accuracy": 0.91791832447052, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.284931506849315, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.2621359223300972e-05, | |
| "loss": 0.5394, | |
| "mean_token_accuracy": 0.9252720773220062, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.2904109589041095, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.2524271844660193e-05, | |
| "loss": 0.5893, | |
| "mean_token_accuracy": 0.921189695596695, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.2958904109589042, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.2427184466019418e-05, | |
| "loss": 0.5125, | |
| "mean_token_accuracy": 0.9273927509784698, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.3013698630136985, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.233009708737864e-05, | |
| "loss": 0.5869, | |
| "mean_token_accuracy": 0.9177975952625275, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.3068493150684932, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.2233009708737864e-05, | |
| "loss": 0.5528, | |
| "mean_token_accuracy": 0.9212169349193573, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.3123287671232877, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.2135922330097088e-05, | |
| "loss": 0.5988, | |
| "mean_token_accuracy": 0.9181526899337769, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3178082191780822, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.2038834951456311e-05, | |
| "loss": 0.5451, | |
| "mean_token_accuracy": 0.924535721540451, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.3232876712328767, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.1941747572815534e-05, | |
| "loss": 0.5542, | |
| "mean_token_accuracy": 0.9256895780563354, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.3287671232876712, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.1844660194174757e-05, | |
| "loss": 0.5968, | |
| "mean_token_accuracy": 0.9179144501686096, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.3342465753424657, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.1747572815533982e-05, | |
| "loss": 0.5547, | |
| "mean_token_accuracy": 0.9240420460700989, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.3397260273972602, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.1650485436893204e-05, | |
| "loss": 0.5952, | |
| "mean_token_accuracy": 0.9179329574108124, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.345205479452055, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.1553398058252427e-05, | |
| "loss": 0.5305, | |
| "mean_token_accuracy": 0.9257975220680237, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.3506849315068492, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.1456310679611652e-05, | |
| "loss": 0.5206, | |
| "mean_token_accuracy": 0.9270393252372742, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.356164383561644, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.1359223300970873e-05, | |
| "loss": 0.5533, | |
| "mean_token_accuracy": 0.9239944219589233, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.3616438356164384, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.1262135922330096e-05, | |
| "loss": 0.5726, | |
| "mean_token_accuracy": 0.9222332239151001, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.367123287671233, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.116504854368932e-05, | |
| "loss": 0.5758, | |
| "mean_token_accuracy": 0.9191364943981171, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3726027397260274, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.1067961165048544e-05, | |
| "loss": 0.4782, | |
| "mean_token_accuracy": 0.9341015517711639, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.378082191780822, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.0970873786407767e-05, | |
| "loss": 0.5065, | |
| "mean_token_accuracy": 0.929078996181488, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.3835616438356164, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.0873786407766991e-05, | |
| "loss": 0.5044, | |
| "mean_token_accuracy": 0.929146945476532, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.389041095890411, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.0776699029126214e-05, | |
| "loss": 0.5588, | |
| "mean_token_accuracy": 0.9236064851284027, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.3945205479452054, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.0679611650485437e-05, | |
| "loss": 0.5258, | |
| "mean_token_accuracy": 0.9257136583328247, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.058252427184466e-05, | |
| "loss": 0.5045, | |
| "mean_token_accuracy": 0.9280400276184082, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.4054794520547946, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.0485436893203885e-05, | |
| "loss": 0.4898, | |
| "mean_token_accuracy": 0.9313413798809052, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.410958904109589, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.0388349514563107e-05, | |
| "loss": 0.531, | |
| "mean_token_accuracy": 0.9248749315738678, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.4164383561643836, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.029126213592233e-05, | |
| "loss": 0.5366, | |
| "mean_token_accuracy": 0.9258707165718079, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.4219178082191781, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.0194174757281553e-05, | |
| "loss": 0.6007, | |
| "mean_token_accuracy": 0.9167526066303253, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4273972602739726, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.0097087378640776e-05, | |
| "loss": 0.5566, | |
| "mean_token_accuracy": 0.9223202764987946, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.4328767123287671, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 9.999999999999999e-06, | |
| "loss": 0.4951, | |
| "mean_token_accuracy": 0.9306872189044952, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.4383561643835616, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 9.902912621359224e-06, | |
| "loss": 0.5958, | |
| "mean_token_accuracy": 0.9156571626663208, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.4438356164383561, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 9.805825242718447e-06, | |
| "loss": 0.5026, | |
| "mean_token_accuracy": 0.9286511540412903, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.4493150684931506, | |
| "grad_norm": 2.0, | |
| "learning_rate": 9.70873786407767e-06, | |
| "loss": 0.539, | |
| "mean_token_accuracy": 0.9245258867740631, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.4547945205479453, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 9.611650485436894e-06, | |
| "loss": 0.5329, | |
| "mean_token_accuracy": 0.9257172644138336, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.4602739726027396, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 9.514563106796117e-06, | |
| "loss": 0.5172, | |
| "mean_token_accuracy": 0.9249266386032104, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.4657534246575343, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.41747572815534e-06, | |
| "loss": 0.5082, | |
| "mean_token_accuracy": 0.9297462403774261, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.4712328767123288, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 9.320388349514563e-06, | |
| "loss": 0.5195, | |
| "mean_token_accuracy": 0.9255764186382294, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.4767123287671233, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 9.223300970873788e-06, | |
| "loss": 0.5049, | |
| "mean_token_accuracy": 0.9289407432079315, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4821917808219178, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 9.12621359223301e-06, | |
| "loss": 0.5602, | |
| "mean_token_accuracy": 0.9205317795276642, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.4876712328767123, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 9.029126213592233e-06, | |
| "loss": 0.5368, | |
| "mean_token_accuracy": 0.9262135624885559, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.4931506849315068, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 8.932038834951456e-06, | |
| "loss": 0.5809, | |
| "mean_token_accuracy": 0.9190745651721954, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.4986301369863013, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 8.83495145631068e-06, | |
| "loss": 0.4738, | |
| "mean_token_accuracy": 0.9319880604743958, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.504109589041096, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 8.737864077669902e-06, | |
| "loss": 0.5187, | |
| "mean_token_accuracy": 0.9270003736019135, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.5095890410958903, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 8.640776699029127e-06, | |
| "loss": 0.5182, | |
| "mean_token_accuracy": 0.9283578097820282, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.515068493150685, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 8.54368932038835e-06, | |
| "loss": 0.5368, | |
| "mean_token_accuracy": 0.9256248772144318, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.5205479452054793, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 8.446601941747573e-06, | |
| "loss": 0.5506, | |
| "mean_token_accuracy": 0.9241002202033997, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.526027397260274, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 8.349514563106797e-06, | |
| "loss": 0.5656, | |
| "mean_token_accuracy": 0.9233757555484772, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.5315068493150685, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 8.25242718446602e-06, | |
| "loss": 0.5238, | |
| "mean_token_accuracy": 0.9281691014766693, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.536986301369863, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.155339805825243e-06, | |
| "loss": 0.4985, | |
| "mean_token_accuracy": 0.9294464886188507, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.5424657534246575, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8.058252427184466e-06, | |
| "loss": 0.562, | |
| "mean_token_accuracy": 0.9225472807884216, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.547945205479452, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 7.96116504854369e-06, | |
| "loss": 0.4537, | |
| "mean_token_accuracy": 0.9356608986854553, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.5534246575342465, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 7.864077669902913e-06, | |
| "loss": 0.5397, | |
| "mean_token_accuracy": 0.9253256320953369, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.558904109589041, | |
| "grad_norm": 1.875, | |
| "learning_rate": 7.766990291262135e-06, | |
| "loss": 0.5313, | |
| "mean_token_accuracy": 0.9250599443912506, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.5643835616438357, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 7.66990291262136e-06, | |
| "loss": 0.5112, | |
| "mean_token_accuracy": 0.92934450507164, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.56986301369863, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 7.572815533980583e-06, | |
| "loss": 0.5062, | |
| "mean_token_accuracy": 0.928922027349472, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.5753424657534247, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 7.475728155339806e-06, | |
| "loss": 0.4996, | |
| "mean_token_accuracy": 0.9288285672664642, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.580821917808219, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 7.378640776699029e-06, | |
| "loss": 0.5267, | |
| "mean_token_accuracy": 0.9273545145988464, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.5863013698630137, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 7.281553398058253e-06, | |
| "loss": 0.5721, | |
| "mean_token_accuracy": 0.9204368591308594, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5917808219178082, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 7.1844660194174755e-06, | |
| "loss": 0.5647, | |
| "mean_token_accuracy": 0.9210836887359619, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.5972602739726027, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 7.087378640776699e-06, | |
| "loss": 0.5867, | |
| "mean_token_accuracy": 0.9216422140598297, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.6027397260273972, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 6.990291262135923e-06, | |
| "loss": 0.5114, | |
| "mean_token_accuracy": 0.9296845495700836, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.6082191780821917, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 6.893203883495145e-06, | |
| "loss": 0.5886, | |
| "mean_token_accuracy": 0.9185077250003815, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.6136986301369864, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 6.796116504854369e-06, | |
| "loss": 0.5852, | |
| "mean_token_accuracy": 0.9188163578510284, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.6191780821917807, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.699029126213593e-06, | |
| "loss": 0.5313, | |
| "mean_token_accuracy": 0.9268418252468109, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.6246575342465754, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.6019417475728155e-06, | |
| "loss": 0.504, | |
| "mean_token_accuracy": 0.9275769889354706, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.6301369863013697, | |
| "grad_norm": 1.875, | |
| "learning_rate": 6.504854368932039e-06, | |
| "loss": 0.4644, | |
| "mean_token_accuracy": 0.9344028532505035, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.6356164383561644, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 6.407766990291262e-06, | |
| "loss": 0.5318, | |
| "mean_token_accuracy": 0.9281158149242401, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.641095890410959, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 6.310679611650486e-06, | |
| "loss": 0.5019, | |
| "mean_token_accuracy": 0.9296712577342987, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6465753424657534, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 6.213592233009709e-06, | |
| "loss": 0.5682, | |
| "mean_token_accuracy": 0.9229175746440887, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.652054794520548, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 6.116504854368932e-06, | |
| "loss": 0.5365, | |
| "mean_token_accuracy": 0.9239448308944702, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.6575342465753424, | |
| "grad_norm": 1.875, | |
| "learning_rate": 6.0194174757281556e-06, | |
| "loss": 0.5741, | |
| "mean_token_accuracy": 0.9217245876789093, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.6630136986301371, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 5.9223300970873785e-06, | |
| "loss": 0.5606, | |
| "mean_token_accuracy": 0.9235257804393768, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.6684931506849314, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 5.825242718446602e-06, | |
| "loss": 0.4991, | |
| "mean_token_accuracy": 0.9290522634983063, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.6739726027397261, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 5.728155339805826e-06, | |
| "loss": 0.5246, | |
| "mean_token_accuracy": 0.9278529584407806, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.6794520547945204, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 5.631067961165048e-06, | |
| "loss": 0.5088, | |
| "mean_token_accuracy": 0.9272408485412598, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.6849315068493151, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 5.533980582524272e-06, | |
| "loss": 0.5482, | |
| "mean_token_accuracy": 0.923981636762619, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.6904109589041096, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 5.436893203883496e-06, | |
| "loss": 0.5489, | |
| "mean_token_accuracy": 0.9235535562038422, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.6958904109589041, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 5.3398058252427185e-06, | |
| "loss": 0.5515, | |
| "mean_token_accuracy": 0.924066424369812, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7013698630136986, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 5.242718446601942e-06, | |
| "loss": 0.5299, | |
| "mean_token_accuracy": 0.9252340495586395, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.7068493150684931, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 5.145631067961165e-06, | |
| "loss": 0.5121, | |
| "mean_token_accuracy": 0.9283900856971741, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.7123287671232876, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 5.048543689320388e-06, | |
| "loss": 0.4787, | |
| "mean_token_accuracy": 0.9316571056842804, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.7178082191780821, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.951456310679612e-06, | |
| "loss": 0.5452, | |
| "mean_token_accuracy": 0.9234668910503387, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.7232876712328768, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 4.854368932038835e-06, | |
| "loss": 0.5079, | |
| "mean_token_accuracy": 0.9304155111312866, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.7287671232876711, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.7572815533980585e-06, | |
| "loss": 0.5, | |
| "mean_token_accuracy": 0.9282440841197968, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.7342465753424658, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.6601941747572815e-06, | |
| "loss": 0.5233, | |
| "mean_token_accuracy": 0.9292930364608765, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.7397260273972601, | |
| "grad_norm": 1.75, | |
| "learning_rate": 4.563106796116505e-06, | |
| "loss": 0.4966, | |
| "mean_token_accuracy": 0.9306870996952057, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.7452054794520548, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.466019417475728e-06, | |
| "loss": 0.5406, | |
| "mean_token_accuracy": 0.92387256026268, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.7506849315068493, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 4.368932038834951e-06, | |
| "loss": 0.471, | |
| "mean_token_accuracy": 0.9328140020370483, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7561643835616438, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 4.271844660194175e-06, | |
| "loss": 0.54, | |
| "mean_token_accuracy": 0.9251973628997803, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.7616438356164383, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 4.1747572815533986e-06, | |
| "loss": 0.4637, | |
| "mean_token_accuracy": 0.9338904619216919, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.7671232876712328, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 4.0776699029126215e-06, | |
| "loss": 0.6104, | |
| "mean_token_accuracy": 0.91615429520607, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.7726027397260276, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 3.980582524271845e-06, | |
| "loss": 0.5368, | |
| "mean_token_accuracy": 0.9243874251842499, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.7780821917808218, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 3.883495145631067e-06, | |
| "loss": 0.5259, | |
| "mean_token_accuracy": 0.9274910092353821, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.7835616438356166, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 3.7864077669902915e-06, | |
| "loss": 0.5615, | |
| "mean_token_accuracy": 0.9228672385215759, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.7890410958904108, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 3.6893203883495144e-06, | |
| "loss": 0.5299, | |
| "mean_token_accuracy": 0.9265862703323364, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.7945205479452055, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 3.5922330097087378e-06, | |
| "loss": 0.5325, | |
| "mean_token_accuracy": 0.9260352849960327, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 3.4951456310679615e-06, | |
| "loss": 0.4843, | |
| "mean_token_accuracy": 0.9326441884040833, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.8054794520547945, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 3.3980582524271844e-06, | |
| "loss": 0.4794, | |
| "mean_token_accuracy": 0.9303202331066132, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.810958904109589, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 3.3009708737864078e-06, | |
| "loss": 0.553, | |
| "mean_token_accuracy": 0.9218713343143463, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.8164383561643835, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 3.203883495145631e-06, | |
| "loss": 0.5198, | |
| "mean_token_accuracy": 0.9283939898014069, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.821917808219178, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 3.1067961165048544e-06, | |
| "loss": 0.499, | |
| "mean_token_accuracy": 0.9306790828704834, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.8273972602739725, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 3.0097087378640778e-06, | |
| "loss": 0.5532, | |
| "mean_token_accuracy": 0.9235433638095856, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.8328767123287673, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 2.912621359223301e-06, | |
| "loss": 0.4962, | |
| "mean_token_accuracy": 0.9297014772891998, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.8383561643835615, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 2.815533980582524e-06, | |
| "loss": 0.5164, | |
| "mean_token_accuracy": 0.9275369644165039, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.8438356164383563, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 2.718446601941748e-06, | |
| "loss": 0.5393, | |
| "mean_token_accuracy": 0.922819197177887, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.8493150684931505, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 2.621359223300971e-06, | |
| "loss": 0.5587, | |
| "mean_token_accuracy": 0.9225592315196991, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.8547945205479452, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.524271844660194e-06, | |
| "loss": 0.5852, | |
| "mean_token_accuracy": 0.9194580018520355, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.8602739726027397, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 2.4271844660194174e-06, | |
| "loss": 0.5182, | |
| "mean_token_accuracy": 0.928735226392746, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8657534246575342, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.3300970873786407e-06, | |
| "loss": 0.5115, | |
| "mean_token_accuracy": 0.9302666187286377, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.8712328767123287, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 2.233009708737864e-06, | |
| "loss": 0.5361, | |
| "mean_token_accuracy": 0.9279023706912994, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.8767123287671232, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 2.1359223300970874e-06, | |
| "loss": 0.5928, | |
| "mean_token_accuracy": 0.921282947063446, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.882191780821918, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.0388349514563107e-06, | |
| "loss": 0.537, | |
| "mean_token_accuracy": 0.9240512549877167, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.8876712328767122, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.9417475728155337e-06, | |
| "loss": 0.4962, | |
| "mean_token_accuracy": 0.9302894473075867, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.893150684931507, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.8446601941747572e-06, | |
| "loss": 0.5294, | |
| "mean_token_accuracy": 0.9255349636077881, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.8986301369863012, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.7475728155339808e-06, | |
| "loss": 0.5209, | |
| "mean_token_accuracy": 0.9264850616455078, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.904109589041096, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.6504854368932039e-06, | |
| "loss": 0.5459, | |
| "mean_token_accuracy": 0.9249573945999146, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.9095890410958904, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.5533980582524272e-06, | |
| "loss": 0.4897, | |
| "mean_token_accuracy": 0.9331128895282745, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.915068493150685, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.4563106796116506e-06, | |
| "loss": 0.5147, | |
| "mean_token_accuracy": 0.9279215335845947, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9205479452054794, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.359223300970874e-06, | |
| "loss": 0.4624, | |
| "mean_token_accuracy": 0.9308468103408813, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.926027397260274, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.262135922330097e-06, | |
| "loss": 0.5045, | |
| "mean_token_accuracy": 0.9297245442867279, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.9315068493150684, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.1650485436893204e-06, | |
| "loss": 0.4788, | |
| "mean_token_accuracy": 0.932204008102417, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.936986301369863, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.0679611650485437e-06, | |
| "loss": 0.5322, | |
| "mean_token_accuracy": 0.9246480166912079, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.9424657534246577, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 9.708737864077668e-07, | |
| "loss": 0.5149, | |
| "mean_token_accuracy": 0.9272936284542084, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.947945205479452, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 8.737864077669904e-07, | |
| "loss": 0.5474, | |
| "mean_token_accuracy": 0.9222309589385986, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.9534246575342467, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 7.766990291262136e-07, | |
| "loss": 0.4871, | |
| "mean_token_accuracy": 0.9330261945724487, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.958904109589041, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 6.79611650485437e-07, | |
| "loss": 0.5604, | |
| "mean_token_accuracy": 0.9217767119407654, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.9643835616438357, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 5.825242718446602e-07, | |
| "loss": 0.5734, | |
| "mean_token_accuracy": 0.9202806055545807, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.9698630136986301, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.854368932038834e-07, | |
| "loss": 0.537, | |
| "mean_token_accuracy": 0.9262631833553314, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.9753424657534246, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 3.883495145631068e-07, | |
| "loss": 0.5366, | |
| "mean_token_accuracy": 0.9244089424610138, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.9808219178082191, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 2.912621359223301e-07, | |
| "loss": 0.5556, | |
| "mean_token_accuracy": 0.9226070046424866, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.9863013698630136, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.941747572815534e-07, | |
| "loss": 0.505, | |
| "mean_token_accuracy": 0.9307898581027985, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.9917808219178084, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 9.70873786407767e-08, | |
| "loss": 0.5307, | |
| "mean_token_accuracy": 0.9279144406318665, | |
| "step": 364 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 364, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.568355335831552e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |