| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9917808219178084, | |
| "eval_steps": 100000.0, | |
| "global_step": 364, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005479452054794521, | |
| "grad_norm": 432.0, | |
| "learning_rate": 0.0, | |
| "loss": 5.7373, | |
| "mean_token_accuracy": 0.6561740338802338, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.010958904109589041, | |
| "grad_norm": 334.0, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 5.8256, | |
| "mean_token_accuracy": 0.6489620804786682, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.01643835616438356, | |
| "grad_norm": 181.0, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 5.088, | |
| "mean_token_accuracy": 0.6650368273258209, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.021917808219178082, | |
| "grad_norm": 75.5, | |
| "learning_rate": 6e-06, | |
| "loss": 3.994, | |
| "mean_token_accuracy": 0.6906364560127258, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.0273972602739726, | |
| "grad_norm": 44.75, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 3.4044, | |
| "mean_token_accuracy": 0.7045449316501617, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03287671232876712, | |
| "grad_norm": 28.125, | |
| "learning_rate": 1e-05, | |
| "loss": 2.8925, | |
| "mean_token_accuracy": 0.7223854660987854, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.038356164383561646, | |
| "grad_norm": 20.0, | |
| "learning_rate": 1.2e-05, | |
| "loss": 2.3515, | |
| "mean_token_accuracy": 0.7618080377578735, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.043835616438356165, | |
| "grad_norm": 19.0, | |
| "learning_rate": 1.4e-05, | |
| "loss": 2.0269, | |
| "mean_token_accuracy": 0.7856993675231934, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.049315068493150684, | |
| "grad_norm": 12.8125, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.7614, | |
| "mean_token_accuracy": 0.8198637962341309, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.0547945205479452, | |
| "grad_norm": 15.875, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.4779, | |
| "mean_token_accuracy": 0.8413160443305969, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06027397260273973, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2e-05, | |
| "loss": 1.3764, | |
| "mean_token_accuracy": 0.8465317487716675, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.06575342465753424, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 1.9943502824858758e-05, | |
| "loss": 1.2839, | |
| "mean_token_accuracy": 0.8562129735946655, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.07123287671232877, | |
| "grad_norm": 10.6875, | |
| "learning_rate": 1.9887005649717518e-05, | |
| "loss": 1.186, | |
| "mean_token_accuracy": 0.8632737696170807, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.07671232876712329, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 1.9830508474576275e-05, | |
| "loss": 1.2342, | |
| "mean_token_accuracy": 0.8569013476371765, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.0821917808219178, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 1.977401129943503e-05, | |
| "loss": 1.1773, | |
| "mean_token_accuracy": 0.861639678478241, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08767123287671233, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.9717514124293785e-05, | |
| "loss": 1.0634, | |
| "mean_token_accuracy": 0.8734889626502991, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.09315068493150686, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.9661016949152545e-05, | |
| "loss": 1.1534, | |
| "mean_token_accuracy": 0.8641785085201263, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.09863013698630137, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 1.96045197740113e-05, | |
| "loss": 1.1272, | |
| "mean_token_accuracy": 0.8654375374317169, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.10410958904109589, | |
| "grad_norm": 2.375, | |
| "learning_rate": 1.9548022598870058e-05, | |
| "loss": 1.0411, | |
| "mean_token_accuracy": 0.8728566467761993, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.1095890410958904, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 1.9491525423728814e-05, | |
| "loss": 1.0268, | |
| "mean_token_accuracy": 0.8728775084018707, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11506849315068493, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 1.9435028248587574e-05, | |
| "loss": 1.0674, | |
| "mean_token_accuracy": 0.8690488934516907, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.12054794520547946, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 1.937853107344633e-05, | |
| "loss": 0.9995, | |
| "mean_token_accuracy": 0.8791483938694, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.12602739726027398, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.9322033898305087e-05, | |
| "loss": 1.0724, | |
| "mean_token_accuracy": 0.8647687137126923, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.13150684931506848, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.9265536723163844e-05, | |
| "loss": 1.0266, | |
| "mean_token_accuracy": 0.8738211989402771, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.136986301369863, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.92090395480226e-05, | |
| "loss": 1.0761, | |
| "mean_token_accuracy": 0.8665441274642944, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.14246575342465753, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.9152542372881357e-05, | |
| "loss": 0.9683, | |
| "mean_token_accuracy": 0.8777200281620026, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.14794520547945206, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.9096045197740114e-05, | |
| "loss": 0.9868, | |
| "mean_token_accuracy": 0.8754189312458038, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.15342465753424658, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.9039548022598874e-05, | |
| "loss": 1.0158, | |
| "mean_token_accuracy": 0.8761765658855438, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.1589041095890411, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.898305084745763e-05, | |
| "loss": 0.8789, | |
| "mean_token_accuracy": 0.8906848430633545, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.1643835616438356, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.8926553672316387e-05, | |
| "loss": 0.9747, | |
| "mean_token_accuracy": 0.8772170841693878, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.16986301369863013, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.8870056497175144e-05, | |
| "loss": 0.9559, | |
| "mean_token_accuracy": 0.879201203584671, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.17534246575342466, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.88135593220339e-05, | |
| "loss": 0.9505, | |
| "mean_token_accuracy": 0.8790359497070312, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.18082191780821918, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.8757062146892657e-05, | |
| "loss": 0.9744, | |
| "mean_token_accuracy": 0.8767738342285156, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.1863013698630137, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.8700564971751413e-05, | |
| "loss": 0.9881, | |
| "mean_token_accuracy": 0.8765853643417358, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.1917808219178082, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.864406779661017e-05, | |
| "loss": 0.9423, | |
| "mean_token_accuracy": 0.8802583813667297, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.19726027397260273, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.858757062146893e-05, | |
| "loss": 0.9277, | |
| "mean_token_accuracy": 0.8835765421390533, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.20273972602739726, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.8531073446327686e-05, | |
| "loss": 0.9619, | |
| "mean_token_accuracy": 0.8785396814346313, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.20821917808219179, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.8474576271186443e-05, | |
| "loss": 0.9415, | |
| "mean_token_accuracy": 0.8792627155780792, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.2136986301369863, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.84180790960452e-05, | |
| "loss": 1.1318, | |
| "mean_token_accuracy": 0.8636864423751831, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.2191780821917808, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.8361581920903956e-05, | |
| "loss": 0.9542, | |
| "mean_token_accuracy": 0.8771731555461884, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.22465753424657534, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.8305084745762713e-05, | |
| "loss": 0.9122, | |
| "mean_token_accuracy": 0.881610095500946, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.23013698630136986, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.824858757062147e-05, | |
| "loss": 0.8768, | |
| "mean_token_accuracy": 0.8854578733444214, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.2356164383561644, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.8192090395480226e-05, | |
| "loss": 0.8985, | |
| "mean_token_accuracy": 0.8832479119300842, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.2410958904109589, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.8135593220338986e-05, | |
| "loss": 0.9174, | |
| "mean_token_accuracy": 0.88409024477005, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.2465753424657534, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.8079096045197743e-05, | |
| "loss": 0.9281, | |
| "mean_token_accuracy": 0.8829980492591858, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.25205479452054796, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.80225988700565e-05, | |
| "loss": 0.9484, | |
| "mean_token_accuracy": 0.8815726637840271, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.25753424657534246, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.7966101694915256e-05, | |
| "loss": 0.9032, | |
| "mean_token_accuracy": 0.8823907375335693, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.26301369863013696, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.7909604519774012e-05, | |
| "loss": 0.954, | |
| "mean_token_accuracy": 0.8775873780250549, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.2684931506849315, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.785310734463277e-05, | |
| "loss": 0.8923, | |
| "mean_token_accuracy": 0.8820536136627197, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.273972602739726, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.7796610169491526e-05, | |
| "loss": 0.9235, | |
| "mean_token_accuracy": 0.8839817941188812, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.27945205479452057, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.7740112994350286e-05, | |
| "loss": 0.9095, | |
| "mean_token_accuracy": 0.8809832036495209, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.28493150684931506, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.7683615819209042e-05, | |
| "loss": 0.8763, | |
| "mean_token_accuracy": 0.8849304616451263, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.29041095890410956, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.76271186440678e-05, | |
| "loss": 0.9113, | |
| "mean_token_accuracy": 0.8810350298881531, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.2958904109589041, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.7570621468926555e-05, | |
| "loss": 0.9356, | |
| "mean_token_accuracy": 0.8820917904376984, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.3013698630136986, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.7514124293785312e-05, | |
| "loss": 0.9315, | |
| "mean_token_accuracy": 0.8792105317115784, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.30684931506849317, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.745762711864407e-05, | |
| "loss": 0.9129, | |
| "mean_token_accuracy": 0.884558379650116, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.31232876712328766, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.7401129943502825e-05, | |
| "loss": 0.9175, | |
| "mean_token_accuracy": 0.8841174840927124, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.3178082191780822, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.734463276836158e-05, | |
| "loss": 0.9441, | |
| "mean_token_accuracy": 0.8799735009670258, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.3232876712328767, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.728813559322034e-05, | |
| "loss": 0.8381, | |
| "mean_token_accuracy": 0.8915592730045319, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.3287671232876712, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.7231638418079098e-05, | |
| "loss": 0.878, | |
| "mean_token_accuracy": 0.8879745006561279, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.33424657534246577, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.7175141242937855e-05, | |
| "loss": 0.894, | |
| "mean_token_accuracy": 0.8848598599433899, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.33972602739726027, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.711864406779661e-05, | |
| "loss": 0.9141, | |
| "mean_token_accuracy": 0.882304459810257, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.3452054794520548, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.7062146892655368e-05, | |
| "loss": 0.9312, | |
| "mean_token_accuracy": 0.8793206214904785, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.3506849315068493, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.7005649717514125e-05, | |
| "loss": 0.8844, | |
| "mean_token_accuracy": 0.8880196511745453, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.3561643835616438, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.694915254237288e-05, | |
| "loss": 0.7884, | |
| "mean_token_accuracy": 0.9000090658664703, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.36164383561643837, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.689265536723164e-05, | |
| "loss": 0.856, | |
| "mean_token_accuracy": 0.8874339759349823, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.36712328767123287, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.6836158192090398e-05, | |
| "loss": 0.9166, | |
| "mean_token_accuracy": 0.8831603229045868, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.3726027397260274, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.6779661016949154e-05, | |
| "loss": 0.9728, | |
| "mean_token_accuracy": 0.8779590725898743, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.3780821917808219, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.672316384180791e-05, | |
| "loss": 0.9378, | |
| "mean_token_accuracy": 0.8816089332103729, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.3835616438356164, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 0.9222, | |
| "mean_token_accuracy": 0.8854541778564453, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.38904109589041097, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.6610169491525424e-05, | |
| "loss": 0.9116, | |
| "mean_token_accuracy": 0.8790720105171204, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.39452054794520547, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.655367231638418e-05, | |
| "loss": 0.8784, | |
| "mean_token_accuracy": 0.8863929808139801, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.6497175141242937e-05, | |
| "loss": 0.8976, | |
| "mean_token_accuracy": 0.8833265900611877, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.4054794520547945, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.6440677966101697e-05, | |
| "loss": 0.8872, | |
| "mean_token_accuracy": 0.8866287767887115, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.410958904109589, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.6384180790960454e-05, | |
| "loss": 0.9294, | |
| "mean_token_accuracy": 0.8842478394508362, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.41643835616438357, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.632768361581921e-05, | |
| "loss": 0.8754, | |
| "mean_token_accuracy": 0.887272983789444, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.42191780821917807, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.6271186440677967e-05, | |
| "loss": 0.8779, | |
| "mean_token_accuracy": 0.8887339234352112, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.4273972602739726, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.6214689265536724e-05, | |
| "loss": 0.8863, | |
| "mean_token_accuracy": 0.8841427862644196, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.4328767123287671, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.615819209039548e-05, | |
| "loss": 0.8632, | |
| "mean_token_accuracy": 0.8868177235126495, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.4383561643835616, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.6101694915254237e-05, | |
| "loss": 0.9242, | |
| "mean_token_accuracy": 0.8839016258716583, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.4438356164383562, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.6045197740112997e-05, | |
| "loss": 0.908, | |
| "mean_token_accuracy": 0.8833514153957367, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.44931506849315067, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.5988700564971753e-05, | |
| "loss": 0.8747, | |
| "mean_token_accuracy": 0.8873944878578186, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.4547945205479452, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.593220338983051e-05, | |
| "loss": 0.9303, | |
| "mean_token_accuracy": 0.8823438286781311, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.4602739726027397, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.5875706214689266e-05, | |
| "loss": 0.8441, | |
| "mean_token_accuracy": 0.888810396194458, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.4657534246575342, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.5819209039548023e-05, | |
| "loss": 0.8512, | |
| "mean_token_accuracy": 0.8866813480854034, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.4712328767123288, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.576271186440678e-05, | |
| "loss": 0.8743, | |
| "mean_token_accuracy": 0.8884658217430115, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.4767123287671233, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.5706214689265536e-05, | |
| "loss": 0.8523, | |
| "mean_token_accuracy": 0.8898887932300568, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.4821917808219178, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.5649717514124293e-05, | |
| "loss": 0.8947, | |
| "mean_token_accuracy": 0.8856352865695953, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.4876712328767123, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.5593220338983053e-05, | |
| "loss": 0.8895, | |
| "mean_token_accuracy": 0.8845047950744629, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.4931506849315068, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.553672316384181e-05, | |
| "loss": 0.8215, | |
| "mean_token_accuracy": 0.8922451138496399, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4986301369863014, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.5480225988700566e-05, | |
| "loss": 0.8677, | |
| "mean_token_accuracy": 0.8868069648742676, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.5041095890410959, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.5423728813559326e-05, | |
| "loss": 0.8843, | |
| "mean_token_accuracy": 0.8863621056079865, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.5095890410958904, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.536723163841808e-05, | |
| "loss": 0.8593, | |
| "mean_token_accuracy": 0.8890643417835236, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.5150684931506849, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.5310734463276836e-05, | |
| "loss": 0.8961, | |
| "mean_token_accuracy": 0.885326474905014, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.5205479452054794, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.5254237288135594e-05, | |
| "loss": 0.8115, | |
| "mean_token_accuracy": 0.8935891091823578, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.5260273972602739, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.5197740112994352e-05, | |
| "loss": 0.8157, | |
| "mean_token_accuracy": 0.8906770646572113, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.5315068493150685, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.5141242937853109e-05, | |
| "loss": 0.9447, | |
| "mean_token_accuracy": 0.8807980120182037, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.536986301369863, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.5084745762711865e-05, | |
| "loss": 0.8652, | |
| "mean_token_accuracy": 0.886509358882904, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.5424657534246575, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.5028248587570622e-05, | |
| "loss": 0.8307, | |
| "mean_token_accuracy": 0.8903360962867737, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.547945205479452, | |
| "grad_norm": 6.25, | |
| "learning_rate": 1.497175141242938e-05, | |
| "loss": 0.918, | |
| "mean_token_accuracy": 0.8805244266986847, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5534246575342465, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.4915254237288137e-05, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.8928936421871185, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.5589041095890411, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.4858757062146894e-05, | |
| "loss": 0.8592, | |
| "mean_token_accuracy": 0.889348953962326, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.5643835616438356, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.480225988700565e-05, | |
| "loss": 0.7756, | |
| "mean_token_accuracy": 0.8963395059108734, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.5698630136986301, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.4745762711864408e-05, | |
| "loss": 0.8789, | |
| "mean_token_accuracy": 0.886920839548111, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.5753424657534246, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.4689265536723165e-05, | |
| "loss": 0.8967, | |
| "mean_token_accuracy": 0.8798324763774872, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.5808219178082191, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.4632768361581922e-05, | |
| "loss": 0.843, | |
| "mean_token_accuracy": 0.8907739818096161, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.5863013698630137, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.4576271186440678e-05, | |
| "loss": 0.8541, | |
| "mean_token_accuracy": 0.8892745971679688, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.5917808219178082, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.4519774011299436e-05, | |
| "loss": 0.8462, | |
| "mean_token_accuracy": 0.8868447542190552, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.5972602739726027, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 1.4463276836158193e-05, | |
| "loss": 0.8057, | |
| "mean_token_accuracy": 0.8942738175392151, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.6027397260273972, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.440677966101695e-05, | |
| "loss": 0.9267, | |
| "mean_token_accuracy": 0.879928857088089, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6082191780821918, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.4350282485875708e-05, | |
| "loss": 0.8554, | |
| "mean_token_accuracy": 0.8891916871070862, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.6136986301369863, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.4293785310734465e-05, | |
| "loss": 0.8786, | |
| "mean_token_accuracy": 0.8868878483772278, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.6191780821917808, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.4237288135593221e-05, | |
| "loss": 0.8238, | |
| "mean_token_accuracy": 0.8923602402210236, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.6246575342465753, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.4180790960451978e-05, | |
| "loss": 0.7824, | |
| "mean_token_accuracy": 0.898999810218811, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.6301369863013698, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 1.4124293785310736e-05, | |
| "loss": 0.8753, | |
| "mean_token_accuracy": 0.8899329602718353, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.6356164383561644, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.4067796610169493e-05, | |
| "loss": 0.8015, | |
| "mean_token_accuracy": 0.8944090604782104, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.6410958904109589, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.4011299435028249e-05, | |
| "loss": 0.8078, | |
| "mean_token_accuracy": 0.8907613754272461, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.6465753424657534, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.3954802259887006e-05, | |
| "loss": 0.8241, | |
| "mean_token_accuracy": 0.8921016752719879, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.6520547945205479, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.3898305084745764e-05, | |
| "loss": 0.8554, | |
| "mean_token_accuracy": 0.8893947899341583, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.6575342465753424, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.384180790960452e-05, | |
| "loss": 0.7452, | |
| "mean_token_accuracy": 0.9025129973888397, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.663013698630137, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.3785310734463277e-05, | |
| "loss": 0.8917, | |
| "mean_token_accuracy": 0.8841381669044495, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.6684931506849315, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.3728813559322034e-05, | |
| "loss": 0.7997, | |
| "mean_token_accuracy": 0.8949976563453674, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.673972602739726, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 1.3672316384180792e-05, | |
| "loss": 0.77, | |
| "mean_token_accuracy": 0.8981111347675323, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.6794520547945205, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.3615819209039549e-05, | |
| "loss": 0.8735, | |
| "mean_token_accuracy": 0.88605797290802, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.684931506849315, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.3559322033898305e-05, | |
| "loss": 0.8578, | |
| "mean_token_accuracy": 0.8875480592250824, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.6904109589041096, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.3502824858757064e-05, | |
| "loss": 0.842, | |
| "mean_token_accuracy": 0.8861750662326813, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.6958904109589041, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.344632768361582e-05, | |
| "loss": 0.7661, | |
| "mean_token_accuracy": 0.8968884646892548, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.7013698630136986, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3389830508474577e-05, | |
| "loss": 0.8218, | |
| "mean_token_accuracy": 0.89415243268013, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.7068493150684931, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 0.84, | |
| "mean_token_accuracy": 0.8905225694179535, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.7123287671232876, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.3276836158192092e-05, | |
| "loss": 0.7886, | |
| "mean_token_accuracy": 0.8948017656803131, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7178082191780822, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.3220338983050848e-05, | |
| "loss": 0.7962, | |
| "mean_token_accuracy": 0.8944378197193146, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.7232876712328767, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.3163841807909605e-05, | |
| "loss": 0.7744, | |
| "mean_token_accuracy": 0.8996008336544037, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.7287671232876712, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.3107344632768361e-05, | |
| "loss": 0.8511, | |
| "mean_token_accuracy": 0.8891786336898804, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.7342465753424657, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.305084745762712e-05, | |
| "loss": 0.7581, | |
| "mean_token_accuracy": 0.8997050821781158, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.7397260273972602, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.2994350282485876e-05, | |
| "loss": 0.8179, | |
| "mean_token_accuracy": 0.889685183763504, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.7452054794520548, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.2937853107344633e-05, | |
| "loss": 0.7822, | |
| "mean_token_accuracy": 0.8955155909061432, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.7506849315068493, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.288135593220339e-05, | |
| "loss": 0.8004, | |
| "mean_token_accuracy": 0.8967688679695129, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.7561643835616438, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 1.282485875706215e-05, | |
| "loss": 0.7105, | |
| "mean_token_accuracy": 0.903778463602066, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.7616438356164383, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.2768361581920904e-05, | |
| "loss": 0.8719, | |
| "mean_token_accuracy": 0.8871429562568665, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.7671232876712328, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.2711864406779661e-05, | |
| "loss": 0.8294, | |
| "mean_token_accuracy": 0.8909505307674408, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7726027397260274, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.265536723163842e-05, | |
| "loss": 0.8538, | |
| "mean_token_accuracy": 0.8904447853565216, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.7780821917808219, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.2598870056497177e-05, | |
| "loss": 0.7883, | |
| "mean_token_accuracy": 0.8976991772651672, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.7835616438356164, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2542372881355932e-05, | |
| "loss": 0.8273, | |
| "mean_token_accuracy": 0.8918053209781647, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.7890410958904109, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.2485875706214689e-05, | |
| "loss": 0.8064, | |
| "mean_token_accuracy": 0.8940227329730988, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.7945205479452054, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.2429378531073449e-05, | |
| "loss": 0.8038, | |
| "mean_token_accuracy": 0.893746942281723, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.2372881355932205e-05, | |
| "loss": 0.7326, | |
| "mean_token_accuracy": 0.9016251862049103, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.8054794520547945, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.2316384180790962e-05, | |
| "loss": 0.8565, | |
| "mean_token_accuracy": 0.8860533237457275, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.810958904109589, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.2259887005649717e-05, | |
| "loss": 0.814, | |
| "mean_token_accuracy": 0.8916102349758148, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.8164383561643835, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.2203389830508477e-05, | |
| "loss": 0.826, | |
| "mean_token_accuracy": 0.8912359774112701, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.821917808219178, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.2146892655367234e-05, | |
| "loss": 0.7332, | |
| "mean_token_accuracy": 0.901479035615921, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8273972602739726, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.209039548022599e-05, | |
| "loss": 0.8137, | |
| "mean_token_accuracy": 0.8948657810688019, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.8328767123287671, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.2033898305084745e-05, | |
| "loss": 0.7541, | |
| "mean_token_accuracy": 0.8997522294521332, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.8383561643835616, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.1977401129943505e-05, | |
| "loss": 0.7506, | |
| "mean_token_accuracy": 0.9004031419754028, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.8438356164383561, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.1920903954802262e-05, | |
| "loss": 0.8034, | |
| "mean_token_accuracy": 0.8948807418346405, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.8493150684931506, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.1864406779661018e-05, | |
| "loss": 0.8141, | |
| "mean_token_accuracy": 0.8910720944404602, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.8547945205479452, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.1807909604519776e-05, | |
| "loss": 0.8654, | |
| "mean_token_accuracy": 0.8876812160015106, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.8602739726027397, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.1751412429378533e-05, | |
| "loss": 0.7872, | |
| "mean_token_accuracy": 0.8944331705570221, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.8657534246575342, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.169491525423729e-05, | |
| "loss": 0.8064, | |
| "mean_token_accuracy": 0.8923972845077515, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.8712328767123287, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.1638418079096046e-05, | |
| "loss": 0.8012, | |
| "mean_token_accuracy": 0.8913676142692566, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.8767123287671232, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 1.1581920903954804e-05, | |
| "loss": 0.7828, | |
| "mean_token_accuracy": 0.8984484672546387, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8821917808219178, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.1525423728813561e-05, | |
| "loss": 0.7947, | |
| "mean_token_accuracy": 0.8937750458717346, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.8876712328767123, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.1468926553672318e-05, | |
| "loss": 0.8615, | |
| "mean_token_accuracy": 0.8844136893749237, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.8931506849315068, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.1412429378531074e-05, | |
| "loss": 0.7851, | |
| "mean_token_accuracy": 0.8954149484634399, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.8986301369863013, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.1355932203389833e-05, | |
| "loss": 0.8646, | |
| "mean_token_accuracy": 0.8902953565120697, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.9041095890410958, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.1299435028248589e-05, | |
| "loss": 0.8723, | |
| "mean_token_accuracy": 0.8843137621879578, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.9095890410958904, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.1242937853107346e-05, | |
| "loss": 0.8073, | |
| "mean_token_accuracy": 0.8933804333209991, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.915068493150685, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.1186440677966102e-05, | |
| "loss": 0.8571, | |
| "mean_token_accuracy": 0.8872124254703522, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.9205479452054794, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.112994350282486e-05, | |
| "loss": 0.7582, | |
| "mean_token_accuracy": 0.8991103768348694, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.9260273972602739, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.1073446327683617e-05, | |
| "loss": 0.7952, | |
| "mean_token_accuracy": 0.8964874744415283, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.9315068493150684, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 1.1016949152542374e-05, | |
| "loss": 0.7878, | |
| "mean_token_accuracy": 0.894893229007721, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.936986301369863, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.096045197740113e-05, | |
| "loss": 0.8082, | |
| "mean_token_accuracy": 0.8924291729927063, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.9424657534246575, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.0903954802259889e-05, | |
| "loss": 0.8655, | |
| "mean_token_accuracy": 0.8870832324028015, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.947945205479452, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 1.0847457627118645e-05, | |
| "loss": 0.7696, | |
| "mean_token_accuracy": 0.8987171053886414, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.9534246575342465, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 1.0790960451977402e-05, | |
| "loss": 0.7408, | |
| "mean_token_accuracy": 0.9013040661811829, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.958904109589041, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.073446327683616e-05, | |
| "loss": 0.8175, | |
| "mean_token_accuracy": 0.8909050524234772, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.9643835616438357, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 1.0677966101694917e-05, | |
| "loss": 0.7626, | |
| "mean_token_accuracy": 0.895355612039566, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.9698630136986301, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 1.0621468926553673e-05, | |
| "loss": 0.8223, | |
| "mean_token_accuracy": 0.8929450511932373, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.9753424657534246, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.056497175141243e-05, | |
| "loss": 0.8457, | |
| "mean_token_accuracy": 0.8877719342708588, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.9808219178082191, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.0508474576271188e-05, | |
| "loss": 0.8065, | |
| "mean_token_accuracy": 0.8943466544151306, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.9863013698630136, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 1.0451977401129945e-05, | |
| "loss": 0.7862, | |
| "mean_token_accuracy": 0.8981420993804932, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9917808219178083, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.0395480225988701e-05, | |
| "loss": 0.7166, | |
| "mean_token_accuracy": 0.9056210517883301, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.9972602739726028, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.0338983050847458e-05, | |
| "loss": 0.716, | |
| "mean_token_accuracy": 0.9040741622447968, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 1.0282485875706216e-05, | |
| "loss": 0.366, | |
| "mean_token_accuracy": 0.8993819952011108, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 1.0054794520547945, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.0225988700564973e-05, | |
| "loss": 0.6589, | |
| "mean_token_accuracy": 0.9100659489631653, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 1.010958904109589, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.016949152542373e-05, | |
| "loss": 0.7632, | |
| "mean_token_accuracy": 0.8995705246925354, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.0164383561643835, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.0112994350282486e-05, | |
| "loss": 0.7087, | |
| "mean_token_accuracy": 0.9034123718738556, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 1.021917808219178, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.0056497175141244e-05, | |
| "loss": 0.7332, | |
| "mean_token_accuracy": 0.8997578024864197, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 1.0273972602739727, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.7192, | |
| "mean_token_accuracy": 0.9052003026008606, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 1.0328767123287672, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 9.943502824858759e-06, | |
| "loss": 0.6996, | |
| "mean_token_accuracy": 0.9051499664783478, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 1.0383561643835617, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 9.887005649717516e-06, | |
| "loss": 0.6603, | |
| "mean_token_accuracy": 0.9084216058254242, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.0438356164383562, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 9.830508474576272e-06, | |
| "loss": 0.6934, | |
| "mean_token_accuracy": 0.905331015586853, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 1.0493150684931507, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 9.774011299435029e-06, | |
| "loss": 0.6852, | |
| "mean_token_accuracy": 0.907427966594696, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 1.0547945205479452, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 9.717514124293787e-06, | |
| "loss": 0.6364, | |
| "mean_token_accuracy": 0.9119535982608795, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 1.0602739726027397, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 9.661016949152544e-06, | |
| "loss": 0.7319, | |
| "mean_token_accuracy": 0.902245968580246, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 1.0657534246575342, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 9.6045197740113e-06, | |
| "loss": 0.7186, | |
| "mean_token_accuracy": 0.9031082093715668, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.0712328767123287, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 9.548022598870057e-06, | |
| "loss": 0.7056, | |
| "mean_token_accuracy": 0.9083144962787628, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 1.0767123287671232, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 9.491525423728815e-06, | |
| "loss": 0.6955, | |
| "mean_token_accuracy": 0.9058070778846741, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 1.0821917808219177, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 9.435028248587572e-06, | |
| "loss": 0.658, | |
| "mean_token_accuracy": 0.9093527495861053, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 1.0876712328767124, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 9.378531073446328e-06, | |
| "loss": 0.6913, | |
| "mean_token_accuracy": 0.907717764377594, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 1.093150684931507, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 9.322033898305085e-06, | |
| "loss": 0.6677, | |
| "mean_token_accuracy": 0.9069054424762726, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0986301369863014, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 9.265536723163843e-06, | |
| "loss": 0.6996, | |
| "mean_token_accuracy": 0.9076306223869324, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 1.104109589041096, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 9.2090395480226e-06, | |
| "loss": 0.7397, | |
| "mean_token_accuracy": 0.8992961049079895, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 1.1095890410958904, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 9.152542372881356e-06, | |
| "loss": 0.7211, | |
| "mean_token_accuracy": 0.9030121862888336, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 1.115068493150685, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.096045197740113e-06, | |
| "loss": 0.7143, | |
| "mean_token_accuracy": 0.9033773839473724, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 1.1205479452054794, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 9.039548022598871e-06, | |
| "loss": 0.7024, | |
| "mean_token_accuracy": 0.905937910079956, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.126027397260274, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 8.983050847457628e-06, | |
| "loss": 0.671, | |
| "mean_token_accuracy": 0.9082026183605194, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 1.1315068493150684, | |
| "grad_norm": 1.875, | |
| "learning_rate": 8.926553672316384e-06, | |
| "loss": 0.713, | |
| "mean_token_accuracy": 0.903482049703598, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 1.1369863013698631, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.870056497175143e-06, | |
| "loss": 0.7127, | |
| "mean_token_accuracy": 0.9049177765846252, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 1.1424657534246576, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 8.8135593220339e-06, | |
| "loss": 0.7038, | |
| "mean_token_accuracy": 0.9026915431022644, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 1.1479452054794521, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 8.757062146892656e-06, | |
| "loss": 0.7828, | |
| "mean_token_accuracy": 0.8936010599136353, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1534246575342466, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 8.700564971751413e-06, | |
| "loss": 0.6303, | |
| "mean_token_accuracy": 0.9128701090812683, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 1.158904109589041, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8.64406779661017e-06, | |
| "loss": 0.6587, | |
| "mean_token_accuracy": 0.9105681478977203, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 1.1643835616438356, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 8.587570621468927e-06, | |
| "loss": 0.6522, | |
| "mean_token_accuracy": 0.9126417934894562, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 1.16986301369863, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 8.531073446327684e-06, | |
| "loss": 0.6384, | |
| "mean_token_accuracy": 0.9142054915428162, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 1.1753424657534246, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 8.47457627118644e-06, | |
| "loss": 0.6443, | |
| "mean_token_accuracy": 0.9108563363552094, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.180821917808219, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 8.418079096045199e-06, | |
| "loss": 0.6724, | |
| "mean_token_accuracy": 0.9071345031261444, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 1.1863013698630138, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 8.361581920903955e-06, | |
| "loss": 0.7334, | |
| "mean_token_accuracy": 0.9011849761009216, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 1.191780821917808, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 8.305084745762712e-06, | |
| "loss": 0.6668, | |
| "mean_token_accuracy": 0.9077614843845367, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 1.1972602739726028, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 8.248587570621469e-06, | |
| "loss": 0.7365, | |
| "mean_token_accuracy": 0.9032963216304779, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 1.2027397260273973, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 8.192090395480227e-06, | |
| "loss": 0.6649, | |
| "mean_token_accuracy": 0.9089824557304382, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.2082191780821918, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 8.135593220338983e-06, | |
| "loss": 0.6773, | |
| "mean_token_accuracy": 0.9071504771709442, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 1.2136986301369863, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 8.07909604519774e-06, | |
| "loss": 0.69, | |
| "mean_token_accuracy": 0.9098499119281769, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 1.2191780821917808, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 8.022598870056498e-06, | |
| "loss": 0.6418, | |
| "mean_token_accuracy": 0.9123164415359497, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 1.2246575342465753, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 7.966101694915255e-06, | |
| "loss": 0.6542, | |
| "mean_token_accuracy": 0.9111972451210022, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 1.2301369863013698, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 7.909604519774012e-06, | |
| "loss": 0.7119, | |
| "mean_token_accuracy": 0.9055051803588867, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.2356164383561643, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.853107344632768e-06, | |
| "loss": 0.6844, | |
| "mean_token_accuracy": 0.9083731472492218, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 1.2410958904109588, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 7.796610169491526e-06, | |
| "loss": 0.6986, | |
| "mean_token_accuracy": 0.9047116041183472, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 1.2465753424657535, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 7.740112994350283e-06, | |
| "loss": 0.6755, | |
| "mean_token_accuracy": 0.9082626402378082, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 1.252054794520548, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 7.68361581920904e-06, | |
| "loss": 0.6935, | |
| "mean_token_accuracy": 0.9040849804878235, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 1.2575342465753425, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 7.627118644067797e-06, | |
| "loss": 0.6887, | |
| "mean_token_accuracy": 0.9099703729152679, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.263013698630137, | |
| "grad_norm": 2.0, | |
| "learning_rate": 7.5706214689265545e-06, | |
| "loss": 0.7033, | |
| "mean_token_accuracy": 0.9058608114719391, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 1.2684931506849315, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 7.514124293785311e-06, | |
| "loss": 0.7058, | |
| "mean_token_accuracy": 0.9063239991664886, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 1.273972602739726, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 7.4576271186440685e-06, | |
| "loss": 0.7218, | |
| "mean_token_accuracy": 0.9018439948558807, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 1.2794520547945205, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 7.401129943502825e-06, | |
| "loss": 0.7134, | |
| "mean_token_accuracy": 0.9037202000617981, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 1.284931506849315, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 7.3446327683615825e-06, | |
| "loss": 0.6849, | |
| "mean_token_accuracy": 0.9080476462841034, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.2904109589041095, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.288135593220339e-06, | |
| "loss": 0.747, | |
| "mean_token_accuracy": 0.9032059609889984, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 1.2958904109589042, | |
| "grad_norm": 1.875, | |
| "learning_rate": 7.2316384180790965e-06, | |
| "loss": 0.6309, | |
| "mean_token_accuracy": 0.9128157496452332, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 1.3013698630136985, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 7.175141242937854e-06, | |
| "loss": 0.7132, | |
| "mean_token_accuracy": 0.9027237892150879, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 1.3068493150684932, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 7.1186440677966106e-06, | |
| "loss": 0.6806, | |
| "mean_token_accuracy": 0.9067763984203339, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 1.3123287671232877, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 7.062146892655368e-06, | |
| "loss": 0.7434, | |
| "mean_token_accuracy": 0.9005565345287323, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.3178082191780822, | |
| "grad_norm": 1.875, | |
| "learning_rate": 7.0056497175141246e-06, | |
| "loss": 0.6809, | |
| "mean_token_accuracy": 0.9072897434234619, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 1.3232876712328767, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.949152542372882e-06, | |
| "loss": 0.7208, | |
| "mean_token_accuracy": 0.906402200460434, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 1.3287671232876712, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.892655367231639e-06, | |
| "loss": 0.7418, | |
| "mean_token_accuracy": 0.9020512700080872, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 1.3342465753424657, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 6.836158192090396e-06, | |
| "loss": 0.7158, | |
| "mean_token_accuracy": 0.9037717282772064, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 1.3397260273972602, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 6.779661016949153e-06, | |
| "loss": 0.7274, | |
| "mean_token_accuracy": 0.9033022224903107, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.345205479452055, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.72316384180791e-06, | |
| "loss": 0.6736, | |
| "mean_token_accuracy": 0.9079640805721283, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 1.3506849315068492, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.6624, | |
| "mean_token_accuracy": 0.9096674621105194, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 1.356164383561644, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 6.610169491525424e-06, | |
| "loss": 0.7013, | |
| "mean_token_accuracy": 0.9066566228866577, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 1.3616438356164384, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 6.553672316384181e-06, | |
| "loss": 0.7107, | |
| "mean_token_accuracy": 0.9048294425010681, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 1.367123287671233, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 6.497175141242938e-06, | |
| "loss": 0.6964, | |
| "mean_token_accuracy": 0.9053798615932465, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3726027397260274, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 6.440677966101695e-06, | |
| "loss": 0.6184, | |
| "mean_token_accuracy": 0.9159266352653503, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 1.378082191780822, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 6.384180790960452e-06, | |
| "loss": 0.6435, | |
| "mean_token_accuracy": 0.9122322797775269, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 1.3835616438356164, | |
| "grad_norm": 1.8359375, | |
| "learning_rate": 6.32768361581921e-06, | |
| "loss": 0.6622, | |
| "mean_token_accuracy": 0.910215824842453, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 1.389041095890411, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 6.271186440677966e-06, | |
| "loss": 0.6991, | |
| "mean_token_accuracy": 0.907020092010498, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 1.3945205479452054, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 6.2146892655367244e-06, | |
| "loss": 0.6726, | |
| "mean_token_accuracy": 0.9085971713066101, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 6.158192090395481e-06, | |
| "loss": 0.6498, | |
| "mean_token_accuracy": 0.9123066365718842, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 1.4054794520547946, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.1016949152542385e-06, | |
| "loss": 0.6304, | |
| "mean_token_accuracy": 0.914670318365097, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.410958904109589, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 6.045197740112995e-06, | |
| "loss": 0.6677, | |
| "mean_token_accuracy": 0.9098580479621887, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 1.4164383561643836, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 5.9887005649717525e-06, | |
| "loss": 0.6927, | |
| "mean_token_accuracy": 0.9073116779327393, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 1.4219178082191781, | |
| "grad_norm": 2.25, | |
| "learning_rate": 5.932203389830509e-06, | |
| "loss": 0.7589, | |
| "mean_token_accuracy": 0.8991564214229584, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.4273972602739726, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 5.8757062146892665e-06, | |
| "loss": 0.7304, | |
| "mean_token_accuracy": 0.9021161198616028, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 1.4328767123287671, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 5.819209039548023e-06, | |
| "loss": 0.6267, | |
| "mean_token_accuracy": 0.9146751463413239, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 1.4383561643835616, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 5.7627118644067805e-06, | |
| "loss": 0.7622, | |
| "mean_token_accuracy": 0.8948341906070709, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 1.4438356164383561, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.706214689265537e-06, | |
| "loss": 0.6191, | |
| "mean_token_accuracy": 0.913863331079483, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 1.4493150684931506, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 5.6497175141242946e-06, | |
| "loss": 0.6872, | |
| "mean_token_accuracy": 0.9074727296829224, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 1.4547945205479453, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 5.593220338983051e-06, | |
| "loss": 0.6691, | |
| "mean_token_accuracy": 0.910420149564743, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 1.4602739726027396, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 5.536723163841809e-06, | |
| "loss": 0.6667, | |
| "mean_token_accuracy": 0.9060676395893097, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 1.4657534246575343, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 5.480225988700565e-06, | |
| "loss": 0.6477, | |
| "mean_token_accuracy": 0.9133667647838593, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 1.4712328767123288, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 5.423728813559323e-06, | |
| "loss": 0.6444, | |
| "mean_token_accuracy": 0.9106875658035278, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 1.4767123287671233, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 5.36723163841808e-06, | |
| "loss": 0.6404, | |
| "mean_token_accuracy": 0.9118073582649231, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4821917808219178, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 5.310734463276837e-06, | |
| "loss": 0.6769, | |
| "mean_token_accuracy": 0.9056595265865326, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 1.4876712328767123, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.254237288135594e-06, | |
| "loss": 0.6808, | |
| "mean_token_accuracy": 0.9079870283603668, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.4931506849315068, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.197740112994351e-06, | |
| "loss": 0.7428, | |
| "mean_token_accuracy": 0.9004494547843933, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 1.4986301369863013, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.141242937853108e-06, | |
| "loss": 0.6059, | |
| "mean_token_accuracy": 0.9166204333305359, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.504109589041096, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 5.084745762711865e-06, | |
| "loss": 0.6387, | |
| "mean_token_accuracy": 0.9125949144363403, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 1.5095890410958903, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 5.028248587570622e-06, | |
| "loss": 0.674, | |
| "mean_token_accuracy": 0.910666286945343, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 1.515068493150685, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 4.9717514124293796e-06, | |
| "loss": 0.653, | |
| "mean_token_accuracy": 0.9097437858581543, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 1.5205479452054793, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 4.915254237288136e-06, | |
| "loss": 0.6891, | |
| "mean_token_accuracy": 0.9078112840652466, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 1.526027397260274, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 4.8587570621468936e-06, | |
| "loss": 0.7249, | |
| "mean_token_accuracy": 0.9042028188705444, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 1.5315068493150685, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 4.80225988700565e-06, | |
| "loss": 0.6809, | |
| "mean_token_accuracy": 0.9092828631401062, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.536986301369863, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 4.745762711864408e-06, | |
| "loss": 0.6141, | |
| "mean_token_accuracy": 0.9147942662239075, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 1.5424657534246575, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 4.689265536723164e-06, | |
| "loss": 0.7209, | |
| "mean_token_accuracy": 0.9041316211223602, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 1.547945205479452, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 4.632768361581922e-06, | |
| "loss": 0.6145, | |
| "mean_token_accuracy": 0.9166280925273895, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 1.5534246575342465, | |
| "grad_norm": 2.125, | |
| "learning_rate": 4.576271186440678e-06, | |
| "loss": 0.673, | |
| "mean_token_accuracy": 0.9096376001834869, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 1.558904109589041, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 4.519774011299436e-06, | |
| "loss": 0.6784, | |
| "mean_token_accuracy": 0.9063901305198669, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 1.5643835616438357, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.463276836158192e-06, | |
| "loss": 0.6594, | |
| "mean_token_accuracy": 0.9109133183956146, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 1.56986301369863, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.40677966101695e-06, | |
| "loss": 0.6409, | |
| "mean_token_accuracy": 0.9116988480091095, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 1.5753424657534247, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 4.350282485875706e-06, | |
| "loss": 0.6547, | |
| "mean_token_accuracy": 0.9099021852016449, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 1.580821917808219, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 4.293785310734464e-06, | |
| "loss": 0.6769, | |
| "mean_token_accuracy": 0.9094822108745575, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 1.5863013698630137, | |
| "grad_norm": 2.0, | |
| "learning_rate": 4.23728813559322e-06, | |
| "loss": 0.7131, | |
| "mean_token_accuracy": 0.903436928987503, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5917808219178082, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 4.180790960451978e-06, | |
| "loss": 0.6988, | |
| "mean_token_accuracy": 0.9046348929405212, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 1.5972602739726027, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.124293785310734e-06, | |
| "loss": 0.7204, | |
| "mean_token_accuracy": 0.9052576124668121, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 1.6027397260273972, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 4.067796610169492e-06, | |
| "loss": 0.6905, | |
| "mean_token_accuracy": 0.9082843363285065, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 1.6082191780821917, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.011299435028249e-06, | |
| "loss": 0.7316, | |
| "mean_token_accuracy": 0.9009381234645844, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 1.6136986301369864, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 3.954802259887006e-06, | |
| "loss": 0.7316, | |
| "mean_token_accuracy": 0.9005630910396576, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 1.6191780821917807, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 3.898305084745763e-06, | |
| "loss": 0.6994, | |
| "mean_token_accuracy": 0.905254602432251, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 1.6246575342465754, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 3.84180790960452e-06, | |
| "loss": 0.6493, | |
| "mean_token_accuracy": 0.9093045294284821, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 1.6301369863013697, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 3.7853107344632772e-06, | |
| "loss": 0.6282, | |
| "mean_token_accuracy": 0.9144234955310822, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 1.6356164383561644, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 3.7288135593220342e-06, | |
| "loss": 0.6878, | |
| "mean_token_accuracy": 0.9078341126441956, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 1.641095890410959, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 3.6723163841807913e-06, | |
| "loss": 0.6357, | |
| "mean_token_accuracy": 0.9131599366664886, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.6465753424657534, | |
| "grad_norm": 1.875, | |
| "learning_rate": 3.6158192090395483e-06, | |
| "loss": 0.7303, | |
| "mean_token_accuracy": 0.9041744768619537, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 1.652054794520548, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 3.5593220338983053e-06, | |
| "loss": 0.6885, | |
| "mean_token_accuracy": 0.9045538306236267, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 1.6575342465753424, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 3.5028248587570623e-06, | |
| "loss": 0.6915, | |
| "mean_token_accuracy": 0.9064763784408569, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 1.6630136986301371, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 3.4463276836158193e-06, | |
| "loss": 0.7073, | |
| "mean_token_accuracy": 0.9064249396324158, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 1.6684931506849314, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 3.3898305084745763e-06, | |
| "loss": 0.6497, | |
| "mean_token_accuracy": 0.9118121564388275, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 1.6739726027397261, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.6628, | |
| "mean_token_accuracy": 0.9104187488555908, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 1.6794520547945204, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 3.2768361581920903e-06, | |
| "loss": 0.6534, | |
| "mean_token_accuracy": 0.909776359796524, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 1.6849315068493151, | |
| "grad_norm": 1.875, | |
| "learning_rate": 3.2203389830508473e-06, | |
| "loss": 0.6717, | |
| "mean_token_accuracy": 0.9090818762779236, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 1.6904109589041096, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 3.163841807909605e-06, | |
| "loss": 0.6735, | |
| "mean_token_accuracy": 0.9090877175331116, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 1.6958904109589041, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 3.1073446327683622e-06, | |
| "loss": 0.6932, | |
| "mean_token_accuracy": 0.9055996835231781, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.7013698630136986, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 3.0508474576271192e-06, | |
| "loss": 0.6919, | |
| "mean_token_accuracy": 0.9036900401115417, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.7068493150684931, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 2.9943502824858762e-06, | |
| "loss": 0.6737, | |
| "mean_token_accuracy": 0.9093412756919861, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 1.7123287671232876, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.9378531073446333e-06, | |
| "loss": 0.6038, | |
| "mean_token_accuracy": 0.9156779944896698, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 1.7178082191780821, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.8813559322033903e-06, | |
| "loss": 0.6681, | |
| "mean_token_accuracy": 0.9089544415473938, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 1.7232876712328768, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.8248587570621473e-06, | |
| "loss": 0.6544, | |
| "mean_token_accuracy": 0.911058783531189, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 1.7287671232876711, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 2.7683615819209043e-06, | |
| "loss": 0.6405, | |
| "mean_token_accuracy": 0.9108243882656097, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 1.7342465753424658, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.7118644067796613e-06, | |
| "loss": 0.6556, | |
| "mean_token_accuracy": 0.9137302935123444, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 1.7397260273972601, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.6553672316384183e-06, | |
| "loss": 0.6323, | |
| "mean_token_accuracy": 0.9138603806495667, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 1.7452054794520548, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.5988700564971753e-06, | |
| "loss": 0.662, | |
| "mean_token_accuracy": 0.908368855714798, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 1.7506849315068493, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.5423728813559323e-06, | |
| "loss": 0.602, | |
| "mean_token_accuracy": 0.9159910678863525, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.7561643835616438, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 2.4858757062146898e-06, | |
| "loss": 0.6691, | |
| "mean_token_accuracy": 0.9087992608547211, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 1.7616438356164383, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.4293785310734468e-06, | |
| "loss": 0.596, | |
| "mean_token_accuracy": 0.917241632938385, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 1.7671232876712328, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.372881355932204e-06, | |
| "loss": 0.7606, | |
| "mean_token_accuracy": 0.8987223207950592, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 1.7726027397260276, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.316384180790961e-06, | |
| "loss": 0.6812, | |
| "mean_token_accuracy": 0.9057367146015167, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 1.7780821917808218, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 2.259887005649718e-06, | |
| "loss": 0.6828, | |
| "mean_token_accuracy": 0.9079809784889221, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.7835616438356166, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.203389830508475e-06, | |
| "loss": 0.7241, | |
| "mean_token_accuracy": 0.9030264317989349, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 1.7890410958904108, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 2.146892655367232e-06, | |
| "loss": 0.6908, | |
| "mean_token_accuracy": 0.9060869216918945, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 1.7945205479452055, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.090395480225989e-06, | |
| "loss": 0.6877, | |
| "mean_token_accuracy": 0.9075476229190826, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.033898305084746e-06, | |
| "loss": 0.6607, | |
| "mean_token_accuracy": 0.9095008373260498, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 1.8054794520547945, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 1.977401129943503e-06, | |
| "loss": 0.5915, | |
| "mean_token_accuracy": 0.9170799255371094, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.810958904109589, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.92090395480226e-06, | |
| "loss": 0.7045, | |
| "mean_token_accuracy": 0.9037127196788788, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 1.8164383561643835, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 1.8644067796610171e-06, | |
| "loss": 0.6665, | |
| "mean_token_accuracy": 0.9105578064918518, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 1.821917808219178, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 1.8079096045197741e-06, | |
| "loss": 0.6302, | |
| "mean_token_accuracy": 0.9144696891307831, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 1.8273972602739725, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.7514124293785311e-06, | |
| "loss": 0.7235, | |
| "mean_token_accuracy": 0.9047011733055115, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 1.8328767123287673, | |
| "grad_norm": 1.875, | |
| "learning_rate": 1.6949152542372882e-06, | |
| "loss": 0.6352, | |
| "mean_token_accuracy": 0.9123874604701996, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 1.8383561643835615, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.6384180790960452e-06, | |
| "loss": 0.6611, | |
| "mean_token_accuracy": 0.9092899858951569, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 1.8438356164383563, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.5819209039548026e-06, | |
| "loss": 0.7016, | |
| "mean_token_accuracy": 0.9032659232616425, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 1.8493150684931505, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.5254237288135596e-06, | |
| "loss": 0.7062, | |
| "mean_token_accuracy": 0.9050180912017822, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 1.8547945205479452, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.4689265536723166e-06, | |
| "loss": 0.7284, | |
| "mean_token_accuracy": 0.9015350937843323, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 1.8602739726027397, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.4124293785310736e-06, | |
| "loss": 0.6355, | |
| "mean_token_accuracy": 0.9123442471027374, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.8657534246575342, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 1.3559322033898307e-06, | |
| "loss": 0.6472, | |
| "mean_token_accuracy": 0.9134511053562164, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 1.8712328767123287, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.2994350282485877e-06, | |
| "loss": 0.6786, | |
| "mean_token_accuracy": 0.9099195599555969, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 1.8767123287671232, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 1.2429378531073449e-06, | |
| "loss": 0.747, | |
| "mean_token_accuracy": 0.9031140804290771, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 1.882191780821918, | |
| "grad_norm": 2.0, | |
| "learning_rate": 1.186440677966102e-06, | |
| "loss": 0.6904, | |
| "mean_token_accuracy": 0.9059503078460693, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 1.8876712328767122, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.129943502824859e-06, | |
| "loss": 0.6317, | |
| "mean_token_accuracy": 0.913758397102356, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 1.893150684931507, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.073446327683616e-06, | |
| "loss": 0.6718, | |
| "mean_token_accuracy": 0.9079194068908691, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 1.8986301369863012, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 1.016949152542373e-06, | |
| "loss": 0.6781, | |
| "mean_token_accuracy": 0.9061055779457092, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 1.904109589041096, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 9.6045197740113e-07, | |
| "loss": 0.7169, | |
| "mean_token_accuracy": 0.9037725031375885, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 1.9095890410958904, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.039548022598871e-07, | |
| "loss": 0.6882, | |
| "mean_token_accuracy": 0.9081202149391174, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 1.915068493150685, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 8.474576271186441e-07, | |
| "loss": 0.6714, | |
| "mean_token_accuracy": 0.9084700644016266, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.9205479452054794, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 7.909604519774013e-07, | |
| "loss": 0.5951, | |
| "mean_token_accuracy": 0.915435403585434, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 1.926027397260274, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 7.344632768361583e-07, | |
| "loss": 0.6639, | |
| "mean_token_accuracy": 0.9109577238559723, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 1.9315068493150684, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 6.779661016949153e-07, | |
| "loss": 0.6354, | |
| "mean_token_accuracy": 0.9138354063034058, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 1.936986301369863, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.214689265536724e-07, | |
| "loss": 0.6613, | |
| "mean_token_accuracy": 0.9091556370258331, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 1.9424657534246577, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 5.649717514124295e-07, | |
| "loss": 0.6825, | |
| "mean_token_accuracy": 0.9077677130699158, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 1.947945205479452, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 5.084745762711865e-07, | |
| "loss": 0.6799, | |
| "mean_token_accuracy": 0.9061026573181152, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 1.9534246575342467, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 4.5197740112994353e-07, | |
| "loss": 0.631, | |
| "mean_token_accuracy": 0.914902925491333, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 1.958904109589041, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 3.9548022598870065e-07, | |
| "loss": 0.7215, | |
| "mean_token_accuracy": 0.9018444120883942, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 1.9643835616438357, | |
| "grad_norm": 1.90625, | |
| "learning_rate": 3.3898305084745766e-07, | |
| "loss": 0.7234, | |
| "mean_token_accuracy": 0.9011828899383545, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 1.9698630136986301, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.8248587570621473e-07, | |
| "loss": 0.6796, | |
| "mean_token_accuracy": 0.9087766408920288, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.9753424657534246, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.2598870056497177e-07, | |
| "loss": 0.6692, | |
| "mean_token_accuracy": 0.9075906872749329, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 1.9808219178082191, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.6949152542372883e-07, | |
| "loss": 0.7053, | |
| "mean_token_accuracy": 0.9050938785076141, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 1.9863013698630136, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.1299435028248588e-07, | |
| "loss": 0.6466, | |
| "mean_token_accuracy": 0.9130730330944061, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 1.9917808219178084, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 5.649717514124294e-08, | |
| "loss": 0.6636, | |
| "mean_token_accuracy": 0.9109528958797455, | |
| "step": 364 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 364, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.568355335831552e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |