diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,50542 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 7212, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 2.829824447631836, + "eval_runtime": 408.1353, + "eval_samples_per_second": 100.489, + "eval_steps_per_second": 1.571, + "step": 0 + }, + { + "epoch": 0.00027731558513588466, + "grad_norm": 46.86890411376953, + "learning_rate": 0.0, + "loss": 2.4417, + "step": 1 + }, + { + "epoch": 0.0005546311702717693, + "grad_norm": 69.00373077392578, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.3396, + "step": 2 + }, + { + "epoch": 0.0008319467554076539, + "grad_norm": 53.255943298339844, + "learning_rate": 5.000000000000001e-07, + "loss": 2.3484, + "step": 3 + }, + { + "epoch": 0.0011092623405435386, + "grad_norm": 21.859601974487305, + "learning_rate": 7.5e-07, + "loss": 2.3949, + "step": 4 + }, + { + "epoch": 0.0013865779256794233, + "grad_norm": 18.474227905273438, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.3011, + "step": 5 + }, + { + "epoch": 0.0016638935108153079, + "grad_norm": 18.92083740234375, + "learning_rate": 1.25e-06, + "loss": 2.1921, + "step": 6 + }, + { + "epoch": 0.0019412090959511925, + "grad_norm": 17.222856521606445, + "learning_rate": 1.5e-06, + "loss": 2.2153, + "step": 7 + }, + { + "epoch": 0.0022185246810870773, + "grad_norm": 31.592514038085938, + "learning_rate": 1.7500000000000002e-06, + "loss": 2.0413, + "step": 8 + }, + { + "epoch": 0.0024958402662229617, + "grad_norm": 17.17295265197754, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.0861, + "step": 9 + }, + { + "epoch": 0.0027731558513588465, + "grad_norm": 17.179834365844727, + "learning_rate": 2.25e-06, + "loss": 2.0175, + "step": 10 + }, + { + "epoch": 0.003050471436494731, + "grad_norm": 17.518646240234375, + "learning_rate": 2.5e-06, + "loss": 1.9135, + "step": 11 + }, + { + "epoch": 0.0033277870216306157, + "grad_norm": 14.855606079101562, + "learning_rate": 2.7500000000000004e-06, + "loss": 1.7351, + "step": 12 + }, + { + "epoch": 0.0036051026067665, + "grad_norm": 13.514114379882812, + "learning_rate": 3e-06, + "loss": 1.5465, + "step": 13 + }, + { + "epoch": 0.003882418191902385, + "grad_norm": 12.117414474487305, + "learning_rate": 3.2500000000000002e-06, + "loss": 1.4642, + "step": 14 + }, + { + "epoch": 0.004159733777038269, + "grad_norm": 10.935081481933594, + "learning_rate": 3.5000000000000004e-06, + "loss": 1.3851, + "step": 15 + }, + { + "epoch": 0.004437049362174155, + "grad_norm": 7.275962829589844, + "learning_rate": 3.75e-06, + "loss": 1.1289, + "step": 16 + }, + { + "epoch": 0.004714364947310039, + "grad_norm": 6.399021148681641, + "learning_rate": 4.000000000000001e-06, + "loss": 1.1277, + "step": 17 + }, + { + "epoch": 0.004991680532445923, + "grad_norm": 6.132956027984619, + "learning_rate": 4.250000000000001e-06, + "loss": 1.1483, + "step": 18 + }, + { + "epoch": 0.005268996117581808, + "grad_norm": 5.525564670562744, + "learning_rate": 4.5e-06, + "loss": 1.0578, + "step": 19 + }, + { + "epoch": 0.005546311702717693, + "grad_norm": 5.441694259643555, + "learning_rate": 4.75e-06, + "loss": 1.0647, + "step": 20 + }, + { + "epoch": 0.005823627287853577, + "grad_norm": 5.160792827606201, + "learning_rate": 5e-06, + "loss": 0.9961, + "step": 21 + }, + { + "epoch": 0.006100942872989462, + "grad_norm": 5.569485664367676, + "learning_rate": 5.25e-06, + "loss": 1.0063, + "step": 22 + }, + { + "epoch": 0.006378258458125347, + "grad_norm": 4.869104385375977, + "learning_rate": 5.500000000000001e-06, + "loss": 0.9848, + "step": 23 + }, + { + "epoch": 0.0066555740432612314, + "grad_norm": 3.172858238220215, + "learning_rate": 5.750000000000001e-06, + "loss": 0.9317, + "step": 24 + }, + { + "epoch": 0.006932889628397116, + "grad_norm": 2.5935134887695312, + "learning_rate": 6e-06, + "loss": 0.9086, + "step": 25 + }, + { + "epoch": 0.007210205213533, + "grad_norm": 1.7386329174041748, + "learning_rate": 6.25e-06, + "loss": 0.8976, + "step": 26 + }, + { + "epoch": 0.0074875207986688855, + "grad_norm": 1.3860479593276978, + "learning_rate": 6.5000000000000004e-06, + "loss": 0.8668, + "step": 27 + }, + { + "epoch": 0.00776483638380477, + "grad_norm": 1.3284790515899658, + "learning_rate": 6.750000000000001e-06, + "loss": 0.8438, + "step": 28 + }, + { + "epoch": 0.008042151968940654, + "grad_norm": 1.208060383796692, + "learning_rate": 7.000000000000001e-06, + "loss": 0.8601, + "step": 29 + }, + { + "epoch": 0.008319467554076539, + "grad_norm": 0.99210125207901, + "learning_rate": 7.25e-06, + "loss": 0.836, + "step": 30 + }, + { + "epoch": 0.008596783139212423, + "grad_norm": 0.7937288284301758, + "learning_rate": 7.5e-06, + "loss": 0.829, + "step": 31 + }, + { + "epoch": 0.00887409872434831, + "grad_norm": 0.706200361251831, + "learning_rate": 7.75e-06, + "loss": 0.8006, + "step": 32 + }, + { + "epoch": 0.009151414309484194, + "grad_norm": 0.9658659100532532, + "learning_rate": 8.000000000000001e-06, + "loss": 0.7989, + "step": 33 + }, + { + "epoch": 0.009428729894620078, + "grad_norm": 1.174869418144226, + "learning_rate": 8.25e-06, + "loss": 0.8159, + "step": 34 + }, + { + "epoch": 0.009706045479755962, + "grad_norm": 0.5839990973472595, + "learning_rate": 8.500000000000002e-06, + "loss": 0.7837, + "step": 35 + }, + { + "epoch": 0.009983361064891847, + "grad_norm": 0.6130610704421997, + "learning_rate": 8.75e-06, + "loss": 0.7799, + "step": 36 + }, + { + "epoch": 0.010260676650027731, + "grad_norm": 0.7108742594718933, + "learning_rate": 9e-06, + "loss": 0.7611, + "step": 37 + }, + { + "epoch": 0.010537992235163616, + "grad_norm": 0.7302682995796204, + "learning_rate": 9.25e-06, + "loss": 0.7942, + "step": 38 + }, + { + "epoch": 0.010815307820299502, + "grad_norm": 0.5843620896339417, + "learning_rate": 9.5e-06, + "loss": 0.765, + "step": 39 + }, + { + "epoch": 0.011092623405435386, + "grad_norm": 0.41768163442611694, + "learning_rate": 9.750000000000002e-06, + "loss": 0.7112, + "step": 40 + }, + { + "epoch": 0.01136993899057127, + "grad_norm": 0.5103988647460938, + "learning_rate": 1e-05, + "loss": 0.762, + "step": 41 + }, + { + "epoch": 0.011647254575707155, + "grad_norm": 0.47128552198410034, + "learning_rate": 1.025e-05, + "loss": 0.7509, + "step": 42 + }, + { + "epoch": 0.01192457016084304, + "grad_norm": 1.0201480388641357, + "learning_rate": 1.05e-05, + "loss": 0.7507, + "step": 43 + }, + { + "epoch": 0.012201885745978924, + "grad_norm": 0.3908264935016632, + "learning_rate": 1.075e-05, + "loss": 0.7629, + "step": 44 + }, + { + "epoch": 0.012479201331114808, + "grad_norm": 0.4154920279979706, + "learning_rate": 1.1000000000000001e-05, + "loss": 0.7531, + "step": 45 + }, + { + "epoch": 0.012756516916250694, + "grad_norm": 0.4213290512561798, + "learning_rate": 1.125e-05, + "loss": 0.7472, + "step": 46 + }, + { + "epoch": 0.013033832501386578, + "grad_norm": 0.6245641112327576, + "learning_rate": 1.1500000000000002e-05, + "loss": 0.7404, + "step": 47 + }, + { + "epoch": 0.013311148086522463, + "grad_norm": 0.44496941566467285, + "learning_rate": 1.175e-05, + "loss": 0.7427, + "step": 48 + }, + { + "epoch": 0.013588463671658347, + "grad_norm": 0.4155629575252533, + "learning_rate": 1.2e-05, + "loss": 0.7147, + "step": 49 + }, + { + "epoch": 0.013865779256794232, + "grad_norm": 0.37920621037483215, + "learning_rate": 1.225e-05, + "loss": 0.7426, + "step": 50 + }, + { + "epoch": 0.014143094841930116, + "grad_norm": 0.3893055319786072, + "learning_rate": 1.25e-05, + "loss": 0.7176, + "step": 51 + }, + { + "epoch": 0.014420410427066, + "grad_norm": 0.3363882005214691, + "learning_rate": 1.2750000000000002e-05, + "loss": 0.7063, + "step": 52 + }, + { + "epoch": 0.014697726012201887, + "grad_norm": 0.4444830119609833, + "learning_rate": 1.3000000000000001e-05, + "loss": 0.6945, + "step": 53 + }, + { + "epoch": 0.014975041597337771, + "grad_norm": 0.3413512706756592, + "learning_rate": 1.3250000000000002e-05, + "loss": 0.7288, + "step": 54 + }, + { + "epoch": 0.015252357182473655, + "grad_norm": 0.4114389717578888, + "learning_rate": 1.3500000000000001e-05, + "loss": 0.7575, + "step": 55 + }, + { + "epoch": 0.01552967276760954, + "grad_norm": 0.36049914360046387, + "learning_rate": 1.3750000000000002e-05, + "loss": 0.7217, + "step": 56 + }, + { + "epoch": 0.015806988352745424, + "grad_norm": 0.41267284750938416, + "learning_rate": 1.4000000000000001e-05, + "loss": 0.7166, + "step": 57 + }, + { + "epoch": 0.01608430393788131, + "grad_norm": 0.4639422297477722, + "learning_rate": 1.4249999999999999e-05, + "loss": 0.7069, + "step": 58 + }, + { + "epoch": 0.016361619523017193, + "grad_norm": 0.36772483587265015, + "learning_rate": 1.45e-05, + "loss": 0.7247, + "step": 59 + }, + { + "epoch": 0.016638935108153077, + "grad_norm": 0.3546575903892517, + "learning_rate": 1.475e-05, + "loss": 0.7128, + "step": 60 + }, + { + "epoch": 0.01691625069328896, + "grad_norm": 0.31919416785240173, + "learning_rate": 1.5e-05, + "loss": 0.7207, + "step": 61 + }, + { + "epoch": 0.017193566278424846, + "grad_norm": 0.3498699367046356, + "learning_rate": 1.525e-05, + "loss": 0.7065, + "step": 62 + }, + { + "epoch": 0.01747088186356073, + "grad_norm": 0.35648590326309204, + "learning_rate": 1.55e-05, + "loss": 0.7146, + "step": 63 + }, + { + "epoch": 0.01774819744869662, + "grad_norm": 0.30697041749954224, + "learning_rate": 1.575e-05, + "loss": 0.6805, + "step": 64 + }, + { + "epoch": 0.018025513033832503, + "grad_norm": 0.5759001970291138, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.7267, + "step": 65 + }, + { + "epoch": 0.018302828618968387, + "grad_norm": 0.263336718082428, + "learning_rate": 1.6250000000000002e-05, + "loss": 0.6936, + "step": 66 + }, + { + "epoch": 0.01858014420410427, + "grad_norm": 0.2977915108203888, + "learning_rate": 1.65e-05, + "loss": 0.6685, + "step": 67 + }, + { + "epoch": 0.018857459789240156, + "grad_norm": 0.3028334081172943, + "learning_rate": 1.675e-05, + "loss": 0.7109, + "step": 68 + }, + { + "epoch": 0.01913477537437604, + "grad_norm": 0.3265489935874939, + "learning_rate": 1.7000000000000003e-05, + "loss": 0.7023, + "step": 69 + }, + { + "epoch": 0.019412090959511925, + "grad_norm": 0.2899531126022339, + "learning_rate": 1.725e-05, + "loss": 0.6985, + "step": 70 + }, + { + "epoch": 0.01968940654464781, + "grad_norm": 0.29272034764289856, + "learning_rate": 1.75e-05, + "loss": 0.7125, + "step": 71 + }, + { + "epoch": 0.019966722129783693, + "grad_norm": 0.3114602863788605, + "learning_rate": 1.775e-05, + "loss": 0.6836, + "step": 72 + }, + { + "epoch": 0.020244037714919578, + "grad_norm": 0.28768229484558105, + "learning_rate": 1.8e-05, + "loss": 0.6809, + "step": 73 + }, + { + "epoch": 0.020521353300055462, + "grad_norm": 0.270345002412796, + "learning_rate": 1.825e-05, + "loss": 0.6776, + "step": 74 + }, + { + "epoch": 0.020798668885191347, + "grad_norm": 0.2635841369628906, + "learning_rate": 1.85e-05, + "loss": 0.6645, + "step": 75 + }, + { + "epoch": 0.02107598447032723, + "grad_norm": 0.3204723000526428, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.6631, + "step": 76 + }, + { + "epoch": 0.021353300055463115, + "grad_norm": 0.27179455757141113, + "learning_rate": 1.9e-05, + "loss": 0.693, + "step": 77 + }, + { + "epoch": 0.021630615640599003, + "grad_norm": 0.26558464765548706, + "learning_rate": 1.925e-05, + "loss": 0.6715, + "step": 78 + }, + { + "epoch": 0.021907931225734888, + "grad_norm": 0.3682558834552765, + "learning_rate": 1.9500000000000003e-05, + "loss": 0.6826, + "step": 79 + }, + { + "epoch": 0.022185246810870772, + "grad_norm": 0.281429648399353, + "learning_rate": 1.9750000000000002e-05, + "loss": 0.6599, + "step": 80 + }, + { + "epoch": 0.022462562396006656, + "grad_norm": 0.28537672758102417, + "learning_rate": 2e-05, + "loss": 0.6695, + "step": 81 + }, + { + "epoch": 0.02273987798114254, + "grad_norm": 0.274913489818573, + "learning_rate": 2.025e-05, + "loss": 0.678, + "step": 82 + }, + { + "epoch": 0.023017193566278425, + "grad_norm": 0.28847208619117737, + "learning_rate": 2.05e-05, + "loss": 0.669, + "step": 83 + }, + { + "epoch": 0.02329450915141431, + "grad_norm": 0.30678853392601013, + "learning_rate": 2.075e-05, + "loss": 0.6647, + "step": 84 + }, + { + "epoch": 0.023571824736550194, + "grad_norm": 0.28266021609306335, + "learning_rate": 2.1e-05, + "loss": 0.671, + "step": 85 + }, + { + "epoch": 0.02384914032168608, + "grad_norm": 0.2712315320968628, + "learning_rate": 2.125e-05, + "loss": 0.6579, + "step": 86 + }, + { + "epoch": 0.024126455906821963, + "grad_norm": 0.30666086077690125, + "learning_rate": 2.15e-05, + "loss": 0.6598, + "step": 87 + }, + { + "epoch": 0.024403771491957847, + "grad_norm": 0.257932186126709, + "learning_rate": 2.175e-05, + "loss": 0.6348, + "step": 88 + }, + { + "epoch": 0.02468108707709373, + "grad_norm": 0.3133629560470581, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.6611, + "step": 89 + }, + { + "epoch": 0.024958402662229616, + "grad_norm": 0.27258774638175964, + "learning_rate": 2.2250000000000002e-05, + "loss": 0.6736, + "step": 90 + }, + { + "epoch": 0.0252357182473655, + "grad_norm": 0.3201597034931183, + "learning_rate": 2.25e-05, + "loss": 0.6918, + "step": 91 + }, + { + "epoch": 0.025513033832501388, + "grad_norm": 0.26909735798835754, + "learning_rate": 2.275e-05, + "loss": 0.6738, + "step": 92 + }, + { + "epoch": 0.025790349417637273, + "grad_norm": 0.40945449471473694, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.6615, + "step": 93 + }, + { + "epoch": 0.026067665002773157, + "grad_norm": 0.3059796392917633, + "learning_rate": 2.3250000000000003e-05, + "loss": 0.6677, + "step": 94 + }, + { + "epoch": 0.02634498058790904, + "grad_norm": 0.2737233638763428, + "learning_rate": 2.35e-05, + "loss": 0.6436, + "step": 95 + }, + { + "epoch": 0.026622296173044926, + "grad_norm": 0.3231774866580963, + "learning_rate": 2.375e-05, + "loss": 0.6735, + "step": 96 + }, + { + "epoch": 0.02689961175818081, + "grad_norm": 0.28404903411865234, + "learning_rate": 2.4e-05, + "loss": 0.6688, + "step": 97 + }, + { + "epoch": 0.027176927343316695, + "grad_norm": 0.3392276465892792, + "learning_rate": 2.425e-05, + "loss": 0.684, + "step": 98 + }, + { + "epoch": 0.02745424292845258, + "grad_norm": 0.3021562099456787, + "learning_rate": 2.45e-05, + "loss": 0.6886, + "step": 99 + }, + { + "epoch": 0.027731558513588463, + "grad_norm": 0.2843964099884033, + "learning_rate": 2.4750000000000002e-05, + "loss": 0.6358, + "step": 100 + }, + { + "epoch": 0.028008874098724348, + "grad_norm": 0.3120342195034027, + "learning_rate": 2.5e-05, + "loss": 0.669, + "step": 101 + }, + { + "epoch": 0.028286189683860232, + "grad_norm": 0.2752966284751892, + "learning_rate": 2.499999878045941e-05, + "loss": 0.6231, + "step": 102 + }, + { + "epoch": 0.028563505268996116, + "grad_norm": 0.28501492738723755, + "learning_rate": 2.4999995121837877e-05, + "loss": 0.6205, + "step": 103 + }, + { + "epoch": 0.028840820854132, + "grad_norm": 0.2797880172729492, + "learning_rate": 2.4999989024136113e-05, + "loss": 0.6697, + "step": 104 + }, + { + "epoch": 0.029118136439267885, + "grad_norm": 0.2929855287075043, + "learning_rate": 2.4999980487355314e-05, + "loss": 0.642, + "step": 105 + }, + { + "epoch": 0.029395452024403773, + "grad_norm": 0.264911025762558, + "learning_rate": 2.4999969511497135e-05, + "loss": 0.6575, + "step": 106 + }, + { + "epoch": 0.029672767609539658, + "grad_norm": 0.2797522246837616, + "learning_rate": 2.4999956096563725e-05, + "loss": 0.6566, + "step": 107 + }, + { + "epoch": 0.029950083194675542, + "grad_norm": 0.24777652323246002, + "learning_rate": 2.49999402425577e-05, + "loss": 0.6288, + "step": 108 + }, + { + "epoch": 0.030227398779811426, + "grad_norm": 0.34936287999153137, + "learning_rate": 2.4999921949482157e-05, + "loss": 0.6506, + "step": 109 + }, + { + "epoch": 0.03050471436494731, + "grad_norm": 0.30599579215049744, + "learning_rate": 2.499990121734066e-05, + "loss": 0.6554, + "step": 110 + }, + { + "epoch": 0.030782029950083195, + "grad_norm": 0.29751792550086975, + "learning_rate": 2.499987804613726e-05, + "loss": 0.6698, + "step": 111 + }, + { + "epoch": 0.03105934553521908, + "grad_norm": 0.2642778754234314, + "learning_rate": 2.4999852435876473e-05, + "loss": 0.6337, + "step": 112 + }, + { + "epoch": 0.031336661120354964, + "grad_norm": 0.2584931552410126, + "learning_rate": 2.49998243865633e-05, + "loss": 0.661, + "step": 113 + }, + { + "epoch": 0.03161397670549085, + "grad_norm": 0.266797810792923, + "learning_rate": 2.4999793898203212e-05, + "loss": 0.6368, + "step": 114 + }, + { + "epoch": 0.03189129229062673, + "grad_norm": 0.35552042722702026, + "learning_rate": 2.4999760970802155e-05, + "loss": 0.6364, + "step": 115 + }, + { + "epoch": 0.03216860787576262, + "grad_norm": 0.28450194001197815, + "learning_rate": 2.4999725604366562e-05, + "loss": 0.661, + "step": 116 + }, + { + "epoch": 0.0324459234608985, + "grad_norm": 0.3352636694908142, + "learning_rate": 2.4999687798903327e-05, + "loss": 0.6439, + "step": 117 + }, + { + "epoch": 0.032723239046034386, + "grad_norm": 0.2475953847169876, + "learning_rate": 2.499964755441983e-05, + "loss": 0.6344, + "step": 118 + }, + { + "epoch": 0.03300055463117027, + "grad_norm": 0.30431386828422546, + "learning_rate": 2.4999604870923926e-05, + "loss": 0.6459, + "step": 119 + }, + { + "epoch": 0.033277870216306155, + "grad_norm": 0.2653152644634247, + "learning_rate": 2.499955974842394e-05, + "loss": 0.6425, + "step": 120 + }, + { + "epoch": 0.03355518580144204, + "grad_norm": 0.29490575194358826, + "learning_rate": 2.4999512186928675e-05, + "loss": 0.6427, + "step": 121 + }, + { + "epoch": 0.03383250138657792, + "grad_norm": 0.2630308270454407, + "learning_rate": 2.4999462186447415e-05, + "loss": 0.6597, + "step": 122 + }, + { + "epoch": 0.03410981697171381, + "grad_norm": 0.26787513494491577, + "learning_rate": 2.4999409746989914e-05, + "loss": 0.6622, + "step": 123 + }, + { + "epoch": 0.03438713255684969, + "grad_norm": 0.25667890906333923, + "learning_rate": 2.499935486856641e-05, + "loss": 0.6335, + "step": 124 + }, + { + "epoch": 0.03466444814198558, + "grad_norm": 0.26751402020454407, + "learning_rate": 2.4999297551187603e-05, + "loss": 0.6358, + "step": 125 + }, + { + "epoch": 0.03494176372712146, + "grad_norm": 0.2815951108932495, + "learning_rate": 2.4999237794864683e-05, + "loss": 0.6615, + "step": 126 + }, + { + "epoch": 0.03521907931225735, + "grad_norm": 0.2573346793651581, + "learning_rate": 2.499917559960931e-05, + "loss": 0.6463, + "step": 127 + }, + { + "epoch": 0.03549639489739324, + "grad_norm": 0.26202693581581116, + "learning_rate": 2.4999110965433615e-05, + "loss": 0.6436, + "step": 128 + }, + { + "epoch": 0.03577371048252912, + "grad_norm": 0.267046719789505, + "learning_rate": 2.4999043892350213e-05, + "loss": 0.6433, + "step": 129 + }, + { + "epoch": 0.036051026067665005, + "grad_norm": 0.2713761329650879, + "learning_rate": 2.499897438037219e-05, + "loss": 0.6314, + "step": 130 + }, + { + "epoch": 0.03632834165280089, + "grad_norm": 0.2704955041408539, + "learning_rate": 2.4998902429513115e-05, + "loss": 0.6471, + "step": 131 + }, + { + "epoch": 0.036605657237936774, + "grad_norm": 0.25811654329299927, + "learning_rate": 2.4998828039787027e-05, + "loss": 0.6346, + "step": 132 + }, + { + "epoch": 0.03688297282307266, + "grad_norm": 0.2768125832080841, + "learning_rate": 2.4998751211208432e-05, + "loss": 0.6327, + "step": 133 + }, + { + "epoch": 0.03716028840820854, + "grad_norm": 0.29026105999946594, + "learning_rate": 2.499867194379233e-05, + "loss": 0.6632, + "step": 134 + }, + { + "epoch": 0.03743760399334443, + "grad_norm": 0.26648250222206116, + "learning_rate": 2.4998590237554182e-05, + "loss": 0.6414, + "step": 135 + }, + { + "epoch": 0.03771491957848031, + "grad_norm": 0.2578074336051941, + "learning_rate": 2.4998506092509938e-05, + "loss": 0.6459, + "step": 136 + }, + { + "epoch": 0.037992235163616196, + "grad_norm": 0.2555679678916931, + "learning_rate": 2.4998419508676014e-05, + "loss": 0.6561, + "step": 137 + }, + { + "epoch": 0.03826955074875208, + "grad_norm": 0.25471994280815125, + "learning_rate": 2.4998330486069304e-05, + "loss": 0.6616, + "step": 138 + }, + { + "epoch": 0.038546866333887965, + "grad_norm": 0.2434554398059845, + "learning_rate": 2.4998239024707183e-05, + "loss": 0.6423, + "step": 139 + }, + { + "epoch": 0.03882418191902385, + "grad_norm": 0.23697395622730255, + "learning_rate": 2.4998145124607485e-05, + "loss": 0.629, + "step": 140 + }, + { + "epoch": 0.039101497504159734, + "grad_norm": 0.2652537226676941, + "learning_rate": 2.4998048785788547e-05, + "loss": 0.6558, + "step": 141 + }, + { + "epoch": 0.03937881308929562, + "grad_norm": 0.2602185606956482, + "learning_rate": 2.499795000826916e-05, + "loss": 0.6427, + "step": 142 + }, + { + "epoch": 0.0396561286744315, + "grad_norm": 0.23875969648361206, + "learning_rate": 2.49978487920686e-05, + "loss": 0.6097, + "step": 143 + }, + { + "epoch": 0.03993344425956739, + "grad_norm": 0.2549594044685364, + "learning_rate": 2.4997745137206618e-05, + "loss": 0.6477, + "step": 144 + }, + { + "epoch": 0.04021075984470327, + "grad_norm": 0.2528778910636902, + "learning_rate": 2.4997639043703437e-05, + "loss": 0.6028, + "step": 145 + }, + { + "epoch": 0.040488075429839156, + "grad_norm": 0.252888023853302, + "learning_rate": 2.499753051157976e-05, + "loss": 0.6547, + "step": 146 + }, + { + "epoch": 0.04076539101497504, + "grad_norm": 0.289661705493927, + "learning_rate": 2.4997419540856762e-05, + "loss": 0.6604, + "step": 147 + }, + { + "epoch": 0.041042706600110924, + "grad_norm": 0.27772676944732666, + "learning_rate": 2.49973061315561e-05, + "loss": 0.6321, + "step": 148 + }, + { + "epoch": 0.04132002218524681, + "grad_norm": 0.29435357451438904, + "learning_rate": 2.4997190283699904e-05, + "loss": 0.6539, + "step": 149 + }, + { + "epoch": 0.04159733777038269, + "grad_norm": 0.2796315848827362, + "learning_rate": 2.4997071997310774e-05, + "loss": 0.6816, + "step": 150 + }, + { + "epoch": 0.04187465335551858, + "grad_norm": 0.2854909598827362, + "learning_rate": 2.4996951272411794e-05, + "loss": 0.621, + "step": 151 + }, + { + "epoch": 0.04215196894065446, + "grad_norm": 0.3513517677783966, + "learning_rate": 2.499682810902652e-05, + "loss": 0.6462, + "step": 152 + }, + { + "epoch": 0.042429284525790346, + "grad_norm": 0.2492416799068451, + "learning_rate": 2.4996702507178988e-05, + "loss": 0.6455, + "step": 153 + }, + { + "epoch": 0.04270660011092623, + "grad_norm": 0.2352532148361206, + "learning_rate": 2.49965744668937e-05, + "loss": 0.6168, + "step": 154 + }, + { + "epoch": 0.04298391569606212, + "grad_norm": 0.25850751996040344, + "learning_rate": 2.4996443988195644e-05, + "loss": 0.6452, + "step": 155 + }, + { + "epoch": 0.04326123128119801, + "grad_norm": 0.23972827196121216, + "learning_rate": 2.499631107111028e-05, + "loss": 0.6447, + "step": 156 + }, + { + "epoch": 0.04353854686633389, + "grad_norm": 0.24847468733787537, + "learning_rate": 2.499617571566354e-05, + "loss": 0.6386, + "step": 157 + }, + { + "epoch": 0.043815862451469775, + "grad_norm": 0.25739696621894836, + "learning_rate": 2.4996037921881837e-05, + "loss": 0.6417, + "step": 158 + }, + { + "epoch": 0.04409317803660566, + "grad_norm": 0.23640736937522888, + "learning_rate": 2.4995897689792062e-05, + "loss": 0.6451, + "step": 159 + }, + { + "epoch": 0.044370493621741544, + "grad_norm": 0.25362861156463623, + "learning_rate": 2.4995755019421577e-05, + "loss": 0.6525, + "step": 160 + }, + { + "epoch": 0.04464780920687743, + "grad_norm": 0.2607216536998749, + "learning_rate": 2.4995609910798214e-05, + "loss": 0.6276, + "step": 161 + }, + { + "epoch": 0.04492512479201331, + "grad_norm": 0.2426438182592392, + "learning_rate": 2.4995462363950295e-05, + "loss": 0.6375, + "step": 162 + }, + { + "epoch": 0.0452024403771492, + "grad_norm": 0.35536178946495056, + "learning_rate": 2.499531237890661e-05, + "loss": 0.6502, + "step": 163 + }, + { + "epoch": 0.04547975596228508, + "grad_norm": 0.2616370618343353, + "learning_rate": 2.4995159955696417e-05, + "loss": 0.6422, + "step": 164 + }, + { + "epoch": 0.045757071547420966, + "grad_norm": 0.2493521124124527, + "learning_rate": 2.4995005094349473e-05, + "loss": 0.6314, + "step": 165 + }, + { + "epoch": 0.04603438713255685, + "grad_norm": 0.25228554010391235, + "learning_rate": 2.4994847794895977e-05, + "loss": 0.6154, + "step": 166 + }, + { + "epoch": 0.046311702717692735, + "grad_norm": 0.24656261503696442, + "learning_rate": 2.4994688057366635e-05, + "loss": 0.6241, + "step": 167 + }, + { + "epoch": 0.04658901830282862, + "grad_norm": 0.27083900570869446, + "learning_rate": 2.4994525881792612e-05, + "loss": 0.627, + "step": 168 + }, + { + "epoch": 0.046866333887964504, + "grad_norm": 0.24816080927848816, + "learning_rate": 2.499436126820555e-05, + "loss": 0.6237, + "step": 169 + }, + { + "epoch": 0.04714364947310039, + "grad_norm": 0.2589535415172577, + "learning_rate": 2.499419421663758e-05, + "loss": 0.6139, + "step": 170 + }, + { + "epoch": 0.04742096505823627, + "grad_norm": 0.2533140182495117, + "learning_rate": 2.499402472712129e-05, + "loss": 0.6533, + "step": 171 + }, + { + "epoch": 0.04769828064337216, + "grad_norm": 0.23976951837539673, + "learning_rate": 2.499385279968975e-05, + "loss": 0.645, + "step": 172 + }, + { + "epoch": 0.04797559622850804, + "grad_norm": 0.24593569338321686, + "learning_rate": 2.4993678434376507e-05, + "loss": 0.5958, + "step": 173 + }, + { + "epoch": 0.048252911813643926, + "grad_norm": 0.283794641494751, + "learning_rate": 2.4993501631215593e-05, + "loss": 0.6565, + "step": 174 + }, + { + "epoch": 0.04853022739877981, + "grad_norm": 0.23555535078048706, + "learning_rate": 2.4993322390241496e-05, + "loss": 0.6077, + "step": 175 + }, + { + "epoch": 0.048807542983915694, + "grad_norm": 0.24843829870224, + "learning_rate": 2.4993140711489203e-05, + "loss": 0.6001, + "step": 176 + }, + { + "epoch": 0.04908485856905158, + "grad_norm": 0.24111098051071167, + "learning_rate": 2.4992956594994156e-05, + "loss": 0.6445, + "step": 177 + }, + { + "epoch": 0.04936217415418746, + "grad_norm": 0.2340569943189621, + "learning_rate": 2.499277004079228e-05, + "loss": 0.6174, + "step": 178 + }, + { + "epoch": 0.04963948973932335, + "grad_norm": 0.26766470074653625, + "learning_rate": 2.499258104891998e-05, + "loss": 0.6167, + "step": 179 + }, + { + "epoch": 0.04991680532445923, + "grad_norm": 0.2567934989929199, + "learning_rate": 2.499238961941413e-05, + "loss": 0.6394, + "step": 180 + }, + { + "epoch": 0.050194120909595116, + "grad_norm": 0.27323460578918457, + "learning_rate": 2.4992195752312093e-05, + "loss": 0.6337, + "step": 181 + }, + { + "epoch": 0.050471436494731, + "grad_norm": 0.3052992522716522, + "learning_rate": 2.4991999447651686e-05, + "loss": 0.6389, + "step": 182 + }, + { + "epoch": 0.050748752079866885, + "grad_norm": 0.24447570741176605, + "learning_rate": 2.4991800705471218e-05, + "loss": 0.6165, + "step": 183 + }, + { + "epoch": 0.051026067665002776, + "grad_norm": 0.319871187210083, + "learning_rate": 2.499159952580947e-05, + "loss": 0.5816, + "step": 184 + }, + { + "epoch": 0.05130338325013866, + "grad_norm": 0.2554628551006317, + "learning_rate": 2.4991395908705693e-05, + "loss": 0.646, + "step": 185 + }, + { + "epoch": 0.051580698835274545, + "grad_norm": 0.2671261727809906, + "learning_rate": 2.499118985419962e-05, + "loss": 0.6592, + "step": 186 + }, + { + "epoch": 0.05185801442041043, + "grad_norm": 0.2524307370185852, + "learning_rate": 2.4990981362331462e-05, + "loss": 0.6178, + "step": 187 + }, + { + "epoch": 0.052135330005546314, + "grad_norm": 0.2809012532234192, + "learning_rate": 2.4990770433141898e-05, + "loss": 0.6234, + "step": 188 + }, + { + "epoch": 0.0524126455906822, + "grad_norm": 0.23483595252037048, + "learning_rate": 2.499055706667208e-05, + "loss": 0.6209, + "step": 189 + }, + { + "epoch": 0.05268996117581808, + "grad_norm": 0.2775763273239136, + "learning_rate": 2.4990341262963654e-05, + "loss": 0.618, + "step": 190 + }, + { + "epoch": 0.05296727676095397, + "grad_norm": 0.27081090211868286, + "learning_rate": 2.499012302205872e-05, + "loss": 0.6377, + "step": 191 + }, + { + "epoch": 0.05324459234608985, + "grad_norm": 0.24989834427833557, + "learning_rate": 2.4989902343999865e-05, + "loss": 0.6179, + "step": 192 + }, + { + "epoch": 0.053521907931225736, + "grad_norm": 0.27669793367385864, + "learning_rate": 2.498967922883015e-05, + "loss": 0.6298, + "step": 193 + }, + { + "epoch": 0.05379922351636162, + "grad_norm": 0.2672564387321472, + "learning_rate": 2.4989453676593106e-05, + "loss": 0.6536, + "step": 194 + }, + { + "epoch": 0.054076539101497505, + "grad_norm": 0.24099047482013702, + "learning_rate": 2.4989225687332752e-05, + "loss": 0.61, + "step": 195 + }, + { + "epoch": 0.05435385468663339, + "grad_norm": 0.235582634806633, + "learning_rate": 2.4988995261093566e-05, + "loss": 0.654, + "step": 196 + }, + { + "epoch": 0.054631170271769273, + "grad_norm": 0.2795652747154236, + "learning_rate": 2.4988762397920517e-05, + "loss": 0.6224, + "step": 197 + }, + { + "epoch": 0.05490848585690516, + "grad_norm": 0.22800379991531372, + "learning_rate": 2.4988527097859045e-05, + "loss": 0.6186, + "step": 198 + }, + { + "epoch": 0.05518580144204104, + "grad_norm": 0.24810528755187988, + "learning_rate": 2.4988289360955053e-05, + "loss": 0.6286, + "step": 199 + }, + { + "epoch": 0.05546311702717693, + "grad_norm": 0.21688294410705566, + "learning_rate": 2.4988049187254935e-05, + "loss": 0.598, + "step": 200 + }, + { + "epoch": 0.05574043261231281, + "grad_norm": 0.23709554970264435, + "learning_rate": 2.4987806576805562e-05, + "loss": 0.6598, + "step": 201 + }, + { + "epoch": 0.056017748197448695, + "grad_norm": 0.24982847273349762, + "learning_rate": 2.4987561529654263e-05, + "loss": 0.6342, + "step": 202 + }, + { + "epoch": 0.05629506378258458, + "grad_norm": 0.2215258628129959, + "learning_rate": 2.498731404584886e-05, + "loss": 0.6159, + "step": 203 + }, + { + "epoch": 0.056572379367720464, + "grad_norm": 0.25177863240242004, + "learning_rate": 2.4987064125437643e-05, + "loss": 0.6289, + "step": 204 + }, + { + "epoch": 0.05684969495285635, + "grad_norm": 0.24716275930404663, + "learning_rate": 2.498681176846937e-05, + "loss": 0.628, + "step": 205 + }, + { + "epoch": 0.05712701053799223, + "grad_norm": 0.2888506054878235, + "learning_rate": 2.49865569749933e-05, + "loss": 0.6216, + "step": 206 + }, + { + "epoch": 0.05740432612312812, + "grad_norm": 0.24658267199993134, + "learning_rate": 2.4986299745059127e-05, + "loss": 0.6132, + "step": 207 + }, + { + "epoch": 0.057681641708264, + "grad_norm": 0.24297240376472473, + "learning_rate": 2.4986040078717063e-05, + "loss": 0.6201, + "step": 208 + }, + { + "epoch": 0.057958957293399886, + "grad_norm": 0.2425074279308319, + "learning_rate": 2.4985777976017767e-05, + "loss": 0.5997, + "step": 209 + }, + { + "epoch": 0.05823627287853577, + "grad_norm": 0.25336262583732605, + "learning_rate": 2.498551343701238e-05, + "loss": 0.6489, + "step": 210 + }, + { + "epoch": 0.058513588463671655, + "grad_norm": 0.23498830199241638, + "learning_rate": 2.498524646175253e-05, + "loss": 0.6341, + "step": 211 + }, + { + "epoch": 0.058790904048807546, + "grad_norm": 0.24754488468170166, + "learning_rate": 2.49849770502903e-05, + "loss": 0.6538, + "step": 212 + }, + { + "epoch": 0.05906821963394343, + "grad_norm": 0.24818097054958344, + "learning_rate": 2.4984705202678266e-05, + "loss": 0.6098, + "step": 213 + }, + { + "epoch": 0.059345535219079315, + "grad_norm": 0.22981123626232147, + "learning_rate": 2.498443091896947e-05, + "loss": 0.6072, + "step": 214 + }, + { + "epoch": 0.0596228508042152, + "grad_norm": 0.2612292766571045, + "learning_rate": 2.4984154199217434e-05, + "loss": 0.626, + "step": 215 + }, + { + "epoch": 0.059900166389351084, + "grad_norm": 0.26491644978523254, + "learning_rate": 2.4983875043476153e-05, + "loss": 0.6495, + "step": 216 + }, + { + "epoch": 0.06017748197448697, + "grad_norm": 0.22399267554283142, + "learning_rate": 2.4983593451800096e-05, + "loss": 0.6341, + "step": 217 + }, + { + "epoch": 0.06045479755962285, + "grad_norm": 0.2318061739206314, + "learning_rate": 2.498330942424421e-05, + "loss": 0.5787, + "step": 218 + }, + { + "epoch": 0.06073211314475874, + "grad_norm": 0.2700578272342682, + "learning_rate": 2.498302296086392e-05, + "loss": 0.6314, + "step": 219 + }, + { + "epoch": 0.06100942872989462, + "grad_norm": 0.22675910592079163, + "learning_rate": 2.4982734061715112e-05, + "loss": 0.5714, + "step": 220 + }, + { + "epoch": 0.061286744315030506, + "grad_norm": 0.2522087097167969, + "learning_rate": 2.4982442726854173e-05, + "loss": 0.6053, + "step": 221 + }, + { + "epoch": 0.06156405990016639, + "grad_norm": 0.22665978968143463, + "learning_rate": 2.4982148956337935e-05, + "loss": 0.6156, + "step": 222 + }, + { + "epoch": 0.061841375485302275, + "grad_norm": 0.24832209944725037, + "learning_rate": 2.4981852750223726e-05, + "loss": 0.6406, + "step": 223 + }, + { + "epoch": 0.06211869107043816, + "grad_norm": 0.2526067793369293, + "learning_rate": 2.498155410856935e-05, + "loss": 0.6312, + "step": 224 + }, + { + "epoch": 0.06239600665557404, + "grad_norm": 0.24257095158100128, + "learning_rate": 2.4981253031433076e-05, + "loss": 0.6456, + "step": 225 + }, + { + "epoch": 0.06267332224070993, + "grad_norm": 0.25497207045555115, + "learning_rate": 2.4980949518873648e-05, + "loss": 0.6047, + "step": 226 + }, + { + "epoch": 0.06295063782584581, + "grad_norm": 0.23874424397945404, + "learning_rate": 2.498064357095029e-05, + "loss": 0.614, + "step": 227 + }, + { + "epoch": 0.0632279534109817, + "grad_norm": 0.24398040771484375, + "learning_rate": 2.498033518772271e-05, + "loss": 0.6212, + "step": 228 + }, + { + "epoch": 0.06350526899611758, + "grad_norm": 0.27126333117485046, + "learning_rate": 2.498002436925107e-05, + "loss": 0.6051, + "step": 229 + }, + { + "epoch": 0.06378258458125347, + "grad_norm": 0.22852414846420288, + "learning_rate": 2.497971111559602e-05, + "loss": 0.6133, + "step": 230 + }, + { + "epoch": 0.06405990016638935, + "grad_norm": 0.22752775251865387, + "learning_rate": 2.4979395426818696e-05, + "loss": 0.5893, + "step": 231 + }, + { + "epoch": 0.06433721575152523, + "grad_norm": 0.27361559867858887, + "learning_rate": 2.4979077302980683e-05, + "loss": 0.6431, + "step": 232 + }, + { + "epoch": 0.06461453133666112, + "grad_norm": 0.3071225881576538, + "learning_rate": 2.497875674414406e-05, + "loss": 0.617, + "step": 233 + }, + { + "epoch": 0.064891846921797, + "grad_norm": 0.3025614023208618, + "learning_rate": 2.4978433750371382e-05, + "loss": 0.6294, + "step": 234 + }, + { + "epoch": 0.06516916250693289, + "grad_norm": 0.21824614703655243, + "learning_rate": 2.4978108321725667e-05, + "loss": 0.6189, + "step": 235 + }, + { + "epoch": 0.06544647809206877, + "grad_norm": 0.23781217634677887, + "learning_rate": 2.497778045827042e-05, + "loss": 0.6115, + "step": 236 + }, + { + "epoch": 0.06572379367720466, + "grad_norm": 0.3532952666282654, + "learning_rate": 2.497745016006961e-05, + "loss": 0.6091, + "step": 237 + }, + { + "epoch": 0.06600110926234054, + "grad_norm": 0.26994985342025757, + "learning_rate": 2.4977117427187692e-05, + "loss": 0.6078, + "step": 238 + }, + { + "epoch": 0.06627842484747642, + "grad_norm": 0.2395590841770172, + "learning_rate": 2.4976782259689587e-05, + "loss": 0.6437, + "step": 239 + }, + { + "epoch": 0.06655574043261231, + "grad_norm": 0.2347521334886551, + "learning_rate": 2.49764446576407e-05, + "loss": 0.6015, + "step": 240 + }, + { + "epoch": 0.0668330560177482, + "grad_norm": 0.2603704333305359, + "learning_rate": 2.49761046211069e-05, + "loss": 0.6441, + "step": 241 + }, + { + "epoch": 0.06711037160288408, + "grad_norm": 0.21548427641391754, + "learning_rate": 2.4975762150154542e-05, + "loss": 0.6059, + "step": 242 + }, + { + "epoch": 0.06738768718801996, + "grad_norm": 0.2270384132862091, + "learning_rate": 2.497541724485045e-05, + "loss": 0.6148, + "step": 243 + }, + { + "epoch": 0.06766500277315585, + "grad_norm": 0.23716577887535095, + "learning_rate": 2.497506990526192e-05, + "loss": 0.6135, + "step": 244 + }, + { + "epoch": 0.06794231835829173, + "grad_norm": 0.24414962530136108, + "learning_rate": 2.4974720131456736e-05, + "loss": 0.6363, + "step": 245 + }, + { + "epoch": 0.06821963394342762, + "grad_norm": 0.24200287461280823, + "learning_rate": 2.497436792350314e-05, + "loss": 0.6278, + "step": 246 + }, + { + "epoch": 0.0684969495285635, + "grad_norm": 0.25273025035858154, + "learning_rate": 2.497401328146986e-05, + "loss": 0.6012, + "step": 247 + }, + { + "epoch": 0.06877426511369938, + "grad_norm": 0.2444678694009781, + "learning_rate": 2.4973656205426094e-05, + "loss": 0.6218, + "step": 248 + }, + { + "epoch": 0.06905158069883527, + "grad_norm": 0.25649040937423706, + "learning_rate": 2.4973296695441523e-05, + "loss": 0.6678, + "step": 249 + }, + { + "epoch": 0.06932889628397115, + "grad_norm": 0.2729082703590393, + "learning_rate": 2.4972934751586292e-05, + "loss": 0.6018, + "step": 250 + }, + { + "epoch": 0.06960621186910704, + "grad_norm": 0.23950397968292236, + "learning_rate": 2.4972570373931026e-05, + "loss": 0.6342, + "step": 251 + }, + { + "epoch": 0.06988352745424292, + "grad_norm": 0.27791211009025574, + "learning_rate": 2.4972203562546825e-05, + "loss": 0.5948, + "step": 252 + }, + { + "epoch": 0.0701608430393788, + "grad_norm": 0.24157491326332092, + "learning_rate": 2.4971834317505266e-05, + "loss": 0.6346, + "step": 253 + }, + { + "epoch": 0.0704381586245147, + "grad_norm": 0.23233160376548767, + "learning_rate": 2.4971462638878394e-05, + "loss": 0.6023, + "step": 254 + }, + { + "epoch": 0.07071547420965059, + "grad_norm": 0.22705158591270447, + "learning_rate": 2.4971088526738737e-05, + "loss": 0.6314, + "step": 255 + }, + { + "epoch": 0.07099278979478647, + "grad_norm": 0.2364932894706726, + "learning_rate": 2.4970711981159294e-05, + "loss": 0.6239, + "step": 256 + }, + { + "epoch": 0.07127010537992236, + "grad_norm": 0.2369173765182495, + "learning_rate": 2.4970333002213535e-05, + "loss": 0.6056, + "step": 257 + }, + { + "epoch": 0.07154742096505824, + "grad_norm": 0.23944415152072906, + "learning_rate": 2.4969951589975415e-05, + "loss": 0.6188, + "step": 258 + }, + { + "epoch": 0.07182473655019413, + "grad_norm": 0.24431641399860382, + "learning_rate": 2.4969567744519357e-05, + "loss": 0.6393, + "step": 259 + }, + { + "epoch": 0.07210205213533001, + "grad_norm": 0.23058977723121643, + "learning_rate": 2.4969181465920254e-05, + "loss": 0.623, + "step": 260 + }, + { + "epoch": 0.0723793677204659, + "grad_norm": 0.2396584004163742, + "learning_rate": 2.4968792754253483e-05, + "loss": 0.6085, + "step": 261 + }, + { + "epoch": 0.07265668330560178, + "grad_norm": 0.28117048740386963, + "learning_rate": 2.496840160959489e-05, + "loss": 0.612, + "step": 262 + }, + { + "epoch": 0.07293399889073766, + "grad_norm": 0.22773273289203644, + "learning_rate": 2.49680080320208e-05, + "loss": 0.6215, + "step": 263 + }, + { + "epoch": 0.07321131447587355, + "grad_norm": 0.25395745038986206, + "learning_rate": 2.496761202160801e-05, + "loss": 0.6197, + "step": 264 + }, + { + "epoch": 0.07348863006100943, + "grad_norm": 0.2748405337333679, + "learning_rate": 2.496721357843379e-05, + "loss": 0.5983, + "step": 265 + }, + { + "epoch": 0.07376594564614532, + "grad_norm": 0.37056779861450195, + "learning_rate": 2.496681270257589e-05, + "loss": 0.5974, + "step": 266 + }, + { + "epoch": 0.0740432612312812, + "grad_norm": 0.2506537139415741, + "learning_rate": 2.4966409394112528e-05, + "loss": 0.6279, + "step": 267 + }, + { + "epoch": 0.07432057681641709, + "grad_norm": 0.3248004615306854, + "learning_rate": 2.4966003653122406e-05, + "loss": 0.5968, + "step": 268 + }, + { + "epoch": 0.07459789240155297, + "grad_norm": 0.27633965015411377, + "learning_rate": 2.4965595479684685e-05, + "loss": 0.6207, + "step": 269 + }, + { + "epoch": 0.07487520798668885, + "grad_norm": 0.2533873915672302, + "learning_rate": 2.4965184873879015e-05, + "loss": 0.6428, + "step": 270 + }, + { + "epoch": 0.07515252357182474, + "grad_norm": 0.24860017001628876, + "learning_rate": 2.496477183578552e-05, + "loss": 0.5717, + "step": 271 + }, + { + "epoch": 0.07542983915696062, + "grad_norm": 0.2541423738002777, + "learning_rate": 2.4964356365484797e-05, + "loss": 0.6331, + "step": 272 + }, + { + "epoch": 0.07570715474209651, + "grad_norm": 0.21915870904922485, + "learning_rate": 2.4963938463057907e-05, + "loss": 0.6167, + "step": 273 + }, + { + "epoch": 0.07598447032723239, + "grad_norm": 0.21635891497135162, + "learning_rate": 2.4963518128586393e-05, + "loss": 0.602, + "step": 274 + }, + { + "epoch": 0.07626178591236828, + "grad_norm": 0.23918819427490234, + "learning_rate": 2.4963095362152282e-05, + "loss": 0.5869, + "step": 275 + }, + { + "epoch": 0.07653910149750416, + "grad_norm": 0.22294320166110992, + "learning_rate": 2.496267016383806e-05, + "loss": 0.6256, + "step": 276 + }, + { + "epoch": 0.07681641708264005, + "grad_norm": 0.2167348712682724, + "learning_rate": 2.49622425337267e-05, + "loss": 0.5936, + "step": 277 + }, + { + "epoch": 0.07709373266777593, + "grad_norm": 0.21333451569080353, + "learning_rate": 2.496181247190164e-05, + "loss": 0.6221, + "step": 278 + }, + { + "epoch": 0.07737104825291181, + "grad_norm": 0.22917450964450836, + "learning_rate": 2.4961379978446793e-05, + "loss": 0.6132, + "step": 279 + }, + { + "epoch": 0.0776483638380477, + "grad_norm": 0.228413388133049, + "learning_rate": 2.496094505344656e-05, + "loss": 0.5953, + "step": 280 + }, + { + "epoch": 0.07792567942318358, + "grad_norm": 0.20908214151859283, + "learning_rate": 2.4960507696985796e-05, + "loss": 0.6081, + "step": 281 + }, + { + "epoch": 0.07820299500831947, + "grad_norm": 0.2270585596561432, + "learning_rate": 2.4960067909149846e-05, + "loss": 0.5915, + "step": 282 + }, + { + "epoch": 0.07848031059345535, + "grad_norm": 0.23176094889640808, + "learning_rate": 2.4959625690024524e-05, + "loss": 0.6126, + "step": 283 + }, + { + "epoch": 0.07875762617859124, + "grad_norm": 0.24496515095233917, + "learning_rate": 2.495918103969612e-05, + "loss": 0.6245, + "step": 284 + }, + { + "epoch": 0.07903494176372712, + "grad_norm": 0.21696555614471436, + "learning_rate": 2.4958733958251394e-05, + "loss": 0.6104, + "step": 285 + }, + { + "epoch": 0.079312257348863, + "grad_norm": 0.22292231023311615, + "learning_rate": 2.4958284445777584e-05, + "loss": 0.6164, + "step": 286 + }, + { + "epoch": 0.07958957293399889, + "grad_norm": 0.24104639887809753, + "learning_rate": 2.4957832502362404e-05, + "loss": 0.6002, + "step": 287 + }, + { + "epoch": 0.07986688851913477, + "grad_norm": 0.22866299748420715, + "learning_rate": 2.495737812809404e-05, + "loss": 0.6143, + "step": 288 + }, + { + "epoch": 0.08014420410427066, + "grad_norm": 0.23108075559139252, + "learning_rate": 2.495692132306115e-05, + "loss": 0.6591, + "step": 289 + }, + { + "epoch": 0.08042151968940654, + "grad_norm": 0.24755387008190155, + "learning_rate": 2.4956462087352868e-05, + "loss": 0.5883, + "step": 290 + }, + { + "epoch": 0.08069883527454243, + "grad_norm": 0.23270832002162933, + "learning_rate": 2.4956000421058807e-05, + "loss": 0.5727, + "step": 291 + }, + { + "epoch": 0.08097615085967831, + "grad_norm": 0.23588238656520844, + "learning_rate": 2.4955536324269048e-05, + "loss": 0.6178, + "step": 292 + }, + { + "epoch": 0.0812534664448142, + "grad_norm": 0.26772478222846985, + "learning_rate": 2.4955069797074147e-05, + "loss": 0.6214, + "step": 293 + }, + { + "epoch": 0.08153078202995008, + "grad_norm": 0.22470323741436005, + "learning_rate": 2.495460083956514e-05, + "loss": 0.61, + "step": 294 + }, + { + "epoch": 0.08180809761508596, + "grad_norm": 0.22901985049247742, + "learning_rate": 2.495412945183353e-05, + "loss": 0.5965, + "step": 295 + }, + { + "epoch": 0.08208541320022185, + "grad_norm": 0.23927484452724457, + "learning_rate": 2.49536556339713e-05, + "loss": 0.6229, + "step": 296 + }, + { + "epoch": 0.08236272878535773, + "grad_norm": 0.2505173087120056, + "learning_rate": 2.49531793860709e-05, + "loss": 0.5974, + "step": 297 + }, + { + "epoch": 0.08264004437049362, + "grad_norm": 0.7846159934997559, + "learning_rate": 2.4952700708225263e-05, + "loss": 0.6097, + "step": 298 + }, + { + "epoch": 0.0829173599556295, + "grad_norm": 0.23284678161144257, + "learning_rate": 2.4952219600527786e-05, + "loss": 0.6161, + "step": 299 + }, + { + "epoch": 0.08319467554076539, + "grad_norm": 0.22659331560134888, + "learning_rate": 2.4951736063072356e-05, + "loss": 0.5917, + "step": 300 + }, + { + "epoch": 0.08347199112590127, + "grad_norm": 0.24401767551898956, + "learning_rate": 2.4951250095953315e-05, + "loss": 0.6163, + "step": 301 + }, + { + "epoch": 0.08374930671103716, + "grad_norm": 0.23994800448417664, + "learning_rate": 2.4950761699265487e-05, + "loss": 0.6035, + "step": 302 + }, + { + "epoch": 0.08402662229617304, + "grad_norm": 0.293527752161026, + "learning_rate": 2.495027087310418e-05, + "loss": 0.6148, + "step": 303 + }, + { + "epoch": 0.08430393788130892, + "grad_norm": 0.2797812819480896, + "learning_rate": 2.4949777617565156e-05, + "loss": 0.6249, + "step": 304 + }, + { + "epoch": 0.08458125346644481, + "grad_norm": 0.2422715574502945, + "learning_rate": 2.4949281932744672e-05, + "loss": 0.6064, + "step": 305 + }, + { + "epoch": 0.08485856905158069, + "grad_norm": 0.2489105761051178, + "learning_rate": 2.4948783818739446e-05, + "loss": 0.6176, + "step": 306 + }, + { + "epoch": 0.08513588463671658, + "grad_norm": 0.23189565539360046, + "learning_rate": 2.4948283275646672e-05, + "loss": 0.6172, + "step": 307 + }, + { + "epoch": 0.08541320022185246, + "grad_norm": 0.21257632970809937, + "learning_rate": 2.4947780303564015e-05, + "loss": 0.6132, + "step": 308 + }, + { + "epoch": 0.08569051580698835, + "grad_norm": 0.23266074061393738, + "learning_rate": 2.4947274902589628e-05, + "loss": 0.6001, + "step": 309 + }, + { + "epoch": 0.08596783139212424, + "grad_norm": 0.21587929129600525, + "learning_rate": 2.4946767072822126e-05, + "loss": 0.6381, + "step": 310 + }, + { + "epoch": 0.08624514697726013, + "grad_norm": 0.23052595555782318, + "learning_rate": 2.4946256814360594e-05, + "loss": 0.6643, + "step": 311 + }, + { + "epoch": 0.08652246256239601, + "grad_norm": 0.2822146713733673, + "learning_rate": 2.4945744127304598e-05, + "loss": 0.6331, + "step": 312 + }, + { + "epoch": 0.0867997781475319, + "grad_norm": 0.22692646086215973, + "learning_rate": 2.4945229011754184e-05, + "loss": 0.6126, + "step": 313 + }, + { + "epoch": 0.08707709373266778, + "grad_norm": 0.2250347137451172, + "learning_rate": 2.4944711467809855e-05, + "loss": 0.6308, + "step": 314 + }, + { + "epoch": 0.08735440931780367, + "grad_norm": 0.21644283831119537, + "learning_rate": 2.4944191495572604e-05, + "loss": 0.587, + "step": 315 + }, + { + "epoch": 0.08763172490293955, + "grad_norm": 0.22959665954113007, + "learning_rate": 2.494366909514389e-05, + "loss": 0.6138, + "step": 316 + }, + { + "epoch": 0.08790904048807544, + "grad_norm": 0.24681390821933746, + "learning_rate": 2.4943144266625645e-05, + "loss": 0.6309, + "step": 317 + }, + { + "epoch": 0.08818635607321132, + "grad_norm": 0.22859139740467072, + "learning_rate": 2.4942617010120282e-05, + "loss": 0.5937, + "step": 318 + }, + { + "epoch": 0.0884636716583472, + "grad_norm": 0.20714016258716583, + "learning_rate": 2.4942087325730678e-05, + "loss": 0.5925, + "step": 319 + }, + { + "epoch": 0.08874098724348309, + "grad_norm": 0.2056405246257782, + "learning_rate": 2.494155521356019e-05, + "loss": 0.5922, + "step": 320 + }, + { + "epoch": 0.08901830282861897, + "grad_norm": 0.22429367899894714, + "learning_rate": 2.4941020673712644e-05, + "loss": 0.6141, + "step": 321 + }, + { + "epoch": 0.08929561841375486, + "grad_norm": 0.2454768568277359, + "learning_rate": 2.494048370629235e-05, + "loss": 0.6221, + "step": 322 + }, + { + "epoch": 0.08957293399889074, + "grad_norm": 0.21887235343456268, + "learning_rate": 2.493994431140408e-05, + "loss": 0.6249, + "step": 323 + }, + { + "epoch": 0.08985024958402663, + "grad_norm": 0.23439091444015503, + "learning_rate": 2.493940248915308e-05, + "loss": 0.6145, + "step": 324 + }, + { + "epoch": 0.09012756516916251, + "grad_norm": 0.21770575642585754, + "learning_rate": 2.4938858239645087e-05, + "loss": 0.6123, + "step": 325 + }, + { + "epoch": 0.0904048807542984, + "grad_norm": 0.24734006822109222, + "learning_rate": 2.4938311562986284e-05, + "loss": 0.6223, + "step": 326 + }, + { + "epoch": 0.09068219633943428, + "grad_norm": 0.22917009890079498, + "learning_rate": 2.4937762459283348e-05, + "loss": 0.6041, + "step": 327 + }, + { + "epoch": 0.09095951192457016, + "grad_norm": 0.22535157203674316, + "learning_rate": 2.4937210928643423e-05, + "loss": 0.6449, + "step": 328 + }, + { + "epoch": 0.09123682750970605, + "grad_norm": 0.21863703429698944, + "learning_rate": 2.4936656971174134e-05, + "loss": 0.6144, + "step": 329 + }, + { + "epoch": 0.09151414309484193, + "grad_norm": 0.24071593582630157, + "learning_rate": 2.4936100586983563e-05, + "loss": 0.6391, + "step": 330 + }, + { + "epoch": 0.09179145867997782, + "grad_norm": 0.21045182645320892, + "learning_rate": 2.4935541776180275e-05, + "loss": 0.613, + "step": 331 + }, + { + "epoch": 0.0920687742651137, + "grad_norm": 0.250699520111084, + "learning_rate": 2.493498053887332e-05, + "loss": 0.6155, + "step": 332 + }, + { + "epoch": 0.09234608985024959, + "grad_norm": 0.22076334059238434, + "learning_rate": 2.4934416875172202e-05, + "loss": 0.6184, + "step": 333 + }, + { + "epoch": 0.09262340543538547, + "grad_norm": 0.22932595014572144, + "learning_rate": 2.4933850785186906e-05, + "loss": 0.6234, + "step": 334 + }, + { + "epoch": 0.09290072102052135, + "grad_norm": 0.2126377820968628, + "learning_rate": 2.4933282269027898e-05, + "loss": 0.5768, + "step": 335 + }, + { + "epoch": 0.09317803660565724, + "grad_norm": 0.21872107684612274, + "learning_rate": 2.49327113268061e-05, + "loss": 0.5847, + "step": 336 + }, + { + "epoch": 0.09345535219079312, + "grad_norm": 0.22751103341579437, + "learning_rate": 2.4932137958632922e-05, + "loss": 0.6241, + "step": 337 + }, + { + "epoch": 0.09373266777592901, + "grad_norm": 0.24364197254180908, + "learning_rate": 2.493156216462025e-05, + "loss": 0.5956, + "step": 338 + }, + { + "epoch": 0.09400998336106489, + "grad_norm": 0.2077159285545349, + "learning_rate": 2.493098394488043e-05, + "loss": 0.5911, + "step": 339 + }, + { + "epoch": 0.09428729894620078, + "grad_norm": 0.24238905310630798, + "learning_rate": 2.4930403299526292e-05, + "loss": 0.629, + "step": 340 + }, + { + "epoch": 0.09456461453133666, + "grad_norm": 0.22944410145282745, + "learning_rate": 2.492982022867113e-05, + "loss": 0.596, + "step": 341 + }, + { + "epoch": 0.09484193011647254, + "grad_norm": 0.22006259858608246, + "learning_rate": 2.492923473242872e-05, + "loss": 0.581, + "step": 342 + }, + { + "epoch": 0.09511924570160843, + "grad_norm": 0.2297179251909256, + "learning_rate": 2.4928646810913307e-05, + "loss": 0.6107, + "step": 343 + }, + { + "epoch": 0.09539656128674431, + "grad_norm": 0.21393971145153046, + "learning_rate": 2.4928056464239614e-05, + "loss": 0.5773, + "step": 344 + }, + { + "epoch": 0.0956738768718802, + "grad_norm": 0.23898737132549286, + "learning_rate": 2.4927463692522825e-05, + "loss": 0.6119, + "step": 345 + }, + { + "epoch": 0.09595119245701608, + "grad_norm": 0.22290126979351044, + "learning_rate": 2.4926868495878613e-05, + "loss": 0.5721, + "step": 346 + }, + { + "epoch": 0.09622850804215197, + "grad_norm": 0.23102609813213348, + "learning_rate": 2.4926270874423113e-05, + "loss": 0.5735, + "step": 347 + }, + { + "epoch": 0.09650582362728785, + "grad_norm": 0.22276602685451508, + "learning_rate": 2.4925670828272935e-05, + "loss": 0.5799, + "step": 348 + }, + { + "epoch": 0.09678313921242374, + "grad_norm": 0.229088693857193, + "learning_rate": 2.492506835754517e-05, + "loss": 0.6191, + "step": 349 + }, + { + "epoch": 0.09706045479755962, + "grad_norm": 0.22365529835224152, + "learning_rate": 2.4924463462357373e-05, + "loss": 0.5932, + "step": 350 + }, + { + "epoch": 0.0973377703826955, + "grad_norm": 0.21552829444408417, + "learning_rate": 2.492385614282757e-05, + "loss": 0.6091, + "step": 351 + }, + { + "epoch": 0.09761508596783139, + "grad_norm": 0.2322327196598053, + "learning_rate": 2.4923246399074272e-05, + "loss": 0.6216, + "step": 352 + }, + { + "epoch": 0.09789240155296727, + "grad_norm": 0.27425798773765564, + "learning_rate": 2.4922634231216458e-05, + "loss": 0.5915, + "step": 353 + }, + { + "epoch": 0.09816971713810316, + "grad_norm": 0.21547527611255646, + "learning_rate": 2.492201963937357e-05, + "loss": 0.6003, + "step": 354 + }, + { + "epoch": 0.09844703272323904, + "grad_norm": 0.24001803994178772, + "learning_rate": 2.4921402623665535e-05, + "loss": 0.5879, + "step": 355 + }, + { + "epoch": 0.09872434830837493, + "grad_norm": 0.2128361016511917, + "learning_rate": 2.492078318421275e-05, + "loss": 0.6192, + "step": 356 + }, + { + "epoch": 0.09900166389351081, + "grad_norm": 0.23891720175743103, + "learning_rate": 2.492016132113608e-05, + "loss": 0.6237, + "step": 357 + }, + { + "epoch": 0.0992789794786467, + "grad_norm": 0.2530137300491333, + "learning_rate": 2.4919537034556876e-05, + "loss": 0.5975, + "step": 358 + }, + { + "epoch": 0.09955629506378258, + "grad_norm": 0.22864577174186707, + "learning_rate": 2.4918910324596944e-05, + "loss": 0.6085, + "step": 359 + }, + { + "epoch": 0.09983361064891846, + "grad_norm": 0.21646267175674438, + "learning_rate": 2.4918281191378573e-05, + "loss": 0.5734, + "step": 360 + }, + { + "epoch": 0.10011092623405435, + "grad_norm": 0.21921531856060028, + "learning_rate": 2.491764963502453e-05, + "loss": 0.6003, + "step": 361 + }, + { + "epoch": 0.10038824181919023, + "grad_norm": 0.22741259634494781, + "learning_rate": 2.491701565565804e-05, + "loss": 0.59, + "step": 362 + }, + { + "epoch": 0.10066555740432612, + "grad_norm": 0.2382003366947174, + "learning_rate": 2.4916379253402815e-05, + "loss": 0.6021, + "step": 363 + }, + { + "epoch": 0.100942872989462, + "grad_norm": 0.20885150134563446, + "learning_rate": 2.4915740428383032e-05, + "loss": 0.5973, + "step": 364 + }, + { + "epoch": 0.10122018857459789, + "grad_norm": 0.1941784769296646, + "learning_rate": 2.491509918072334e-05, + "loss": 0.591, + "step": 365 + }, + { + "epoch": 0.10149750415973377, + "grad_norm": 0.21724678575992584, + "learning_rate": 2.491445551054887e-05, + "loss": 0.6212, + "step": 366 + }, + { + "epoch": 0.10177481974486967, + "grad_norm": 0.29596778750419617, + "learning_rate": 2.4913809417985213e-05, + "loss": 0.6241, + "step": 367 + }, + { + "epoch": 0.10205213533000555, + "grad_norm": 0.23365665972232819, + "learning_rate": 2.4913160903158443e-05, + "loss": 0.6243, + "step": 368 + }, + { + "epoch": 0.10232945091514144, + "grad_norm": 0.23263859748840332, + "learning_rate": 2.4912509966195098e-05, + "loss": 0.5946, + "step": 369 + }, + { + "epoch": 0.10260676650027732, + "grad_norm": 0.22337226569652557, + "learning_rate": 2.4911856607222196e-05, + "loss": 0.6287, + "step": 370 + }, + { + "epoch": 0.1028840820854132, + "grad_norm": 0.23225417733192444, + "learning_rate": 2.491120082636722e-05, + "loss": 0.5927, + "step": 371 + }, + { + "epoch": 0.10316139767054909, + "grad_norm": 0.22292552888393402, + "learning_rate": 2.4910542623758142e-05, + "loss": 0.6208, + "step": 372 + }, + { + "epoch": 0.10343871325568497, + "grad_norm": 0.21180188655853271, + "learning_rate": 2.4909881999523382e-05, + "loss": 0.5652, + "step": 373 + }, + { + "epoch": 0.10371602884082086, + "grad_norm": 0.2395281195640564, + "learning_rate": 2.4909218953791853e-05, + "loss": 0.5922, + "step": 374 + }, + { + "epoch": 0.10399334442595674, + "grad_norm": 0.2313883900642395, + "learning_rate": 2.4908553486692926e-05, + "loss": 0.6083, + "step": 375 + }, + { + "epoch": 0.10427066001109263, + "grad_norm": 0.21677231788635254, + "learning_rate": 2.4907885598356456e-05, + "loss": 0.6115, + "step": 376 + }, + { + "epoch": 0.10454797559622851, + "grad_norm": 0.21811628341674805, + "learning_rate": 2.4907215288912766e-05, + "loss": 0.5815, + "step": 377 + }, + { + "epoch": 0.1048252911813644, + "grad_norm": 0.22422359883785248, + "learning_rate": 2.4906542558492652e-05, + "loss": 0.6161, + "step": 378 + }, + { + "epoch": 0.10510260676650028, + "grad_norm": 0.2190743088722229, + "learning_rate": 2.4905867407227377e-05, + "loss": 0.5554, + "step": 379 + }, + { + "epoch": 0.10537992235163617, + "grad_norm": 0.25590968132019043, + "learning_rate": 2.490518983524869e-05, + "loss": 0.5856, + "step": 380 + }, + { + "epoch": 0.10565723793677205, + "grad_norm": 0.26324909925460815, + "learning_rate": 2.490450984268879e-05, + "loss": 0.6057, + "step": 381 + }, + { + "epoch": 0.10593455352190793, + "grad_norm": 0.2394174039363861, + "learning_rate": 2.490382742968037e-05, + "loss": 0.6045, + "step": 382 + }, + { + "epoch": 0.10621186910704382, + "grad_norm": 0.23230458796024323, + "learning_rate": 2.4903142596356586e-05, + "loss": 0.6188, + "step": 383 + }, + { + "epoch": 0.1064891846921797, + "grad_norm": 0.21763205528259277, + "learning_rate": 2.4902455342851067e-05, + "loss": 0.5626, + "step": 384 + }, + { + "epoch": 0.10676650027731559, + "grad_norm": 0.469051718711853, + "learning_rate": 2.490176566929791e-05, + "loss": 0.5909, + "step": 385 + }, + { + "epoch": 0.10704381586245147, + "grad_norm": 0.24806742370128632, + "learning_rate": 2.4901073575831697e-05, + "loss": 0.6215, + "step": 386 + }, + { + "epoch": 0.10732113144758736, + "grad_norm": 0.22851231694221497, + "learning_rate": 2.4900379062587463e-05, + "loss": 0.593, + "step": 387 + }, + { + "epoch": 0.10759844703272324, + "grad_norm": 0.24515169858932495, + "learning_rate": 2.489968212970074e-05, + "loss": 0.6036, + "step": 388 + }, + { + "epoch": 0.10787576261785913, + "grad_norm": 0.24662603437900543, + "learning_rate": 2.4898982777307506e-05, + "loss": 0.6153, + "step": 389 + }, + { + "epoch": 0.10815307820299501, + "grad_norm": 0.2459113895893097, + "learning_rate": 2.4898281005544227e-05, + "loss": 0.5771, + "step": 390 + }, + { + "epoch": 0.1084303937881309, + "grad_norm": 0.23075874149799347, + "learning_rate": 2.489757681454784e-05, + "loss": 0.6297, + "step": 391 + }, + { + "epoch": 0.10870770937326678, + "grad_norm": 0.24344393610954285, + "learning_rate": 2.4896870204455746e-05, + "loss": 0.5993, + "step": 392 + }, + { + "epoch": 0.10898502495840266, + "grad_norm": 0.2444470226764679, + "learning_rate": 2.4896161175405826e-05, + "loss": 0.6159, + "step": 393 + }, + { + "epoch": 0.10926234054353855, + "grad_norm": 0.24199549853801727, + "learning_rate": 2.4895449727536435e-05, + "loss": 0.6177, + "step": 394 + }, + { + "epoch": 0.10953965612867443, + "grad_norm": 0.20678602159023285, + "learning_rate": 2.4894735860986385e-05, + "loss": 0.5894, + "step": 395 + }, + { + "epoch": 0.10981697171381032, + "grad_norm": 0.25881609320640564, + "learning_rate": 2.489401957589498e-05, + "loss": 0.631, + "step": 396 + }, + { + "epoch": 0.1100942872989462, + "grad_norm": 0.2568078637123108, + "learning_rate": 2.489330087240198e-05, + "loss": 0.5902, + "step": 397 + }, + { + "epoch": 0.11037160288408208, + "grad_norm": 0.2495458871126175, + "learning_rate": 2.489257975064763e-05, + "loss": 0.6141, + "step": 398 + }, + { + "epoch": 0.11064891846921797, + "grad_norm": 0.6050971150398254, + "learning_rate": 2.489185621077263e-05, + "loss": 0.597, + "step": 399 + }, + { + "epoch": 0.11092623405435385, + "grad_norm": 0.22337263822555542, + "learning_rate": 2.489113025291817e-05, + "loss": 0.58, + "step": 400 + }, + { + "epoch": 0.11120354963948974, + "grad_norm": 0.20583049952983856, + "learning_rate": 2.4890401877225898e-05, + "loss": 0.5751, + "step": 401 + }, + { + "epoch": 0.11148086522462562, + "grad_norm": 0.2487124800682068, + "learning_rate": 2.488967108383795e-05, + "loss": 0.6009, + "step": 402 + }, + { + "epoch": 0.1117581808097615, + "grad_norm": 0.24986512959003448, + "learning_rate": 2.4888937872896908e-05, + "loss": 0.6203, + "step": 403 + }, + { + "epoch": 0.11203549639489739, + "grad_norm": 0.28655165433883667, + "learning_rate": 2.488820224454585e-05, + "loss": 0.6037, + "step": 404 + }, + { + "epoch": 0.11231281198003328, + "grad_norm": 0.24651272594928741, + "learning_rate": 2.4887464198928317e-05, + "loss": 0.5853, + "step": 405 + }, + { + "epoch": 0.11259012756516916, + "grad_norm": 0.1938582807779312, + "learning_rate": 2.4886723736188318e-05, + "loss": 0.5888, + "step": 406 + }, + { + "epoch": 0.11286744315030504, + "grad_norm": 0.22223535180091858, + "learning_rate": 2.4885980856470338e-05, + "loss": 0.627, + "step": 407 + }, + { + "epoch": 0.11314475873544093, + "grad_norm": 0.24378454685211182, + "learning_rate": 2.4885235559919328e-05, + "loss": 0.5827, + "step": 408 + }, + { + "epoch": 0.11342207432057681, + "grad_norm": 0.2019236534833908, + "learning_rate": 2.4884487846680727e-05, + "loss": 0.5976, + "step": 409 + }, + { + "epoch": 0.1136993899057127, + "grad_norm": 0.21661922335624695, + "learning_rate": 2.4883737716900424e-05, + "loss": 0.6013, + "step": 410 + }, + { + "epoch": 0.11397670549084858, + "grad_norm": 0.26957792043685913, + "learning_rate": 2.4882985170724787e-05, + "loss": 0.63, + "step": 411 + }, + { + "epoch": 0.11425402107598447, + "grad_norm": 0.21899108588695526, + "learning_rate": 2.4882230208300668e-05, + "loss": 0.5935, + "step": 412 + }, + { + "epoch": 0.11453133666112035, + "grad_norm": 0.2505897879600525, + "learning_rate": 2.488147282977537e-05, + "loss": 0.5689, + "step": 413 + }, + { + "epoch": 0.11480865224625623, + "grad_norm": 0.20966675877571106, + "learning_rate": 2.4880713035296686e-05, + "loss": 0.5893, + "step": 414 + }, + { + "epoch": 0.11508596783139212, + "grad_norm": 0.26599064469337463, + "learning_rate": 2.4879950825012864e-05, + "loss": 0.5912, + "step": 415 + }, + { + "epoch": 0.115363283416528, + "grad_norm": 0.22095918655395508, + "learning_rate": 2.487918619907264e-05, + "loss": 0.6068, + "step": 416 + }, + { + "epoch": 0.11564059900166389, + "grad_norm": 0.20822377502918243, + "learning_rate": 2.4878419157625206e-05, + "loss": 0.5783, + "step": 417 + }, + { + "epoch": 0.11591791458679977, + "grad_norm": 0.20983396470546722, + "learning_rate": 2.4877649700820232e-05, + "loss": 0.6258, + "step": 418 + }, + { + "epoch": 0.11619523017193566, + "grad_norm": 0.2288864701986313, + "learning_rate": 2.4876877828807864e-05, + "loss": 0.6196, + "step": 419 + }, + { + "epoch": 0.11647254575707154, + "grad_norm": 0.20762163400650024, + "learning_rate": 2.4876103541738714e-05, + "loss": 0.5674, + "step": 420 + }, + { + "epoch": 0.11674986134220743, + "grad_norm": 0.2152256816625595, + "learning_rate": 2.4875326839763863e-05, + "loss": 0.5681, + "step": 421 + }, + { + "epoch": 0.11702717692734331, + "grad_norm": 0.25224751234054565, + "learning_rate": 2.4874547723034865e-05, + "loss": 0.5948, + "step": 422 + }, + { + "epoch": 0.11730449251247921, + "grad_norm": 0.21316662430763245, + "learning_rate": 2.4873766191703752e-05, + "loss": 0.5757, + "step": 423 + }, + { + "epoch": 0.11758180809761509, + "grad_norm": 0.20757247507572174, + "learning_rate": 2.4872982245923014e-05, + "loss": 0.5903, + "step": 424 + }, + { + "epoch": 0.11785912368275098, + "grad_norm": 0.23846663534641266, + "learning_rate": 2.487219588584563e-05, + "loss": 0.5735, + "step": 425 + }, + { + "epoch": 0.11813643926788686, + "grad_norm": 0.21389099955558777, + "learning_rate": 2.4871407111625027e-05, + "loss": 0.5998, + "step": 426 + }, + { + "epoch": 0.11841375485302275, + "grad_norm": 0.21840502321720123, + "learning_rate": 2.487061592341513e-05, + "loss": 0.5854, + "step": 427 + }, + { + "epoch": 0.11869107043815863, + "grad_norm": 0.23358672857284546, + "learning_rate": 2.4869822321370308e-05, + "loss": 0.6212, + "step": 428 + }, + { + "epoch": 0.11896838602329451, + "grad_norm": 0.24467387795448303, + "learning_rate": 2.4869026305645418e-05, + "loss": 0.5937, + "step": 429 + }, + { + "epoch": 0.1192457016084304, + "grad_norm": 0.24679329991340637, + "learning_rate": 2.486822787639579e-05, + "loss": 0.6027, + "step": 430 + }, + { + "epoch": 0.11952301719356628, + "grad_norm": 0.22588002681732178, + "learning_rate": 2.4867427033777206e-05, + "loss": 0.5707, + "step": 431 + }, + { + "epoch": 0.11980033277870217, + "grad_norm": 0.20728443562984467, + "learning_rate": 2.486662377794594e-05, + "loss": 0.5857, + "step": 432 + }, + { + "epoch": 0.12007764836383805, + "grad_norm": 0.2292574942111969, + "learning_rate": 2.4865818109058732e-05, + "loss": 0.6288, + "step": 433 + }, + { + "epoch": 0.12035496394897394, + "grad_norm": 0.22358085215091705, + "learning_rate": 2.4865010027272784e-05, + "loss": 0.6043, + "step": 434 + }, + { + "epoch": 0.12063227953410982, + "grad_norm": 0.21650134027004242, + "learning_rate": 2.4864199532745776e-05, + "loss": 0.5772, + "step": 435 + }, + { + "epoch": 0.1209095951192457, + "grad_norm": 0.21783700585365295, + "learning_rate": 2.486338662563585e-05, + "loss": 0.608, + "step": 436 + }, + { + "epoch": 0.12118691070438159, + "grad_norm": 0.2252453863620758, + "learning_rate": 2.4862571306101633e-05, + "loss": 0.5783, + "step": 437 + }, + { + "epoch": 0.12146422628951747, + "grad_norm": 0.22224466502666473, + "learning_rate": 2.4861753574302217e-05, + "loss": 0.5823, + "step": 438 + }, + { + "epoch": 0.12174154187465336, + "grad_norm": 0.24375957250595093, + "learning_rate": 2.486093343039716e-05, + "loss": 0.5872, + "step": 439 + }, + { + "epoch": 0.12201885745978924, + "grad_norm": 0.20903299748897552, + "learning_rate": 2.4860110874546495e-05, + "loss": 0.6237, + "step": 440 + }, + { + "epoch": 0.12229617304492513, + "grad_norm": 0.23007185757160187, + "learning_rate": 2.485928590691072e-05, + "loss": 0.6188, + "step": 441 + }, + { + "epoch": 0.12257348863006101, + "grad_norm": 0.23085376620292664, + "learning_rate": 2.4858458527650814e-05, + "loss": 0.5693, + "step": 442 + }, + { + "epoch": 0.1228508042151969, + "grad_norm": 0.2241743952035904, + "learning_rate": 2.485762873692822e-05, + "loss": 0.6294, + "step": 443 + }, + { + "epoch": 0.12312811980033278, + "grad_norm": 0.20904746651649475, + "learning_rate": 2.4856796534904845e-05, + "loss": 0.6301, + "step": 444 + }, + { + "epoch": 0.12340543538546866, + "grad_norm": 0.6742352843284607, + "learning_rate": 2.4855961921743083e-05, + "loss": 0.5524, + "step": 445 + }, + { + "epoch": 0.12368275097060455, + "grad_norm": 0.20682546496391296, + "learning_rate": 2.4855124897605782e-05, + "loss": 0.5907, + "step": 446 + }, + { + "epoch": 0.12396006655574043, + "grad_norm": 0.2383589744567871, + "learning_rate": 2.485428546265627e-05, + "loss": 0.5865, + "step": 447 + }, + { + "epoch": 0.12423738214087632, + "grad_norm": 0.2051754891872406, + "learning_rate": 2.4853443617058348e-05, + "loss": 0.6112, + "step": 448 + }, + { + "epoch": 0.1245146977260122, + "grad_norm": 0.2156454175710678, + "learning_rate": 2.4852599360976274e-05, + "loss": 0.5913, + "step": 449 + }, + { + "epoch": 0.12479201331114809, + "grad_norm": 0.22987020015716553, + "learning_rate": 2.485175269457479e-05, + "loss": 0.5873, + "step": 450 + }, + { + "epoch": 0.12506932889628397, + "grad_norm": 0.20809032022953033, + "learning_rate": 2.4850903618019102e-05, + "loss": 0.582, + "step": 451 + }, + { + "epoch": 0.12534664448141986, + "grad_norm": 0.2254360467195511, + "learning_rate": 2.485005213147489e-05, + "loss": 0.5998, + "step": 452 + }, + { + "epoch": 0.12562396006655574, + "grad_norm": 0.214163139462471, + "learning_rate": 2.4849198235108296e-05, + "loss": 0.5884, + "step": 453 + }, + { + "epoch": 0.12590127565169162, + "grad_norm": 0.21463198959827423, + "learning_rate": 2.484834192908594e-05, + "loss": 0.606, + "step": 454 + }, + { + "epoch": 0.1261785912368275, + "grad_norm": 0.20102332532405853, + "learning_rate": 2.4847483213574908e-05, + "loss": 0.6012, + "step": 455 + }, + { + "epoch": 0.1264559068219634, + "grad_norm": 0.19328515231609344, + "learning_rate": 2.4846622088742765e-05, + "loss": 0.5749, + "step": 456 + }, + { + "epoch": 0.12673322240709928, + "grad_norm": 0.20251993834972382, + "learning_rate": 2.484575855475753e-05, + "loss": 0.6121, + "step": 457 + }, + { + "epoch": 0.12701053799223516, + "grad_norm": 0.21547801792621613, + "learning_rate": 2.484489261178771e-05, + "loss": 0.6019, + "step": 458 + }, + { + "epoch": 0.12728785357737105, + "grad_norm": 0.21968044340610504, + "learning_rate": 2.4844024260002276e-05, + "loss": 0.5863, + "step": 459 + }, + { + "epoch": 0.12756516916250693, + "grad_norm": 0.21164929866790771, + "learning_rate": 2.4843153499570648e-05, + "loss": 0.5995, + "step": 460 + }, + { + "epoch": 0.12784248474764282, + "grad_norm": 0.2152341902256012, + "learning_rate": 2.4842280330662753e-05, + "loss": 0.6374, + "step": 461 + }, + { + "epoch": 0.1281198003327787, + "grad_norm": 0.19914227724075317, + "learning_rate": 2.4841404753448963e-05, + "loss": 0.5919, + "step": 462 + }, + { + "epoch": 0.12839711591791458, + "grad_norm": 0.2268274873495102, + "learning_rate": 2.4840526768100124e-05, + "loss": 0.5913, + "step": 463 + }, + { + "epoch": 0.12867443150305047, + "grad_norm": 0.21451812982559204, + "learning_rate": 2.483964637478756e-05, + "loss": 0.6146, + "step": 464 + }, + { + "epoch": 0.12895174708818635, + "grad_norm": 0.1978655308485031, + "learning_rate": 2.483876357368305e-05, + "loss": 0.5938, + "step": 465 + }, + { + "epoch": 0.12922906267332224, + "grad_norm": 0.20545656979084015, + "learning_rate": 2.4837878364958865e-05, + "loss": 0.6172, + "step": 466 + }, + { + "epoch": 0.12950637825845812, + "grad_norm": 0.21529193222522736, + "learning_rate": 2.483699074878772e-05, + "loss": 0.5794, + "step": 467 + }, + { + "epoch": 0.129783693843594, + "grad_norm": 0.2971234917640686, + "learning_rate": 2.4836100725342818e-05, + "loss": 0.6166, + "step": 468 + }, + { + "epoch": 0.1300610094287299, + "grad_norm": 0.1968923807144165, + "learning_rate": 2.4835208294797824e-05, + "loss": 0.5898, + "step": 469 + }, + { + "epoch": 0.13033832501386577, + "grad_norm": 0.2248852252960205, + "learning_rate": 2.483431345732688e-05, + "loss": 0.5984, + "step": 470 + }, + { + "epoch": 0.13061564059900166, + "grad_norm": 0.21942903101444244, + "learning_rate": 2.4833416213104588e-05, + "loss": 0.5984, + "step": 471 + }, + { + "epoch": 0.13089295618413754, + "grad_norm": 0.22266723215579987, + "learning_rate": 2.4832516562306024e-05, + "loss": 0.5858, + "step": 472 + }, + { + "epoch": 0.13117027176927343, + "grad_norm": 0.21460357308387756, + "learning_rate": 2.483161450510674e-05, + "loss": 0.5763, + "step": 473 + }, + { + "epoch": 0.1314475873544093, + "grad_norm": 0.333474725484848, + "learning_rate": 2.4830710041682735e-05, + "loss": 0.6024, + "step": 474 + }, + { + "epoch": 0.1317249029395452, + "grad_norm": 0.1983480155467987, + "learning_rate": 2.4829803172210515e-05, + "loss": 0.5898, + "step": 475 + }, + { + "epoch": 0.13200221852468108, + "grad_norm": 0.2835070788860321, + "learning_rate": 2.482889389686702e-05, + "loss": 0.571, + "step": 476 + }, + { + "epoch": 0.13227953410981697, + "grad_norm": 0.2176080197095871, + "learning_rate": 2.4827982215829674e-05, + "loss": 0.5875, + "step": 477 + }, + { + "epoch": 0.13255684969495285, + "grad_norm": 0.2436138391494751, + "learning_rate": 2.482706812927638e-05, + "loss": 0.5965, + "step": 478 + }, + { + "epoch": 0.13283416528008873, + "grad_norm": 0.21060815453529358, + "learning_rate": 2.4826151637385495e-05, + "loss": 0.5881, + "step": 479 + }, + { + "epoch": 0.13311148086522462, + "grad_norm": 0.49135246872901917, + "learning_rate": 2.4825232740335847e-05, + "loss": 0.5742, + "step": 480 + }, + { + "epoch": 0.1333887964503605, + "grad_norm": 0.20535485446453094, + "learning_rate": 2.4824311438306742e-05, + "loss": 0.5877, + "step": 481 + }, + { + "epoch": 0.1336661120354964, + "grad_norm": 0.20854201912879944, + "learning_rate": 2.482338773147795e-05, + "loss": 0.6065, + "step": 482 + }, + { + "epoch": 0.13394342762063227, + "grad_norm": 0.20914287865161896, + "learning_rate": 2.4822461620029708e-05, + "loss": 0.5919, + "step": 483 + }, + { + "epoch": 0.13422074320576816, + "grad_norm": 0.20028036832809448, + "learning_rate": 2.4821533104142724e-05, + "loss": 0.5707, + "step": 484 + }, + { + "epoch": 0.13449805879090404, + "grad_norm": 0.22616969048976898, + "learning_rate": 2.4820602183998185e-05, + "loss": 0.5896, + "step": 485 + }, + { + "epoch": 0.13477537437603992, + "grad_norm": 0.2049257457256317, + "learning_rate": 2.4819668859777728e-05, + "loss": 0.5693, + "step": 486 + }, + { + "epoch": 0.1350526899611758, + "grad_norm": 0.21746453642845154, + "learning_rate": 2.4818733131663473e-05, + "loss": 0.6177, + "step": 487 + }, + { + "epoch": 0.1353300055463117, + "grad_norm": 0.20084752142429352, + "learning_rate": 2.4817794999838004e-05, + "loss": 0.5871, + "step": 488 + }, + { + "epoch": 0.13560732113144758, + "grad_norm": 0.2062511444091797, + "learning_rate": 2.4816854464484378e-05, + "loss": 0.5975, + "step": 489 + }, + { + "epoch": 0.13588463671658346, + "grad_norm": 0.2201562523841858, + "learning_rate": 2.4815911525786118e-05, + "loss": 0.5683, + "step": 490 + }, + { + "epoch": 0.13616195230171935, + "grad_norm": 0.22616079449653625, + "learning_rate": 2.4814966183927213e-05, + "loss": 0.6306, + "step": 491 + }, + { + "epoch": 0.13643926788685523, + "grad_norm": 0.21003180742263794, + "learning_rate": 2.4814018439092128e-05, + "loss": 0.6064, + "step": 492 + }, + { + "epoch": 0.13671658347199112, + "grad_norm": 0.2046622782945633, + "learning_rate": 2.481306829146579e-05, + "loss": 0.6107, + "step": 493 + }, + { + "epoch": 0.136993899057127, + "grad_norm": 0.2102370411157608, + "learning_rate": 2.4812115741233606e-05, + "loss": 0.596, + "step": 494 + }, + { + "epoch": 0.13727121464226288, + "grad_norm": 0.20774902403354645, + "learning_rate": 2.4811160788581434e-05, + "loss": 0.6111, + "step": 495 + }, + { + "epoch": 0.13754853022739877, + "grad_norm": 0.20868700742721558, + "learning_rate": 2.481020343369561e-05, + "loss": 0.604, + "step": 496 + }, + { + "epoch": 0.13782584581253465, + "grad_norm": 0.20590144395828247, + "learning_rate": 2.4809243676762947e-05, + "loss": 0.606, + "step": 497 + }, + { + "epoch": 0.13810316139767054, + "grad_norm": 0.2019280344247818, + "learning_rate": 2.4808281517970716e-05, + "loss": 0.6034, + "step": 498 + }, + { + "epoch": 0.13838047698280642, + "grad_norm": 0.22689440846443176, + "learning_rate": 2.4807316957506656e-05, + "loss": 0.5715, + "step": 499 + }, + { + "epoch": 0.1386577925679423, + "grad_norm": 0.2134653776884079, + "learning_rate": 2.4806349995558986e-05, + "loss": 0.6184, + "step": 500 + }, + { + "epoch": 0.1389351081530782, + "grad_norm": 0.20334339141845703, + "learning_rate": 2.4805380632316377e-05, + "loss": 0.5804, + "step": 501 + }, + { + "epoch": 0.13921242373821408, + "grad_norm": 0.20713390409946442, + "learning_rate": 2.4804408867967984e-05, + "loss": 0.5898, + "step": 502 + }, + { + "epoch": 0.13948973932334996, + "grad_norm": 0.21584905683994293, + "learning_rate": 2.4803434702703422e-05, + "loss": 0.5957, + "step": 503 + }, + { + "epoch": 0.13976705490848584, + "grad_norm": 0.21197180449962616, + "learning_rate": 2.4802458136712775e-05, + "loss": 0.5981, + "step": 504 + }, + { + "epoch": 0.14004437049362173, + "grad_norm": 0.19864031672477722, + "learning_rate": 2.4801479170186597e-05, + "loss": 0.6027, + "step": 505 + }, + { + "epoch": 0.1403216860787576, + "grad_norm": 0.21110500395298004, + "learning_rate": 2.4800497803315913e-05, + "loss": 0.5882, + "step": 506 + }, + { + "epoch": 0.1405990016638935, + "grad_norm": 0.20834285020828247, + "learning_rate": 2.4799514036292215e-05, + "loss": 0.5935, + "step": 507 + }, + { + "epoch": 0.1408763172490294, + "grad_norm": 0.22122903168201447, + "learning_rate": 2.4798527869307454e-05, + "loss": 0.6011, + "step": 508 + }, + { + "epoch": 0.1411536328341653, + "grad_norm": 0.21510954201221466, + "learning_rate": 2.4797539302554064e-05, + "loss": 0.6266, + "step": 509 + }, + { + "epoch": 0.14143094841930118, + "grad_norm": 0.20589859783649445, + "learning_rate": 2.479654833622494e-05, + "loss": 0.5858, + "step": 510 + }, + { + "epoch": 0.14170826400443706, + "grad_norm": 0.20928624272346497, + "learning_rate": 2.4795554970513445e-05, + "loss": 0.6006, + "step": 511 + }, + { + "epoch": 0.14198557958957295, + "grad_norm": 0.2174837589263916, + "learning_rate": 2.4794559205613412e-05, + "loss": 0.5792, + "step": 512 + }, + { + "epoch": 0.14226289517470883, + "grad_norm": 0.20877033472061157, + "learning_rate": 2.4793561041719137e-05, + "loss": 0.5662, + "step": 513 + }, + { + "epoch": 0.14254021075984472, + "grad_norm": 0.240639790892601, + "learning_rate": 2.479256047902539e-05, + "loss": 0.5824, + "step": 514 + }, + { + "epoch": 0.1428175263449806, + "grad_norm": 0.21567635238170624, + "learning_rate": 2.479155751772741e-05, + "loss": 0.5833, + "step": 515 + }, + { + "epoch": 0.14309484193011648, + "grad_norm": 0.2284121960401535, + "learning_rate": 2.4790552158020896e-05, + "loss": 0.6057, + "step": 516 + }, + { + "epoch": 0.14337215751525237, + "grad_norm": 0.19480617344379425, + "learning_rate": 2.478954440010203e-05, + "loss": 0.5972, + "step": 517 + }, + { + "epoch": 0.14364947310038825, + "grad_norm": 0.20838883519172668, + "learning_rate": 2.4788534244167443e-05, + "loss": 0.6373, + "step": 518 + }, + { + "epoch": 0.14392678868552414, + "grad_norm": 0.21365465223789215, + "learning_rate": 2.4787521690414245e-05, + "loss": 0.5796, + "step": 519 + }, + { + "epoch": 0.14420410427066002, + "grad_norm": 2.2805471420288086, + "learning_rate": 2.4786506739040018e-05, + "loss": 0.5915, + "step": 520 + }, + { + "epoch": 0.1444814198557959, + "grad_norm": 0.34635624289512634, + "learning_rate": 2.47854893902428e-05, + "loss": 0.6325, + "step": 521 + }, + { + "epoch": 0.1447587354409318, + "grad_norm": 0.39266762137413025, + "learning_rate": 2.47844696442211e-05, + "loss": 0.5756, + "step": 522 + }, + { + "epoch": 0.14503605102606767, + "grad_norm": 0.31766456365585327, + "learning_rate": 2.4783447501173907e-05, + "loss": 0.5703, + "step": 523 + }, + { + "epoch": 0.14531336661120356, + "grad_norm": 0.24752533435821533, + "learning_rate": 2.478242296130066e-05, + "loss": 0.5878, + "step": 524 + }, + { + "epoch": 0.14559068219633944, + "grad_norm": 0.24595655500888824, + "learning_rate": 2.4781396024801272e-05, + "loss": 0.5819, + "step": 525 + }, + { + "epoch": 0.14586799778147533, + "grad_norm": 0.2457636296749115, + "learning_rate": 2.478036669187614e-05, + "loss": 0.599, + "step": 526 + }, + { + "epoch": 0.1461453133666112, + "grad_norm": 0.244289368391037, + "learning_rate": 2.4779334962726096e-05, + "loss": 0.5922, + "step": 527 + }, + { + "epoch": 0.1464226289517471, + "grad_norm": 0.23528233170509338, + "learning_rate": 2.477830083755247e-05, + "loss": 0.6032, + "step": 528 + }, + { + "epoch": 0.14669994453688298, + "grad_norm": 0.2198038101196289, + "learning_rate": 2.477726431655704e-05, + "loss": 0.5954, + "step": 529 + }, + { + "epoch": 0.14697726012201887, + "grad_norm": 0.23673711717128754, + "learning_rate": 2.4776225399942066e-05, + "loss": 0.5938, + "step": 530 + }, + { + "epoch": 0.14725457570715475, + "grad_norm": 0.2085774540901184, + "learning_rate": 2.4775184087910262e-05, + "loss": 0.5856, + "step": 531 + }, + { + "epoch": 0.14753189129229063, + "grad_norm": 0.21415582299232483, + "learning_rate": 2.4774140380664816e-05, + "loss": 0.5751, + "step": 532 + }, + { + "epoch": 0.14780920687742652, + "grad_norm": 0.2082296758890152, + "learning_rate": 2.4773094278409388e-05, + "loss": 0.5573, + "step": 533 + }, + { + "epoch": 0.1480865224625624, + "grad_norm": 0.20202411711215973, + "learning_rate": 2.4772045781348093e-05, + "loss": 0.5883, + "step": 534 + }, + { + "epoch": 0.1483638380476983, + "grad_norm": 0.20766015350818634, + "learning_rate": 2.477099488968553e-05, + "loss": 0.6066, + "step": 535 + }, + { + "epoch": 0.14864115363283417, + "grad_norm": 0.2137647122144699, + "learning_rate": 2.4769941603626744e-05, + "loss": 0.597, + "step": 536 + }, + { + "epoch": 0.14891846921797006, + "grad_norm": 0.23699134588241577, + "learning_rate": 2.4768885923377265e-05, + "loss": 0.587, + "step": 537 + }, + { + "epoch": 0.14919578480310594, + "grad_norm": 0.21466752886772156, + "learning_rate": 2.4767827849143087e-05, + "loss": 0.5725, + "step": 538 + }, + { + "epoch": 0.14947310038824183, + "grad_norm": 0.20940807461738586, + "learning_rate": 2.476676738113067e-05, + "loss": 0.5807, + "step": 539 + }, + { + "epoch": 0.1497504159733777, + "grad_norm": 0.22769619524478912, + "learning_rate": 2.476570451954693e-05, + "loss": 0.6089, + "step": 540 + }, + { + "epoch": 0.1500277315585136, + "grad_norm": 0.20399393141269684, + "learning_rate": 2.4764639264599266e-05, + "loss": 0.5705, + "step": 541 + }, + { + "epoch": 0.15030504714364948, + "grad_norm": 0.2241872102022171, + "learning_rate": 2.4763571616495535e-05, + "loss": 0.5731, + "step": 542 + }, + { + "epoch": 0.15058236272878536, + "grad_norm": 0.20283614099025726, + "learning_rate": 2.4762501575444062e-05, + "loss": 0.6051, + "step": 543 + }, + { + "epoch": 0.15085967831392125, + "grad_norm": 0.2145642638206482, + "learning_rate": 2.4761429141653646e-05, + "loss": 0.6069, + "step": 544 + }, + { + "epoch": 0.15113699389905713, + "grad_norm": 0.2139946073293686, + "learning_rate": 2.4760354315333546e-05, + "loss": 0.6055, + "step": 545 + }, + { + "epoch": 0.15141430948419302, + "grad_norm": 0.22807584702968597, + "learning_rate": 2.4759277096693486e-05, + "loss": 0.5945, + "step": 546 + }, + { + "epoch": 0.1516916250693289, + "grad_norm": 0.2132754623889923, + "learning_rate": 2.4758197485943657e-05, + "loss": 0.5975, + "step": 547 + }, + { + "epoch": 0.15196894065446478, + "grad_norm": 0.2016879767179489, + "learning_rate": 2.4757115483294724e-05, + "loss": 0.5863, + "step": 548 + }, + { + "epoch": 0.15224625623960067, + "grad_norm": 0.227370485663414, + "learning_rate": 2.475603108895782e-05, + "loss": 0.583, + "step": 549 + }, + { + "epoch": 0.15252357182473655, + "grad_norm": 0.22234570980072021, + "learning_rate": 2.475494430314453e-05, + "loss": 0.5962, + "step": 550 + }, + { + "epoch": 0.15280088740987244, + "grad_norm": 0.20360559225082397, + "learning_rate": 2.4753855126066916e-05, + "loss": 0.587, + "step": 551 + }, + { + "epoch": 0.15307820299500832, + "grad_norm": 0.23359502851963043, + "learning_rate": 2.475276355793751e-05, + "loss": 0.5967, + "step": 552 + }, + { + "epoch": 0.1533555185801442, + "grad_norm": 0.23216257989406586, + "learning_rate": 2.47516695989693e-05, + "loss": 0.5845, + "step": 553 + }, + { + "epoch": 0.1536328341652801, + "grad_norm": 0.21213343739509583, + "learning_rate": 2.475057324937575e-05, + "loss": 0.5821, + "step": 554 + }, + { + "epoch": 0.15391014975041598, + "grad_norm": 0.21203738451004028, + "learning_rate": 2.4749474509370784e-05, + "loss": 0.5792, + "step": 555 + }, + { + "epoch": 0.15418746533555186, + "grad_norm": 0.21234023571014404, + "learning_rate": 2.4748373379168805e-05, + "loss": 0.5985, + "step": 556 + }, + { + "epoch": 0.15446478092068774, + "grad_norm": 0.20847538113594055, + "learning_rate": 2.4747269858984658e-05, + "loss": 0.595, + "step": 557 + }, + { + "epoch": 0.15474209650582363, + "grad_norm": 0.20475518703460693, + "learning_rate": 2.474616394903368e-05, + "loss": 0.5821, + "step": 558 + }, + { + "epoch": 0.1550194120909595, + "grad_norm": 0.211504727602005, + "learning_rate": 2.474505564953166e-05, + "loss": 0.572, + "step": 559 + }, + { + "epoch": 0.1552967276760954, + "grad_norm": 0.21250484883785248, + "learning_rate": 2.4743944960694854e-05, + "loss": 0.5748, + "step": 560 + }, + { + "epoch": 0.15557404326123128, + "grad_norm": 0.2148432582616806, + "learning_rate": 2.4742831882739988e-05, + "loss": 0.5881, + "step": 561 + }, + { + "epoch": 0.15585135884636717, + "grad_norm": 0.19098572432994843, + "learning_rate": 2.4741716415884257e-05, + "loss": 0.5989, + "step": 562 + }, + { + "epoch": 0.15612867443150305, + "grad_norm": 0.20260894298553467, + "learning_rate": 2.474059856034531e-05, + "loss": 0.567, + "step": 563 + }, + { + "epoch": 0.15640599001663893, + "grad_norm": 0.21840746700763702, + "learning_rate": 2.4739478316341282e-05, + "loss": 0.6054, + "step": 564 + }, + { + "epoch": 0.15668330560177482, + "grad_norm": 0.2050980031490326, + "learning_rate": 2.473835568409075e-05, + "loss": 0.5842, + "step": 565 + }, + { + "epoch": 0.1569606211869107, + "grad_norm": 0.20163971185684204, + "learning_rate": 2.473723066381278e-05, + "loss": 0.5823, + "step": 566 + }, + { + "epoch": 0.1572379367720466, + "grad_norm": 0.2088451236486435, + "learning_rate": 2.473610325572689e-05, + "loss": 0.5995, + "step": 567 + }, + { + "epoch": 0.15751525235718247, + "grad_norm": 0.20921272039413452, + "learning_rate": 2.4734973460053056e-05, + "loss": 0.585, + "step": 568 + }, + { + "epoch": 0.15779256794231836, + "grad_norm": 0.22330057621002197, + "learning_rate": 2.473384127701175e-05, + "loss": 0.5888, + "step": 569 + }, + { + "epoch": 0.15806988352745424, + "grad_norm": 0.2152683287858963, + "learning_rate": 2.4732706706823876e-05, + "loss": 0.5942, + "step": 570 + }, + { + "epoch": 0.15834719911259013, + "grad_norm": 0.20223170518875122, + "learning_rate": 2.4731569749710824e-05, + "loss": 0.5781, + "step": 571 + }, + { + "epoch": 0.158624514697726, + "grad_norm": 0.20824022591114044, + "learning_rate": 2.4730430405894446e-05, + "loss": 0.6404, + "step": 572 + }, + { + "epoch": 0.1589018302828619, + "grad_norm": 0.19907240569591522, + "learning_rate": 2.4729288675597058e-05, + "loss": 0.5983, + "step": 573 + }, + { + "epoch": 0.15917914586799778, + "grad_norm": 0.20674046874046326, + "learning_rate": 2.472814455904144e-05, + "loss": 0.5595, + "step": 574 + }, + { + "epoch": 0.15945646145313366, + "grad_norm": 0.19486385583877563, + "learning_rate": 2.4726998056450833e-05, + "loss": 0.5783, + "step": 575 + }, + { + "epoch": 0.15973377703826955, + "grad_norm": 0.2102123498916626, + "learning_rate": 2.4725849168048965e-05, + "loss": 0.5809, + "step": 576 + }, + { + "epoch": 0.16001109262340543, + "grad_norm": 0.21006052196025848, + "learning_rate": 2.4724697894060005e-05, + "loss": 0.5882, + "step": 577 + }, + { + "epoch": 0.16028840820854132, + "grad_norm": 0.22287555038928986, + "learning_rate": 2.47235442347086e-05, + "loss": 0.6012, + "step": 578 + }, + { + "epoch": 0.1605657237936772, + "grad_norm": 0.20599472522735596, + "learning_rate": 2.4722388190219852e-05, + "loss": 0.5971, + "step": 579 + }, + { + "epoch": 0.16084303937881309, + "grad_norm": 0.21176591515541077, + "learning_rate": 2.4721229760819348e-05, + "loss": 0.5954, + "step": 580 + }, + { + "epoch": 0.16112035496394897, + "grad_norm": 0.24732773005962372, + "learning_rate": 2.4720068946733123e-05, + "loss": 0.5818, + "step": 581 + }, + { + "epoch": 0.16139767054908485, + "grad_norm": 0.20434054732322693, + "learning_rate": 2.4718905748187677e-05, + "loss": 0.5745, + "step": 582 + }, + { + "epoch": 0.16167498613422074, + "grad_norm": 0.20684310793876648, + "learning_rate": 2.4717740165409988e-05, + "loss": 0.5663, + "step": 583 + }, + { + "epoch": 0.16195230171935662, + "grad_norm": 0.2029474377632141, + "learning_rate": 2.471657219862749e-05, + "loss": 0.5855, + "step": 584 + }, + { + "epoch": 0.1622296173044925, + "grad_norm": 0.2033785730600357, + "learning_rate": 2.4715401848068086e-05, + "loss": 0.6119, + "step": 585 + }, + { + "epoch": 0.1625069328896284, + "grad_norm": 0.21371322870254517, + "learning_rate": 2.4714229113960135e-05, + "loss": 0.6022, + "step": 586 + }, + { + "epoch": 0.16278424847476428, + "grad_norm": 0.20918406546115875, + "learning_rate": 2.4713053996532477e-05, + "loss": 0.569, + "step": 587 + }, + { + "epoch": 0.16306156405990016, + "grad_norm": 0.2060522437095642, + "learning_rate": 2.4711876496014407e-05, + "loss": 0.5982, + "step": 588 + }, + { + "epoch": 0.16333887964503604, + "grad_norm": 0.20782527327537537, + "learning_rate": 2.4710696612635688e-05, + "loss": 0.6015, + "step": 589 + }, + { + "epoch": 0.16361619523017193, + "grad_norm": 0.20826764404773712, + "learning_rate": 2.4709514346626536e-05, + "loss": 0.6094, + "step": 590 + }, + { + "epoch": 0.1638935108153078, + "grad_norm": 0.20720824599266052, + "learning_rate": 2.4708329698217652e-05, + "loss": 0.6054, + "step": 591 + }, + { + "epoch": 0.1641708264004437, + "grad_norm": 0.19394385814666748, + "learning_rate": 2.4707142667640193e-05, + "loss": 0.5812, + "step": 592 + }, + { + "epoch": 0.16444814198557958, + "grad_norm": 0.2022271454334259, + "learning_rate": 2.4705953255125777e-05, + "loss": 0.6084, + "step": 593 + }, + { + "epoch": 0.16472545757071547, + "grad_norm": 0.21304059028625488, + "learning_rate": 2.4704761460906488e-05, + "loss": 0.5673, + "step": 594 + }, + { + "epoch": 0.16500277315585135, + "grad_norm": 0.20137831568717957, + "learning_rate": 2.470356728521488e-05, + "loss": 0.5945, + "step": 595 + }, + { + "epoch": 0.16528008874098724, + "grad_norm": 0.20188415050506592, + "learning_rate": 2.470237072828397e-05, + "loss": 0.5849, + "step": 596 + }, + { + "epoch": 0.16555740432612312, + "grad_norm": 0.206806018948555, + "learning_rate": 2.4701171790347233e-05, + "loss": 0.5863, + "step": 597 + }, + { + "epoch": 0.165834719911259, + "grad_norm": 0.2093089371919632, + "learning_rate": 2.4699970471638613e-05, + "loss": 0.601, + "step": 598 + }, + { + "epoch": 0.1661120354963949, + "grad_norm": 0.19595085084438324, + "learning_rate": 2.4698766772392524e-05, + "loss": 0.5993, + "step": 599 + }, + { + "epoch": 0.16638935108153077, + "grad_norm": 0.20450963079929352, + "learning_rate": 2.469756069284384e-05, + "loss": 0.5875, + "step": 600 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.18902625143527985, + "learning_rate": 2.4696352233227894e-05, + "loss": 0.5943, + "step": 601 + }, + { + "epoch": 0.16694398225180254, + "grad_norm": 0.19143831729888916, + "learning_rate": 2.469514139378049e-05, + "loss": 0.5925, + "step": 602 + }, + { + "epoch": 0.16722129783693843, + "grad_norm": 0.20280803740024567, + "learning_rate": 2.46939281747379e-05, + "loss": 0.596, + "step": 603 + }, + { + "epoch": 0.1674986134220743, + "grad_norm": 0.20762760937213898, + "learning_rate": 2.4692712576336848e-05, + "loss": 0.5951, + "step": 604 + }, + { + "epoch": 0.1677759290072102, + "grad_norm": 0.209476500749588, + "learning_rate": 2.4691494598814536e-05, + "loss": 0.5988, + "step": 605 + }, + { + "epoch": 0.16805324459234608, + "grad_norm": 0.23190903663635254, + "learning_rate": 2.4690274242408617e-05, + "loss": 0.5928, + "step": 606 + }, + { + "epoch": 0.16833056017748196, + "grad_norm": 0.20941099524497986, + "learning_rate": 2.4689051507357218e-05, + "loss": 0.6001, + "step": 607 + }, + { + "epoch": 0.16860787576261785, + "grad_norm": 0.20067985355854034, + "learning_rate": 2.468782639389893e-05, + "loss": 0.572, + "step": 608 + }, + { + "epoch": 0.16888519134775373, + "grad_norm": 0.20099548995494843, + "learning_rate": 2.4686598902272793e-05, + "loss": 0.5603, + "step": 609 + }, + { + "epoch": 0.16916250693288962, + "grad_norm": 0.20298384130001068, + "learning_rate": 2.4685369032718343e-05, + "loss": 0.5657, + "step": 610 + }, + { + "epoch": 0.1694398225180255, + "grad_norm": 0.19231431186199188, + "learning_rate": 2.4684136785475544e-05, + "loss": 0.5628, + "step": 611 + }, + { + "epoch": 0.16971713810316139, + "grad_norm": 0.20296776294708252, + "learning_rate": 2.468290216078485e-05, + "loss": 0.6011, + "step": 612 + }, + { + "epoch": 0.16999445368829727, + "grad_norm": 0.19989420473575592, + "learning_rate": 2.468166515888716e-05, + "loss": 0.5876, + "step": 613 + }, + { + "epoch": 0.17027176927343315, + "grad_norm": 0.19636170566082, + "learning_rate": 2.4680425780023852e-05, + "loss": 0.5852, + "step": 614 + }, + { + "epoch": 0.17054908485856904, + "grad_norm": 0.21776345372200012, + "learning_rate": 2.4679184024436757e-05, + "loss": 0.5988, + "step": 615 + }, + { + "epoch": 0.17082640044370492, + "grad_norm": 0.2017216682434082, + "learning_rate": 2.4677939892368183e-05, + "loss": 0.6135, + "step": 616 + }, + { + "epoch": 0.1711037160288408, + "grad_norm": 0.218113973736763, + "learning_rate": 2.4676693384060884e-05, + "loss": 0.5727, + "step": 617 + }, + { + "epoch": 0.1713810316139767, + "grad_norm": 0.2070799022912979, + "learning_rate": 2.4675444499758093e-05, + "loss": 0.6229, + "step": 618 + }, + { + "epoch": 0.17165834719911258, + "grad_norm": 0.211971253156662, + "learning_rate": 2.4674193239703496e-05, + "loss": 0.5909, + "step": 619 + }, + { + "epoch": 0.1719356627842485, + "grad_norm": 0.21103401482105255, + "learning_rate": 2.4672939604141248e-05, + "loss": 0.5805, + "step": 620 + }, + { + "epoch": 0.17221297836938437, + "grad_norm": 0.22362381219863892, + "learning_rate": 2.467168359331597e-05, + "loss": 0.6006, + "step": 621 + }, + { + "epoch": 0.17249029395452026, + "grad_norm": 0.1989423632621765, + "learning_rate": 2.4670425207472737e-05, + "loss": 0.5895, + "step": 622 + }, + { + "epoch": 0.17276760953965614, + "grad_norm": 0.20032650232315063, + "learning_rate": 2.46691644468571e-05, + "loss": 0.5856, + "step": 623 + }, + { + "epoch": 0.17304492512479203, + "grad_norm": 0.21420548856258392, + "learning_rate": 2.466790131171506e-05, + "loss": 0.5835, + "step": 624 + }, + { + "epoch": 0.1733222407099279, + "grad_norm": 0.2181633710861206, + "learning_rate": 2.466663580229309e-05, + "loss": 0.5608, + "step": 625 + }, + { + "epoch": 0.1735995562950638, + "grad_norm": 0.19224753975868225, + "learning_rate": 2.4665367918838135e-05, + "loss": 0.5826, + "step": 626 + }, + { + "epoch": 0.17387687188019968, + "grad_norm": 0.20331954956054688, + "learning_rate": 2.4664097661597576e-05, + "loss": 0.5948, + "step": 627 + }, + { + "epoch": 0.17415418746533556, + "grad_norm": 0.20249582827091217, + "learning_rate": 2.4662825030819282e-05, + "loss": 0.5894, + "step": 628 + }, + { + "epoch": 0.17443150305047145, + "grad_norm": 0.19642974436283112, + "learning_rate": 2.466155002675158e-05, + "loss": 0.5938, + "step": 629 + }, + { + "epoch": 0.17470881863560733, + "grad_norm": 0.20837126672267914, + "learning_rate": 2.466027264964325e-05, + "loss": 0.589, + "step": 630 + }, + { + "epoch": 0.17498613422074322, + "grad_norm": 0.1986762434244156, + "learning_rate": 2.465899289974355e-05, + "loss": 0.5677, + "step": 631 + }, + { + "epoch": 0.1752634498058791, + "grad_norm": 0.47144341468811035, + "learning_rate": 2.4657710777302183e-05, + "loss": 0.6075, + "step": 632 + }, + { + "epoch": 0.17554076539101499, + "grad_norm": 0.2005637288093567, + "learning_rate": 2.465642628256934e-05, + "loss": 0.5863, + "step": 633 + }, + { + "epoch": 0.17581808097615087, + "grad_norm": 0.2160159796476364, + "learning_rate": 2.465513941579564e-05, + "loss": 0.5661, + "step": 634 + }, + { + "epoch": 0.17609539656128675, + "grad_norm": 0.22309495508670807, + "learning_rate": 2.4653850177232203e-05, + "loss": 0.6029, + "step": 635 + }, + { + "epoch": 0.17637271214642264, + "grad_norm": 0.20880426466464996, + "learning_rate": 2.4652558567130585e-05, + "loss": 0.6039, + "step": 636 + }, + { + "epoch": 0.17665002773155852, + "grad_norm": 0.20371706783771515, + "learning_rate": 2.4651264585742813e-05, + "loss": 0.5974, + "step": 637 + }, + { + "epoch": 0.1769273433166944, + "grad_norm": 0.19604718685150146, + "learning_rate": 2.464996823332138e-05, + "loss": 0.6012, + "step": 638 + }, + { + "epoch": 0.1772046589018303, + "grad_norm": 0.2000109851360321, + "learning_rate": 2.4648669510119235e-05, + "loss": 0.6038, + "step": 639 + }, + { + "epoch": 0.17748197448696618, + "grad_norm": 0.19588269293308258, + "learning_rate": 2.46473684163898e-05, + "loss": 0.5949, + "step": 640 + }, + { + "epoch": 0.17775929007210206, + "grad_norm": 0.2040427029132843, + "learning_rate": 2.4646064952386945e-05, + "loss": 0.5616, + "step": 641 + }, + { + "epoch": 0.17803660565723795, + "grad_norm": 0.2059299200773239, + "learning_rate": 2.4644759118365014e-05, + "loss": 0.5785, + "step": 642 + }, + { + "epoch": 0.17831392124237383, + "grad_norm": 0.1959904134273529, + "learning_rate": 2.464345091457881e-05, + "loss": 0.5691, + "step": 643 + }, + { + "epoch": 0.17859123682750971, + "grad_norm": 0.20045484602451324, + "learning_rate": 2.46421403412836e-05, + "loss": 0.5831, + "step": 644 + }, + { + "epoch": 0.1788685524126456, + "grad_norm": 0.18542559444904327, + "learning_rate": 2.4640827398735105e-05, + "loss": 0.5666, + "step": 645 + }, + { + "epoch": 0.17914586799778148, + "grad_norm": 0.29157590866088867, + "learning_rate": 2.463951208718952e-05, + "loss": 0.5841, + "step": 646 + }, + { + "epoch": 0.17942318358291737, + "grad_norm": 0.20582380890846252, + "learning_rate": 2.46381944069035e-05, + "loss": 0.618, + "step": 647 + }, + { + "epoch": 0.17970049916805325, + "grad_norm": 0.20922648906707764, + "learning_rate": 2.4636874358134153e-05, + "loss": 0.5831, + "step": 648 + }, + { + "epoch": 0.17997781475318914, + "grad_norm": 0.19867978990077972, + "learning_rate": 2.463555194113906e-05, + "loss": 0.5734, + "step": 649 + }, + { + "epoch": 0.18025513033832502, + "grad_norm": 0.19810400903224945, + "learning_rate": 2.463422715617626e-05, + "loss": 0.6086, + "step": 650 + }, + { + "epoch": 0.1805324459234609, + "grad_norm": 0.22697949409484863, + "learning_rate": 2.4632900003504246e-05, + "loss": 0.5942, + "step": 651 + }, + { + "epoch": 0.1808097615085968, + "grad_norm": 0.21418651938438416, + "learning_rate": 2.4631570483381992e-05, + "loss": 0.5793, + "step": 652 + }, + { + "epoch": 0.18108707709373267, + "grad_norm": 0.22276844084262848, + "learning_rate": 2.4630238596068914e-05, + "loss": 0.5998, + "step": 653 + }, + { + "epoch": 0.18136439267886856, + "grad_norm": 0.237818643450737, + "learning_rate": 2.4628904341824898e-05, + "loss": 0.5787, + "step": 654 + }, + { + "epoch": 0.18164170826400444, + "grad_norm": 0.20757392048835754, + "learning_rate": 2.46275677209103e-05, + "loss": 0.5603, + "step": 655 + }, + { + "epoch": 0.18191902384914033, + "grad_norm": 0.20871873199939728, + "learning_rate": 2.4626228733585926e-05, + "loss": 0.5689, + "step": 656 + }, + { + "epoch": 0.1821963394342762, + "grad_norm": 0.2344467043876648, + "learning_rate": 2.4624887380113048e-05, + "loss": 0.5887, + "step": 657 + }, + { + "epoch": 0.1824736550194121, + "grad_norm": 0.20889438688755035, + "learning_rate": 2.4623543660753397e-05, + "loss": 0.5699, + "step": 658 + }, + { + "epoch": 0.18275097060454798, + "grad_norm": 0.23723891377449036, + "learning_rate": 2.4622197575769173e-05, + "loss": 0.5691, + "step": 659 + }, + { + "epoch": 0.18302828618968386, + "grad_norm": 0.20685730874538422, + "learning_rate": 2.462084912542303e-05, + "loss": 0.585, + "step": 660 + }, + { + "epoch": 0.18330560177481975, + "grad_norm": 0.18916456401348114, + "learning_rate": 2.4619498309978085e-05, + "loss": 0.5785, + "step": 661 + }, + { + "epoch": 0.18358291735995563, + "grad_norm": 0.19421158730983734, + "learning_rate": 2.4618145129697916e-05, + "loss": 0.5742, + "step": 662 + }, + { + "epoch": 0.18386023294509152, + "grad_norm": 0.19799606502056122, + "learning_rate": 2.4616789584846575e-05, + "loss": 0.5642, + "step": 663 + }, + { + "epoch": 0.1841375485302274, + "grad_norm": 0.20754088461399078, + "learning_rate": 2.4615431675688556e-05, + "loss": 0.5793, + "step": 664 + }, + { + "epoch": 0.1844148641153633, + "grad_norm": 0.20479615032672882, + "learning_rate": 2.4614071402488822e-05, + "loss": 0.6009, + "step": 665 + }, + { + "epoch": 0.18469217970049917, + "grad_norm": 0.20695596933364868, + "learning_rate": 2.4612708765512803e-05, + "loss": 0.607, + "step": 666 + }, + { + "epoch": 0.18496949528563505, + "grad_norm": 0.21166419982910156, + "learning_rate": 2.4611343765026385e-05, + "loss": 0.5889, + "step": 667 + }, + { + "epoch": 0.18524681087077094, + "grad_norm": 0.19035880267620087, + "learning_rate": 2.4609976401295914e-05, + "loss": 0.5596, + "step": 668 + }, + { + "epoch": 0.18552412645590682, + "grad_norm": 0.2189519852399826, + "learning_rate": 2.4608606674588196e-05, + "loss": 0.595, + "step": 669 + }, + { + "epoch": 0.1858014420410427, + "grad_norm": 0.20535226166248322, + "learning_rate": 2.4607234585170506e-05, + "loss": 0.5785, + "step": 670 + }, + { + "epoch": 0.1860787576261786, + "grad_norm": 0.20723526179790497, + "learning_rate": 2.4605860133310577e-05, + "loss": 0.6205, + "step": 671 + }, + { + "epoch": 0.18635607321131448, + "grad_norm": 0.22765034437179565, + "learning_rate": 2.4604483319276596e-05, + "loss": 0.5739, + "step": 672 + }, + { + "epoch": 0.18663338879645036, + "grad_norm": 0.19783975183963776, + "learning_rate": 2.4603104143337212e-05, + "loss": 0.6001, + "step": 673 + }, + { + "epoch": 0.18691070438158625, + "grad_norm": 0.21098697185516357, + "learning_rate": 2.4601722605761547e-05, + "loss": 0.5636, + "step": 674 + }, + { + "epoch": 0.18718801996672213, + "grad_norm": 0.20571714639663696, + "learning_rate": 2.4600338706819175e-05, + "loss": 0.6031, + "step": 675 + }, + { + "epoch": 0.18746533555185801, + "grad_norm": 0.2101418673992157, + "learning_rate": 2.4598952446780127e-05, + "loss": 0.5854, + "step": 676 + }, + { + "epoch": 0.1877426511369939, + "grad_norm": 0.20562447607517242, + "learning_rate": 2.45975638259149e-05, + "loss": 0.6007, + "step": 677 + }, + { + "epoch": 0.18801996672212978, + "grad_norm": 0.20215946435928345, + "learning_rate": 2.4596172844494454e-05, + "loss": 0.601, + "step": 678 + }, + { + "epoch": 0.18829728230726567, + "grad_norm": 0.2149016410112381, + "learning_rate": 2.45947795027902e-05, + "loss": 0.5965, + "step": 679 + }, + { + "epoch": 0.18857459789240155, + "grad_norm": 0.1985412836074829, + "learning_rate": 2.4593383801074025e-05, + "loss": 0.563, + "step": 680 + }, + { + "epoch": 0.18885191347753744, + "grad_norm": 0.20387370884418488, + "learning_rate": 2.459198573961826e-05, + "loss": 0.5816, + "step": 681 + }, + { + "epoch": 0.18912922906267332, + "grad_norm": 0.19687046110630035, + "learning_rate": 2.4590585318695703e-05, + "loss": 0.5761, + "step": 682 + }, + { + "epoch": 0.1894065446478092, + "grad_norm": 0.2053958773612976, + "learning_rate": 2.458918253857962e-05, + "loss": 0.5931, + "step": 683 + }, + { + "epoch": 0.1896838602329451, + "grad_norm": 0.20455330610275269, + "learning_rate": 2.4587777399543726e-05, + "loss": 0.5739, + "step": 684 + }, + { + "epoch": 0.18996117581808097, + "grad_norm": 0.2084437757730484, + "learning_rate": 2.4586369901862204e-05, + "loss": 0.5659, + "step": 685 + }, + { + "epoch": 0.19023849140321686, + "grad_norm": 0.20842646062374115, + "learning_rate": 2.4584960045809686e-05, + "loss": 0.5863, + "step": 686 + }, + { + "epoch": 0.19051580698835274, + "grad_norm": 0.19156675040721893, + "learning_rate": 2.4583547831661283e-05, + "loss": 0.5738, + "step": 687 + }, + { + "epoch": 0.19079312257348863, + "grad_norm": 0.19893507659435272, + "learning_rate": 2.4582133259692546e-05, + "loss": 0.5739, + "step": 688 + }, + { + "epoch": 0.1910704381586245, + "grad_norm": 0.20070527493953705, + "learning_rate": 2.4580716330179505e-05, + "loss": 0.5703, + "step": 689 + }, + { + "epoch": 0.1913477537437604, + "grad_norm": 0.20454630255699158, + "learning_rate": 2.4579297043398636e-05, + "loss": 0.5735, + "step": 690 + }, + { + "epoch": 0.19162506932889628, + "grad_norm": 0.21206796169281006, + "learning_rate": 2.4577875399626877e-05, + "loss": 0.5852, + "step": 691 + }, + { + "epoch": 0.19190238491403216, + "grad_norm": 0.20151346921920776, + "learning_rate": 2.4576451399141627e-05, + "loss": 0.6033, + "step": 692 + }, + { + "epoch": 0.19217970049916805, + "grad_norm": 0.1981053203344345, + "learning_rate": 2.457502504222075e-05, + "loss": 0.5874, + "step": 693 + }, + { + "epoch": 0.19245701608430393, + "grad_norm": 0.197993203997612, + "learning_rate": 2.457359632914257e-05, + "loss": 0.5918, + "step": 694 + }, + { + "epoch": 0.19273433166943982, + "grad_norm": 0.2022867351770401, + "learning_rate": 2.4572165260185857e-05, + "loss": 0.5786, + "step": 695 + }, + { + "epoch": 0.1930116472545757, + "grad_norm": 0.20423941314220428, + "learning_rate": 2.457073183562986e-05, + "loss": 0.5781, + "step": 696 + }, + { + "epoch": 0.1932889628397116, + "grad_norm": 0.2390977442264557, + "learning_rate": 2.4569296055754275e-05, + "loss": 0.5727, + "step": 697 + }, + { + "epoch": 0.19356627842484747, + "grad_norm": 0.2169281542301178, + "learning_rate": 2.4567857920839256e-05, + "loss": 0.5638, + "step": 698 + }, + { + "epoch": 0.19384359400998336, + "grad_norm": 0.18818534910678864, + "learning_rate": 2.4566417431165427e-05, + "loss": 0.5722, + "step": 699 + }, + { + "epoch": 0.19412090959511924, + "grad_norm": 0.1905328780412674, + "learning_rate": 2.456497458701386e-05, + "loss": 0.5632, + "step": 700 + }, + { + "epoch": 0.19439822518025512, + "grad_norm": 0.19498996436595917, + "learning_rate": 2.45635293886661e-05, + "loss": 0.5953, + "step": 701 + }, + { + "epoch": 0.194675540765391, + "grad_norm": 0.19959990680217743, + "learning_rate": 2.456208183640414e-05, + "loss": 0.583, + "step": 702 + }, + { + "epoch": 0.1949528563505269, + "grad_norm": 0.2034797966480255, + "learning_rate": 2.456063193051043e-05, + "loss": 0.5883, + "step": 703 + }, + { + "epoch": 0.19523017193566278, + "grad_norm": 0.20221780240535736, + "learning_rate": 2.455917967126789e-05, + "loss": 0.5928, + "step": 704 + }, + { + "epoch": 0.19550748752079866, + "grad_norm": 0.21273115277290344, + "learning_rate": 2.4557725058959895e-05, + "loss": 0.5719, + "step": 705 + }, + { + "epoch": 0.19578480310593455, + "grad_norm": 0.1873910129070282, + "learning_rate": 2.455626809387028e-05, + "loss": 0.5642, + "step": 706 + }, + { + "epoch": 0.19606211869107043, + "grad_norm": 0.19268092513084412, + "learning_rate": 2.4554808776283334e-05, + "loss": 0.5555, + "step": 707 + }, + { + "epoch": 0.19633943427620631, + "grad_norm": 0.20540937781333923, + "learning_rate": 2.4553347106483808e-05, + "loss": 0.6076, + "step": 708 + }, + { + "epoch": 0.1966167498613422, + "grad_norm": 0.19650278985500336, + "learning_rate": 2.4551883084756917e-05, + "loss": 0.5866, + "step": 709 + }, + { + "epoch": 0.19689406544647808, + "grad_norm": 0.221206396818161, + "learning_rate": 2.4550416711388327e-05, + "loss": 0.581, + "step": 710 + }, + { + "epoch": 0.19717138103161397, + "grad_norm": 0.19788506627082825, + "learning_rate": 2.4548947986664167e-05, + "loss": 0.5667, + "step": 711 + }, + { + "epoch": 0.19744869661674985, + "grad_norm": 0.22713138163089752, + "learning_rate": 2.454747691087102e-05, + "loss": 0.5732, + "step": 712 + }, + { + "epoch": 0.19772601220188574, + "grad_norm": 0.19035859405994415, + "learning_rate": 2.454600348429594e-05, + "loss": 0.6072, + "step": 713 + }, + { + "epoch": 0.19800332778702162, + "grad_norm": 0.19724468886852264, + "learning_rate": 2.4544527707226428e-05, + "loss": 0.5958, + "step": 714 + }, + { + "epoch": 0.1982806433721575, + "grad_norm": 0.20074382424354553, + "learning_rate": 2.4543049579950445e-05, + "loss": 0.6006, + "step": 715 + }, + { + "epoch": 0.1985579589572934, + "grad_norm": 0.19603832066059113, + "learning_rate": 2.4541569102756414e-05, + "loss": 0.5901, + "step": 716 + }, + { + "epoch": 0.19883527454242927, + "grad_norm": 0.1956927478313446, + "learning_rate": 2.4540086275933215e-05, + "loss": 0.5731, + "step": 717 + }, + { + "epoch": 0.19911259012756516, + "grad_norm": 0.19697944819927216, + "learning_rate": 2.4538601099770187e-05, + "loss": 0.5778, + "step": 718 + }, + { + "epoch": 0.19938990571270104, + "grad_norm": 0.19818982481956482, + "learning_rate": 2.453711357455713e-05, + "loss": 0.5588, + "step": 719 + }, + { + "epoch": 0.19966722129783693, + "grad_norm": 0.18997104465961456, + "learning_rate": 2.4535623700584297e-05, + "loss": 0.5789, + "step": 720 + }, + { + "epoch": 0.1999445368829728, + "grad_norm": 0.21167294681072235, + "learning_rate": 2.4534131478142402e-05, + "loss": 0.5804, + "step": 721 + }, + { + "epoch": 0.2002218524681087, + "grad_norm": 0.21374750137329102, + "learning_rate": 2.4532636907522617e-05, + "loss": 0.5727, + "step": 722 + }, + { + "epoch": 0.20049916805324458, + "grad_norm": 0.20970353484153748, + "learning_rate": 2.453113998901657e-05, + "loss": 0.5766, + "step": 723 + }, + { + "epoch": 0.20077648363838047, + "grad_norm": 0.19419021904468536, + "learning_rate": 2.4529640722916355e-05, + "loss": 0.5755, + "step": 724 + }, + { + "epoch": 0.20105379922351635, + "grad_norm": 0.20143002271652222, + "learning_rate": 2.4528139109514513e-05, + "loss": 0.5627, + "step": 725 + }, + { + "epoch": 0.20133111480865223, + "grad_norm": 0.20216137170791626, + "learning_rate": 2.4526635149104056e-05, + "loss": 0.5771, + "step": 726 + }, + { + "epoch": 0.20160843039378812, + "grad_norm": 0.20338623225688934, + "learning_rate": 2.452512884197844e-05, + "loss": 0.605, + "step": 727 + }, + { + "epoch": 0.201885745978924, + "grad_norm": 0.20078277587890625, + "learning_rate": 2.4523620188431585e-05, + "loss": 0.5954, + "step": 728 + }, + { + "epoch": 0.2021630615640599, + "grad_norm": 0.2046244889497757, + "learning_rate": 2.4522109188757875e-05, + "loss": 0.5945, + "step": 729 + }, + { + "epoch": 0.20244037714919577, + "grad_norm": 0.20462092757225037, + "learning_rate": 2.4520595843252138e-05, + "loss": 0.5762, + "step": 730 + }, + { + "epoch": 0.20271769273433166, + "grad_norm": 0.20749370753765106, + "learning_rate": 2.4519080152209675e-05, + "loss": 0.5435, + "step": 731 + }, + { + "epoch": 0.20299500831946754, + "grad_norm": 0.21734094619750977, + "learning_rate": 2.4517562115926233e-05, + "loss": 0.5961, + "step": 732 + }, + { + "epoch": 0.20327232390460345, + "grad_norm": 0.20754100382328033, + "learning_rate": 2.4516041734698024e-05, + "loss": 0.5548, + "step": 733 + }, + { + "epoch": 0.20354963948973934, + "grad_norm": 0.20619900524616241, + "learning_rate": 2.451451900882172e-05, + "loss": 0.5897, + "step": 734 + }, + { + "epoch": 0.20382695507487522, + "grad_norm": 0.28535279631614685, + "learning_rate": 2.451299393859443e-05, + "loss": 0.5815, + "step": 735 + }, + { + "epoch": 0.2041042706600111, + "grad_norm": 0.19404634833335876, + "learning_rate": 2.4511466524313748e-05, + "loss": 0.5743, + "step": 736 + }, + { + "epoch": 0.204381586245147, + "grad_norm": 0.18895412981510162, + "learning_rate": 2.4509936766277706e-05, + "loss": 0.5876, + "step": 737 + }, + { + "epoch": 0.20465890183028287, + "grad_norm": 0.22120583057403564, + "learning_rate": 2.4508404664784808e-05, + "loss": 0.5873, + "step": 738 + }, + { + "epoch": 0.20493621741541876, + "grad_norm": 0.21361954510211945, + "learning_rate": 2.4506870220134e-05, + "loss": 0.6002, + "step": 739 + }, + { + "epoch": 0.20521353300055464, + "grad_norm": 0.19417433440685272, + "learning_rate": 2.4505333432624694e-05, + "loss": 0.5673, + "step": 740 + }, + { + "epoch": 0.20549084858569053, + "grad_norm": 0.19543422758579254, + "learning_rate": 2.4503794302556765e-05, + "loss": 0.5628, + "step": 741 + }, + { + "epoch": 0.2057681641708264, + "grad_norm": 0.1954490691423416, + "learning_rate": 2.450225283023053e-05, + "loss": 0.6246, + "step": 742 + }, + { + "epoch": 0.2060454797559623, + "grad_norm": 0.1868993192911148, + "learning_rate": 2.4500709015946776e-05, + "loss": 0.5858, + "step": 743 + }, + { + "epoch": 0.20632279534109818, + "grad_norm": 0.2035941481590271, + "learning_rate": 2.449916286000674e-05, + "loss": 0.5974, + "step": 744 + }, + { + "epoch": 0.20660011092623407, + "grad_norm": 0.1918855458498001, + "learning_rate": 2.4497614362712118e-05, + "loss": 0.5843, + "step": 745 + }, + { + "epoch": 0.20687742651136995, + "grad_norm": 0.19244706630706787, + "learning_rate": 2.4496063524365063e-05, + "loss": 0.5535, + "step": 746 + }, + { + "epoch": 0.20715474209650583, + "grad_norm": 0.20424753427505493, + "learning_rate": 2.4494510345268185e-05, + "loss": 0.5835, + "step": 747 + }, + { + "epoch": 0.20743205768164172, + "grad_norm": 0.19604821503162384, + "learning_rate": 2.4492954825724544e-05, + "loss": 0.5701, + "step": 748 + }, + { + "epoch": 0.2077093732667776, + "grad_norm": 0.19546863436698914, + "learning_rate": 2.4491396966037678e-05, + "loss": 0.5898, + "step": 749 + }, + { + "epoch": 0.2079866888519135, + "grad_norm": 0.19861635565757751, + "learning_rate": 2.4489836766511555e-05, + "loss": 0.587, + "step": 750 + }, + { + "epoch": 0.20826400443704937, + "grad_norm": 0.19369752705097198, + "learning_rate": 2.4488274227450613e-05, + "loss": 0.6027, + "step": 751 + }, + { + "epoch": 0.20854132002218526, + "grad_norm": 0.4158318340778351, + "learning_rate": 2.448670934915975e-05, + "loss": 0.602, + "step": 752 + }, + { + "epoch": 0.20881863560732114, + "grad_norm": 0.20547251403331757, + "learning_rate": 2.4485142131944306e-05, + "loss": 0.5949, + "step": 753 + }, + { + "epoch": 0.20909595119245702, + "grad_norm": 0.21317002177238464, + "learning_rate": 2.4483572576110093e-05, + "loss": 0.5862, + "step": 754 + }, + { + "epoch": 0.2093732667775929, + "grad_norm": 0.19712896645069122, + "learning_rate": 2.448200068196337e-05, + "loss": 0.5983, + "step": 755 + }, + { + "epoch": 0.2096505823627288, + "grad_norm": 0.2054811269044876, + "learning_rate": 2.448042644981086e-05, + "loss": 0.5983, + "step": 756 + }, + { + "epoch": 0.20992789794786468, + "grad_norm": 0.19795221090316772, + "learning_rate": 2.447884987995973e-05, + "loss": 0.6208, + "step": 757 + }, + { + "epoch": 0.21020521353300056, + "grad_norm": 0.21395504474639893, + "learning_rate": 2.447727097271762e-05, + "loss": 0.5983, + "step": 758 + }, + { + "epoch": 0.21048252911813645, + "grad_norm": 0.19311439990997314, + "learning_rate": 2.447568972839261e-05, + "loss": 0.592, + "step": 759 + }, + { + "epoch": 0.21075984470327233, + "grad_norm": 0.21382609009742737, + "learning_rate": 2.4474106147293242e-05, + "loss": 0.5752, + "step": 760 + }, + { + "epoch": 0.21103716028840822, + "grad_norm": 0.19354097545146942, + "learning_rate": 2.447252022972852e-05, + "loss": 0.5911, + "step": 761 + }, + { + "epoch": 0.2113144758735441, + "grad_norm": 0.19984754920005798, + "learning_rate": 2.4470931976007894e-05, + "loss": 0.5995, + "step": 762 + }, + { + "epoch": 0.21159179145867998, + "grad_norm": 0.20404407382011414, + "learning_rate": 2.4469341386441274e-05, + "loss": 0.5551, + "step": 763 + }, + { + "epoch": 0.21186910704381587, + "grad_norm": 0.2025006264448166, + "learning_rate": 2.446774846133903e-05, + "loss": 0.6105, + "step": 764 + }, + { + "epoch": 0.21214642262895175, + "grad_norm": 0.20010975003242493, + "learning_rate": 2.446615320101198e-05, + "loss": 0.5788, + "step": 765 + }, + { + "epoch": 0.21242373821408764, + "grad_norm": 0.20225434005260468, + "learning_rate": 2.4464555605771404e-05, + "loss": 0.5636, + "step": 766 + }, + { + "epoch": 0.21270105379922352, + "grad_norm": 0.19845524430274963, + "learning_rate": 2.4462955675929032e-05, + "loss": 0.5758, + "step": 767 + }, + { + "epoch": 0.2129783693843594, + "grad_norm": 0.19598202407360077, + "learning_rate": 2.446135341179706e-05, + "loss": 0.5456, + "step": 768 + }, + { + "epoch": 0.2132556849694953, + "grad_norm": 0.2050497829914093, + "learning_rate": 2.445974881368812e-05, + "loss": 0.5912, + "step": 769 + }, + { + "epoch": 0.21353300055463117, + "grad_norm": 0.19924525916576385, + "learning_rate": 2.4458141881915324e-05, + "loss": 0.5479, + "step": 770 + }, + { + "epoch": 0.21381031613976706, + "grad_norm": 0.20329277217388153, + "learning_rate": 2.445653261679222e-05, + "loss": 0.6006, + "step": 771 + }, + { + "epoch": 0.21408763172490294, + "grad_norm": 0.19327110052108765, + "learning_rate": 2.4454921018632827e-05, + "loss": 0.5739, + "step": 772 + }, + { + "epoch": 0.21436494731003883, + "grad_norm": 0.19316452741622925, + "learning_rate": 2.4453307087751594e-05, + "loss": 0.5953, + "step": 773 + }, + { + "epoch": 0.2146422628951747, + "grad_norm": 0.19617030024528503, + "learning_rate": 2.4451690824463457e-05, + "loss": 0.5686, + "step": 774 + }, + { + "epoch": 0.2149195784803106, + "grad_norm": 0.20528316497802734, + "learning_rate": 2.4450072229083786e-05, + "loss": 0.5691, + "step": 775 + }, + { + "epoch": 0.21519689406544648, + "grad_norm": 0.20544420182704926, + "learning_rate": 2.4448451301928408e-05, + "loss": 0.5776, + "step": 776 + }, + { + "epoch": 0.21547420965058237, + "grad_norm": 0.21979959309101105, + "learning_rate": 2.4446828043313614e-05, + "loss": 0.5947, + "step": 777 + }, + { + "epoch": 0.21575152523571825, + "grad_norm": 0.2081802487373352, + "learning_rate": 2.4445202453556145e-05, + "loss": 0.5752, + "step": 778 + }, + { + "epoch": 0.21602884082085413, + "grad_norm": 0.2012367993593216, + "learning_rate": 2.4443574532973195e-05, + "loss": 0.5671, + "step": 779 + }, + { + "epoch": 0.21630615640599002, + "grad_norm": 0.1998508721590042, + "learning_rate": 2.4441944281882415e-05, + "loss": 0.6154, + "step": 780 + }, + { + "epoch": 0.2165834719911259, + "grad_norm": 0.20325055718421936, + "learning_rate": 2.444031170060191e-05, + "loss": 0.5743, + "step": 781 + }, + { + "epoch": 0.2168607875762618, + "grad_norm": 0.20255804061889648, + "learning_rate": 2.443867678945024e-05, + "loss": 0.5748, + "step": 782 + }, + { + "epoch": 0.21713810316139767, + "grad_norm": 0.1919908970594406, + "learning_rate": 2.4437039548746415e-05, + "loss": 0.5709, + "step": 783 + }, + { + "epoch": 0.21741541874653356, + "grad_norm": 0.20014292001724243, + "learning_rate": 2.443539997880991e-05, + "loss": 0.5636, + "step": 784 + }, + { + "epoch": 0.21769273433166944, + "grad_norm": 0.19818776845932007, + "learning_rate": 2.4433758079960647e-05, + "loss": 0.5649, + "step": 785 + }, + { + "epoch": 0.21797004991680533, + "grad_norm": 0.18718703091144562, + "learning_rate": 2.4432113852519005e-05, + "loss": 0.5917, + "step": 786 + }, + { + "epoch": 0.2182473655019412, + "grad_norm": 0.32280299067497253, + "learning_rate": 2.4430467296805816e-05, + "loss": 0.5864, + "step": 787 + }, + { + "epoch": 0.2185246810870771, + "grad_norm": 0.20851466059684753, + "learning_rate": 2.442881841314236e-05, + "loss": 0.5837, + "step": 788 + }, + { + "epoch": 0.21880199667221298, + "grad_norm": 0.1917923539876938, + "learning_rate": 2.442716720185039e-05, + "loss": 0.6032, + "step": 789 + }, + { + "epoch": 0.21907931225734886, + "grad_norm": 0.25185203552246094, + "learning_rate": 2.442551366325209e-05, + "loss": 0.5873, + "step": 790 + }, + { + "epoch": 0.21935662784248475, + "grad_norm": 0.1962638646364212, + "learning_rate": 2.4423857797670118e-05, + "loss": 0.577, + "step": 791 + }, + { + "epoch": 0.21963394342762063, + "grad_norm": 0.19475746154785156, + "learning_rate": 2.4422199605427572e-05, + "loss": 0.5677, + "step": 792 + }, + { + "epoch": 0.21991125901275652, + "grad_norm": 0.517663836479187, + "learning_rate": 2.4420539086848007e-05, + "loss": 0.5718, + "step": 793 + }, + { + "epoch": 0.2201885745978924, + "grad_norm": 0.1952415555715561, + "learning_rate": 2.441887624225544e-05, + "loss": 0.5801, + "step": 794 + }, + { + "epoch": 0.22046589018302828, + "grad_norm": 0.20376408100128174, + "learning_rate": 2.441721107197433e-05, + "loss": 0.6038, + "step": 795 + }, + { + "epoch": 0.22074320576816417, + "grad_norm": 0.21492497622966766, + "learning_rate": 2.4415543576329604e-05, + "loss": 0.5626, + "step": 796 + }, + { + "epoch": 0.22102052135330005, + "grad_norm": 0.21810825169086456, + "learning_rate": 2.4413873755646627e-05, + "loss": 0.566, + "step": 797 + }, + { + "epoch": 0.22129783693843594, + "grad_norm": 0.2002691626548767, + "learning_rate": 2.4412201610251232e-05, + "loss": 0.5706, + "step": 798 + }, + { + "epoch": 0.22157515252357182, + "grad_norm": 0.24929803609848022, + "learning_rate": 2.441052714046969e-05, + "loss": 0.5878, + "step": 799 + }, + { + "epoch": 0.2218524681087077, + "grad_norm": 0.20125854015350342, + "learning_rate": 2.440885034662874e-05, + "loss": 0.5869, + "step": 800 + }, + { + "epoch": 0.2221297836938436, + "grad_norm": 0.19206437468528748, + "learning_rate": 2.4407171229055574e-05, + "loss": 0.5911, + "step": 801 + }, + { + "epoch": 0.22240709927897948, + "grad_norm": 0.22323836386203766, + "learning_rate": 2.4405489788077823e-05, + "loss": 0.5725, + "step": 802 + }, + { + "epoch": 0.22268441486411536, + "grad_norm": 0.2044333517551422, + "learning_rate": 2.4403806024023584e-05, + "loss": 0.6243, + "step": 803 + }, + { + "epoch": 0.22296173044925124, + "grad_norm": 0.20750725269317627, + "learning_rate": 2.44021199372214e-05, + "loss": 0.6002, + "step": 804 + }, + { + "epoch": 0.22323904603438713, + "grad_norm": 0.2218470573425293, + "learning_rate": 2.4400431528000284e-05, + "loss": 0.5886, + "step": 805 + }, + { + "epoch": 0.223516361619523, + "grad_norm": 0.20135878026485443, + "learning_rate": 2.4398740796689676e-05, + "loss": 0.5711, + "step": 806 + }, + { + "epoch": 0.2237936772046589, + "grad_norm": 0.19776999950408936, + "learning_rate": 2.439704774361949e-05, + "loss": 0.5706, + "step": 807 + }, + { + "epoch": 0.22407099278979478, + "grad_norm": 0.22162839770317078, + "learning_rate": 2.4395352369120078e-05, + "loss": 0.593, + "step": 808 + }, + { + "epoch": 0.22434830837493067, + "grad_norm": 0.1974382847547531, + "learning_rate": 2.4393654673522264e-05, + "loss": 0.5657, + "step": 809 + }, + { + "epoch": 0.22462562396006655, + "grad_norm": 0.2362552434206009, + "learning_rate": 2.4391954657157302e-05, + "loss": 0.5844, + "step": 810 + }, + { + "epoch": 0.22490293954520243, + "grad_norm": 0.2056739628314972, + "learning_rate": 2.4390252320356915e-05, + "loss": 0.5959, + "step": 811 + }, + { + "epoch": 0.22518025513033832, + "grad_norm": 0.19037006795406342, + "learning_rate": 2.4388547663453275e-05, + "loss": 0.5638, + "step": 812 + }, + { + "epoch": 0.2254575707154742, + "grad_norm": 0.21129223704338074, + "learning_rate": 2.4386840686779004e-05, + "loss": 0.5954, + "step": 813 + }, + { + "epoch": 0.2257348863006101, + "grad_norm": 0.2718643546104431, + "learning_rate": 2.4385131390667184e-05, + "loss": 0.5881, + "step": 814 + }, + { + "epoch": 0.22601220188574597, + "grad_norm": 0.2181466519832611, + "learning_rate": 2.4383419775451334e-05, + "loss": 0.5858, + "step": 815 + }, + { + "epoch": 0.22628951747088186, + "grad_norm": 0.22737659513950348, + "learning_rate": 2.438170584146544e-05, + "loss": 0.5732, + "step": 816 + }, + { + "epoch": 0.22656683305601774, + "grad_norm": 0.20686711370944977, + "learning_rate": 2.437998958904394e-05, + "loss": 0.5524, + "step": 817 + }, + { + "epoch": 0.22684414864115363, + "grad_norm": 0.2140977680683136, + "learning_rate": 2.4378271018521714e-05, + "loss": 0.5628, + "step": 818 + }, + { + "epoch": 0.2271214642262895, + "grad_norm": 0.19793452322483063, + "learning_rate": 2.4376550130234104e-05, + "loss": 0.5796, + "step": 819 + }, + { + "epoch": 0.2273987798114254, + "grad_norm": 0.20289914309978485, + "learning_rate": 2.4374826924516903e-05, + "loss": 0.5848, + "step": 820 + }, + { + "epoch": 0.22767609539656128, + "grad_norm": 0.19081373512744904, + "learning_rate": 2.437310140170635e-05, + "loss": 0.6011, + "step": 821 + }, + { + "epoch": 0.22795341098169716, + "grad_norm": 0.20546457171440125, + "learning_rate": 2.437137356213914e-05, + "loss": 0.5476, + "step": 822 + }, + { + "epoch": 0.22823072656683305, + "grad_norm": 0.22531366348266602, + "learning_rate": 2.4369643406152422e-05, + "loss": 0.577, + "step": 823 + }, + { + "epoch": 0.22850804215196893, + "grad_norm": 0.1964918076992035, + "learning_rate": 2.4367910934083795e-05, + "loss": 0.5733, + "step": 824 + }, + { + "epoch": 0.22878535773710482, + "grad_norm": 0.1976742297410965, + "learning_rate": 2.4366176146271313e-05, + "loss": 0.576, + "step": 825 + }, + { + "epoch": 0.2290626733222407, + "grad_norm": 0.20134706795215607, + "learning_rate": 2.4364439043053475e-05, + "loss": 0.5715, + "step": 826 + }, + { + "epoch": 0.22933998890737659, + "grad_norm": 0.28538307547569275, + "learning_rate": 2.4362699624769236e-05, + "loss": 0.6009, + "step": 827 + }, + { + "epoch": 0.22961730449251247, + "grad_norm": 0.21845568716526031, + "learning_rate": 2.4360957891758006e-05, + "loss": 0.5936, + "step": 828 + }, + { + "epoch": 0.22989462007764835, + "grad_norm": 0.1977756768465042, + "learning_rate": 2.435921384435964e-05, + "loss": 0.5651, + "step": 829 + }, + { + "epoch": 0.23017193566278424, + "grad_norm": 0.20483648777008057, + "learning_rate": 2.4357467482914447e-05, + "loss": 0.5861, + "step": 830 + }, + { + "epoch": 0.23044925124792012, + "grad_norm": 0.191145658493042, + "learning_rate": 2.4355718807763196e-05, + "loss": 0.5951, + "step": 831 + }, + { + "epoch": 0.230726566833056, + "grad_norm": 0.20611602067947388, + "learning_rate": 2.4353967819247093e-05, + "loss": 0.5762, + "step": 832 + }, + { + "epoch": 0.2310038824181919, + "grad_norm": 0.34163740277290344, + "learning_rate": 2.43522145177078e-05, + "loss": 0.5573, + "step": 833 + }, + { + "epoch": 0.23128119800332778, + "grad_norm": 0.1989511400461197, + "learning_rate": 2.4350458903487438e-05, + "loss": 0.5799, + "step": 834 + }, + { + "epoch": 0.23155851358846366, + "grad_norm": 0.1951713114976883, + "learning_rate": 2.434870097692857e-05, + "loss": 0.5763, + "step": 835 + }, + { + "epoch": 0.23183582917359954, + "grad_norm": 0.19492702186107635, + "learning_rate": 2.4346940738374217e-05, + "loss": 0.5751, + "step": 836 + }, + { + "epoch": 0.23211314475873543, + "grad_norm": 0.20524460077285767, + "learning_rate": 2.434517818816785e-05, + "loss": 0.5959, + "step": 837 + }, + { + "epoch": 0.2323904603438713, + "grad_norm": 0.19833968579769135, + "learning_rate": 2.4343413326653384e-05, + "loss": 0.5881, + "step": 838 + }, + { + "epoch": 0.2326677759290072, + "grad_norm": 0.19568949937820435, + "learning_rate": 2.4341646154175192e-05, + "loss": 0.5449, + "step": 839 + }, + { + "epoch": 0.23294509151414308, + "grad_norm": 0.19156986474990845, + "learning_rate": 2.43398766710781e-05, + "loss": 0.5703, + "step": 840 + }, + { + "epoch": 0.23322240709927897, + "grad_norm": 0.20440654456615448, + "learning_rate": 2.4338104877707372e-05, + "loss": 0.5783, + "step": 841 + }, + { + "epoch": 0.23349972268441485, + "grad_norm": 0.20396758615970612, + "learning_rate": 2.4336330774408744e-05, + "loss": 0.5911, + "step": 842 + }, + { + "epoch": 0.23377703826955074, + "grad_norm": 0.19637946784496307, + "learning_rate": 2.4334554361528376e-05, + "loss": 0.5708, + "step": 843 + }, + { + "epoch": 0.23405435385468662, + "grad_norm": 0.21237438917160034, + "learning_rate": 2.433277563941291e-05, + "loss": 0.6104, + "step": 844 + }, + { + "epoch": 0.2343316694398225, + "grad_norm": 0.18440033495426178, + "learning_rate": 2.433099460840941e-05, + "loss": 0.5745, + "step": 845 + }, + { + "epoch": 0.23460898502495842, + "grad_norm": 0.19301645457744598, + "learning_rate": 2.4329211268865406e-05, + "loss": 0.5621, + "step": 846 + }, + { + "epoch": 0.2348863006100943, + "grad_norm": 0.2056163102388382, + "learning_rate": 2.4327425621128873e-05, + "loss": 0.5973, + "step": 847 + }, + { + "epoch": 0.23516361619523019, + "grad_norm": 0.20398737490177155, + "learning_rate": 2.432563766554824e-05, + "loss": 0.5795, + "step": 848 + }, + { + "epoch": 0.23544093178036607, + "grad_norm": 0.19015717506408691, + "learning_rate": 2.432384740247239e-05, + "loss": 0.5563, + "step": 849 + }, + { + "epoch": 0.23571824736550195, + "grad_norm": 0.19576147198677063, + "learning_rate": 2.4322054832250636e-05, + "loss": 0.5757, + "step": 850 + }, + { + "epoch": 0.23599556295063784, + "grad_norm": 0.1978127360343933, + "learning_rate": 2.4320259955232773e-05, + "loss": 0.5832, + "step": 851 + }, + { + "epoch": 0.23627287853577372, + "grad_norm": 0.19191214442253113, + "learning_rate": 2.4318462771769012e-05, + "loss": 0.5812, + "step": 852 + }, + { + "epoch": 0.2365501941209096, + "grad_norm": 0.18717870116233826, + "learning_rate": 2.4316663282210046e-05, + "loss": 0.5753, + "step": 853 + }, + { + "epoch": 0.2368275097060455, + "grad_norm": 0.18739578127861023, + "learning_rate": 2.4314861486906996e-05, + "loss": 0.5665, + "step": 854 + }, + { + "epoch": 0.23710482529118138, + "grad_norm": 0.1928299367427826, + "learning_rate": 2.431305738621144e-05, + "loss": 0.5836, + "step": 855 + }, + { + "epoch": 0.23738214087631726, + "grad_norm": 0.20048747956752777, + "learning_rate": 2.4311250980475408e-05, + "loss": 0.5673, + "step": 856 + }, + { + "epoch": 0.23765945646145314, + "grad_norm": 0.2141515463590622, + "learning_rate": 2.4309442270051376e-05, + "loss": 0.5426, + "step": 857 + }, + { + "epoch": 0.23793677204658903, + "grad_norm": 0.19637706875801086, + "learning_rate": 2.4307631255292273e-05, + "loss": 0.5542, + "step": 858 + }, + { + "epoch": 0.2382140876317249, + "grad_norm": 0.21245527267456055, + "learning_rate": 2.4305817936551472e-05, + "loss": 0.5867, + "step": 859 + }, + { + "epoch": 0.2384914032168608, + "grad_norm": 0.19474704563617706, + "learning_rate": 2.4304002314182804e-05, + "loss": 0.5716, + "step": 860 + }, + { + "epoch": 0.23876871880199668, + "grad_norm": 0.19590826332569122, + "learning_rate": 2.4302184388540544e-05, + "loss": 0.5746, + "step": 861 + }, + { + "epoch": 0.23904603438713257, + "grad_norm": 0.21298062801361084, + "learning_rate": 2.4300364159979418e-05, + "loss": 0.5788, + "step": 862 + }, + { + "epoch": 0.23932334997226845, + "grad_norm": 0.21597878634929657, + "learning_rate": 2.4298541628854597e-05, + "loss": 0.5868, + "step": 863 + }, + { + "epoch": 0.23960066555740434, + "grad_norm": 0.2077784389257431, + "learning_rate": 2.4296716795521707e-05, + "loss": 0.5878, + "step": 864 + }, + { + "epoch": 0.23987798114254022, + "grad_norm": 0.1982557773590088, + "learning_rate": 2.4294889660336823e-05, + "loss": 0.5734, + "step": 865 + }, + { + "epoch": 0.2401552967276761, + "grad_norm": 0.19443267583847046, + "learning_rate": 2.4293060223656465e-05, + "loss": 0.5845, + "step": 866 + }, + { + "epoch": 0.240432612312812, + "grad_norm": 0.2007235586643219, + "learning_rate": 2.4291228485837613e-05, + "loss": 0.5829, + "step": 867 + }, + { + "epoch": 0.24070992789794787, + "grad_norm": 0.18996436893939972, + "learning_rate": 2.4289394447237674e-05, + "loss": 0.5918, + "step": 868 + }, + { + "epoch": 0.24098724348308376, + "grad_norm": 0.19309580326080322, + "learning_rate": 2.4287558108214527e-05, + "loss": 0.5962, + "step": 869 + }, + { + "epoch": 0.24126455906821964, + "grad_norm": 0.19195586442947388, + "learning_rate": 2.428571946912649e-05, + "loss": 0.5738, + "step": 870 + }, + { + "epoch": 0.24154187465335553, + "grad_norm": 0.19671426713466644, + "learning_rate": 2.4283878530332322e-05, + "loss": 0.5551, + "step": 871 + }, + { + "epoch": 0.2418191902384914, + "grad_norm": 0.19544430077075958, + "learning_rate": 2.4282035292191247e-05, + "loss": 0.5876, + "step": 872 + }, + { + "epoch": 0.2420965058236273, + "grad_norm": 0.19258378446102142, + "learning_rate": 2.4280189755062928e-05, + "loss": 0.559, + "step": 873 + }, + { + "epoch": 0.24237382140876318, + "grad_norm": 0.21361422538757324, + "learning_rate": 2.427834191930748e-05, + "loss": 0.5753, + "step": 874 + }, + { + "epoch": 0.24265113699389906, + "grad_norm": 0.20279040932655334, + "learning_rate": 2.4276491785285457e-05, + "loss": 0.5724, + "step": 875 + }, + { + "epoch": 0.24292845257903495, + "grad_norm": 0.20467157661914825, + "learning_rate": 2.427463935335788e-05, + "loss": 0.5989, + "step": 876 + }, + { + "epoch": 0.24320576816417083, + "grad_norm": 0.20144453644752502, + "learning_rate": 2.4272784623886195e-05, + "loss": 0.5943, + "step": 877 + }, + { + "epoch": 0.24348308374930672, + "grad_norm": 0.19686299562454224, + "learning_rate": 2.4270927597232325e-05, + "loss": 0.5692, + "step": 878 + }, + { + "epoch": 0.2437603993344426, + "grad_norm": 0.22722774744033813, + "learning_rate": 2.426906827375861e-05, + "loss": 0.566, + "step": 879 + }, + { + "epoch": 0.24403771491957849, + "grad_norm": 0.19703295826911926, + "learning_rate": 2.4267206653827856e-05, + "loss": 0.5627, + "step": 880 + }, + { + "epoch": 0.24431503050471437, + "grad_norm": 0.2020971179008484, + "learning_rate": 2.4265342737803327e-05, + "loss": 0.5836, + "step": 881 + }, + { + "epoch": 0.24459234608985025, + "grad_norm": 0.1921062171459198, + "learning_rate": 2.4263476526048707e-05, + "loss": 0.5651, + "step": 882 + }, + { + "epoch": 0.24486966167498614, + "grad_norm": 0.20176348090171814, + "learning_rate": 2.4261608018928147e-05, + "loss": 0.5883, + "step": 883 + }, + { + "epoch": 0.24514697726012202, + "grad_norm": 0.19450893998146057, + "learning_rate": 2.425973721680625e-05, + "loss": 0.5591, + "step": 884 + }, + { + "epoch": 0.2454242928452579, + "grad_norm": 0.8580565452575684, + "learning_rate": 2.425786412004805e-05, + "loss": 0.5592, + "step": 885 + }, + { + "epoch": 0.2457016084303938, + "grad_norm": 0.24260735511779785, + "learning_rate": 2.4255988729019042e-05, + "loss": 0.5902, + "step": 886 + }, + { + "epoch": 0.24597892401552968, + "grad_norm": 0.19789321720600128, + "learning_rate": 2.4254111044085163e-05, + "loss": 0.5745, + "step": 887 + }, + { + "epoch": 0.24625623960066556, + "grad_norm": 0.21979939937591553, + "learning_rate": 2.4252231065612805e-05, + "loss": 0.5551, + "step": 888 + }, + { + "epoch": 0.24653355518580145, + "grad_norm": 0.20352937281131744, + "learning_rate": 2.425034879396879e-05, + "loss": 0.5591, + "step": 889 + }, + { + "epoch": 0.24681087077093733, + "grad_norm": 0.2068743109703064, + "learning_rate": 2.424846422952041e-05, + "loss": 0.5848, + "step": 890 + }, + { + "epoch": 0.24708818635607321, + "grad_norm": 0.19195155799388885, + "learning_rate": 2.4246577372635387e-05, + "loss": 0.5621, + "step": 891 + }, + { + "epoch": 0.2473655019412091, + "grad_norm": 0.2014138102531433, + "learning_rate": 2.42446882236819e-05, + "loss": 0.5689, + "step": 892 + }, + { + "epoch": 0.24764281752634498, + "grad_norm": 0.2104417085647583, + "learning_rate": 2.4242796783028573e-05, + "loss": 0.5898, + "step": 893 + }, + { + "epoch": 0.24792013311148087, + "grad_norm": 0.22085507214069366, + "learning_rate": 2.4240903051044474e-05, + "loss": 0.5762, + "step": 894 + }, + { + "epoch": 0.24819744869661675, + "grad_norm": 0.20655465126037598, + "learning_rate": 2.4239007028099117e-05, + "loss": 0.5654, + "step": 895 + }, + { + "epoch": 0.24847476428175264, + "grad_norm": 0.2050492912530899, + "learning_rate": 2.4237108714562474e-05, + "loss": 0.5859, + "step": 896 + }, + { + "epoch": 0.24875207986688852, + "grad_norm": 0.21355165541172028, + "learning_rate": 2.4235208110804947e-05, + "loss": 0.5695, + "step": 897 + }, + { + "epoch": 0.2490293954520244, + "grad_norm": 0.20924112200737, + "learning_rate": 2.42333052171974e-05, + "loss": 0.5593, + "step": 898 + }, + { + "epoch": 0.2493067110371603, + "grad_norm": 0.22572918236255646, + "learning_rate": 2.423140003411114e-05, + "loss": 0.6217, + "step": 899 + }, + { + "epoch": 0.24958402662229617, + "grad_norm": 0.2063211351633072, + "learning_rate": 2.4229492561917914e-05, + "loss": 0.5765, + "step": 900 + }, + { + "epoch": 0.24986134220743206, + "grad_norm": 0.21796129643917084, + "learning_rate": 2.4227582800989923e-05, + "loss": 0.5932, + "step": 901 + }, + { + "epoch": 0.25013865779256794, + "grad_norm": 0.20169825851917267, + "learning_rate": 2.4225670751699808e-05, + "loss": 0.5858, + "step": 902 + }, + { + "epoch": 0.2504159733777038, + "grad_norm": 0.2097984254360199, + "learning_rate": 2.4223756414420668e-05, + "loss": 0.5888, + "step": 903 + }, + { + "epoch": 0.2506932889628397, + "grad_norm": 0.2516496479511261, + "learning_rate": 2.4221839789526033e-05, + "loss": 0.5812, + "step": 904 + }, + { + "epoch": 0.2509706045479756, + "grad_norm": 0.2110574096441269, + "learning_rate": 2.421992087738989e-05, + "loss": 0.5859, + "step": 905 + }, + { + "epoch": 0.2512479201331115, + "grad_norm": 0.1935090869665146, + "learning_rate": 2.4217999678386673e-05, + "loss": 0.5768, + "step": 906 + }, + { + "epoch": 0.25152523571824736, + "grad_norm": 0.20760700106620789, + "learning_rate": 2.4216076192891257e-05, + "loss": 0.5856, + "step": 907 + }, + { + "epoch": 0.25180255130338325, + "grad_norm": 0.20197226107120514, + "learning_rate": 2.4214150421278964e-05, + "loss": 0.6041, + "step": 908 + }, + { + "epoch": 0.25207986688851913, + "grad_norm": 0.20894859731197357, + "learning_rate": 2.4212222363925563e-05, + "loss": 0.5821, + "step": 909 + }, + { + "epoch": 0.252357182473655, + "grad_norm": 0.20673821866512299, + "learning_rate": 2.4210292021207268e-05, + "loss": 0.5813, + "step": 910 + }, + { + "epoch": 0.2526344980587909, + "grad_norm": 0.23159608244895935, + "learning_rate": 2.420835939350074e-05, + "loss": 0.5803, + "step": 911 + }, + { + "epoch": 0.2529118136439268, + "grad_norm": 0.19740775227546692, + "learning_rate": 2.420642448118309e-05, + "loss": 0.5721, + "step": 912 + }, + { + "epoch": 0.25318912922906267, + "grad_norm": 0.20442472398281097, + "learning_rate": 2.4204487284631866e-05, + "loss": 0.5681, + "step": 913 + }, + { + "epoch": 0.25346644481419855, + "grad_norm": 0.19745062291622162, + "learning_rate": 2.420254780422507e-05, + "loss": 0.573, + "step": 914 + }, + { + "epoch": 0.25374376039933444, + "grad_norm": 0.19819265604019165, + "learning_rate": 2.4200606040341147e-05, + "loss": 0.6015, + "step": 915 + }, + { + "epoch": 0.2540210759844703, + "grad_norm": 0.20158332586288452, + "learning_rate": 2.4198661993358976e-05, + "loss": 0.5557, + "step": 916 + }, + { + "epoch": 0.2542983915696062, + "grad_norm": 0.19300688803195953, + "learning_rate": 2.4196715663657903e-05, + "loss": 0.5691, + "step": 917 + }, + { + "epoch": 0.2545757071547421, + "grad_norm": 0.19178220629692078, + "learning_rate": 2.4194767051617707e-05, + "loss": 0.5877, + "step": 918 + }, + { + "epoch": 0.254853022739878, + "grad_norm": 0.19551022350788116, + "learning_rate": 2.4192816157618615e-05, + "loss": 0.5719, + "step": 919 + }, + { + "epoch": 0.25513033832501386, + "grad_norm": 0.2008076012134552, + "learning_rate": 2.419086298204129e-05, + "loss": 0.579, + "step": 920 + }, + { + "epoch": 0.25540765391014975, + "grad_norm": 0.19526442885398865, + "learning_rate": 2.4188907525266856e-05, + "loss": 0.546, + "step": 921 + }, + { + "epoch": 0.25568496949528563, + "grad_norm": 0.1844739466905594, + "learning_rate": 2.418694978767687e-05, + "loss": 0.5759, + "step": 922 + }, + { + "epoch": 0.2559622850804215, + "grad_norm": 0.20165039598941803, + "learning_rate": 2.4184989769653343e-05, + "loss": 0.5618, + "step": 923 + }, + { + "epoch": 0.2562396006655574, + "grad_norm": 0.19225285947322845, + "learning_rate": 2.418302747157872e-05, + "loss": 0.5627, + "step": 924 + }, + { + "epoch": 0.2565169162506933, + "grad_norm": 0.19688788056373596, + "learning_rate": 2.418106289383591e-05, + "loss": 0.5714, + "step": 925 + }, + { + "epoch": 0.25679423183582917, + "grad_norm": 0.2039179801940918, + "learning_rate": 2.417909603680824e-05, + "loss": 0.5975, + "step": 926 + }, + { + "epoch": 0.25707154742096505, + "grad_norm": 0.21120522916316986, + "learning_rate": 2.41771269008795e-05, + "loss": 0.6026, + "step": 927 + }, + { + "epoch": 0.25734886300610094, + "grad_norm": 0.21704575419425964, + "learning_rate": 2.4175155486433927e-05, + "loss": 0.5798, + "step": 928 + }, + { + "epoch": 0.2576261785912368, + "grad_norm": 0.18682295083999634, + "learning_rate": 2.4173181793856187e-05, + "loss": 0.5872, + "step": 929 + }, + { + "epoch": 0.2579034941763727, + "grad_norm": 0.200609490275383, + "learning_rate": 2.4171205823531402e-05, + "loss": 0.5568, + "step": 930 + }, + { + "epoch": 0.2581808097615086, + "grad_norm": 0.1996905356645584, + "learning_rate": 2.416922757584514e-05, + "loss": 0.5965, + "step": 931 + }, + { + "epoch": 0.2584581253466445, + "grad_norm": 0.1988278329372406, + "learning_rate": 2.4167247051183412e-05, + "loss": 0.5819, + "step": 932 + }, + { + "epoch": 0.25873544093178036, + "grad_norm": 0.20553693175315857, + "learning_rate": 2.4165264249932662e-05, + "loss": 0.591, + "step": 933 + }, + { + "epoch": 0.25901275651691624, + "grad_norm": 0.19097572565078735, + "learning_rate": 2.416327917247979e-05, + "loss": 0.5517, + "step": 934 + }, + { + "epoch": 0.2592900721020521, + "grad_norm": 0.191832035779953, + "learning_rate": 2.4161291819212144e-05, + "loss": 0.5811, + "step": 935 + }, + { + "epoch": 0.259567387687188, + "grad_norm": 0.19829009473323822, + "learning_rate": 2.4159302190517496e-05, + "loss": 0.5966, + "step": 936 + }, + { + "epoch": 0.2598447032723239, + "grad_norm": 0.1965586543083191, + "learning_rate": 2.415731028678409e-05, + "loss": 0.5458, + "step": 937 + }, + { + "epoch": 0.2601220188574598, + "grad_norm": 0.20748619735240936, + "learning_rate": 2.4155316108400593e-05, + "loss": 0.5843, + "step": 938 + }, + { + "epoch": 0.26039933444259566, + "grad_norm": 0.20656223595142365, + "learning_rate": 2.415331965575612e-05, + "loss": 0.5749, + "step": 939 + }, + { + "epoch": 0.26067665002773155, + "grad_norm": 0.186055988073349, + "learning_rate": 2.4151320929240227e-05, + "loss": 0.5749, + "step": 940 + }, + { + "epoch": 0.26095396561286743, + "grad_norm": 0.21665525436401367, + "learning_rate": 2.4149319929242934e-05, + "loss": 0.5825, + "step": 941 + }, + { + "epoch": 0.2612312811980033, + "grad_norm": 0.2007439136505127, + "learning_rate": 2.4147316656154674e-05, + "loss": 0.5904, + "step": 942 + }, + { + "epoch": 0.2615085967831392, + "grad_norm": 0.20290826261043549, + "learning_rate": 2.4145311110366347e-05, + "loss": 0.6019, + "step": 943 + }, + { + "epoch": 0.2617859123682751, + "grad_norm": 0.20062971115112305, + "learning_rate": 2.4143303292269286e-05, + "loss": 0.5571, + "step": 944 + }, + { + "epoch": 0.26206322795341097, + "grad_norm": 0.2056947946548462, + "learning_rate": 2.414129320225527e-05, + "loss": 0.5683, + "step": 945 + }, + { + "epoch": 0.26234054353854686, + "grad_norm": 0.18966248631477356, + "learning_rate": 2.4139280840716517e-05, + "loss": 0.5962, + "step": 946 + }, + { + "epoch": 0.26261785912368274, + "grad_norm": 0.20114421844482422, + "learning_rate": 2.41372662080457e-05, + "loss": 0.5528, + "step": 947 + }, + { + "epoch": 0.2628951747088186, + "grad_norm": 0.1974896341562271, + "learning_rate": 2.4135249304635914e-05, + "loss": 0.5802, + "step": 948 + }, + { + "epoch": 0.2631724902939545, + "grad_norm": 0.1997508406639099, + "learning_rate": 2.4133230130880726e-05, + "loss": 0.6122, + "step": 949 + }, + { + "epoch": 0.2634498058790904, + "grad_norm": 0.18641312420368195, + "learning_rate": 2.413120868717412e-05, + "loss": 0.5705, + "step": 950 + }, + { + "epoch": 0.2637271214642263, + "grad_norm": 0.19226068258285522, + "learning_rate": 2.4129184973910533e-05, + "loss": 0.5737, + "step": 951 + }, + { + "epoch": 0.26400443704936216, + "grad_norm": 0.18802867829799652, + "learning_rate": 2.4127158991484855e-05, + "loss": 0.562, + "step": 952 + }, + { + "epoch": 0.26428175263449805, + "grad_norm": 0.19342157244682312, + "learning_rate": 2.41251307402924e-05, + "loss": 0.5483, + "step": 953 + }, + { + "epoch": 0.26455906821963393, + "grad_norm": 0.1865961253643036, + "learning_rate": 2.4123100220728935e-05, + "loss": 0.5638, + "step": 954 + }, + { + "epoch": 0.2648363838047698, + "grad_norm": 0.22456099092960358, + "learning_rate": 2.4121067433190666e-05, + "loss": 0.5918, + "step": 955 + }, + { + "epoch": 0.2651136993899057, + "grad_norm": 0.24317651987075806, + "learning_rate": 2.4119032378074245e-05, + "loss": 0.5477, + "step": 956 + }, + { + "epoch": 0.2653910149750416, + "grad_norm": 0.19641970098018646, + "learning_rate": 2.411699505577677e-05, + "loss": 0.5554, + "step": 957 + }, + { + "epoch": 0.26566833056017747, + "grad_norm": 0.20111410319805145, + "learning_rate": 2.4114955466695773e-05, + "loss": 0.562, + "step": 958 + }, + { + "epoch": 0.26594564614531335, + "grad_norm": 0.19789332151412964, + "learning_rate": 2.411291361122923e-05, + "loss": 0.5919, + "step": 959 + }, + { + "epoch": 0.26622296173044924, + "grad_norm": 0.20293276011943817, + "learning_rate": 2.4110869489775567e-05, + "loss": 0.5839, + "step": 960 + }, + { + "epoch": 0.2665002773155851, + "grad_norm": 0.18560869991779327, + "learning_rate": 2.410882310273364e-05, + "loss": 0.5476, + "step": 961 + }, + { + "epoch": 0.266777592900721, + "grad_norm": 0.1932571828365326, + "learning_rate": 2.410677445050276e-05, + "loss": 0.5736, + "step": 962 + }, + { + "epoch": 0.2670549084858569, + "grad_norm": 0.21673519909381866, + "learning_rate": 2.4104723533482664e-05, + "loss": 0.5558, + "step": 963 + }, + { + "epoch": 0.2673322240709928, + "grad_norm": 0.21258150041103363, + "learning_rate": 2.4102670352073548e-05, + "loss": 0.591, + "step": 964 + }, + { + "epoch": 0.26760953965612866, + "grad_norm": 0.20553314685821533, + "learning_rate": 2.4100614906676036e-05, + "loss": 0.5901, + "step": 965 + }, + { + "epoch": 0.26788685524126454, + "grad_norm": 0.23216207325458527, + "learning_rate": 2.4098557197691204e-05, + "loss": 0.5677, + "step": 966 + }, + { + "epoch": 0.2681641708264004, + "grad_norm": 0.20041371881961823, + "learning_rate": 2.4096497225520564e-05, + "loss": 0.566, + "step": 967 + }, + { + "epoch": 0.2684414864115363, + "grad_norm": 0.20803777873516083, + "learning_rate": 2.4094434990566076e-05, + "loss": 0.5714, + "step": 968 + }, + { + "epoch": 0.2687188019966722, + "grad_norm": 0.19973017275333405, + "learning_rate": 2.409237049323013e-05, + "loss": 0.5805, + "step": 969 + }, + { + "epoch": 0.2689961175818081, + "grad_norm": 0.20199733972549438, + "learning_rate": 2.4090303733915567e-05, + "loss": 0.5475, + "step": 970 + }, + { + "epoch": 0.26927343316694397, + "grad_norm": 0.2112300544977188, + "learning_rate": 2.4088234713025664e-05, + "loss": 0.5821, + "step": 971 + }, + { + "epoch": 0.26955074875207985, + "grad_norm": 0.18687258660793304, + "learning_rate": 2.408616343096415e-05, + "loss": 0.5369, + "step": 972 + }, + { + "epoch": 0.26982806433721573, + "grad_norm": 0.20027992129325867, + "learning_rate": 2.4084089888135176e-05, + "loss": 0.551, + "step": 973 + }, + { + "epoch": 0.2701053799223516, + "grad_norm": 0.1915608048439026, + "learning_rate": 2.408201408494335e-05, + "loss": 0.5653, + "step": 974 + }, + { + "epoch": 0.2703826955074875, + "grad_norm": 0.2044133096933365, + "learning_rate": 2.407993602179372e-05, + "loss": 0.597, + "step": 975 + }, + { + "epoch": 0.2706600110926234, + "grad_norm": 0.20056426525115967, + "learning_rate": 2.4077855699091764e-05, + "loss": 0.5864, + "step": 976 + }, + { + "epoch": 0.27093732667775927, + "grad_norm": 0.19527383148670197, + "learning_rate": 2.407577311724341e-05, + "loss": 0.565, + "step": 977 + }, + { + "epoch": 0.27121464226289516, + "grad_norm": 0.2120949625968933, + "learning_rate": 2.407368827665503e-05, + "loss": 0.5621, + "step": 978 + }, + { + "epoch": 0.27149195784803104, + "grad_norm": 0.18631702661514282, + "learning_rate": 2.407160117773343e-05, + "loss": 0.5657, + "step": 979 + }, + { + "epoch": 0.2717692734331669, + "grad_norm": 0.19784550368785858, + "learning_rate": 2.4069511820885854e-05, + "loss": 0.5547, + "step": 980 + }, + { + "epoch": 0.2720465890183028, + "grad_norm": 0.19714047014713287, + "learning_rate": 2.4067420206519993e-05, + "loss": 0.5793, + "step": 981 + }, + { + "epoch": 0.2723239046034387, + "grad_norm": 0.2170424610376358, + "learning_rate": 2.4065326335043976e-05, + "loss": 0.5745, + "step": 982 + }, + { + "epoch": 0.2726012201885746, + "grad_norm": 0.20346680283546448, + "learning_rate": 2.4063230206866377e-05, + "loss": 0.5828, + "step": 983 + }, + { + "epoch": 0.27287853577371046, + "grad_norm": 0.2216998040676117, + "learning_rate": 2.40611318223962e-05, + "loss": 0.5762, + "step": 984 + }, + { + "epoch": 0.27315585135884635, + "grad_norm": 0.20975996553897858, + "learning_rate": 2.4059031182042897e-05, + "loss": 0.5442, + "step": 985 + }, + { + "epoch": 0.27343316694398223, + "grad_norm": 0.19896754622459412, + "learning_rate": 2.405692828621636e-05, + "loss": 0.5977, + "step": 986 + }, + { + "epoch": 0.2737104825291181, + "grad_norm": 0.18813903629779816, + "learning_rate": 2.4054823135326922e-05, + "loss": 0.5641, + "step": 987 + }, + { + "epoch": 0.273987798114254, + "grad_norm": 0.19095094501972198, + "learning_rate": 2.4052715729785348e-05, + "loss": 0.5427, + "step": 988 + }, + { + "epoch": 0.2742651136993899, + "grad_norm": 0.19185671210289001, + "learning_rate": 2.405060607000285e-05, + "loss": 0.557, + "step": 989 + }, + { + "epoch": 0.27454242928452577, + "grad_norm": 0.19244584441184998, + "learning_rate": 2.4048494156391087e-05, + "loss": 0.5558, + "step": 990 + }, + { + "epoch": 0.27481974486966165, + "grad_norm": 0.20083992183208466, + "learning_rate": 2.404637998936214e-05, + "loss": 0.5635, + "step": 991 + }, + { + "epoch": 0.27509706045479754, + "grad_norm": 0.19767695665359497, + "learning_rate": 2.404426356932854e-05, + "loss": 0.5814, + "step": 992 + }, + { + "epoch": 0.2753743760399334, + "grad_norm": 0.19133426249027252, + "learning_rate": 2.4042144896703256e-05, + "loss": 0.5951, + "step": 993 + }, + { + "epoch": 0.2756516916250693, + "grad_norm": 0.19364149868488312, + "learning_rate": 2.40400239718997e-05, + "loss": 0.5695, + "step": 994 + }, + { + "epoch": 0.2759290072102052, + "grad_norm": 0.19669091701507568, + "learning_rate": 2.4037900795331722e-05, + "loss": 0.5801, + "step": 995 + }, + { + "epoch": 0.2762063227953411, + "grad_norm": 0.2011607140302658, + "learning_rate": 2.403577536741361e-05, + "loss": 0.5736, + "step": 996 + }, + { + "epoch": 0.27648363838047696, + "grad_norm": 0.19536298513412476, + "learning_rate": 2.4033647688560084e-05, + "loss": 0.5404, + "step": 997 + }, + { + "epoch": 0.27676095396561284, + "grad_norm": 0.1903197318315506, + "learning_rate": 2.403151775918632e-05, + "loss": 0.5939, + "step": 998 + }, + { + "epoch": 0.27703826955074873, + "grad_norm": 0.19172310829162598, + "learning_rate": 2.4029385579707916e-05, + "loss": 0.5688, + "step": 999 + }, + { + "epoch": 0.2773155851358846, + "grad_norm": 0.22239771485328674, + "learning_rate": 2.402725115054092e-05, + "loss": 0.5754, + "step": 1000 + }, + { + "epoch": 0.2775929007210205, + "grad_norm": 0.18384471535682678, + "learning_rate": 2.402511447210182e-05, + "loss": 0.5668, + "step": 1001 + }, + { + "epoch": 0.2778702163061564, + "grad_norm": 0.2017565667629242, + "learning_rate": 2.402297554480753e-05, + "loss": 0.5711, + "step": 1002 + }, + { + "epoch": 0.27814753189129227, + "grad_norm": 0.1890055239200592, + "learning_rate": 2.402083436907542e-05, + "loss": 0.5552, + "step": 1003 + }, + { + "epoch": 0.27842484747642815, + "grad_norm": 0.1961050033569336, + "learning_rate": 2.4018690945323284e-05, + "loss": 0.5744, + "step": 1004 + }, + { + "epoch": 0.27870216306156403, + "grad_norm": 0.2047930508852005, + "learning_rate": 2.401654527396936e-05, + "loss": 0.5794, + "step": 1005 + }, + { + "epoch": 0.2789794786466999, + "grad_norm": 0.19990523159503937, + "learning_rate": 2.4014397355432335e-05, + "loss": 0.6046, + "step": 1006 + }, + { + "epoch": 0.2792567942318358, + "grad_norm": 0.19778995215892792, + "learning_rate": 2.401224719013131e-05, + "loss": 0.5621, + "step": 1007 + }, + { + "epoch": 0.2795341098169717, + "grad_norm": 0.1909160017967224, + "learning_rate": 2.4010094778485846e-05, + "loss": 0.5943, + "step": 1008 + }, + { + "epoch": 0.27981142540210757, + "grad_norm": 0.20408110320568085, + "learning_rate": 2.4007940120915946e-05, + "loss": 0.5991, + "step": 1009 + }, + { + "epoch": 0.28008874098724346, + "grad_norm": 0.20102624595165253, + "learning_rate": 2.4005783217842024e-05, + "loss": 0.6022, + "step": 1010 + }, + { + "epoch": 0.28036605657237934, + "grad_norm": 0.1910308301448822, + "learning_rate": 2.4003624069684957e-05, + "loss": 0.5874, + "step": 1011 + }, + { + "epoch": 0.2806433721575152, + "grad_norm": 0.189175084233284, + "learning_rate": 2.4001462676866054e-05, + "loss": 0.5698, + "step": 1012 + }, + { + "epoch": 0.2809206877426511, + "grad_norm": 0.19003014266490936, + "learning_rate": 2.3999299039807055e-05, + "loss": 0.5819, + "step": 1013 + }, + { + "epoch": 0.281198003327787, + "grad_norm": 0.2033187299966812, + "learning_rate": 2.3997133158930145e-05, + "loss": 0.5979, + "step": 1014 + }, + { + "epoch": 0.28147531891292293, + "grad_norm": 0.1880473643541336, + "learning_rate": 2.3994965034657946e-05, + "loss": 0.5472, + "step": 1015 + }, + { + "epoch": 0.2817526344980588, + "grad_norm": 0.19091346859931946, + "learning_rate": 2.3992794667413514e-05, + "loss": 0.5698, + "step": 1016 + }, + { + "epoch": 0.2820299500831947, + "grad_norm": 0.19986368715763092, + "learning_rate": 2.399062205762035e-05, + "loss": 0.5956, + "step": 1017 + }, + { + "epoch": 0.2823072656683306, + "grad_norm": 0.19207067787647247, + "learning_rate": 2.398844720570238e-05, + "loss": 0.549, + "step": 1018 + }, + { + "epoch": 0.28258458125346647, + "grad_norm": 0.19408905506134033, + "learning_rate": 2.398627011208398e-05, + "loss": 0.5938, + "step": 1019 + }, + { + "epoch": 0.28286189683860236, + "grad_norm": 0.20354047417640686, + "learning_rate": 2.398409077718996e-05, + "loss": 0.5488, + "step": 1020 + }, + { + "epoch": 0.28313921242373824, + "grad_norm": 0.19870953261852264, + "learning_rate": 2.3981909201445563e-05, + "loss": 0.565, + "step": 1021 + }, + { + "epoch": 0.2834165280088741, + "grad_norm": 0.19812439382076263, + "learning_rate": 2.3979725385276475e-05, + "loss": 0.5455, + "step": 1022 + }, + { + "epoch": 0.28369384359401, + "grad_norm": 0.1886581927537918, + "learning_rate": 2.3977539329108813e-05, + "loss": 0.5458, + "step": 1023 + }, + { + "epoch": 0.2839711591791459, + "grad_norm": 0.1987125426530838, + "learning_rate": 2.3975351033369138e-05, + "loss": 0.5669, + "step": 1024 + }, + { + "epoch": 0.2842484747642818, + "grad_norm": 0.22528746724128723, + "learning_rate": 2.397316049848444e-05, + "loss": 0.579, + "step": 1025 + }, + { + "epoch": 0.28452579034941766, + "grad_norm": 0.20103305578231812, + "learning_rate": 2.3970967724882154e-05, + "loss": 0.5843, + "step": 1026 + }, + { + "epoch": 0.28480310593455355, + "grad_norm": 0.21639437973499298, + "learning_rate": 2.396877271299015e-05, + "loss": 0.5709, + "step": 1027 + }, + { + "epoch": 0.28508042151968943, + "grad_norm": 0.24263976514339447, + "learning_rate": 2.3966575463236725e-05, + "loss": 0.5926, + "step": 1028 + }, + { + "epoch": 0.2853577371048253, + "grad_norm": 0.19815106689929962, + "learning_rate": 2.396437597605063e-05, + "loss": 0.5722, + "step": 1029 + }, + { + "epoch": 0.2856350526899612, + "grad_norm": 0.19260184466838837, + "learning_rate": 2.396217425186104e-05, + "loss": 0.5621, + "step": 1030 + }, + { + "epoch": 0.2859123682750971, + "grad_norm": 0.20668724179267883, + "learning_rate": 2.3959970291097566e-05, + "loss": 0.5924, + "step": 1031 + }, + { + "epoch": 0.28618968386023297, + "grad_norm": 0.2019844353199005, + "learning_rate": 2.3957764094190265e-05, + "loss": 0.5821, + "step": 1032 + }, + { + "epoch": 0.28646699944536885, + "grad_norm": 0.1919315904378891, + "learning_rate": 2.3955555661569617e-05, + "loss": 0.5985, + "step": 1033 + }, + { + "epoch": 0.28674431503050474, + "grad_norm": 0.19516989588737488, + "learning_rate": 2.3953344993666555e-05, + "loss": 0.5993, + "step": 1034 + }, + { + "epoch": 0.2870216306156406, + "grad_norm": 0.18881164491176605, + "learning_rate": 2.3951132090912432e-05, + "loss": 0.5911, + "step": 1035 + }, + { + "epoch": 0.2872989462007765, + "grad_norm": 0.20282314717769623, + "learning_rate": 2.3948916953739045e-05, + "loss": 0.5939, + "step": 1036 + }, + { + "epoch": 0.2875762617859124, + "grad_norm": 0.19400392472743988, + "learning_rate": 2.394669958257863e-05, + "loss": 0.5993, + "step": 1037 + }, + { + "epoch": 0.2878535773710483, + "grad_norm": 0.21767151355743408, + "learning_rate": 2.3944479977863847e-05, + "loss": 0.5796, + "step": 1038 + }, + { + "epoch": 0.28813089295618416, + "grad_norm": 0.21109527349472046, + "learning_rate": 2.3942258140027805e-05, + "loss": 0.5678, + "step": 1039 + }, + { + "epoch": 0.28840820854132004, + "grad_norm": 0.19805195927619934, + "learning_rate": 2.3940034069504048e-05, + "loss": 0.5645, + "step": 1040 + }, + { + "epoch": 0.28868552412645593, + "grad_norm": 0.18621553480625153, + "learning_rate": 2.3937807766726545e-05, + "loss": 0.5639, + "step": 1041 + }, + { + "epoch": 0.2889628397115918, + "grad_norm": 0.19867920875549316, + "learning_rate": 2.3935579232129705e-05, + "loss": 0.578, + "step": 1042 + }, + { + "epoch": 0.2892401552967277, + "grad_norm": 0.1965349018573761, + "learning_rate": 2.393334846614838e-05, + "loss": 0.5895, + "step": 1043 + }, + { + "epoch": 0.2895174708818636, + "grad_norm": 0.21346546709537506, + "learning_rate": 2.3931115469217848e-05, + "loss": 0.5592, + "step": 1044 + }, + { + "epoch": 0.28979478646699947, + "grad_norm": 0.19675497710704803, + "learning_rate": 2.392888024177382e-05, + "loss": 0.5709, + "step": 1045 + }, + { + "epoch": 0.29007210205213535, + "grad_norm": 0.18906576931476593, + "learning_rate": 2.392664278425246e-05, + "loss": 0.5721, + "step": 1046 + }, + { + "epoch": 0.29034941763727123, + "grad_norm": 0.1987171620130539, + "learning_rate": 2.3924403097090348e-05, + "loss": 0.5796, + "step": 1047 + }, + { + "epoch": 0.2906267332224071, + "grad_norm": 0.20017191767692566, + "learning_rate": 2.392216118072451e-05, + "loss": 0.5803, + "step": 1048 + }, + { + "epoch": 0.290904048807543, + "grad_norm": 0.1904933750629425, + "learning_rate": 2.39199170355924e-05, + "loss": 0.5847, + "step": 1049 + }, + { + "epoch": 0.2911813643926789, + "grad_norm": 0.20761634409427643, + "learning_rate": 2.3917670662131914e-05, + "loss": 0.5678, + "step": 1050 + }, + { + "epoch": 0.29145867997781477, + "grad_norm": 0.1986733376979828, + "learning_rate": 2.391542206078137e-05, + "loss": 0.5886, + "step": 1051 + }, + { + "epoch": 0.29173599556295066, + "grad_norm": 0.2128080427646637, + "learning_rate": 2.3913171231979543e-05, + "loss": 0.5649, + "step": 1052 + }, + { + "epoch": 0.29201331114808654, + "grad_norm": 0.1908857524394989, + "learning_rate": 2.391091817616562e-05, + "loss": 0.5951, + "step": 1053 + }, + { + "epoch": 0.2922906267332224, + "grad_norm": 0.19354763627052307, + "learning_rate": 2.3908662893779228e-05, + "loss": 0.5878, + "step": 1054 + }, + { + "epoch": 0.2925679423183583, + "grad_norm": 0.19191789627075195, + "learning_rate": 2.3906405385260443e-05, + "loss": 0.5842, + "step": 1055 + }, + { + "epoch": 0.2928452579034942, + "grad_norm": 0.19070293009281158, + "learning_rate": 2.3904145651049764e-05, + "loss": 0.5707, + "step": 1056 + }, + { + "epoch": 0.2931225734886301, + "grad_norm": 0.20022979378700256, + "learning_rate": 2.3901883691588116e-05, + "loss": 0.5838, + "step": 1057 + }, + { + "epoch": 0.29339988907376596, + "grad_norm": 0.18544460833072662, + "learning_rate": 2.3899619507316878e-05, + "loss": 0.5732, + "step": 1058 + }, + { + "epoch": 0.29367720465890185, + "grad_norm": 0.19487732648849487, + "learning_rate": 2.3897353098677845e-05, + "loss": 0.5681, + "step": 1059 + }, + { + "epoch": 0.29395452024403773, + "grad_norm": 0.19073253870010376, + "learning_rate": 2.3895084466113253e-05, + "loss": 0.5638, + "step": 1060 + }, + { + "epoch": 0.2942318358291736, + "grad_norm": 0.19357110559940338, + "learning_rate": 2.3892813610065778e-05, + "loss": 0.5617, + "step": 1061 + }, + { + "epoch": 0.2945091514143095, + "grad_norm": 0.19819645583629608, + "learning_rate": 2.3890540530978518e-05, + "loss": 0.5601, + "step": 1062 + }, + { + "epoch": 0.2947864669994454, + "grad_norm": 0.20150421559810638, + "learning_rate": 2.3888265229295014e-05, + "loss": 0.5675, + "step": 1063 + }, + { + "epoch": 0.29506378258458127, + "grad_norm": 0.19517284631729126, + "learning_rate": 2.388598770545924e-05, + "loss": 0.5516, + "step": 1064 + }, + { + "epoch": 0.29534109816971715, + "grad_norm": 0.2073058784008026, + "learning_rate": 2.3883707959915594e-05, + "loss": 0.5729, + "step": 1065 + }, + { + "epoch": 0.29561841375485304, + "grad_norm": 0.190653994679451, + "learning_rate": 2.3881425993108922e-05, + "loss": 0.5932, + "step": 1066 + }, + { + "epoch": 0.2958957293399889, + "grad_norm": 0.19685585796833038, + "learning_rate": 2.3879141805484492e-05, + "loss": 0.5579, + "step": 1067 + }, + { + "epoch": 0.2961730449251248, + "grad_norm": 0.19154155254364014, + "learning_rate": 2.3876855397488014e-05, + "loss": 0.5622, + "step": 1068 + }, + { + "epoch": 0.2964503605102607, + "grad_norm": 0.23048245906829834, + "learning_rate": 2.387456676956562e-05, + "loss": 0.5951, + "step": 1069 + }, + { + "epoch": 0.2967276760953966, + "grad_norm": 0.2001733034849167, + "learning_rate": 2.387227592216389e-05, + "loss": 0.5723, + "step": 1070 + }, + { + "epoch": 0.29700499168053246, + "grad_norm": 0.20289377868175507, + "learning_rate": 2.3869982855729822e-05, + "loss": 0.6023, + "step": 1071 + }, + { + "epoch": 0.29728230726566834, + "grad_norm": 0.18517981469631195, + "learning_rate": 2.386768757071086e-05, + "loss": 0.6075, + "step": 1072 + }, + { + "epoch": 0.29755962285080423, + "grad_norm": 0.18674753606319427, + "learning_rate": 2.3865390067554865e-05, + "loss": 0.5725, + "step": 1073 + }, + { + "epoch": 0.2978369384359401, + "grad_norm": 0.19436419010162354, + "learning_rate": 2.3863090346710153e-05, + "loss": 0.595, + "step": 1074 + }, + { + "epoch": 0.298114254021076, + "grad_norm": 0.1953345537185669, + "learning_rate": 2.3860788408625456e-05, + "loss": 0.5845, + "step": 1075 + }, + { + "epoch": 0.2983915696062119, + "grad_norm": 0.23673701286315918, + "learning_rate": 2.385848425374994e-05, + "loss": 0.5716, + "step": 1076 + }, + { + "epoch": 0.29866888519134777, + "grad_norm": 0.20462685823440552, + "learning_rate": 2.385617788253321e-05, + "loss": 0.5486, + "step": 1077 + }, + { + "epoch": 0.29894620077648365, + "grad_norm": 0.19857102632522583, + "learning_rate": 2.3853869295425296e-05, + "loss": 0.5744, + "step": 1078 + }, + { + "epoch": 0.29922351636161953, + "grad_norm": 0.21950219571590424, + "learning_rate": 2.385155849287667e-05, + "loss": 0.5644, + "step": 1079 + }, + { + "epoch": 0.2995008319467554, + "grad_norm": 0.25962188839912415, + "learning_rate": 2.384924547533823e-05, + "loss": 0.5826, + "step": 1080 + }, + { + "epoch": 0.2997781475318913, + "grad_norm": 0.21206709742546082, + "learning_rate": 2.3846930243261302e-05, + "loss": 0.5726, + "step": 1081 + }, + { + "epoch": 0.3000554631170272, + "grad_norm": 0.20144885778427124, + "learning_rate": 2.384461279709765e-05, + "loss": 0.5736, + "step": 1082 + }, + { + "epoch": 0.3003327787021631, + "grad_norm": 0.19098758697509766, + "learning_rate": 2.3842293137299475e-05, + "loss": 0.57, + "step": 1083 + }, + { + "epoch": 0.30061009428729896, + "grad_norm": 0.20457082986831665, + "learning_rate": 2.38399712643194e-05, + "loss": 0.5527, + "step": 1084 + }, + { + "epoch": 0.30088740987243484, + "grad_norm": 0.22459912300109863, + "learning_rate": 2.3837647178610482e-05, + "loss": 0.5907, + "step": 1085 + }, + { + "epoch": 0.3011647254575707, + "grad_norm": 0.2092091143131256, + "learning_rate": 2.3835320880626216e-05, + "loss": 0.5796, + "step": 1086 + }, + { + "epoch": 0.3014420410427066, + "grad_norm": 0.18621404469013214, + "learning_rate": 2.3832992370820523e-05, + "loss": 0.6085, + "step": 1087 + }, + { + "epoch": 0.3017193566278425, + "grad_norm": 0.1938057541847229, + "learning_rate": 2.3830661649647757e-05, + "loss": 0.5642, + "step": 1088 + }, + { + "epoch": 0.3019966722129784, + "grad_norm": 0.2136821299791336, + "learning_rate": 2.3828328717562704e-05, + "loss": 0.5621, + "step": 1089 + }, + { + "epoch": 0.30227398779811426, + "grad_norm": 0.200357124209404, + "learning_rate": 2.3825993575020577e-05, + "loss": 0.5728, + "step": 1090 + }, + { + "epoch": 0.30255130338325015, + "grad_norm": 0.19953665137290955, + "learning_rate": 2.382365622247703e-05, + "loss": 0.5677, + "step": 1091 + }, + { + "epoch": 0.30282861896838603, + "grad_norm": 0.19972378015518188, + "learning_rate": 2.382131666038814e-05, + "loss": 0.5623, + "step": 1092 + }, + { + "epoch": 0.3031059345535219, + "grad_norm": 0.21744661033153534, + "learning_rate": 2.381897488921041e-05, + "loss": 0.5508, + "step": 1093 + }, + { + "epoch": 0.3033832501386578, + "grad_norm": 0.18918602168560028, + "learning_rate": 2.3816630909400793e-05, + "loss": 0.5805, + "step": 1094 + }, + { + "epoch": 0.3036605657237937, + "grad_norm": 0.1894434541463852, + "learning_rate": 2.3814284721416656e-05, + "loss": 0.563, + "step": 1095 + }, + { + "epoch": 0.30393788130892957, + "grad_norm": 0.19141651690006256, + "learning_rate": 2.3811936325715807e-05, + "loss": 0.5839, + "step": 1096 + }, + { + "epoch": 0.30421519689406545, + "grad_norm": 0.1895507425069809, + "learning_rate": 2.3809585722756472e-05, + "loss": 0.5864, + "step": 1097 + }, + { + "epoch": 0.30449251247920134, + "grad_norm": 0.21037045121192932, + "learning_rate": 2.3807232912997324e-05, + "loss": 0.5806, + "step": 1098 + }, + { + "epoch": 0.3047698280643372, + "grad_norm": 0.20355623960494995, + "learning_rate": 2.3804877896897455e-05, + "loss": 0.5677, + "step": 1099 + }, + { + "epoch": 0.3050471436494731, + "grad_norm": 0.20115728676319122, + "learning_rate": 2.380252067491639e-05, + "loss": 0.5635, + "step": 1100 + }, + { + "epoch": 0.305324459234609, + "grad_norm": 0.19460982084274292, + "learning_rate": 2.3800161247514086e-05, + "loss": 0.5774, + "step": 1101 + }, + { + "epoch": 0.3056017748197449, + "grad_norm": 0.18929602205753326, + "learning_rate": 2.3797799615150934e-05, + "loss": 0.5708, + "step": 1102 + }, + { + "epoch": 0.30587909040488076, + "grad_norm": 0.20288972556591034, + "learning_rate": 2.3795435778287745e-05, + "loss": 0.5852, + "step": 1103 + }, + { + "epoch": 0.30615640599001664, + "grad_norm": 0.20964893698692322, + "learning_rate": 2.379306973738577e-05, + "loss": 0.5731, + "step": 1104 + }, + { + "epoch": 0.30643372157515253, + "grad_norm": 0.2250620722770691, + "learning_rate": 2.379070149290668e-05, + "loss": 0.5741, + "step": 1105 + }, + { + "epoch": 0.3067110371602884, + "grad_norm": 0.19434000551700592, + "learning_rate": 2.3788331045312592e-05, + "loss": 0.5949, + "step": 1106 + }, + { + "epoch": 0.3069883527454243, + "grad_norm": 0.1980692446231842, + "learning_rate": 2.3785958395066037e-05, + "loss": 0.5591, + "step": 1107 + }, + { + "epoch": 0.3072656683305602, + "grad_norm": 0.20279406011104584, + "learning_rate": 2.3783583542629984e-05, + "loss": 0.5925, + "step": 1108 + }, + { + "epoch": 0.30754298391569607, + "grad_norm": 0.19802772998809814, + "learning_rate": 2.378120648846783e-05, + "loss": 0.5756, + "step": 1109 + }, + { + "epoch": 0.30782029950083195, + "grad_norm": 0.19455523788928986, + "learning_rate": 2.37788272330434e-05, + "loss": 0.5846, + "step": 1110 + }, + { + "epoch": 0.30809761508596784, + "grad_norm": 0.19493591785430908, + "learning_rate": 2.3776445776820948e-05, + "loss": 0.5788, + "step": 1111 + }, + { + "epoch": 0.3083749306711037, + "grad_norm": 0.19869014620780945, + "learning_rate": 2.3774062120265163e-05, + "loss": 0.5836, + "step": 1112 + }, + { + "epoch": 0.3086522462562396, + "grad_norm": 0.19301962852478027, + "learning_rate": 2.3771676263841157e-05, + "loss": 0.5689, + "step": 1113 + }, + { + "epoch": 0.3089295618413755, + "grad_norm": 0.2125353217124939, + "learning_rate": 2.3769288208014473e-05, + "loss": 0.579, + "step": 1114 + }, + { + "epoch": 0.3092068774265114, + "grad_norm": 0.20093408226966858, + "learning_rate": 2.376689795325109e-05, + "loss": 0.5489, + "step": 1115 + }, + { + "epoch": 0.30948419301164726, + "grad_norm": 0.20457975566387177, + "learning_rate": 2.37645055000174e-05, + "loss": 0.5734, + "step": 1116 + }, + { + "epoch": 0.30976150859678314, + "grad_norm": 0.2070866972208023, + "learning_rate": 2.376211084878024e-05, + "loss": 0.5711, + "step": 1117 + }, + { + "epoch": 0.310038824181919, + "grad_norm": 0.19698989391326904, + "learning_rate": 2.375971400000687e-05, + "loss": 0.5781, + "step": 1118 + }, + { + "epoch": 0.3103161397670549, + "grad_norm": 0.19639791548252106, + "learning_rate": 2.3757314954164982e-05, + "loss": 0.5496, + "step": 1119 + }, + { + "epoch": 0.3105934553521908, + "grad_norm": 0.19890196621418, + "learning_rate": 2.3754913711722687e-05, + "loss": 0.5658, + "step": 1120 + }, + { + "epoch": 0.3108707709373267, + "grad_norm": 0.2007942795753479, + "learning_rate": 2.3752510273148533e-05, + "loss": 0.5679, + "step": 1121 + }, + { + "epoch": 0.31114808652246256, + "grad_norm": 0.21742717921733856, + "learning_rate": 2.3750104638911493e-05, + "loss": 0.5603, + "step": 1122 + }, + { + "epoch": 0.31142540210759845, + "grad_norm": 0.19708824157714844, + "learning_rate": 2.3747696809480974e-05, + "loss": 0.561, + "step": 1123 + }, + { + "epoch": 0.31170271769273433, + "grad_norm": 0.20064733922481537, + "learning_rate": 2.374528678532681e-05, + "loss": 0.5535, + "step": 1124 + }, + { + "epoch": 0.3119800332778702, + "grad_norm": 0.19142742455005646, + "learning_rate": 2.3742874566919248e-05, + "loss": 0.5621, + "step": 1125 + }, + { + "epoch": 0.3122573488630061, + "grad_norm": 0.20165249705314636, + "learning_rate": 2.3740460154728987e-05, + "loss": 0.5862, + "step": 1126 + }, + { + "epoch": 0.312534664448142, + "grad_norm": 0.19451411068439484, + "learning_rate": 2.373804354922714e-05, + "loss": 0.5611, + "step": 1127 + }, + { + "epoch": 0.31281198003327787, + "grad_norm": 0.20111672580242157, + "learning_rate": 2.373562475088525e-05, + "loss": 0.5629, + "step": 1128 + }, + { + "epoch": 0.31308929561841375, + "grad_norm": 0.21810267865657806, + "learning_rate": 2.3733203760175292e-05, + "loss": 0.5746, + "step": 1129 + }, + { + "epoch": 0.31336661120354964, + "grad_norm": 0.20461341738700867, + "learning_rate": 2.3730780577569654e-05, + "loss": 0.5429, + "step": 1130 + }, + { + "epoch": 0.3136439267886855, + "grad_norm": 0.18515266478061676, + "learning_rate": 2.3728355203541182e-05, + "loss": 0.5547, + "step": 1131 + }, + { + "epoch": 0.3139212423738214, + "grad_norm": 0.1941545456647873, + "learning_rate": 2.3725927638563112e-05, + "loss": 0.5566, + "step": 1132 + }, + { + "epoch": 0.3141985579589573, + "grad_norm": 0.20392583310604095, + "learning_rate": 2.3723497883109137e-05, + "loss": 0.5506, + "step": 1133 + }, + { + "epoch": 0.3144758735440932, + "grad_norm": 0.19422155618667603, + "learning_rate": 2.3721065937653363e-05, + "loss": 0.5826, + "step": 1134 + }, + { + "epoch": 0.31475318912922906, + "grad_norm": 0.19887231290340424, + "learning_rate": 2.3718631802670334e-05, + "loss": 0.5651, + "step": 1135 + }, + { + "epoch": 0.31503050471436495, + "grad_norm": 0.20396895706653595, + "learning_rate": 2.3716195478635e-05, + "loss": 0.5396, + "step": 1136 + }, + { + "epoch": 0.31530782029950083, + "grad_norm": 0.2106340378522873, + "learning_rate": 2.3713756966022766e-05, + "loss": 0.552, + "step": 1137 + }, + { + "epoch": 0.3155851358846367, + "grad_norm": 0.20762008428573608, + "learning_rate": 2.371131626530944e-05, + "loss": 0.5942, + "step": 1138 + }, + { + "epoch": 0.3158624514697726, + "grad_norm": 0.19259460270404816, + "learning_rate": 2.3708873376971277e-05, + "loss": 0.5364, + "step": 1139 + }, + { + "epoch": 0.3161397670549085, + "grad_norm": 0.19473238289356232, + "learning_rate": 2.3706428301484946e-05, + "loss": 0.5862, + "step": 1140 + }, + { + "epoch": 0.31641708264004437, + "grad_norm": 0.20146487653255463, + "learning_rate": 2.370398103932754e-05, + "loss": 0.5924, + "step": 1141 + }, + { + "epoch": 0.31669439822518025, + "grad_norm": 0.19463180005550385, + "learning_rate": 2.370153159097659e-05, + "loss": 0.554, + "step": 1142 + }, + { + "epoch": 0.31697171381031614, + "grad_norm": 0.22867369651794434, + "learning_rate": 2.3699079956910052e-05, + "loss": 0.58, + "step": 1143 + }, + { + "epoch": 0.317249029395452, + "grad_norm": 0.1938031017780304, + "learning_rate": 2.3696626137606297e-05, + "loss": 0.5998, + "step": 1144 + }, + { + "epoch": 0.3175263449805879, + "grad_norm": 0.1907264143228531, + "learning_rate": 2.369417013354413e-05, + "loss": 0.5601, + "step": 1145 + }, + { + "epoch": 0.3178036605657238, + "grad_norm": 0.21385140717029572, + "learning_rate": 2.369171194520279e-05, + "loss": 0.5919, + "step": 1146 + }, + { + "epoch": 0.3180809761508597, + "grad_norm": 0.23154176771640778, + "learning_rate": 2.3689251573061932e-05, + "loss": 0.5901, + "step": 1147 + }, + { + "epoch": 0.31835829173599556, + "grad_norm": 0.19392211735248566, + "learning_rate": 2.3686789017601634e-05, + "loss": 0.5514, + "step": 1148 + }, + { + "epoch": 0.31863560732113144, + "grad_norm": 0.19515374302864075, + "learning_rate": 2.3684324279302418e-05, + "loss": 0.558, + "step": 1149 + }, + { + "epoch": 0.3189129229062673, + "grad_norm": 0.18750956654548645, + "learning_rate": 2.3681857358645205e-05, + "loss": 0.5544, + "step": 1150 + }, + { + "epoch": 0.3191902384914032, + "grad_norm": 0.20247051119804382, + "learning_rate": 2.3679388256111368e-05, + "loss": 0.5724, + "step": 1151 + }, + { + "epoch": 0.3194675540765391, + "grad_norm": 0.19525887072086334, + "learning_rate": 2.3676916972182686e-05, + "loss": 0.583, + "step": 1152 + }, + { + "epoch": 0.319744869661675, + "grad_norm": 0.20457209646701813, + "learning_rate": 2.3674443507341377e-05, + "loss": 0.5592, + "step": 1153 + }, + { + "epoch": 0.32002218524681086, + "grad_norm": 0.19369947910308838, + "learning_rate": 2.367196786207008e-05, + "loss": 0.5889, + "step": 1154 + }, + { + "epoch": 0.32029950083194675, + "grad_norm": 0.18851204216480255, + "learning_rate": 2.3669490036851856e-05, + "loss": 0.5804, + "step": 1155 + }, + { + "epoch": 0.32057681641708263, + "grad_norm": 0.1838391125202179, + "learning_rate": 2.3667010032170196e-05, + "loss": 0.5251, + "step": 1156 + }, + { + "epoch": 0.3208541320022185, + "grad_norm": 0.1949407011270523, + "learning_rate": 2.3664527848509015e-05, + "loss": 0.5719, + "step": 1157 + }, + { + "epoch": 0.3211314475873544, + "grad_norm": 0.2245536744594574, + "learning_rate": 2.3662043486352653e-05, + "loss": 0.5699, + "step": 1158 + }, + { + "epoch": 0.3214087631724903, + "grad_norm": 0.18889588117599487, + "learning_rate": 2.3659556946185875e-05, + "loss": 0.5449, + "step": 1159 + }, + { + "epoch": 0.32168607875762617, + "grad_norm": 0.18529140949249268, + "learning_rate": 2.3657068228493863e-05, + "loss": 0.5469, + "step": 1160 + }, + { + "epoch": 0.32196339434276205, + "grad_norm": 0.19042176008224487, + "learning_rate": 2.3654577333762246e-05, + "loss": 0.5662, + "step": 1161 + }, + { + "epoch": 0.32224070992789794, + "grad_norm": 0.22052356600761414, + "learning_rate": 2.3652084262477055e-05, + "loss": 0.5603, + "step": 1162 + }, + { + "epoch": 0.3225180255130338, + "grad_norm": 0.19664537906646729, + "learning_rate": 2.364958901512475e-05, + "loss": 0.5837, + "step": 1163 + }, + { + "epoch": 0.3227953410981697, + "grad_norm": 0.1836601048707962, + "learning_rate": 2.3647091592192234e-05, + "loss": 0.5626, + "step": 1164 + }, + { + "epoch": 0.3230726566833056, + "grad_norm": 0.19754944741725922, + "learning_rate": 2.3644591994166805e-05, + "loss": 0.5867, + "step": 1165 + }, + { + "epoch": 0.3233499722684415, + "grad_norm": 0.18801425397396088, + "learning_rate": 2.364209022153621e-05, + "loss": 0.5447, + "step": 1166 + }, + { + "epoch": 0.32362728785357736, + "grad_norm": 0.19709810614585876, + "learning_rate": 2.363958627478861e-05, + "loss": 0.5953, + "step": 1167 + }, + { + "epoch": 0.32390460343871325, + "grad_norm": 0.19867432117462158, + "learning_rate": 2.3637080154412588e-05, + "loss": 0.6041, + "step": 1168 + }, + { + "epoch": 0.32418191902384913, + "grad_norm": 0.19217659533023834, + "learning_rate": 2.363457186089716e-05, + "loss": 0.5693, + "step": 1169 + }, + { + "epoch": 0.324459234608985, + "grad_norm": 0.20366674661636353, + "learning_rate": 2.3632061394731753e-05, + "loss": 0.5957, + "step": 1170 + }, + { + "epoch": 0.3247365501941209, + "grad_norm": 0.19761385023593903, + "learning_rate": 2.362954875640623e-05, + "loss": 0.5722, + "step": 1171 + }, + { + "epoch": 0.3250138657792568, + "grad_norm": 0.20664581656455994, + "learning_rate": 2.362703394641087e-05, + "loss": 0.5763, + "step": 1172 + }, + { + "epoch": 0.32529118136439267, + "grad_norm": 0.18691927194595337, + "learning_rate": 2.3624516965236386e-05, + "loss": 0.5707, + "step": 1173 + }, + { + "epoch": 0.32556849694952855, + "grad_norm": 0.18426918983459473, + "learning_rate": 2.36219978133739e-05, + "loss": 0.5337, + "step": 1174 + }, + { + "epoch": 0.32584581253466444, + "grad_norm": 0.19540777802467346, + "learning_rate": 2.3619476491314977e-05, + "loss": 0.5549, + "step": 1175 + }, + { + "epoch": 0.3261231281198003, + "grad_norm": 0.2000686079263687, + "learning_rate": 2.3616952999551576e-05, + "loss": 0.5765, + "step": 1176 + }, + { + "epoch": 0.3264004437049362, + "grad_norm": 0.19267399609088898, + "learning_rate": 2.3614427338576114e-05, + "loss": 0.5585, + "step": 1177 + }, + { + "epoch": 0.3266777592900721, + "grad_norm": 0.21964752674102783, + "learning_rate": 2.3611899508881403e-05, + "loss": 0.5742, + "step": 1178 + }, + { + "epoch": 0.326955074875208, + "grad_norm": 0.21694490313529968, + "learning_rate": 2.3609369510960696e-05, + "loss": 0.5586, + "step": 1179 + }, + { + "epoch": 0.32723239046034386, + "grad_norm": 0.2035355418920517, + "learning_rate": 2.360683734530766e-05, + "loss": 0.5712, + "step": 1180 + }, + { + "epoch": 0.32750970604547974, + "grad_norm": 0.1984809935092926, + "learning_rate": 2.3604303012416383e-05, + "loss": 0.5846, + "step": 1181 + }, + { + "epoch": 0.3277870216306156, + "grad_norm": 0.19246408343315125, + "learning_rate": 2.360176651278139e-05, + "loss": 0.5931, + "step": 1182 + }, + { + "epoch": 0.3280643372157515, + "grad_norm": 0.18938469886779785, + "learning_rate": 2.3599227846897615e-05, + "loss": 0.5722, + "step": 1183 + }, + { + "epoch": 0.3283416528008874, + "grad_norm": 0.1882466822862625, + "learning_rate": 2.359668701526042e-05, + "loss": 0.5693, + "step": 1184 + }, + { + "epoch": 0.3286189683860233, + "grad_norm": 0.18115603923797607, + "learning_rate": 2.3594144018365584e-05, + "loss": 0.5694, + "step": 1185 + }, + { + "epoch": 0.32889628397115916, + "grad_norm": 0.21563617885112762, + "learning_rate": 2.3591598856709317e-05, + "loss": 0.5541, + "step": 1186 + }, + { + "epoch": 0.32917359955629505, + "grad_norm": 0.19328701496124268, + "learning_rate": 2.3589051530788246e-05, + "loss": 0.5469, + "step": 1187 + }, + { + "epoch": 0.32945091514143093, + "grad_norm": 0.19828177988529205, + "learning_rate": 2.358650204109942e-05, + "loss": 0.5785, + "step": 1188 + }, + { + "epoch": 0.3297282307265668, + "grad_norm": 0.1821170300245285, + "learning_rate": 2.358395038814032e-05, + "loss": 0.5727, + "step": 1189 + }, + { + "epoch": 0.3300055463117027, + "grad_norm": 0.19181084632873535, + "learning_rate": 2.3581396572408833e-05, + "loss": 0.5699, + "step": 1190 + }, + { + "epoch": 0.3302828618968386, + "grad_norm": 0.18721237778663635, + "learning_rate": 2.3578840594403275e-05, + "loss": 0.5786, + "step": 1191 + }, + { + "epoch": 0.33056017748197447, + "grad_norm": 0.2080426812171936, + "learning_rate": 2.3576282454622394e-05, + "loss": 0.6082, + "step": 1192 + }, + { + "epoch": 0.33083749306711036, + "grad_norm": 0.19874081015586853, + "learning_rate": 2.3573722153565343e-05, + "loss": 0.5802, + "step": 1193 + }, + { + "epoch": 0.33111480865224624, + "grad_norm": 0.20292651653289795, + "learning_rate": 2.357115969173171e-05, + "loss": 0.5562, + "step": 1194 + }, + { + "epoch": 0.3313921242373821, + "grad_norm": 0.1992533951997757, + "learning_rate": 2.356859506962149e-05, + "loss": 0.5491, + "step": 1195 + }, + { + "epoch": 0.331669439822518, + "grad_norm": 0.19271281361579895, + "learning_rate": 2.356602828773512e-05, + "loss": 0.5713, + "step": 1196 + }, + { + "epoch": 0.3319467554076539, + "grad_norm": 0.19548431038856506, + "learning_rate": 2.356345934657344e-05, + "loss": 0.5455, + "step": 1197 + }, + { + "epoch": 0.3322240709927898, + "grad_norm": 0.19355355203151703, + "learning_rate": 2.3560888246637726e-05, + "loss": 0.576, + "step": 1198 + }, + { + "epoch": 0.33250138657792566, + "grad_norm": 0.19047684967517853, + "learning_rate": 2.3558314988429657e-05, + "loss": 0.5569, + "step": 1199 + }, + { + "epoch": 0.33277870216306155, + "grad_norm": 0.19171833992004395, + "learning_rate": 2.3555739572451353e-05, + "loss": 0.5974, + "step": 1200 + }, + { + "epoch": 0.33305601774819743, + "grad_norm": 0.18777206540107727, + "learning_rate": 2.3553161999205337e-05, + "loss": 0.5614, + "step": 1201 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.1937248855829239, + "learning_rate": 2.355058226919457e-05, + "loss": 0.5725, + "step": 1202 + }, + { + "epoch": 0.3336106489184692, + "grad_norm": 0.19912956655025482, + "learning_rate": 2.3548000382922422e-05, + "loss": 0.5595, + "step": 1203 + }, + { + "epoch": 0.3338879645036051, + "grad_norm": 0.19780032336711884, + "learning_rate": 2.354541634089269e-05, + "loss": 0.5766, + "step": 1204 + }, + { + "epoch": 0.33416528008874097, + "grad_norm": 0.19286835193634033, + "learning_rate": 2.3542830143609584e-05, + "loss": 0.5529, + "step": 1205 + }, + { + "epoch": 0.33444259567387685, + "grad_norm": 0.19875440001487732, + "learning_rate": 2.3540241791577745e-05, + "loss": 0.5616, + "step": 1206 + }, + { + "epoch": 0.33471991125901274, + "grad_norm": 0.20458170771598816, + "learning_rate": 2.3537651285302224e-05, + "loss": 0.5754, + "step": 1207 + }, + { + "epoch": 0.3349972268441486, + "grad_norm": 0.19896847009658813, + "learning_rate": 2.3535058625288503e-05, + "loss": 0.586, + "step": 1208 + }, + { + "epoch": 0.3352745424292845, + "grad_norm": 0.2005191296339035, + "learning_rate": 2.3532463812042478e-05, + "loss": 0.57, + "step": 1209 + }, + { + "epoch": 0.3355518580144204, + "grad_norm": 0.19629941880702972, + "learning_rate": 2.3529866846070457e-05, + "loss": 0.5632, + "step": 1210 + }, + { + "epoch": 0.3358291735995563, + "grad_norm": 0.2009962797164917, + "learning_rate": 2.3527267727879187e-05, + "loss": 0.5926, + "step": 1211 + }, + { + "epoch": 0.33610648918469216, + "grad_norm": 0.18961815536022186, + "learning_rate": 2.3524666457975826e-05, + "loss": 0.566, + "step": 1212 + }, + { + "epoch": 0.33638380476982804, + "grad_norm": 0.19457431137561798, + "learning_rate": 2.3522063036867938e-05, + "loss": 0.5302, + "step": 1213 + }, + { + "epoch": 0.3366611203549639, + "grad_norm": 0.20213200151920319, + "learning_rate": 2.351945746506353e-05, + "loss": 0.5726, + "step": 1214 + }, + { + "epoch": 0.3369384359400998, + "grad_norm": 0.1976713389158249, + "learning_rate": 2.351684974307102e-05, + "loss": 0.5425, + "step": 1215 + }, + { + "epoch": 0.3372157515252357, + "grad_norm": 0.19058452546596527, + "learning_rate": 2.3514239871399235e-05, + "loss": 0.5695, + "step": 1216 + }, + { + "epoch": 0.3374930671103716, + "grad_norm": 0.18397174775600433, + "learning_rate": 2.3511627850557432e-05, + "loss": 0.535, + "step": 1217 + }, + { + "epoch": 0.33777038269550747, + "grad_norm": 0.2004639059305191, + "learning_rate": 2.3509013681055293e-05, + "loss": 0.5698, + "step": 1218 + }, + { + "epoch": 0.33804769828064335, + "grad_norm": 0.1970200389623642, + "learning_rate": 2.3506397363402905e-05, + "loss": 0.5831, + "step": 1219 + }, + { + "epoch": 0.33832501386577923, + "grad_norm": 0.24607634544372559, + "learning_rate": 2.3503778898110782e-05, + "loss": 0.6011, + "step": 1220 + }, + { + "epoch": 0.3386023294509151, + "grad_norm": 0.1974974125623703, + "learning_rate": 2.3501158285689857e-05, + "loss": 0.5402, + "step": 1221 + }, + { + "epoch": 0.338879645036051, + "grad_norm": 0.20804527401924133, + "learning_rate": 2.3498535526651478e-05, + "loss": 0.5804, + "step": 1222 + }, + { + "epoch": 0.3391569606211869, + "grad_norm": 0.21283218264579773, + "learning_rate": 2.349591062150742e-05, + "loss": 0.5498, + "step": 1223 + }, + { + "epoch": 0.33943427620632277, + "grad_norm": 0.20082467794418335, + "learning_rate": 2.3493283570769863e-05, + "loss": 0.5589, + "step": 1224 + }, + { + "epoch": 0.33971159179145866, + "grad_norm": 0.1998477727174759, + "learning_rate": 2.3490654374951426e-05, + "loss": 0.5496, + "step": 1225 + }, + { + "epoch": 0.33998890737659454, + "grad_norm": 0.18160581588745117, + "learning_rate": 2.3488023034565127e-05, + "loss": 0.569, + "step": 1226 + }, + { + "epoch": 0.3402662229617304, + "grad_norm": 0.18750114738941193, + "learning_rate": 2.3485389550124413e-05, + "loss": 0.5768, + "step": 1227 + }, + { + "epoch": 0.3405435385468663, + "grad_norm": 0.19347476959228516, + "learning_rate": 2.3482753922143143e-05, + "loss": 0.5354, + "step": 1228 + }, + { + "epoch": 0.3408208541320022, + "grad_norm": 0.19742092490196228, + "learning_rate": 2.34801161511356e-05, + "loss": 0.5656, + "step": 1229 + }, + { + "epoch": 0.3410981697171381, + "grad_norm": 0.19319242238998413, + "learning_rate": 2.3477476237616487e-05, + "loss": 0.5654, + "step": 1230 + }, + { + "epoch": 0.34137548530227396, + "grad_norm": 0.18535441160202026, + "learning_rate": 2.3474834182100914e-05, + "loss": 0.5419, + "step": 1231 + }, + { + "epoch": 0.34165280088740985, + "grad_norm": 0.2077123373746872, + "learning_rate": 2.347218998510442e-05, + "loss": 0.5719, + "step": 1232 + }, + { + "epoch": 0.34193011647254573, + "grad_norm": 0.1924944818019867, + "learning_rate": 2.3469543647142954e-05, + "loss": 0.5493, + "step": 1233 + }, + { + "epoch": 0.3422074320576816, + "grad_norm": 0.1943088173866272, + "learning_rate": 2.3466895168732894e-05, + "loss": 0.5418, + "step": 1234 + }, + { + "epoch": 0.3424847476428175, + "grad_norm": 0.19199106097221375, + "learning_rate": 2.3464244550391023e-05, + "loss": 0.5463, + "step": 1235 + }, + { + "epoch": 0.3427620632279534, + "grad_norm": 0.19151833653450012, + "learning_rate": 2.3461591792634548e-05, + "loss": 0.5595, + "step": 1236 + }, + { + "epoch": 0.34303937881308927, + "grad_norm": 0.20270255208015442, + "learning_rate": 2.3458936895981093e-05, + "loss": 0.5963, + "step": 1237 + }, + { + "epoch": 0.34331669439822515, + "grad_norm": 0.1867821365594864, + "learning_rate": 2.3456279860948696e-05, + "loss": 0.5716, + "step": 1238 + }, + { + "epoch": 0.34359400998336104, + "grad_norm": 0.20384635031223297, + "learning_rate": 2.3453620688055817e-05, + "loss": 0.5726, + "step": 1239 + }, + { + "epoch": 0.343871325568497, + "grad_norm": 0.19507844746112823, + "learning_rate": 2.3450959377821334e-05, + "loss": 0.5461, + "step": 1240 + }, + { + "epoch": 0.34414864115363286, + "grad_norm": 0.18854713439941406, + "learning_rate": 2.3448295930764536e-05, + "loss": 0.5617, + "step": 1241 + }, + { + "epoch": 0.34442595673876875, + "grad_norm": 0.1924966722726822, + "learning_rate": 2.344563034740513e-05, + "loss": 0.5716, + "step": 1242 + }, + { + "epoch": 0.34470327232390463, + "grad_norm": 0.1878724843263626, + "learning_rate": 2.3442962628263245e-05, + "loss": 0.5588, + "step": 1243 + }, + { + "epoch": 0.3449805879090405, + "grad_norm": 0.20984706282615662, + "learning_rate": 2.3440292773859422e-05, + "loss": 0.5623, + "step": 1244 + }, + { + "epoch": 0.3452579034941764, + "grad_norm": 0.19068847596645355, + "learning_rate": 2.343762078471462e-05, + "loss": 0.5573, + "step": 1245 + }, + { + "epoch": 0.3455352190793123, + "grad_norm": 0.1986820548772812, + "learning_rate": 2.343494666135022e-05, + "loss": 0.5799, + "step": 1246 + }, + { + "epoch": 0.34581253466444817, + "grad_norm": 0.21772533655166626, + "learning_rate": 2.343227040428801e-05, + "loss": 0.5674, + "step": 1247 + }, + { + "epoch": 0.34608985024958405, + "grad_norm": 0.20012469589710236, + "learning_rate": 2.3429592014050198e-05, + "loss": 0.5713, + "step": 1248 + }, + { + "epoch": 0.34636716583471994, + "grad_norm": 0.21233108639717102, + "learning_rate": 2.3426911491159408e-05, + "loss": 0.5232, + "step": 1249 + }, + { + "epoch": 0.3466444814198558, + "grad_norm": 0.19460223615169525, + "learning_rate": 2.3424228836138686e-05, + "loss": 0.5547, + "step": 1250 + }, + { + "epoch": 0.3469217970049917, + "grad_norm": 0.18516409397125244, + "learning_rate": 2.3421544049511484e-05, + "loss": 0.5445, + "step": 1251 + }, + { + "epoch": 0.3471991125901276, + "grad_norm": 0.20658938586711884, + "learning_rate": 2.341885713180168e-05, + "loss": 0.5551, + "step": 1252 + }, + { + "epoch": 0.3474764281752635, + "grad_norm": 0.19541315734386444, + "learning_rate": 2.3416168083533556e-05, + "loss": 0.5608, + "step": 1253 + }, + { + "epoch": 0.34775374376039936, + "grad_norm": 0.22885319590568542, + "learning_rate": 2.3413476905231825e-05, + "loss": 0.5836, + "step": 1254 + }, + { + "epoch": 0.34803105934553524, + "grad_norm": 0.2285439521074295, + "learning_rate": 2.3410783597421597e-05, + "loss": 0.5444, + "step": 1255 + }, + { + "epoch": 0.3483083749306711, + "grad_norm": 0.200783833861351, + "learning_rate": 2.3408088160628422e-05, + "loss": 0.5601, + "step": 1256 + }, + { + "epoch": 0.348585690515807, + "grad_norm": 0.19225940108299255, + "learning_rate": 2.3405390595378236e-05, + "loss": 0.5602, + "step": 1257 + }, + { + "epoch": 0.3488630061009429, + "grad_norm": 0.19936451315879822, + "learning_rate": 2.340269090219741e-05, + "loss": 0.5593, + "step": 1258 + }, + { + "epoch": 0.3491403216860788, + "grad_norm": 0.20055221021175385, + "learning_rate": 2.3399989081612732e-05, + "loss": 0.568, + "step": 1259 + }, + { + "epoch": 0.34941763727121466, + "grad_norm": 0.1941661238670349, + "learning_rate": 2.3397285134151394e-05, + "loss": 0.5688, + "step": 1260 + }, + { + "epoch": 0.34969495285635055, + "grad_norm": 0.19209027290344238, + "learning_rate": 2.3394579060341008e-05, + "loss": 0.5635, + "step": 1261 + }, + { + "epoch": 0.34997226844148643, + "grad_norm": 0.20127955079078674, + "learning_rate": 2.33918708607096e-05, + "loss": 0.5833, + "step": 1262 + }, + { + "epoch": 0.3502495840266223, + "grad_norm": 0.20718038082122803, + "learning_rate": 2.3389160535785612e-05, + "loss": 0.5883, + "step": 1263 + }, + { + "epoch": 0.3505268996117582, + "grad_norm": 0.19489611685276031, + "learning_rate": 2.3386448086097902e-05, + "loss": 0.5618, + "step": 1264 + }, + { + "epoch": 0.3508042151968941, + "grad_norm": 0.19366618990898132, + "learning_rate": 2.338373351217574e-05, + "loss": 0.5813, + "step": 1265 + }, + { + "epoch": 0.35108153078202997, + "grad_norm": 0.19742882251739502, + "learning_rate": 2.3381016814548806e-05, + "loss": 0.5866, + "step": 1266 + }, + { + "epoch": 0.35135884636716586, + "grad_norm": 0.2014235556125641, + "learning_rate": 2.337829799374721e-05, + "loss": 0.5503, + "step": 1267 + }, + { + "epoch": 0.35163616195230174, + "grad_norm": 0.19284577667713165, + "learning_rate": 2.337557705030146e-05, + "loss": 0.5683, + "step": 1268 + }, + { + "epoch": 0.3519134775374376, + "grad_norm": 0.19096529483795166, + "learning_rate": 2.3372853984742482e-05, + "loss": 0.5535, + "step": 1269 + }, + { + "epoch": 0.3521907931225735, + "grad_norm": 0.19602730870246887, + "learning_rate": 2.337012879760162e-05, + "loss": 0.5717, + "step": 1270 + }, + { + "epoch": 0.3524681087077094, + "grad_norm": 0.19658134877681732, + "learning_rate": 2.3367401489410635e-05, + "loss": 0.5945, + "step": 1271 + }, + { + "epoch": 0.3527454242928453, + "grad_norm": 0.20233921706676483, + "learning_rate": 2.3364672060701688e-05, + "loss": 0.5887, + "step": 1272 + }, + { + "epoch": 0.35302273987798116, + "grad_norm": 0.20138201117515564, + "learning_rate": 2.3361940512007368e-05, + "loss": 0.5424, + "step": 1273 + }, + { + "epoch": 0.35330005546311705, + "grad_norm": 0.1879206895828247, + "learning_rate": 2.3359206843860675e-05, + "loss": 0.546, + "step": 1274 + }, + { + "epoch": 0.35357737104825293, + "grad_norm": 0.19074855744838715, + "learning_rate": 2.335647105679502e-05, + "loss": 0.5486, + "step": 1275 + }, + { + "epoch": 0.3538546866333888, + "grad_norm": 0.2010781466960907, + "learning_rate": 2.335373315134422e-05, + "loss": 0.6002, + "step": 1276 + }, + { + "epoch": 0.3541320022185247, + "grad_norm": 0.19462116062641144, + "learning_rate": 2.3350993128042523e-05, + "loss": 0.5937, + "step": 1277 + }, + { + "epoch": 0.3544093178036606, + "grad_norm": 0.21550050377845764, + "learning_rate": 2.3348250987424573e-05, + "loss": 0.5783, + "step": 1278 + }, + { + "epoch": 0.35468663338879647, + "grad_norm": 0.20210538804531097, + "learning_rate": 2.3345506730025434e-05, + "loss": 0.5784, + "step": 1279 + }, + { + "epoch": 0.35496394897393235, + "grad_norm": 0.19225560128688812, + "learning_rate": 2.3342760356380588e-05, + "loss": 0.566, + "step": 1280 + }, + { + "epoch": 0.35524126455906824, + "grad_norm": 0.2126346081495285, + "learning_rate": 2.3340011867025924e-05, + "loss": 0.5369, + "step": 1281 + }, + { + "epoch": 0.3555185801442041, + "grad_norm": 0.19172579050064087, + "learning_rate": 2.333726126249774e-05, + "loss": 0.5736, + "step": 1282 + }, + { + "epoch": 0.35579589572934, + "grad_norm": 0.20066869258880615, + "learning_rate": 2.333450854333276e-05, + "loss": 0.5538, + "step": 1283 + }, + { + "epoch": 0.3560732113144759, + "grad_norm": 0.2043704241514206, + "learning_rate": 2.3331753710068106e-05, + "loss": 0.5682, + "step": 1284 + }, + { + "epoch": 0.3563505268996118, + "grad_norm": 0.1963237076997757, + "learning_rate": 2.3328996763241323e-05, + "loss": 0.5342, + "step": 1285 + }, + { + "epoch": 0.35662784248474766, + "grad_norm": 0.18448245525360107, + "learning_rate": 2.332623770339036e-05, + "loss": 0.5717, + "step": 1286 + }, + { + "epoch": 0.35690515806988354, + "grad_norm": 0.1940702348947525, + "learning_rate": 2.3323476531053587e-05, + "loss": 0.5491, + "step": 1287 + }, + { + "epoch": 0.35718247365501943, + "grad_norm": 0.1945616453886032, + "learning_rate": 2.3320713246769782e-05, + "loss": 0.5464, + "step": 1288 + }, + { + "epoch": 0.3574597892401553, + "grad_norm": 0.19436432421207428, + "learning_rate": 2.331794785107813e-05, + "loss": 0.5475, + "step": 1289 + }, + { + "epoch": 0.3577371048252912, + "grad_norm": 0.19309785962104797, + "learning_rate": 2.3315180344518236e-05, + "loss": 0.5413, + "step": 1290 + }, + { + "epoch": 0.3580144204104271, + "grad_norm": 0.19348269701004028, + "learning_rate": 2.331241072763012e-05, + "loss": 0.5713, + "step": 1291 + }, + { + "epoch": 0.35829173599556297, + "grad_norm": 0.18745654821395874, + "learning_rate": 2.330963900095419e-05, + "loss": 0.5661, + "step": 1292 + }, + { + "epoch": 0.35856905158069885, + "grad_norm": 0.20293056964874268, + "learning_rate": 2.3306865165031305e-05, + "loss": 0.5855, + "step": 1293 + }, + { + "epoch": 0.35884636716583473, + "grad_norm": 0.202471524477005, + "learning_rate": 2.3304089220402702e-05, + "loss": 0.5905, + "step": 1294 + }, + { + "epoch": 0.3591236827509706, + "grad_norm": 0.18649210035800934, + "learning_rate": 2.330131116761004e-05, + "loss": 0.5768, + "step": 1295 + }, + { + "epoch": 0.3594009983361065, + "grad_norm": 0.18877407908439636, + "learning_rate": 2.3298531007195398e-05, + "loss": 0.576, + "step": 1296 + }, + { + "epoch": 0.3596783139212424, + "grad_norm": 0.18760234117507935, + "learning_rate": 2.329574873970125e-05, + "loss": 0.54, + "step": 1297 + }, + { + "epoch": 0.35995562950637827, + "grad_norm": 0.2042498141527176, + "learning_rate": 2.32929643656705e-05, + "loss": 0.5595, + "step": 1298 + }, + { + "epoch": 0.36023294509151416, + "grad_norm": 0.19519071280956268, + "learning_rate": 2.3290177885646448e-05, + "loss": 0.5446, + "step": 1299 + }, + { + "epoch": 0.36051026067665004, + "grad_norm": 0.19533094763755798, + "learning_rate": 2.3287389300172806e-05, + "loss": 0.619, + "step": 1300 + }, + { + "epoch": 0.3607875762617859, + "grad_norm": 0.18442007899284363, + "learning_rate": 2.3284598609793705e-05, + "loss": 0.5651, + "step": 1301 + }, + { + "epoch": 0.3610648918469218, + "grad_norm": 0.1934802085161209, + "learning_rate": 2.3281805815053688e-05, + "loss": 0.5894, + "step": 1302 + }, + { + "epoch": 0.3613422074320577, + "grad_norm": 0.20552469789981842, + "learning_rate": 2.327901091649769e-05, + "loss": 0.5943, + "step": 1303 + }, + { + "epoch": 0.3616195230171936, + "grad_norm": 0.19476066529750824, + "learning_rate": 2.3276213914671084e-05, + "loss": 0.5729, + "step": 1304 + }, + { + "epoch": 0.36189683860232946, + "grad_norm": 0.19621802866458893, + "learning_rate": 2.3273414810119632e-05, + "loss": 0.5616, + "step": 1305 + }, + { + "epoch": 0.36217415418746535, + "grad_norm": 0.192935511469841, + "learning_rate": 2.3270613603389513e-05, + "loss": 0.5507, + "step": 1306 + }, + { + "epoch": 0.36245146977260123, + "grad_norm": 0.18750174343585968, + "learning_rate": 2.3267810295027317e-05, + "loss": 0.5866, + "step": 1307 + }, + { + "epoch": 0.3627287853577371, + "grad_norm": 0.18611189723014832, + "learning_rate": 2.3265004885580047e-05, + "loss": 0.5638, + "step": 1308 + }, + { + "epoch": 0.363006100942873, + "grad_norm": 0.205692857503891, + "learning_rate": 2.3262197375595108e-05, + "loss": 0.5628, + "step": 1309 + }, + { + "epoch": 0.3632834165280089, + "grad_norm": 0.19303978979587555, + "learning_rate": 2.3259387765620322e-05, + "loss": 0.5785, + "step": 1310 + }, + { + "epoch": 0.36356073211314477, + "grad_norm": 0.19016319513320923, + "learning_rate": 2.325657605620392e-05, + "loss": 0.5637, + "step": 1311 + }, + { + "epoch": 0.36383804769828065, + "grad_norm": 0.18235303461551666, + "learning_rate": 2.325376224789454e-05, + "loss": 0.5518, + "step": 1312 + }, + { + "epoch": 0.36411536328341654, + "grad_norm": 0.1889420747756958, + "learning_rate": 2.325094634124123e-05, + "loss": 0.5539, + "step": 1313 + }, + { + "epoch": 0.3643926788685524, + "grad_norm": 0.1905851662158966, + "learning_rate": 2.3248128336793444e-05, + "loss": 0.5892, + "step": 1314 + }, + { + "epoch": 0.3646699944536883, + "grad_norm": 0.18385791778564453, + "learning_rate": 2.324530823510106e-05, + "loss": 0.5746, + "step": 1315 + }, + { + "epoch": 0.3649473100388242, + "grad_norm": 0.18778762221336365, + "learning_rate": 2.3242486036714343e-05, + "loss": 0.5578, + "step": 1316 + }, + { + "epoch": 0.3652246256239601, + "grad_norm": 0.18907684087753296, + "learning_rate": 2.3239661742183984e-05, + "loss": 0.5912, + "step": 1317 + }, + { + "epoch": 0.36550194120909596, + "grad_norm": 0.19764885306358337, + "learning_rate": 2.3236835352061076e-05, + "loss": 0.5719, + "step": 1318 + }, + { + "epoch": 0.36577925679423184, + "grad_norm": 0.1823858916759491, + "learning_rate": 2.3234006866897125e-05, + "loss": 0.5619, + "step": 1319 + }, + { + "epoch": 0.36605657237936773, + "grad_norm": 0.31975796818733215, + "learning_rate": 2.3231176287244044e-05, + "loss": 0.5935, + "step": 1320 + }, + { + "epoch": 0.3663338879645036, + "grad_norm": 0.17738473415374756, + "learning_rate": 2.322834361365415e-05, + "loss": 0.56, + "step": 1321 + }, + { + "epoch": 0.3666112035496395, + "grad_norm": 0.19397957623004913, + "learning_rate": 2.3225508846680173e-05, + "loss": 0.5767, + "step": 1322 + }, + { + "epoch": 0.3668885191347754, + "grad_norm": 0.1985812485218048, + "learning_rate": 2.3222671986875255e-05, + "loss": 0.5322, + "step": 1323 + }, + { + "epoch": 0.36716583471991127, + "grad_norm": 0.19608476758003235, + "learning_rate": 2.3219833034792943e-05, + "loss": 0.5758, + "step": 1324 + }, + { + "epoch": 0.36744315030504715, + "grad_norm": 0.196056067943573, + "learning_rate": 2.3216991990987186e-05, + "loss": 0.5658, + "step": 1325 + }, + { + "epoch": 0.36772046589018303, + "grad_norm": 0.18968339264392853, + "learning_rate": 2.3214148856012354e-05, + "loss": 0.5472, + "step": 1326 + }, + { + "epoch": 0.3679977814753189, + "grad_norm": 0.19283819198608398, + "learning_rate": 2.3211303630423208e-05, + "loss": 0.5554, + "step": 1327 + }, + { + "epoch": 0.3682750970604548, + "grad_norm": 0.19521844387054443, + "learning_rate": 2.320845631477494e-05, + "loss": 0.5252, + "step": 1328 + }, + { + "epoch": 0.3685524126455907, + "grad_norm": 0.19548355042934418, + "learning_rate": 2.3205606909623122e-05, + "loss": 0.569, + "step": 1329 + }, + { + "epoch": 0.3688297282307266, + "grad_norm": 0.18432483077049255, + "learning_rate": 2.3202755415523763e-05, + "loss": 0.5612, + "step": 1330 + }, + { + "epoch": 0.36910704381586246, + "grad_norm": 0.19396983087062836, + "learning_rate": 2.3199901833033255e-05, + "loss": 0.586, + "step": 1331 + }, + { + "epoch": 0.36938435940099834, + "grad_norm": 0.18808341026306152, + "learning_rate": 2.3197046162708413e-05, + "loss": 0.566, + "step": 1332 + }, + { + "epoch": 0.3696616749861342, + "grad_norm": 0.19177477061748505, + "learning_rate": 2.3194188405106453e-05, + "loss": 0.5673, + "step": 1333 + }, + { + "epoch": 0.3699389905712701, + "grad_norm": 0.19362227618694305, + "learning_rate": 2.3191328560784992e-05, + "loss": 0.5367, + "step": 1334 + }, + { + "epoch": 0.370216306156406, + "grad_norm": 0.1922491490840912, + "learning_rate": 2.3188466630302072e-05, + "loss": 0.5466, + "step": 1335 + }, + { + "epoch": 0.3704936217415419, + "grad_norm": 0.19545325636863708, + "learning_rate": 2.3185602614216125e-05, + "loss": 0.5861, + "step": 1336 + }, + { + "epoch": 0.37077093732667776, + "grad_norm": 0.1878175288438797, + "learning_rate": 2.3182736513086002e-05, + "loss": 0.5429, + "step": 1337 + }, + { + "epoch": 0.37104825291181365, + "grad_norm": 0.20909211039543152, + "learning_rate": 2.3179868327470948e-05, + "loss": 0.579, + "step": 1338 + }, + { + "epoch": 0.37132556849694953, + "grad_norm": 0.2049614042043686, + "learning_rate": 2.3176998057930626e-05, + "loss": 0.5877, + "step": 1339 + }, + { + "epoch": 0.3716028840820854, + "grad_norm": 0.20033860206604004, + "learning_rate": 2.3174125705025103e-05, + "loss": 0.5576, + "step": 1340 + }, + { + "epoch": 0.3718801996672213, + "grad_norm": 0.19559535384178162, + "learning_rate": 2.3171251269314846e-05, + "loss": 0.581, + "step": 1341 + }, + { + "epoch": 0.3721575152523572, + "grad_norm": 0.19223865866661072, + "learning_rate": 2.3168374751360737e-05, + "loss": 0.5742, + "step": 1342 + }, + { + "epoch": 0.37243483083749307, + "grad_norm": 0.18235072493553162, + "learning_rate": 2.316549615172406e-05, + "loss": 0.5839, + "step": 1343 + }, + { + "epoch": 0.37271214642262895, + "grad_norm": 0.1987343281507492, + "learning_rate": 2.3162615470966512e-05, + "loss": 0.5531, + "step": 1344 + }, + { + "epoch": 0.37298946200776484, + "grad_norm": 0.20500993728637695, + "learning_rate": 2.3159732709650182e-05, + "loss": 0.5849, + "step": 1345 + }, + { + "epoch": 0.3732667775929007, + "grad_norm": 0.19410103559494019, + "learning_rate": 2.3156847868337574e-05, + "loss": 0.602, + "step": 1346 + }, + { + "epoch": 0.3735440931780366, + "grad_norm": 0.19419234991073608, + "learning_rate": 2.31539609475916e-05, + "loss": 0.5639, + "step": 1347 + }, + { + "epoch": 0.3738214087631725, + "grad_norm": 0.19289056956768036, + "learning_rate": 2.3151071947975578e-05, + "loss": 0.5833, + "step": 1348 + }, + { + "epoch": 0.3740987243483084, + "grad_norm": 0.1862688809633255, + "learning_rate": 2.314818087005322e-05, + "loss": 0.5587, + "step": 1349 + }, + { + "epoch": 0.37437603993344426, + "grad_norm": 0.1773182898759842, + "learning_rate": 2.314528771438866e-05, + "loss": 0.5297, + "step": 1350 + }, + { + "epoch": 0.37465335551858014, + "grad_norm": 0.18808980286121368, + "learning_rate": 2.314239248154642e-05, + "loss": 0.5478, + "step": 1351 + }, + { + "epoch": 0.37493067110371603, + "grad_norm": 0.1850731521844864, + "learning_rate": 2.3139495172091447e-05, + "loss": 0.5631, + "step": 1352 + }, + { + "epoch": 0.3752079866888519, + "grad_norm": 0.233089417219162, + "learning_rate": 2.313659578658907e-05, + "loss": 0.5688, + "step": 1353 + }, + { + "epoch": 0.3754853022739878, + "grad_norm": 0.19679111242294312, + "learning_rate": 2.313369432560505e-05, + "loss": 0.5713, + "step": 1354 + }, + { + "epoch": 0.3757626178591237, + "grad_norm": 0.22906848788261414, + "learning_rate": 2.3130790789705535e-05, + "loss": 0.5727, + "step": 1355 + }, + { + "epoch": 0.37603993344425957, + "grad_norm": 0.20593827962875366, + "learning_rate": 2.3127885179457077e-05, + "loss": 0.5382, + "step": 1356 + }, + { + "epoch": 0.37631724902939545, + "grad_norm": 0.18781960010528564, + "learning_rate": 2.3124977495426637e-05, + "loss": 0.5746, + "step": 1357 + }, + { + "epoch": 0.37659456461453134, + "grad_norm": 0.2294550985097885, + "learning_rate": 2.3122067738181587e-05, + "loss": 0.5783, + "step": 1358 + }, + { + "epoch": 0.3768718801996672, + "grad_norm": 0.2007582187652588, + "learning_rate": 2.311915590828969e-05, + "loss": 0.5586, + "step": 1359 + }, + { + "epoch": 0.3771491957848031, + "grad_norm": 0.20126573741436005, + "learning_rate": 2.3116242006319132e-05, + "loss": 0.5621, + "step": 1360 + }, + { + "epoch": 0.377426511369939, + "grad_norm": 0.19607853889465332, + "learning_rate": 2.3113326032838487e-05, + "loss": 0.5305, + "step": 1361 + }, + { + "epoch": 0.3777038269550749, + "grad_norm": 0.193894624710083, + "learning_rate": 2.3110407988416736e-05, + "loss": 0.5578, + "step": 1362 + }, + { + "epoch": 0.37798114254021076, + "grad_norm": 0.1813306212425232, + "learning_rate": 2.310748787362327e-05, + "loss": 0.5787, + "step": 1363 + }, + { + "epoch": 0.37825845812534664, + "grad_norm": 0.19636552035808563, + "learning_rate": 2.3104565689027875e-05, + "loss": 0.5615, + "step": 1364 + }, + { + "epoch": 0.3785357737104825, + "grad_norm": 0.1957857310771942, + "learning_rate": 2.3101641435200756e-05, + "loss": 0.5821, + "step": 1365 + }, + { + "epoch": 0.3788130892956184, + "grad_norm": 0.18413352966308594, + "learning_rate": 2.3098715112712507e-05, + "loss": 0.5388, + "step": 1366 + }, + { + "epoch": 0.3790904048807543, + "grad_norm": 0.20153377950191498, + "learning_rate": 2.3095786722134133e-05, + "loss": 0.5748, + "step": 1367 + }, + { + "epoch": 0.3793677204658902, + "grad_norm": 0.22044618427753448, + "learning_rate": 2.309285626403704e-05, + "loss": 0.5668, + "step": 1368 + }, + { + "epoch": 0.37964503605102606, + "grad_norm": 0.19130001962184906, + "learning_rate": 2.3089923738993034e-05, + "loss": 0.5752, + "step": 1369 + }, + { + "epoch": 0.37992235163616195, + "grad_norm": 0.18104785680770874, + "learning_rate": 2.3086989147574333e-05, + "loss": 0.5577, + "step": 1370 + }, + { + "epoch": 0.38019966722129783, + "grad_norm": 0.18427520990371704, + "learning_rate": 2.3084052490353553e-05, + "loss": 0.5537, + "step": 1371 + }, + { + "epoch": 0.3804769828064337, + "grad_norm": 0.1886986941099167, + "learning_rate": 2.3081113767903713e-05, + "loss": 0.5646, + "step": 1372 + }, + { + "epoch": 0.3807542983915696, + "grad_norm": 0.19212138652801514, + "learning_rate": 2.3078172980798236e-05, + "loss": 0.5415, + "step": 1373 + }, + { + "epoch": 0.3810316139767055, + "grad_norm": 0.18980136513710022, + "learning_rate": 2.3075230129610946e-05, + "loss": 0.5725, + "step": 1374 + }, + { + "epoch": 0.38130892956184137, + "grad_norm": 0.1848769187927246, + "learning_rate": 2.3072285214916072e-05, + "loss": 0.536, + "step": 1375 + }, + { + "epoch": 0.38158624514697725, + "grad_norm": 0.18508492410182953, + "learning_rate": 2.3069338237288247e-05, + "loss": 0.5753, + "step": 1376 + }, + { + "epoch": 0.38186356073211314, + "grad_norm": 0.1909710019826889, + "learning_rate": 2.30663891973025e-05, + "loss": 0.563, + "step": 1377 + }, + { + "epoch": 0.382140876317249, + "grad_norm": 0.20639832317829132, + "learning_rate": 2.3063438095534272e-05, + "loss": 0.5713, + "step": 1378 + }, + { + "epoch": 0.3824181919023849, + "grad_norm": 0.1900801658630371, + "learning_rate": 2.3060484932559395e-05, + "loss": 0.5579, + "step": 1379 + }, + { + "epoch": 0.3826955074875208, + "grad_norm": 0.19208571314811707, + "learning_rate": 2.305752970895412e-05, + "loss": 0.5571, + "step": 1380 + }, + { + "epoch": 0.3829728230726567, + "grad_norm": 0.1912533938884735, + "learning_rate": 2.3054572425295075e-05, + "loss": 0.5452, + "step": 1381 + }, + { + "epoch": 0.38325013865779256, + "grad_norm": 0.19920608401298523, + "learning_rate": 2.3051613082159313e-05, + "loss": 0.5799, + "step": 1382 + }, + { + "epoch": 0.38352745424292845, + "grad_norm": 0.19880720973014832, + "learning_rate": 2.3048651680124283e-05, + "loss": 0.5504, + "step": 1383 + }, + { + "epoch": 0.38380476982806433, + "grad_norm": 0.1889009028673172, + "learning_rate": 2.3045688219767824e-05, + "loss": 0.5751, + "step": 1384 + }, + { + "epoch": 0.3840820854132002, + "grad_norm": 0.20132310688495636, + "learning_rate": 2.3042722701668194e-05, + "loss": 0.5723, + "step": 1385 + }, + { + "epoch": 0.3843594009983361, + "grad_norm": 0.20646221935749054, + "learning_rate": 2.3039755126404037e-05, + "loss": 0.581, + "step": 1386 + }, + { + "epoch": 0.384636716583472, + "grad_norm": 0.19051332771778107, + "learning_rate": 2.3036785494554415e-05, + "loss": 0.5609, + "step": 1387 + }, + { + "epoch": 0.38491403216860787, + "grad_norm": 0.1971728354692459, + "learning_rate": 2.303381380669877e-05, + "loss": 0.563, + "step": 1388 + }, + { + "epoch": 0.38519134775374375, + "grad_norm": 0.19891361892223358, + "learning_rate": 2.303084006341697e-05, + "loss": 0.571, + "step": 1389 + }, + { + "epoch": 0.38546866333887964, + "grad_norm": 0.18801699578762054, + "learning_rate": 2.302786426528926e-05, + "loss": 0.5772, + "step": 1390 + }, + { + "epoch": 0.3857459789240155, + "grad_norm": 0.1934468299150467, + "learning_rate": 2.3024886412896302e-05, + "loss": 0.5426, + "step": 1391 + }, + { + "epoch": 0.3860232945091514, + "grad_norm": 0.20585696399211884, + "learning_rate": 2.3021906506819152e-05, + "loss": 0.5521, + "step": 1392 + }, + { + "epoch": 0.3863006100942873, + "grad_norm": 0.19114629924297333, + "learning_rate": 2.3018924547639272e-05, + "loss": 0.597, + "step": 1393 + }, + { + "epoch": 0.3865779256794232, + "grad_norm": 0.18134154379367828, + "learning_rate": 2.301594053593852e-05, + "loss": 0.5473, + "step": 1394 + }, + { + "epoch": 0.38685524126455906, + "grad_norm": 0.18706971406936646, + "learning_rate": 2.301295447229915e-05, + "loss": 0.5843, + "step": 1395 + }, + { + "epoch": 0.38713255684969494, + "grad_norm": 0.183674156665802, + "learning_rate": 2.300996635730383e-05, + "loss": 0.5268, + "step": 1396 + }, + { + "epoch": 0.3874098724348308, + "grad_norm": 0.18946042656898499, + "learning_rate": 2.3006976191535616e-05, + "loss": 0.5529, + "step": 1397 + }, + { + "epoch": 0.3876871880199667, + "grad_norm": 0.20170167088508606, + "learning_rate": 2.3003983975577975e-05, + "loss": 0.5781, + "step": 1398 + }, + { + "epoch": 0.3879645036051026, + "grad_norm": 0.1854201704263687, + "learning_rate": 2.300098971001476e-05, + "loss": 0.536, + "step": 1399 + }, + { + "epoch": 0.3882418191902385, + "grad_norm": 0.19159002602100372, + "learning_rate": 2.299799339543023e-05, + "loss": 0.584, + "step": 1400 + }, + { + "epoch": 0.38851913477537436, + "grad_norm": 0.18878300487995148, + "learning_rate": 2.299499503240905e-05, + "loss": 0.5613, + "step": 1401 + }, + { + "epoch": 0.38879645036051025, + "grad_norm": 0.2083284854888916, + "learning_rate": 2.2991994621536283e-05, + "loss": 0.5607, + "step": 1402 + }, + { + "epoch": 0.38907376594564613, + "grad_norm": 0.19140368700027466, + "learning_rate": 2.2988992163397386e-05, + "loss": 0.5723, + "step": 1403 + }, + { + "epoch": 0.389351081530782, + "grad_norm": 0.19407185912132263, + "learning_rate": 2.2985987658578217e-05, + "loss": 0.5592, + "step": 1404 + }, + { + "epoch": 0.3896283971159179, + "grad_norm": 0.202662855386734, + "learning_rate": 2.298298110766503e-05, + "loss": 0.5682, + "step": 1405 + }, + { + "epoch": 0.3899057127010538, + "grad_norm": 0.1902836710214615, + "learning_rate": 2.2979972511244493e-05, + "loss": 0.5626, + "step": 1406 + }, + { + "epoch": 0.39018302828618967, + "grad_norm": 0.19113852083683014, + "learning_rate": 2.2976961869903657e-05, + "loss": 0.5713, + "step": 1407 + }, + { + "epoch": 0.39046034387132555, + "grad_norm": 0.19765017926692963, + "learning_rate": 2.2973949184229975e-05, + "loss": 0.5811, + "step": 1408 + }, + { + "epoch": 0.39073765945646144, + "grad_norm": 0.1867290586233139, + "learning_rate": 2.2970934454811306e-05, + "loss": 0.5094, + "step": 1409 + }, + { + "epoch": 0.3910149750415973, + "grad_norm": 0.19188763201236725, + "learning_rate": 2.2967917682235905e-05, + "loss": 0.5736, + "step": 1410 + }, + { + "epoch": 0.3912922906267332, + "grad_norm": 0.19784973561763763, + "learning_rate": 2.296489886709242e-05, + "loss": 0.5739, + "step": 1411 + }, + { + "epoch": 0.3915696062118691, + "grad_norm": 0.19761349260807037, + "learning_rate": 2.2961878009969904e-05, + "loss": 0.5702, + "step": 1412 + }, + { + "epoch": 0.391846921797005, + "grad_norm": 0.19187502562999725, + "learning_rate": 2.2958855111457804e-05, + "loss": 0.5575, + "step": 1413 + }, + { + "epoch": 0.39212423738214086, + "grad_norm": 0.19723129272460938, + "learning_rate": 2.2955830172145975e-05, + "loss": 0.5757, + "step": 1414 + }, + { + "epoch": 0.39240155296727675, + "grad_norm": 0.1946565806865692, + "learning_rate": 2.2952803192624653e-05, + "loss": 0.5696, + "step": 1415 + }, + { + "epoch": 0.39267886855241263, + "grad_norm": 0.1714453399181366, + "learning_rate": 2.2949774173484488e-05, + "loss": 0.5783, + "step": 1416 + }, + { + "epoch": 0.3929561841375485, + "grad_norm": 0.1907140165567398, + "learning_rate": 2.2946743115316518e-05, + "loss": 0.5642, + "step": 1417 + }, + { + "epoch": 0.3932334997226844, + "grad_norm": 0.18922662734985352, + "learning_rate": 2.294371001871219e-05, + "loss": 0.5403, + "step": 1418 + }, + { + "epoch": 0.3935108153078203, + "grad_norm": 0.18016541004180908, + "learning_rate": 2.294067488426333e-05, + "loss": 0.5428, + "step": 1419 + }, + { + "epoch": 0.39378813089295617, + "grad_norm": 0.18581193685531616, + "learning_rate": 2.293763771256218e-05, + "loss": 0.5671, + "step": 1420 + }, + { + "epoch": 0.39406544647809205, + "grad_norm": 0.19479569792747498, + "learning_rate": 2.293459850420138e-05, + "loss": 0.5471, + "step": 1421 + }, + { + "epoch": 0.39434276206322794, + "grad_norm": 0.19193242490291595, + "learning_rate": 2.2931557259773944e-05, + "loss": 0.5422, + "step": 1422 + }, + { + "epoch": 0.3946200776483638, + "grad_norm": 0.1958051174879074, + "learning_rate": 2.2928513979873312e-05, + "loss": 0.5605, + "step": 1423 + }, + { + "epoch": 0.3948973932334997, + "grad_norm": 0.1831037700176239, + "learning_rate": 2.2925468665093304e-05, + "loss": 0.5632, + "step": 1424 + }, + { + "epoch": 0.3951747088186356, + "grad_norm": 0.19089363515377045, + "learning_rate": 2.2922421316028142e-05, + "loss": 0.5534, + "step": 1425 + }, + { + "epoch": 0.3954520244037715, + "grad_norm": 0.18920765817165375, + "learning_rate": 2.2919371933272445e-05, + "loss": 0.5708, + "step": 1426 + }, + { + "epoch": 0.39572933998890736, + "grad_norm": 0.1790907382965088, + "learning_rate": 2.2916320517421224e-05, + "loss": 0.54, + "step": 1427 + }, + { + "epoch": 0.39600665557404324, + "grad_norm": 0.19448032975196838, + "learning_rate": 2.29132670690699e-05, + "loss": 0.5713, + "step": 1428 + }, + { + "epoch": 0.3962839711591791, + "grad_norm": 0.18597151339054108, + "learning_rate": 2.2910211588814272e-05, + "loss": 0.5619, + "step": 1429 + }, + { + "epoch": 0.396561286744315, + "grad_norm": 0.19088095426559448, + "learning_rate": 2.2907154077250554e-05, + "loss": 0.5567, + "step": 1430 + }, + { + "epoch": 0.3968386023294509, + "grad_norm": 0.2470846325159073, + "learning_rate": 2.290409453497534e-05, + "loss": 0.5567, + "step": 1431 + }, + { + "epoch": 0.3971159179145868, + "grad_norm": 0.189782977104187, + "learning_rate": 2.2901032962585633e-05, + "loss": 0.5689, + "step": 1432 + }, + { + "epoch": 0.39739323349972266, + "grad_norm": 0.18323121964931488, + "learning_rate": 2.289796936067882e-05, + "loss": 0.5695, + "step": 1433 + }, + { + "epoch": 0.39767054908485855, + "grad_norm": 0.19183968007564545, + "learning_rate": 2.28949037298527e-05, + "loss": 0.5527, + "step": 1434 + }, + { + "epoch": 0.39794786466999443, + "grad_norm": 0.2490936517715454, + "learning_rate": 2.2891836070705454e-05, + "loss": 0.5705, + "step": 1435 + }, + { + "epoch": 0.3982251802551303, + "grad_norm": 0.18432928621768951, + "learning_rate": 2.2888766383835664e-05, + "loss": 0.5542, + "step": 1436 + }, + { + "epoch": 0.3985024958402662, + "grad_norm": 0.18222655355930328, + "learning_rate": 2.2885694669842305e-05, + "loss": 0.5693, + "step": 1437 + }, + { + "epoch": 0.3987798114254021, + "grad_norm": 0.18472431600093842, + "learning_rate": 2.2882620929324758e-05, + "loss": 0.5664, + "step": 1438 + }, + { + "epoch": 0.39905712701053797, + "grad_norm": 0.19013690948486328, + "learning_rate": 2.2879545162882782e-05, + "loss": 0.534, + "step": 1439 + }, + { + "epoch": 0.39933444259567386, + "grad_norm": 0.1890943944454193, + "learning_rate": 2.2876467371116546e-05, + "loss": 0.5537, + "step": 1440 + }, + { + "epoch": 0.39961175818080974, + "grad_norm": 0.19494980573654175, + "learning_rate": 2.28733875546266e-05, + "loss": 0.5502, + "step": 1441 + }, + { + "epoch": 0.3998890737659456, + "grad_norm": 0.215170755982399, + "learning_rate": 2.2870305714013908e-05, + "loss": 0.57, + "step": 1442 + }, + { + "epoch": 0.4001663893510815, + "grad_norm": 0.20500238239765167, + "learning_rate": 2.2867221849879816e-05, + "loss": 0.5648, + "step": 1443 + }, + { + "epoch": 0.4004437049362174, + "grad_norm": 0.1898437738418579, + "learning_rate": 2.2864135962826067e-05, + "loss": 0.5685, + "step": 1444 + }, + { + "epoch": 0.4007210205213533, + "grad_norm": 0.1926756501197815, + "learning_rate": 2.2861048053454797e-05, + "loss": 0.5514, + "step": 1445 + }, + { + "epoch": 0.40099833610648916, + "grad_norm": 0.19258753955364227, + "learning_rate": 2.2857958122368545e-05, + "loss": 0.5397, + "step": 1446 + }, + { + "epoch": 0.40127565169162505, + "grad_norm": 0.18636788427829742, + "learning_rate": 2.285486617017023e-05, + "loss": 0.5421, + "step": 1447 + }, + { + "epoch": 0.40155296727676093, + "grad_norm": 0.20043087005615234, + "learning_rate": 2.2851772197463184e-05, + "loss": 0.5241, + "step": 1448 + }, + { + "epoch": 0.4018302828618968, + "grad_norm": 0.2062230408191681, + "learning_rate": 2.284867620485111e-05, + "loss": 0.5702, + "step": 1449 + }, + { + "epoch": 0.4021075984470327, + "grad_norm": 0.1930972784757614, + "learning_rate": 2.284557819293813e-05, + "loss": 0.5533, + "step": 1450 + }, + { + "epoch": 0.4023849140321686, + "grad_norm": 0.20658205449581146, + "learning_rate": 2.284247816232874e-05, + "loss": 0.5613, + "step": 1451 + }, + { + "epoch": 0.40266222961730447, + "grad_norm": 0.2103574126958847, + "learning_rate": 2.2839376113627848e-05, + "loss": 0.5668, + "step": 1452 + }, + { + "epoch": 0.40293954520244035, + "grad_norm": 0.19861139357089996, + "learning_rate": 2.2836272047440733e-05, + "loss": 0.5549, + "step": 1453 + }, + { + "epoch": 0.40321686078757624, + "grad_norm": 0.1885387897491455, + "learning_rate": 2.2833165964373093e-05, + "loss": 0.565, + "step": 1454 + }, + { + "epoch": 0.4034941763727121, + "grad_norm": 0.18594177067279816, + "learning_rate": 2.2830057865030997e-05, + "loss": 0.5129, + "step": 1455 + }, + { + "epoch": 0.403771491957848, + "grad_norm": 0.19150151312351227, + "learning_rate": 2.282694775002092e-05, + "loss": 0.5591, + "step": 1456 + }, + { + "epoch": 0.4040488075429839, + "grad_norm": 0.1973596066236496, + "learning_rate": 2.2823835619949735e-05, + "loss": 0.5795, + "step": 1457 + }, + { + "epoch": 0.4043261231281198, + "grad_norm": 0.21641992032527924, + "learning_rate": 2.2820721475424693e-05, + "loss": 0.5286, + "step": 1458 + }, + { + "epoch": 0.40460343871325566, + "grad_norm": 0.20730352401733398, + "learning_rate": 2.281760531705345e-05, + "loss": 0.578, + "step": 1459 + }, + { + "epoch": 0.40488075429839154, + "grad_norm": 0.19733300805091858, + "learning_rate": 2.281448714544405e-05, + "loss": 0.6048, + "step": 1460 + }, + { + "epoch": 0.4051580698835274, + "grad_norm": 0.2007417231798172, + "learning_rate": 2.281136696120493e-05, + "loss": 0.588, + "step": 1461 + }, + { + "epoch": 0.4054353854686633, + "grad_norm": 0.1785038709640503, + "learning_rate": 2.280824476494492e-05, + "loss": 0.5505, + "step": 1462 + }, + { + "epoch": 0.4057127010537992, + "grad_norm": 0.18670029938220978, + "learning_rate": 2.2805120557273246e-05, + "loss": 0.5533, + "step": 1463 + }, + { + "epoch": 0.4059900166389351, + "grad_norm": 0.18775032460689545, + "learning_rate": 2.2801994338799525e-05, + "loss": 0.5457, + "step": 1464 + }, + { + "epoch": 0.406267332224071, + "grad_norm": 0.18645282089710236, + "learning_rate": 2.2798866110133758e-05, + "loss": 0.5595, + "step": 1465 + }, + { + "epoch": 0.4065446478092069, + "grad_norm": 0.2067873477935791, + "learning_rate": 2.279573587188635e-05, + "loss": 0.5438, + "step": 1466 + }, + { + "epoch": 0.4068219633943428, + "grad_norm": 0.18860718607902527, + "learning_rate": 2.2792603624668097e-05, + "loss": 0.541, + "step": 1467 + }, + { + "epoch": 0.4070992789794787, + "grad_norm": 0.18750540912151337, + "learning_rate": 2.2789469369090173e-05, + "loss": 0.5567, + "step": 1468 + }, + { + "epoch": 0.40737659456461456, + "grad_norm": 0.18958985805511475, + "learning_rate": 2.2786333105764162e-05, + "loss": 0.5413, + "step": 1469 + }, + { + "epoch": 0.40765391014975044, + "grad_norm": 0.27570971846580505, + "learning_rate": 2.2783194835302035e-05, + "loss": 0.5548, + "step": 1470 + }, + { + "epoch": 0.4079312257348863, + "grad_norm": 0.1889680027961731, + "learning_rate": 2.2780054558316146e-05, + "loss": 0.5507, + "step": 1471 + }, + { + "epoch": 0.4082085413200222, + "grad_norm": 0.18410347402095795, + "learning_rate": 2.277691227541925e-05, + "loss": 0.5689, + "step": 1472 + }, + { + "epoch": 0.4084858569051581, + "grad_norm": 0.18935447931289673, + "learning_rate": 2.277376798722448e-05, + "loss": 0.5631, + "step": 1473 + }, + { + "epoch": 0.408763172490294, + "grad_norm": 0.19542036950588226, + "learning_rate": 2.2770621694345385e-05, + "loss": 0.5614, + "step": 1474 + }, + { + "epoch": 0.40904048807542986, + "grad_norm": 0.2004593461751938, + "learning_rate": 2.2767473397395876e-05, + "loss": 0.5707, + "step": 1475 + }, + { + "epoch": 0.40931780366056575, + "grad_norm": 0.19551704823970795, + "learning_rate": 2.276432309699028e-05, + "loss": 0.5656, + "step": 1476 + }, + { + "epoch": 0.40959511924570163, + "grad_norm": 0.18432258069515228, + "learning_rate": 2.27611707937433e-05, + "loss": 0.5696, + "step": 1477 + }, + { + "epoch": 0.4098724348308375, + "grad_norm": 0.20155014097690582, + "learning_rate": 2.2758016488270033e-05, + "loss": 0.581, + "step": 1478 + }, + { + "epoch": 0.4101497504159734, + "grad_norm": 0.1944228559732437, + "learning_rate": 2.2754860181185967e-05, + "loss": 0.565, + "step": 1479 + }, + { + "epoch": 0.4104270660011093, + "grad_norm": 0.20183973014354706, + "learning_rate": 2.2751701873106983e-05, + "loss": 0.5743, + "step": 1480 + }, + { + "epoch": 0.41070438158624517, + "grad_norm": 0.18106213212013245, + "learning_rate": 2.274854156464935e-05, + "loss": 0.5726, + "step": 1481 + }, + { + "epoch": 0.41098169717138106, + "grad_norm": 0.19121475517749786, + "learning_rate": 2.2745379256429728e-05, + "loss": 0.5856, + "step": 1482 + }, + { + "epoch": 0.41125901275651694, + "grad_norm": 0.1916564702987671, + "learning_rate": 2.2742214949065166e-05, + "loss": 0.5494, + "step": 1483 + }, + { + "epoch": 0.4115363283416528, + "grad_norm": 0.2309359312057495, + "learning_rate": 2.2739048643173105e-05, + "loss": 0.575, + "step": 1484 + }, + { + "epoch": 0.4118136439267887, + "grad_norm": 0.3025970757007599, + "learning_rate": 2.2735880339371373e-05, + "loss": 0.5717, + "step": 1485 + }, + { + "epoch": 0.4120909595119246, + "grad_norm": 0.19141024351119995, + "learning_rate": 2.27327100382782e-05, + "loss": 0.5743, + "step": 1486 + }, + { + "epoch": 0.4123682750970605, + "grad_norm": 0.19072705507278442, + "learning_rate": 2.272953774051218e-05, + "loss": 0.5498, + "step": 1487 + }, + { + "epoch": 0.41264559068219636, + "grad_norm": 0.23282243311405182, + "learning_rate": 2.2726363446692324e-05, + "loss": 0.5653, + "step": 1488 + }, + { + "epoch": 0.41292290626733225, + "grad_norm": 0.1904718577861786, + "learning_rate": 2.2723187157438015e-05, + "loss": 0.5557, + "step": 1489 + }, + { + "epoch": 0.41320022185246813, + "grad_norm": 0.18210548162460327, + "learning_rate": 2.2720008873369036e-05, + "loss": 0.5648, + "step": 1490 + }, + { + "epoch": 0.413477537437604, + "grad_norm": 0.1940220296382904, + "learning_rate": 2.271682859510555e-05, + "loss": 0.5824, + "step": 1491 + }, + { + "epoch": 0.4137548530227399, + "grad_norm": 0.18632103502750397, + "learning_rate": 2.2713646323268113e-05, + "loss": 0.5694, + "step": 1492 + }, + { + "epoch": 0.4140321686078758, + "grad_norm": 0.19417639076709747, + "learning_rate": 2.2710462058477676e-05, + "loss": 0.5385, + "step": 1493 + }, + { + "epoch": 0.41430948419301167, + "grad_norm": 0.1876698136329651, + "learning_rate": 2.270727580135557e-05, + "loss": 0.5441, + "step": 1494 + }, + { + "epoch": 0.41458679977814755, + "grad_norm": 0.19295859336853027, + "learning_rate": 2.270408755252352e-05, + "loss": 0.5587, + "step": 1495 + }, + { + "epoch": 0.41486411536328344, + "grad_norm": 0.1876155138015747, + "learning_rate": 2.2700897312603635e-05, + "loss": 0.5597, + "step": 1496 + }, + { + "epoch": 0.4151414309484193, + "grad_norm": 0.18888817727565765, + "learning_rate": 2.2697705082218417e-05, + "loss": 0.58, + "step": 1497 + }, + { + "epoch": 0.4154187465335552, + "grad_norm": 0.1931556612253189, + "learning_rate": 2.2694510861990755e-05, + "loss": 0.5195, + "step": 1498 + }, + { + "epoch": 0.4156960621186911, + "grad_norm": 0.20737329125404358, + "learning_rate": 2.2691314652543922e-05, + "loss": 0.5742, + "step": 1499 + }, + { + "epoch": 0.415973377703827, + "grad_norm": 0.18405389785766602, + "learning_rate": 2.268811645450159e-05, + "loss": 0.5572, + "step": 1500 + }, + { + "epoch": 0.41625069328896286, + "grad_norm": 0.2047012746334076, + "learning_rate": 2.2684916268487805e-05, + "loss": 0.5682, + "step": 1501 + }, + { + "epoch": 0.41652800887409874, + "grad_norm": 0.19159288704395294, + "learning_rate": 2.2681714095127016e-05, + "loss": 0.5642, + "step": 1502 + }, + { + "epoch": 0.4168053244592346, + "grad_norm": 0.19872340559959412, + "learning_rate": 2.2678509935044046e-05, + "loss": 0.5801, + "step": 1503 + }, + { + "epoch": 0.4170826400443705, + "grad_norm": 0.22284448146820068, + "learning_rate": 2.267530378886411e-05, + "loss": 0.5468, + "step": 1504 + }, + { + "epoch": 0.4173599556295064, + "grad_norm": 0.18655410408973694, + "learning_rate": 2.2672095657212822e-05, + "loss": 0.5557, + "step": 1505 + }, + { + "epoch": 0.4176372712146423, + "grad_norm": 0.1884729266166687, + "learning_rate": 2.266888554071616e-05, + "loss": 0.5641, + "step": 1506 + }, + { + "epoch": 0.41791458679977816, + "grad_norm": 0.1828029453754425, + "learning_rate": 2.2665673440000512e-05, + "loss": 0.5295, + "step": 1507 + }, + { + "epoch": 0.41819190238491405, + "grad_norm": 0.19014927744865417, + "learning_rate": 2.2662459355692645e-05, + "loss": 0.5385, + "step": 1508 + }, + { + "epoch": 0.41846921797004993, + "grad_norm": 0.20038922131061554, + "learning_rate": 2.26592432884197e-05, + "loss": 0.5763, + "step": 1509 + }, + { + "epoch": 0.4187465335551858, + "grad_norm": 0.19587905704975128, + "learning_rate": 2.2656025238809233e-05, + "loss": 0.5642, + "step": 1510 + }, + { + "epoch": 0.4190238491403217, + "grad_norm": 0.18529456853866577, + "learning_rate": 2.265280520748916e-05, + "loss": 0.5467, + "step": 1511 + }, + { + "epoch": 0.4193011647254576, + "grad_norm": 0.19396060705184937, + "learning_rate": 2.26495831950878e-05, + "loss": 0.5756, + "step": 1512 + }, + { + "epoch": 0.41957848031059347, + "grad_norm": 0.19712162017822266, + "learning_rate": 2.2646359202233848e-05, + "loss": 0.5856, + "step": 1513 + }, + { + "epoch": 0.41985579589572936, + "grad_norm": 0.1929578334093094, + "learning_rate": 2.264313322955639e-05, + "loss": 0.5677, + "step": 1514 + }, + { + "epoch": 0.42013311148086524, + "grad_norm": 0.19393184781074524, + "learning_rate": 2.263990527768491e-05, + "loss": 0.5409, + "step": 1515 + }, + { + "epoch": 0.4204104270660011, + "grad_norm": 0.19869175553321838, + "learning_rate": 2.2636675347249252e-05, + "loss": 0.5344, + "step": 1516 + }, + { + "epoch": 0.420687742651137, + "grad_norm": 0.19600443542003632, + "learning_rate": 2.263344343887967e-05, + "loss": 0.5627, + "step": 1517 + }, + { + "epoch": 0.4209650582362729, + "grad_norm": 0.19098101556301117, + "learning_rate": 2.263020955320679e-05, + "loss": 0.5704, + "step": 1518 + }, + { + "epoch": 0.4212423738214088, + "grad_norm": 0.20919351279735565, + "learning_rate": 2.2626973690861635e-05, + "loss": 0.55, + "step": 1519 + }, + { + "epoch": 0.42151968940654466, + "grad_norm": 0.21177563071250916, + "learning_rate": 2.2623735852475602e-05, + "loss": 0.5723, + "step": 1520 + }, + { + "epoch": 0.42179700499168055, + "grad_norm": 0.19463296234607697, + "learning_rate": 2.262049603868048e-05, + "loss": 0.5601, + "step": 1521 + }, + { + "epoch": 0.42207432057681643, + "grad_norm": 0.20798444747924805, + "learning_rate": 2.2617254250108445e-05, + "loss": 0.5606, + "step": 1522 + }, + { + "epoch": 0.4223516361619523, + "grad_norm": 0.20048515498638153, + "learning_rate": 2.2614010487392053e-05, + "loss": 0.5628, + "step": 1523 + }, + { + "epoch": 0.4226289517470882, + "grad_norm": 0.1931045949459076, + "learning_rate": 2.2610764751164253e-05, + "loss": 0.5662, + "step": 1524 + }, + { + "epoch": 0.4229062673322241, + "grad_norm": 0.19520334899425507, + "learning_rate": 2.2607517042058367e-05, + "loss": 0.5552, + "step": 1525 + }, + { + "epoch": 0.42318358291735997, + "grad_norm": 0.1984749138355255, + "learning_rate": 2.2604267360708113e-05, + "loss": 0.5672, + "step": 1526 + }, + { + "epoch": 0.42346089850249585, + "grad_norm": 0.19392019510269165, + "learning_rate": 2.2601015707747585e-05, + "loss": 0.5689, + "step": 1527 + }, + { + "epoch": 0.42373821408763174, + "grad_norm": 0.1880037933588028, + "learning_rate": 2.2597762083811276e-05, + "loss": 0.5606, + "step": 1528 + }, + { + "epoch": 0.4240155296727676, + "grad_norm": 0.19556698203086853, + "learning_rate": 2.259450648953405e-05, + "loss": 0.5626, + "step": 1529 + }, + { + "epoch": 0.4242928452579035, + "grad_norm": 0.18473385274410248, + "learning_rate": 2.2591248925551156e-05, + "loss": 0.541, + "step": 1530 + }, + { + "epoch": 0.4245701608430394, + "grad_norm": 0.2530158460140228, + "learning_rate": 2.2587989392498237e-05, + "loss": 0.5429, + "step": 1531 + }, + { + "epoch": 0.4248474764281753, + "grad_norm": 0.22607286274433136, + "learning_rate": 2.258472789101131e-05, + "loss": 0.5578, + "step": 1532 + }, + { + "epoch": 0.42512479201331116, + "grad_norm": 0.19067604839801788, + "learning_rate": 2.258146442172678e-05, + "loss": 0.5474, + "step": 1533 + }, + { + "epoch": 0.42540210759844704, + "grad_norm": 0.18085655570030212, + "learning_rate": 2.257819898528144e-05, + "loss": 0.5506, + "step": 1534 + }, + { + "epoch": 0.42567942318358293, + "grad_norm": 0.18807153403759003, + "learning_rate": 2.257493158231246e-05, + "loss": 0.5461, + "step": 1535 + }, + { + "epoch": 0.4259567387687188, + "grad_norm": 0.18786919116973877, + "learning_rate": 2.25716622134574e-05, + "loss": 0.5781, + "step": 1536 + }, + { + "epoch": 0.4262340543538547, + "grad_norm": 0.19066756963729858, + "learning_rate": 2.2568390879354195e-05, + "loss": 0.5378, + "step": 1537 + }, + { + "epoch": 0.4265113699389906, + "grad_norm": 0.1895960569381714, + "learning_rate": 2.2565117580641175e-05, + "loss": 0.5661, + "step": 1538 + }, + { + "epoch": 0.42678868552412647, + "grad_norm": 0.21506737172603607, + "learning_rate": 2.2561842317957045e-05, + "loss": 0.661, + "step": 1539 + }, + { + "epoch": 0.42706600110926235, + "grad_norm": 0.1904166042804718, + "learning_rate": 2.2558565091940895e-05, + "loss": 0.5643, + "step": 1540 + }, + { + "epoch": 0.42734331669439823, + "grad_norm": 0.18517723679542542, + "learning_rate": 2.2555285903232197e-05, + "loss": 0.5509, + "step": 1541 + }, + { + "epoch": 0.4276206322795341, + "grad_norm": 0.2003047913312912, + "learning_rate": 2.2552004752470814e-05, + "loss": 0.5487, + "step": 1542 + }, + { + "epoch": 0.42789794786467, + "grad_norm": 0.19767479598522186, + "learning_rate": 2.2548721640296976e-05, + "loss": 0.5534, + "step": 1543 + }, + { + "epoch": 0.4281752634498059, + "grad_norm": 0.1890031397342682, + "learning_rate": 2.2545436567351312e-05, + "loss": 0.5762, + "step": 1544 + }, + { + "epoch": 0.42845257903494177, + "grad_norm": 0.22215117514133453, + "learning_rate": 2.2542149534274827e-05, + "loss": 0.5433, + "step": 1545 + }, + { + "epoch": 0.42872989462007766, + "grad_norm": 0.17825603485107422, + "learning_rate": 2.2538860541708902e-05, + "loss": 0.5724, + "step": 1546 + }, + { + "epoch": 0.42900721020521354, + "grad_norm": 0.1967187076807022, + "learning_rate": 2.2535569590295313e-05, + "loss": 0.5632, + "step": 1547 + }, + { + "epoch": 0.4292845257903494, + "grad_norm": 0.18680186569690704, + "learning_rate": 2.253227668067621e-05, + "loss": 0.5704, + "step": 1548 + }, + { + "epoch": 0.4295618413754853, + "grad_norm": 0.17610576748847961, + "learning_rate": 2.2528981813494127e-05, + "loss": 0.5295, + "step": 1549 + }, + { + "epoch": 0.4298391569606212, + "grad_norm": 0.19055548310279846, + "learning_rate": 2.2525684989391975e-05, + "loss": 0.5651, + "step": 1550 + }, + { + "epoch": 0.4301164725457571, + "grad_norm": 0.19191017746925354, + "learning_rate": 2.2522386209013062e-05, + "loss": 0.5366, + "step": 1551 + }, + { + "epoch": 0.43039378813089296, + "grad_norm": 0.20109041035175323, + "learning_rate": 2.2519085473001055e-05, + "loss": 0.5508, + "step": 1552 + }, + { + "epoch": 0.43067110371602885, + "grad_norm": 0.18693628907203674, + "learning_rate": 2.2515782782000027e-05, + "loss": 0.5603, + "step": 1553 + }, + { + "epoch": 0.43094841930116473, + "grad_norm": 0.18805643916130066, + "learning_rate": 2.2512478136654412e-05, + "loss": 0.5197, + "step": 1554 + }, + { + "epoch": 0.4312257348863006, + "grad_norm": 0.19016428291797638, + "learning_rate": 2.2509171537609042e-05, + "loss": 0.5719, + "step": 1555 + }, + { + "epoch": 0.4315030504714365, + "grad_norm": 0.18083550035953522, + "learning_rate": 2.2505862985509112e-05, + "loss": 0.5502, + "step": 1556 + }, + { + "epoch": 0.4317803660565724, + "grad_norm": 0.18382135033607483, + "learning_rate": 2.2502552481000218e-05, + "loss": 0.5437, + "step": 1557 + }, + { + "epoch": 0.43205768164170827, + "grad_norm": 0.20305176079273224, + "learning_rate": 2.2499240024728316e-05, + "loss": 0.5559, + "step": 1558 + }, + { + "epoch": 0.43233499722684415, + "grad_norm": 0.19593636691570282, + "learning_rate": 2.2495925617339765e-05, + "loss": 0.5695, + "step": 1559 + }, + { + "epoch": 0.43261231281198004, + "grad_norm": 0.18708528578281403, + "learning_rate": 2.2492609259481283e-05, + "loss": 0.5828, + "step": 1560 + }, + { + "epoch": 0.4328896283971159, + "grad_norm": 0.20516738295555115, + "learning_rate": 2.248929095179999e-05, + "loss": 0.5691, + "step": 1561 + }, + { + "epoch": 0.4331669439822518, + "grad_norm": 0.19292466342449188, + "learning_rate": 2.248597069494337e-05, + "loss": 0.5567, + "step": 1562 + }, + { + "epoch": 0.4334442595673877, + "grad_norm": 0.18612238764762878, + "learning_rate": 2.2482648489559296e-05, + "loss": 0.5497, + "step": 1563 + }, + { + "epoch": 0.4337215751525236, + "grad_norm": 0.18284805119037628, + "learning_rate": 2.2479324336296016e-05, + "loss": 0.5305, + "step": 1564 + }, + { + "epoch": 0.43399889073765946, + "grad_norm": 0.19658204913139343, + "learning_rate": 2.247599823580216e-05, + "loss": 0.5631, + "step": 1565 + }, + { + "epoch": 0.43427620632279534, + "grad_norm": 0.19752554595470428, + "learning_rate": 2.2472670188726737e-05, + "loss": 0.5647, + "step": 1566 + }, + { + "epoch": 0.43455352190793123, + "grad_norm": 0.18531207740306854, + "learning_rate": 2.246934019571914e-05, + "loss": 0.5667, + "step": 1567 + }, + { + "epoch": 0.4348308374930671, + "grad_norm": 0.18967878818511963, + "learning_rate": 2.2466008257429142e-05, + "loss": 0.5572, + "step": 1568 + }, + { + "epoch": 0.435108153078203, + "grad_norm": 0.20551453530788422, + "learning_rate": 2.2462674374506886e-05, + "loss": 0.5554, + "step": 1569 + }, + { + "epoch": 0.4353854686633389, + "grad_norm": 0.1888595074415207, + "learning_rate": 2.2459338547602905e-05, + "loss": 0.5986, + "step": 1570 + }, + { + "epoch": 0.43566278424847477, + "grad_norm": 0.1824692189693451, + "learning_rate": 2.2456000777368102e-05, + "loss": 0.5223, + "step": 1571 + }, + { + "epoch": 0.43594009983361065, + "grad_norm": 0.189620703458786, + "learning_rate": 2.245266106445377e-05, + "loss": 0.5465, + "step": 1572 + }, + { + "epoch": 0.43621741541874653, + "grad_norm": 0.20219853520393372, + "learning_rate": 2.2449319409511574e-05, + "loss": 0.5655, + "step": 1573 + }, + { + "epoch": 0.4364947310038824, + "grad_norm": 0.18667539954185486, + "learning_rate": 2.244597581319356e-05, + "loss": 0.5307, + "step": 1574 + }, + { + "epoch": 0.4367720465890183, + "grad_norm": 0.19490988552570343, + "learning_rate": 2.2442630276152148e-05, + "loss": 0.5666, + "step": 1575 + }, + { + "epoch": 0.4370493621741542, + "grad_norm": 0.18786631524562836, + "learning_rate": 2.2439282799040146e-05, + "loss": 0.535, + "step": 1576 + }, + { + "epoch": 0.4373266777592901, + "grad_norm": 0.1902604103088379, + "learning_rate": 2.2435933382510735e-05, + "loss": 0.5362, + "step": 1577 + }, + { + "epoch": 0.43760399334442596, + "grad_norm": 0.17974701523780823, + "learning_rate": 2.2432582027217473e-05, + "loss": 0.5538, + "step": 1578 + }, + { + "epoch": 0.43788130892956184, + "grad_norm": 0.1953812539577484, + "learning_rate": 2.2429228733814294e-05, + "loss": 0.5584, + "step": 1579 + }, + { + "epoch": 0.4381586245146977, + "grad_norm": 0.17894363403320312, + "learning_rate": 2.2425873502955524e-05, + "loss": 0.5667, + "step": 1580 + }, + { + "epoch": 0.4384359400998336, + "grad_norm": 0.2096855342388153, + "learning_rate": 2.2422516335295852e-05, + "loss": 0.5634, + "step": 1581 + }, + { + "epoch": 0.4387132556849695, + "grad_norm": 0.1870642900466919, + "learning_rate": 2.241915723149035e-05, + "loss": 0.5691, + "step": 1582 + }, + { + "epoch": 0.4389905712701054, + "grad_norm": 0.2048119157552719, + "learning_rate": 2.241579619219447e-05, + "loss": 0.5645, + "step": 1583 + }, + { + "epoch": 0.43926788685524126, + "grad_norm": 0.19508272409439087, + "learning_rate": 2.2412433218064037e-05, + "loss": 0.5543, + "step": 1584 + }, + { + "epoch": 0.43954520244037715, + "grad_norm": 0.1895490139722824, + "learning_rate": 2.240906830975526e-05, + "loss": 0.5522, + "step": 1585 + }, + { + "epoch": 0.43982251802551303, + "grad_norm": 0.18243685364723206, + "learning_rate": 2.240570146792472e-05, + "loss": 0.5695, + "step": 1586 + }, + { + "epoch": 0.4400998336106489, + "grad_norm": 0.18209905922412872, + "learning_rate": 2.2402332693229377e-05, + "loss": 0.5447, + "step": 1587 + }, + { + "epoch": 0.4403771491957848, + "grad_norm": 0.1850498467683792, + "learning_rate": 2.2398961986326567e-05, + "loss": 0.5289, + "step": 1588 + }, + { + "epoch": 0.4406544647809207, + "grad_norm": 0.19210520386695862, + "learning_rate": 2.2395589347874005e-05, + "loss": 0.5792, + "step": 1589 + }, + { + "epoch": 0.44093178036605657, + "grad_norm": 0.19306769967079163, + "learning_rate": 2.239221477852978e-05, + "loss": 0.5771, + "step": 1590 + }, + { + "epoch": 0.44120909595119245, + "grad_norm": 0.18809685111045837, + "learning_rate": 2.2388838278952367e-05, + "loss": 0.5648, + "step": 1591 + }, + { + "epoch": 0.44148641153632834, + "grad_norm": 0.19211184978485107, + "learning_rate": 2.2385459849800606e-05, + "loss": 0.5867, + "step": 1592 + }, + { + "epoch": 0.4417637271214642, + "grad_norm": 0.2126152366399765, + "learning_rate": 2.2382079491733715e-05, + "loss": 0.5705, + "step": 1593 + }, + { + "epoch": 0.4420410427066001, + "grad_norm": 0.18234777450561523, + "learning_rate": 2.23786972054113e-05, + "loss": 0.5373, + "step": 1594 + }, + { + "epoch": 0.442318358291736, + "grad_norm": 0.18619130551815033, + "learning_rate": 2.2375312991493324e-05, + "loss": 0.5525, + "step": 1595 + }, + { + "epoch": 0.4425956738768719, + "grad_norm": 0.19054090976715088, + "learning_rate": 2.237192685064014e-05, + "loss": 0.5606, + "step": 1596 + }, + { + "epoch": 0.44287298946200776, + "grad_norm": 0.19798876345157623, + "learning_rate": 2.236853878351248e-05, + "loss": 0.5389, + "step": 1597 + }, + { + "epoch": 0.44315030504714364, + "grad_norm": 0.20198500156402588, + "learning_rate": 2.2365148790771442e-05, + "loss": 0.577, + "step": 1598 + }, + { + "epoch": 0.44342762063227953, + "grad_norm": 0.22898751497268677, + "learning_rate": 2.2361756873078502e-05, + "loss": 0.5516, + "step": 1599 + }, + { + "epoch": 0.4437049362174154, + "grad_norm": 0.1942586600780487, + "learning_rate": 2.2358363031095513e-05, + "loss": 0.5583, + "step": 1600 + }, + { + "epoch": 0.4439822518025513, + "grad_norm": 0.18665020167827606, + "learning_rate": 2.23549672654847e-05, + "loss": 0.562, + "step": 1601 + }, + { + "epoch": 0.4442595673876872, + "grad_norm": 0.20378893613815308, + "learning_rate": 2.2351569576908675e-05, + "loss": 0.5783, + "step": 1602 + }, + { + "epoch": 0.44453688297282307, + "grad_norm": 0.21167699992656708, + "learning_rate": 2.2348169966030416e-05, + "loss": 0.5453, + "step": 1603 + }, + { + "epoch": 0.44481419855795895, + "grad_norm": 0.19410103559494019, + "learning_rate": 2.234476843351327e-05, + "loss": 0.5527, + "step": 1604 + }, + { + "epoch": 0.44509151414309484, + "grad_norm": 0.17932045459747314, + "learning_rate": 2.2341364980020973e-05, + "loss": 0.568, + "step": 1605 + }, + { + "epoch": 0.4453688297282307, + "grad_norm": 0.2007569819688797, + "learning_rate": 2.2337959606217624e-05, + "loss": 0.5699, + "step": 1606 + }, + { + "epoch": 0.4456461453133666, + "grad_norm": 0.18869513273239136, + "learning_rate": 2.2334552312767705e-05, + "loss": 0.5506, + "step": 1607 + }, + { + "epoch": 0.4459234608985025, + "grad_norm": 0.1867865025997162, + "learning_rate": 2.2331143100336072e-05, + "loss": 0.5758, + "step": 1608 + }, + { + "epoch": 0.4462007764836384, + "grad_norm": 0.19275221228599548, + "learning_rate": 2.2327731969587947e-05, + "loss": 0.5408, + "step": 1609 + }, + { + "epoch": 0.44647809206877426, + "grad_norm": 0.19611743092536926, + "learning_rate": 2.2324318921188932e-05, + "loss": 0.5974, + "step": 1610 + }, + { + "epoch": 0.44675540765391014, + "grad_norm": 0.20041632652282715, + "learning_rate": 2.2320903955805e-05, + "loss": 0.5598, + "step": 1611 + }, + { + "epoch": 0.447032723239046, + "grad_norm": 0.18512395024299622, + "learning_rate": 2.2317487074102514e-05, + "loss": 0.5661, + "step": 1612 + }, + { + "epoch": 0.4473100388241819, + "grad_norm": 0.1952909678220749, + "learning_rate": 2.2314068276748188e-05, + "loss": 0.5635, + "step": 1613 + }, + { + "epoch": 0.4475873544093178, + "grad_norm": 0.1981881707906723, + "learning_rate": 2.231064756440912e-05, + "loss": 0.5601, + "step": 1614 + }, + { + "epoch": 0.4478646699944537, + "grad_norm": 0.1989242285490036, + "learning_rate": 2.230722493775279e-05, + "loss": 0.5635, + "step": 1615 + }, + { + "epoch": 0.44814198557958956, + "grad_norm": 0.18769480288028717, + "learning_rate": 2.2303800397447034e-05, + "loss": 0.5589, + "step": 1616 + }, + { + "epoch": 0.44841930116472545, + "grad_norm": 0.19833675026893616, + "learning_rate": 2.230037394416007e-05, + "loss": 0.5622, + "step": 1617 + }, + { + "epoch": 0.44869661674986133, + "grad_norm": 0.19801415503025055, + "learning_rate": 2.2296945578560498e-05, + "loss": 0.5862, + "step": 1618 + }, + { + "epoch": 0.4489739323349972, + "grad_norm": 0.1936810463666916, + "learning_rate": 2.2293515301317274e-05, + "loss": 0.5452, + "step": 1619 + }, + { + "epoch": 0.4492512479201331, + "grad_norm": 0.20558279752731323, + "learning_rate": 2.2290083113099748e-05, + "loss": 0.5573, + "step": 1620 + }, + { + "epoch": 0.449528563505269, + "grad_norm": 0.2063780575990677, + "learning_rate": 2.2286649014577615e-05, + "loss": 0.6017, + "step": 1621 + }, + { + "epoch": 0.44980587909040487, + "grad_norm": 0.18592418730258942, + "learning_rate": 2.2283213006420973e-05, + "loss": 0.5421, + "step": 1622 + }, + { + "epoch": 0.45008319467554075, + "grad_norm": 0.19346946477890015, + "learning_rate": 2.227977508930027e-05, + "loss": 0.53, + "step": 1623 + }, + { + "epoch": 0.45036051026067664, + "grad_norm": 0.2029414027929306, + "learning_rate": 2.2276335263886336e-05, + "loss": 0.5808, + "step": 1624 + }, + { + "epoch": 0.4506378258458125, + "grad_norm": 0.21214988827705383, + "learning_rate": 2.2272893530850373e-05, + "loss": 0.564, + "step": 1625 + }, + { + "epoch": 0.4509151414309484, + "grad_norm": 0.19748808443546295, + "learning_rate": 2.2269449890863956e-05, + "loss": 0.5562, + "step": 1626 + }, + { + "epoch": 0.4511924570160843, + "grad_norm": 0.19283835589885712, + "learning_rate": 2.2266004344599028e-05, + "loss": 0.5511, + "step": 1627 + }, + { + "epoch": 0.4514697726012202, + "grad_norm": 0.1917610466480255, + "learning_rate": 2.2262556892727904e-05, + "loss": 0.5744, + "step": 1628 + }, + { + "epoch": 0.45174708818635606, + "grad_norm": 0.20705603063106537, + "learning_rate": 2.225910753592328e-05, + "loss": 0.5786, + "step": 1629 + }, + { + "epoch": 0.45202440377149194, + "grad_norm": 0.19348299503326416, + "learning_rate": 2.225565627485821e-05, + "loss": 0.5899, + "step": 1630 + }, + { + "epoch": 0.45230171935662783, + "grad_norm": 0.19649489223957062, + "learning_rate": 2.2252203110206134e-05, + "loss": 0.5317, + "step": 1631 + }, + { + "epoch": 0.4525790349417637, + "grad_norm": 0.18169252574443817, + "learning_rate": 2.224874804264085e-05, + "loss": 0.5243, + "step": 1632 + }, + { + "epoch": 0.4528563505268996, + "grad_norm": 0.188733771443367, + "learning_rate": 2.224529107283653e-05, + "loss": 0.5376, + "step": 1633 + }, + { + "epoch": 0.4531336661120355, + "grad_norm": 0.18059036135673523, + "learning_rate": 2.2241832201467727e-05, + "loss": 0.5603, + "step": 1634 + }, + { + "epoch": 0.45341098169717137, + "grad_norm": 0.19139643013477325, + "learning_rate": 2.223837142920936e-05, + "loss": 0.5603, + "step": 1635 + }, + { + "epoch": 0.45368829728230725, + "grad_norm": 0.19182981550693512, + "learning_rate": 2.2234908756736712e-05, + "loss": 0.5805, + "step": 1636 + }, + { + "epoch": 0.45396561286744314, + "grad_norm": 0.187539204955101, + "learning_rate": 2.223144418472544e-05, + "loss": 0.5546, + "step": 1637 + }, + { + "epoch": 0.454242928452579, + "grad_norm": 0.19611942768096924, + "learning_rate": 2.2227977713851587e-05, + "loss": 0.5349, + "step": 1638 + }, + { + "epoch": 0.4545202440377149, + "grad_norm": 0.19672465324401855, + "learning_rate": 2.2224509344791536e-05, + "loss": 0.5342, + "step": 1639 + }, + { + "epoch": 0.4547975596228508, + "grad_norm": 0.17679478228092194, + "learning_rate": 2.222103907822207e-05, + "loss": 0.5473, + "step": 1640 + }, + { + "epoch": 0.4550748752079867, + "grad_norm": 0.1936340481042862, + "learning_rate": 2.2217566914820322e-05, + "loss": 0.5543, + "step": 1641 + }, + { + "epoch": 0.45535219079312256, + "grad_norm": 0.18610352277755737, + "learning_rate": 2.2214092855263813e-05, + "loss": 0.5412, + "step": 1642 + }, + { + "epoch": 0.45562950637825844, + "grad_norm": 0.18969598412513733, + "learning_rate": 2.2210616900230412e-05, + "loss": 0.5707, + "step": 1643 + }, + { + "epoch": 0.4559068219633943, + "grad_norm": 0.18808507919311523, + "learning_rate": 2.220713905039838e-05, + "loss": 0.5288, + "step": 1644 + }, + { + "epoch": 0.4561841375485302, + "grad_norm": 0.19103705883026123, + "learning_rate": 2.220365930644633e-05, + "loss": 0.5925, + "step": 1645 + }, + { + "epoch": 0.4564614531336661, + "grad_norm": 0.1837342530488968, + "learning_rate": 2.2200177669053258e-05, + "loss": 0.5893, + "step": 1646 + }, + { + "epoch": 0.456738768718802, + "grad_norm": 0.1928233504295349, + "learning_rate": 2.2196694138898517e-05, + "loss": 0.5445, + "step": 1647 + }, + { + "epoch": 0.45701608430393786, + "grad_norm": 0.195438414812088, + "learning_rate": 2.2193208716661846e-05, + "loss": 0.5561, + "step": 1648 + }, + { + "epoch": 0.45729339988907375, + "grad_norm": 0.18883578479290009, + "learning_rate": 2.2189721403023334e-05, + "loss": 0.5463, + "step": 1649 + }, + { + "epoch": 0.45757071547420963, + "grad_norm": 0.19664356112480164, + "learning_rate": 2.2186232198663455e-05, + "loss": 0.576, + "step": 1650 + }, + { + "epoch": 0.4578480310593455, + "grad_norm": 0.5387895703315735, + "learning_rate": 2.218274110426304e-05, + "loss": 0.562, + "step": 1651 + }, + { + "epoch": 0.4581253466444814, + "grad_norm": 0.18329556286334991, + "learning_rate": 2.21792481205033e-05, + "loss": 0.5737, + "step": 1652 + }, + { + "epoch": 0.4584026622296173, + "grad_norm": 0.20161500573158264, + "learning_rate": 2.21757532480658e-05, + "loss": 0.5711, + "step": 1653 + }, + { + "epoch": 0.45867997781475317, + "grad_norm": 0.20053423941135406, + "learning_rate": 2.2172256487632488e-05, + "loss": 0.5842, + "step": 1654 + }, + { + "epoch": 0.45895729339988905, + "grad_norm": 0.1900150030851364, + "learning_rate": 2.2168757839885672e-05, + "loss": 0.5702, + "step": 1655 + }, + { + "epoch": 0.45923460898502494, + "grad_norm": 0.19815103709697723, + "learning_rate": 2.2165257305508035e-05, + "loss": 0.5661, + "step": 1656 + }, + { + "epoch": 0.4595119245701608, + "grad_norm": 0.18877148628234863, + "learning_rate": 2.2161754885182623e-05, + "loss": 0.5278, + "step": 1657 + }, + { + "epoch": 0.4597892401552967, + "grad_norm": 0.1913890838623047, + "learning_rate": 2.215825057959285e-05, + "loss": 0.5342, + "step": 1658 + }, + { + "epoch": 0.4600665557404326, + "grad_norm": 0.18911883234977722, + "learning_rate": 2.2154744389422493e-05, + "loss": 0.5473, + "step": 1659 + }, + { + "epoch": 0.4603438713255685, + "grad_norm": 0.2087319940328598, + "learning_rate": 2.2151236315355714e-05, + "loss": 0.5839, + "step": 1660 + }, + { + "epoch": 0.46062118691070436, + "grad_norm": 0.19037386775016785, + "learning_rate": 2.214772635807702e-05, + "loss": 0.5518, + "step": 1661 + }, + { + "epoch": 0.46089850249584025, + "grad_norm": 0.1889956146478653, + "learning_rate": 2.2144214518271307e-05, + "loss": 0.5527, + "step": 1662 + }, + { + "epoch": 0.46117581808097613, + "grad_norm": 0.24889707565307617, + "learning_rate": 2.214070079662382e-05, + "loss": 0.5766, + "step": 1663 + }, + { + "epoch": 0.461453133666112, + "grad_norm": 0.19265350699424744, + "learning_rate": 2.213718519382018e-05, + "loss": 0.5721, + "step": 1664 + }, + { + "epoch": 0.4617304492512479, + "grad_norm": 0.19039899110794067, + "learning_rate": 2.213366771054638e-05, + "loss": 0.5678, + "step": 1665 + }, + { + "epoch": 0.4620077648363838, + "grad_norm": 0.203168585896492, + "learning_rate": 2.2130148347488773e-05, + "loss": 0.5486, + "step": 1666 + }, + { + "epoch": 0.46228508042151967, + "grad_norm": 0.19051045179367065, + "learning_rate": 2.2126627105334073e-05, + "loss": 0.544, + "step": 1667 + }, + { + "epoch": 0.46256239600665555, + "grad_norm": 0.18888403475284576, + "learning_rate": 2.212310398476937e-05, + "loss": 0.5436, + "step": 1668 + }, + { + "epoch": 0.46283971159179144, + "grad_norm": 0.1800607144832611, + "learning_rate": 2.2119578986482127e-05, + "loss": 0.5659, + "step": 1669 + }, + { + "epoch": 0.4631170271769273, + "grad_norm": 0.19722715020179749, + "learning_rate": 2.211605211116015e-05, + "loss": 0.5582, + "step": 1670 + }, + { + "epoch": 0.4633943427620632, + "grad_norm": 0.19088581204414368, + "learning_rate": 2.2112523359491637e-05, + "loss": 0.5471, + "step": 1671 + }, + { + "epoch": 0.4636716583471991, + "grad_norm": 0.19284148514270782, + "learning_rate": 2.210899273216514e-05, + "loss": 0.5408, + "step": 1672 + }, + { + "epoch": 0.463948973932335, + "grad_norm": 0.1880495250225067, + "learning_rate": 2.2105460229869574e-05, + "loss": 0.561, + "step": 1673 + }, + { + "epoch": 0.46422628951747086, + "grad_norm": 0.186056986451149, + "learning_rate": 2.2101925853294226e-05, + "loss": 0.5811, + "step": 1674 + }, + { + "epoch": 0.46450360510260674, + "grad_norm": 0.1980651617050171, + "learning_rate": 2.2098389603128744e-05, + "loss": 0.5456, + "step": 1675 + }, + { + "epoch": 0.4647809206877426, + "grad_norm": 0.2039021998643875, + "learning_rate": 2.2094851480063143e-05, + "loss": 0.6018, + "step": 1676 + }, + { + "epoch": 0.4650582362728785, + "grad_norm": 0.19567370414733887, + "learning_rate": 2.2091311484787815e-05, + "loss": 0.5499, + "step": 1677 + }, + { + "epoch": 0.4653355518580144, + "grad_norm": 0.19234254956245422, + "learning_rate": 2.208776961799349e-05, + "loss": 0.5522, + "step": 1678 + }, + { + "epoch": 0.4656128674431503, + "grad_norm": 0.1954323649406433, + "learning_rate": 2.20842258803713e-05, + "loss": 0.5702, + "step": 1679 + }, + { + "epoch": 0.46589018302828616, + "grad_norm": 0.1974237710237503, + "learning_rate": 2.20806802726127e-05, + "loss": 0.5604, + "step": 1680 + }, + { + "epoch": 0.46616749861342205, + "grad_norm": 0.19580432772636414, + "learning_rate": 2.2077132795409552e-05, + "loss": 0.5184, + "step": 1681 + }, + { + "epoch": 0.46644481419855793, + "grad_norm": 0.19890138506889343, + "learning_rate": 2.207358344945405e-05, + "loss": 0.5767, + "step": 1682 + }, + { + "epoch": 0.4667221297836938, + "grad_norm": 0.18458129465579987, + "learning_rate": 2.2070032235438776e-05, + "loss": 0.5316, + "step": 1683 + }, + { + "epoch": 0.4669994453688297, + "grad_norm": 0.1938208043575287, + "learning_rate": 2.206647915405665e-05, + "loss": 0.5809, + "step": 1684 + }, + { + "epoch": 0.4672767609539656, + "grad_norm": 0.20221275091171265, + "learning_rate": 2.206292420600099e-05, + "loss": 0.5731, + "step": 1685 + }, + { + "epoch": 0.46755407653910147, + "grad_norm": 0.194106787443161, + "learning_rate": 2.205936739196545e-05, + "loss": 0.5624, + "step": 1686 + }, + { + "epoch": 0.46783139212423736, + "grad_norm": 0.1918519288301468, + "learning_rate": 2.205580871264406e-05, + "loss": 0.535, + "step": 1687 + }, + { + "epoch": 0.46810870770937324, + "grad_norm": 0.19345992803573608, + "learning_rate": 2.2052248168731216e-05, + "loss": 0.5686, + "step": 1688 + }, + { + "epoch": 0.4683860232945091, + "grad_norm": 0.20070746541023254, + "learning_rate": 2.2048685760921674e-05, + "loss": 0.567, + "step": 1689 + }, + { + "epoch": 0.468663338879645, + "grad_norm": 0.1972619593143463, + "learning_rate": 2.204512148991055e-05, + "loss": 0.555, + "step": 1690 + }, + { + "epoch": 0.46894065446478095, + "grad_norm": 0.19085273146629333, + "learning_rate": 2.2041555356393327e-05, + "loss": 0.5985, + "step": 1691 + }, + { + "epoch": 0.46921797004991683, + "grad_norm": 0.20294739305973053, + "learning_rate": 2.2037987361065855e-05, + "loss": 0.5762, + "step": 1692 + }, + { + "epoch": 0.4694952856350527, + "grad_norm": 0.1896994560956955, + "learning_rate": 2.203441750462435e-05, + "loss": 0.5857, + "step": 1693 + }, + { + "epoch": 0.4697726012201886, + "grad_norm": 0.19331398606300354, + "learning_rate": 2.2030845787765377e-05, + "loss": 0.5654, + "step": 1694 + }, + { + "epoch": 0.4700499168053245, + "grad_norm": 0.20668423175811768, + "learning_rate": 2.2027272211185875e-05, + "loss": 0.5812, + "step": 1695 + }, + { + "epoch": 0.47032723239046037, + "grad_norm": 0.1910814493894577, + "learning_rate": 2.2023696775583146e-05, + "loss": 0.5479, + "step": 1696 + }, + { + "epoch": 0.47060454797559625, + "grad_norm": 0.1902536004781723, + "learning_rate": 2.2020119481654848e-05, + "loss": 0.5647, + "step": 1697 + }, + { + "epoch": 0.47088186356073214, + "grad_norm": 0.1955711394548416, + "learning_rate": 2.201654033009901e-05, + "loss": 0.581, + "step": 1698 + }, + { + "epoch": 0.471159179145868, + "grad_norm": 0.1895892173051834, + "learning_rate": 2.2012959321614018e-05, + "loss": 0.5658, + "step": 1699 + }, + { + "epoch": 0.4714364947310039, + "grad_norm": 0.1803213357925415, + "learning_rate": 2.2009376456898622e-05, + "loss": 0.558, + "step": 1700 + }, + { + "epoch": 0.4717138103161398, + "grad_norm": 0.19255508482456207, + "learning_rate": 2.200579173665193e-05, + "loss": 0.5649, + "step": 1701 + }, + { + "epoch": 0.4719911259012757, + "grad_norm": 0.20155562460422516, + "learning_rate": 2.2002205161573426e-05, + "loss": 0.5592, + "step": 1702 + }, + { + "epoch": 0.47226844148641156, + "grad_norm": 0.20530745387077332, + "learning_rate": 2.1998616732362935e-05, + "loss": 0.5677, + "step": 1703 + }, + { + "epoch": 0.47254575707154745, + "grad_norm": 0.2158524990081787, + "learning_rate": 2.1995026449720657e-05, + "loss": 0.5476, + "step": 1704 + }, + { + "epoch": 0.47282307265668333, + "grad_norm": 0.2383381724357605, + "learning_rate": 2.1991434314347155e-05, + "loss": 0.5413, + "step": 1705 + }, + { + "epoch": 0.4731003882418192, + "grad_norm": 0.19787725806236267, + "learning_rate": 2.1987840326943343e-05, + "loss": 0.5637, + "step": 1706 + }, + { + "epoch": 0.4733777038269551, + "grad_norm": 0.18500499427318573, + "learning_rate": 2.1984244488210508e-05, + "loss": 0.5333, + "step": 1707 + }, + { + "epoch": 0.473655019412091, + "grad_norm": 0.19429421424865723, + "learning_rate": 2.1980646798850295e-05, + "loss": 0.5611, + "step": 1708 + }, + { + "epoch": 0.47393233499722687, + "grad_norm": 0.18553559482097626, + "learning_rate": 2.197704725956471e-05, + "loss": 0.5516, + "step": 1709 + }, + { + "epoch": 0.47420965058236275, + "grad_norm": 0.1934727132320404, + "learning_rate": 2.197344587105611e-05, + "loss": 0.5464, + "step": 1710 + }, + { + "epoch": 0.47448696616749864, + "grad_norm": 0.20638912916183472, + "learning_rate": 2.1969842634027233e-05, + "loss": 0.5664, + "step": 1711 + }, + { + "epoch": 0.4747642817526345, + "grad_norm": 0.19581542909145355, + "learning_rate": 2.196623754918115e-05, + "loss": 0.5597, + "step": 1712 + }, + { + "epoch": 0.4750415973377704, + "grad_norm": 0.19786013662815094, + "learning_rate": 2.1962630617221325e-05, + "loss": 0.5729, + "step": 1713 + }, + { + "epoch": 0.4753189129229063, + "grad_norm": 0.19448676705360413, + "learning_rate": 2.1959021838851556e-05, + "loss": 0.5573, + "step": 1714 + }, + { + "epoch": 0.4755962285080422, + "grad_norm": 0.2131812423467636, + "learning_rate": 2.1955411214776015e-05, + "loss": 0.5862, + "step": 1715 + }, + { + "epoch": 0.47587354409317806, + "grad_norm": 0.2251943200826645, + "learning_rate": 2.195179874569923e-05, + "loss": 0.5847, + "step": 1716 + }, + { + "epoch": 0.47615085967831394, + "grad_norm": 0.20042872428894043, + "learning_rate": 2.1948184432326084e-05, + "loss": 0.5742, + "step": 1717 + }, + { + "epoch": 0.4764281752634498, + "grad_norm": 0.193080872297287, + "learning_rate": 2.1944568275361838e-05, + "loss": 0.5441, + "step": 1718 + }, + { + "epoch": 0.4767054908485857, + "grad_norm": 0.20040108263492584, + "learning_rate": 2.194095027551209e-05, + "loss": 0.5635, + "step": 1719 + }, + { + "epoch": 0.4769828064337216, + "grad_norm": 0.19599542021751404, + "learning_rate": 2.193733043348281e-05, + "loss": 0.5852, + "step": 1720 + }, + { + "epoch": 0.4772601220188575, + "grad_norm": 0.1938834935426712, + "learning_rate": 2.1933708749980324e-05, + "loss": 0.5644, + "step": 1721 + }, + { + "epoch": 0.47753743760399336, + "grad_norm": 0.19517837464809418, + "learning_rate": 2.1930085225711317e-05, + "loss": 0.5568, + "step": 1722 + }, + { + "epoch": 0.47781475318912925, + "grad_norm": 0.1954992413520813, + "learning_rate": 2.1926459861382843e-05, + "loss": 0.5459, + "step": 1723 + }, + { + "epoch": 0.47809206877426513, + "grad_norm": 0.17570015788078308, + "learning_rate": 2.1922832657702297e-05, + "loss": 0.539, + "step": 1724 + }, + { + "epoch": 0.478369384359401, + "grad_norm": 0.24834416806697845, + "learning_rate": 2.1919203615377442e-05, + "loss": 0.5401, + "step": 1725 + }, + { + "epoch": 0.4786466999445369, + "grad_norm": 0.19633722305297852, + "learning_rate": 2.1915572735116413e-05, + "loss": 0.5469, + "step": 1726 + }, + { + "epoch": 0.4789240155296728, + "grad_norm": 0.19104620814323425, + "learning_rate": 2.1911940017627676e-05, + "loss": 0.5472, + "step": 1727 + }, + { + "epoch": 0.47920133111480867, + "grad_norm": 0.18845802545547485, + "learning_rate": 2.1908305463620084e-05, + "loss": 0.5742, + "step": 1728 + }, + { + "epoch": 0.47947864669994456, + "grad_norm": 0.20180946588516235, + "learning_rate": 2.190466907380282e-05, + "loss": 0.5402, + "step": 1729 + }, + { + "epoch": 0.47975596228508044, + "grad_norm": 0.19500130414962769, + "learning_rate": 2.190103084888545e-05, + "loss": 0.5476, + "step": 1730 + }, + { + "epoch": 0.4800332778702163, + "grad_norm": 0.1933142989873886, + "learning_rate": 2.1897390789577887e-05, + "loss": 0.5426, + "step": 1731 + }, + { + "epoch": 0.4803105934553522, + "grad_norm": 0.1977783739566803, + "learning_rate": 2.1893748896590404e-05, + "loss": 0.5614, + "step": 1732 + }, + { + "epoch": 0.4805879090404881, + "grad_norm": 0.2301134616136551, + "learning_rate": 2.1890105170633624e-05, + "loss": 0.564, + "step": 1733 + }, + { + "epoch": 0.480865224625624, + "grad_norm": 0.219647616147995, + "learning_rate": 2.1886459612418542e-05, + "loss": 0.5289, + "step": 1734 + }, + { + "epoch": 0.48114254021075986, + "grad_norm": 0.19821031391620636, + "learning_rate": 2.18828122226565e-05, + "loss": 0.5903, + "step": 1735 + }, + { + "epoch": 0.48141985579589575, + "grad_norm": 0.24226263165473938, + "learning_rate": 2.18791630020592e-05, + "loss": 0.5795, + "step": 1736 + }, + { + "epoch": 0.48169717138103163, + "grad_norm": 0.200203076004982, + "learning_rate": 2.18755119513387e-05, + "loss": 0.568, + "step": 1737 + }, + { + "epoch": 0.4819744869661675, + "grad_norm": 0.19729411602020264, + "learning_rate": 2.1871859071207425e-05, + "loss": 0.5633, + "step": 1738 + }, + { + "epoch": 0.4822518025513034, + "grad_norm": 0.19362856447696686, + "learning_rate": 2.1868204362378136e-05, + "loss": 0.5709, + "step": 1739 + }, + { + "epoch": 0.4825291181364393, + "grad_norm": 0.21311257779598236, + "learning_rate": 2.1864547825563968e-05, + "loss": 0.5481, + "step": 1740 + }, + { + "epoch": 0.48280643372157517, + "grad_norm": 0.1957651972770691, + "learning_rate": 2.1860889461478416e-05, + "loss": 0.5481, + "step": 1741 + }, + { + "epoch": 0.48308374930671105, + "grad_norm": 0.20088225603103638, + "learning_rate": 2.1857229270835316e-05, + "loss": 0.5774, + "step": 1742 + }, + { + "epoch": 0.48336106489184694, + "grad_norm": 0.19715815782546997, + "learning_rate": 2.1853567254348873e-05, + "loss": 0.5737, + "step": 1743 + }, + { + "epoch": 0.4836383804769828, + "grad_norm": 0.2071049064397812, + "learning_rate": 2.184990341273364e-05, + "loss": 0.5745, + "step": 1744 + }, + { + "epoch": 0.4839156960621187, + "grad_norm": 0.19320468604564667, + "learning_rate": 2.1846237746704526e-05, + "loss": 0.5662, + "step": 1745 + }, + { + "epoch": 0.4841930116472546, + "grad_norm": 0.18520797789096832, + "learning_rate": 2.1842570256976807e-05, + "loss": 0.5808, + "step": 1746 + }, + { + "epoch": 0.4844703272323905, + "grad_norm": 0.20254836976528168, + "learning_rate": 2.18389009442661e-05, + "loss": 0.5558, + "step": 1747 + }, + { + "epoch": 0.48474764281752636, + "grad_norm": 0.2153664231300354, + "learning_rate": 2.1835229809288393e-05, + "loss": 0.5661, + "step": 1748 + }, + { + "epoch": 0.48502495840266224, + "grad_norm": 0.1884908676147461, + "learning_rate": 2.183155685276002e-05, + "loss": 0.5577, + "step": 1749 + }, + { + "epoch": 0.4853022739877981, + "grad_norm": 0.19069699943065643, + "learning_rate": 2.1827882075397664e-05, + "loss": 0.5417, + "step": 1750 + }, + { + "epoch": 0.485579589572934, + "grad_norm": 0.2055320143699646, + "learning_rate": 2.182420547791838e-05, + "loss": 0.5887, + "step": 1751 + }, + { + "epoch": 0.4858569051580699, + "grad_norm": 0.19550320506095886, + "learning_rate": 2.182052706103957e-05, + "loss": 0.5348, + "step": 1752 + }, + { + "epoch": 0.4861342207432058, + "grad_norm": 0.19508466124534607, + "learning_rate": 2.1816846825478988e-05, + "loss": 0.5506, + "step": 1753 + }, + { + "epoch": 0.48641153632834166, + "grad_norm": 0.18409934639930725, + "learning_rate": 2.181316477195474e-05, + "loss": 0.5629, + "step": 1754 + }, + { + "epoch": 0.48668885191347755, + "grad_norm": 0.18185651302337646, + "learning_rate": 2.1809480901185302e-05, + "loss": 0.5471, + "step": 1755 + }, + { + "epoch": 0.48696616749861343, + "grad_norm": 0.18371707201004028, + "learning_rate": 2.180579521388949e-05, + "loss": 0.5747, + "step": 1756 + }, + { + "epoch": 0.4872434830837493, + "grad_norm": 0.1873805671930313, + "learning_rate": 2.1802107710786476e-05, + "loss": 0.5606, + "step": 1757 + }, + { + "epoch": 0.4875207986688852, + "grad_norm": 0.19243435561656952, + "learning_rate": 2.1798418392595794e-05, + "loss": 0.5638, + "step": 1758 + }, + { + "epoch": 0.4877981142540211, + "grad_norm": 0.184648796916008, + "learning_rate": 2.179472726003733e-05, + "loss": 0.5584, + "step": 1759 + }, + { + "epoch": 0.48807542983915697, + "grad_norm": 0.17338646948337555, + "learning_rate": 2.1791034313831316e-05, + "loss": 0.556, + "step": 1760 + }, + { + "epoch": 0.48835274542429286, + "grad_norm": 0.18127377331256866, + "learning_rate": 2.1787339554698344e-05, + "loss": 0.5631, + "step": 1761 + }, + { + "epoch": 0.48863006100942874, + "grad_norm": 0.1888059824705124, + "learning_rate": 2.1783642983359364e-05, + "loss": 0.5611, + "step": 1762 + }, + { + "epoch": 0.4889073765945646, + "grad_norm": 0.17712418735027313, + "learning_rate": 2.1779944600535672e-05, + "loss": 0.5462, + "step": 1763 + }, + { + "epoch": 0.4891846921797005, + "grad_norm": 0.19322241842746735, + "learning_rate": 2.177624440694892e-05, + "loss": 0.5511, + "step": 1764 + }, + { + "epoch": 0.4894620077648364, + "grad_norm": 0.18574179708957672, + "learning_rate": 2.1772542403321118e-05, + "loss": 0.5531, + "step": 1765 + }, + { + "epoch": 0.4897393233499723, + "grad_norm": 0.18718282878398895, + "learning_rate": 2.1768838590374617e-05, + "loss": 0.5683, + "step": 1766 + }, + { + "epoch": 0.49001663893510816, + "grad_norm": 0.18598803877830505, + "learning_rate": 2.1765132968832135e-05, + "loss": 0.5488, + "step": 1767 + }, + { + "epoch": 0.49029395452024405, + "grad_norm": 0.18899311125278473, + "learning_rate": 2.1761425539416737e-05, + "loss": 0.5449, + "step": 1768 + }, + { + "epoch": 0.49057127010537993, + "grad_norm": 0.1895790696144104, + "learning_rate": 2.175771630285184e-05, + "loss": 0.58, + "step": 1769 + }, + { + "epoch": 0.4908485856905158, + "grad_norm": 0.17997822165489197, + "learning_rate": 2.1754005259861217e-05, + "loss": 0.5734, + "step": 1770 + }, + { + "epoch": 0.4911259012756517, + "grad_norm": 0.19107869267463684, + "learning_rate": 2.175029241116898e-05, + "loss": 0.5707, + "step": 1771 + }, + { + "epoch": 0.4914032168607876, + "grad_norm": 0.22478626668453217, + "learning_rate": 2.1746577757499613e-05, + "loss": 0.5667, + "step": 1772 + }, + { + "epoch": 0.49168053244592347, + "grad_norm": 0.2036799043416977, + "learning_rate": 2.1742861299577947e-05, + "loss": 0.5505, + "step": 1773 + }, + { + "epoch": 0.49195784803105935, + "grad_norm": 0.1856662929058075, + "learning_rate": 2.1739143038129152e-05, + "loss": 0.538, + "step": 1774 + }, + { + "epoch": 0.49223516361619524, + "grad_norm": 0.18573297560214996, + "learning_rate": 2.1735422973878766e-05, + "loss": 0.5507, + "step": 1775 + }, + { + "epoch": 0.4925124792013311, + "grad_norm": 0.19560420513153076, + "learning_rate": 2.1731701107552673e-05, + "loss": 0.5395, + "step": 1776 + }, + { + "epoch": 0.492789794786467, + "grad_norm": 0.18675924837589264, + "learning_rate": 2.1727977439877094e-05, + "loss": 0.5523, + "step": 1777 + }, + { + "epoch": 0.4930671103716029, + "grad_norm": 0.19126398861408234, + "learning_rate": 2.1724251971578636e-05, + "loss": 0.5736, + "step": 1778 + }, + { + "epoch": 0.4933444259567388, + "grad_norm": 0.17955103516578674, + "learning_rate": 2.1720524703384222e-05, + "loss": 0.5398, + "step": 1779 + }, + { + "epoch": 0.49362174154187466, + "grad_norm": 0.1961311250925064, + "learning_rate": 2.1716795636021148e-05, + "loss": 0.5565, + "step": 1780 + }, + { + "epoch": 0.49389905712701054, + "grad_norm": 0.1914500594139099, + "learning_rate": 2.171306477021705e-05, + "loss": 0.5296, + "step": 1781 + }, + { + "epoch": 0.49417637271214643, + "grad_norm": 0.2172551453113556, + "learning_rate": 2.170933210669992e-05, + "loss": 0.5711, + "step": 1782 + }, + { + "epoch": 0.4944536882972823, + "grad_norm": 0.18967878818511963, + "learning_rate": 2.1705597646198098e-05, + "loss": 0.5719, + "step": 1783 + }, + { + "epoch": 0.4947310038824182, + "grad_norm": 0.19041703641414642, + "learning_rate": 2.1701861389440277e-05, + "loss": 0.5431, + "step": 1784 + }, + { + "epoch": 0.4950083194675541, + "grad_norm": 0.19202065467834473, + "learning_rate": 2.1698123337155503e-05, + "loss": 0.5392, + "step": 1785 + }, + { + "epoch": 0.49528563505268997, + "grad_norm": 0.1918521374464035, + "learning_rate": 2.1694383490073162e-05, + "loss": 0.5268, + "step": 1786 + }, + { + "epoch": 0.49556295063782585, + "grad_norm": 0.19112561643123627, + "learning_rate": 2.1690641848923004e-05, + "loss": 0.5741, + "step": 1787 + }, + { + "epoch": 0.49584026622296173, + "grad_norm": 0.28441137075424194, + "learning_rate": 2.168689841443512e-05, + "loss": 0.5628, + "step": 1788 + }, + { + "epoch": 0.4961175818080976, + "grad_norm": 0.1971031278371811, + "learning_rate": 2.1683153187339955e-05, + "loss": 0.5336, + "step": 1789 + }, + { + "epoch": 0.4963948973932335, + "grad_norm": 0.18874448537826538, + "learning_rate": 2.16794061683683e-05, + "loss": 0.5717, + "step": 1790 + }, + { + "epoch": 0.4966722129783694, + "grad_norm": 0.19406016170978546, + "learning_rate": 2.1675657358251293e-05, + "loss": 0.5641, + "step": 1791 + }, + { + "epoch": 0.49694952856350527, + "grad_norm": 0.19491691887378693, + "learning_rate": 2.1671906757720433e-05, + "loss": 0.5598, + "step": 1792 + }, + { + "epoch": 0.49722684414864116, + "grad_norm": 0.1898011416196823, + "learning_rate": 2.166815436750756e-05, + "loss": 0.5748, + "step": 1793 + }, + { + "epoch": 0.49750415973377704, + "grad_norm": 0.18792784214019775, + "learning_rate": 2.1664400188344863e-05, + "loss": 0.5383, + "step": 1794 + }, + { + "epoch": 0.4977814753189129, + "grad_norm": 0.1921299546957016, + "learning_rate": 2.1660644220964886e-05, + "loss": 0.5649, + "step": 1795 + }, + { + "epoch": 0.4980587909040488, + "grad_norm": 0.1881396770477295, + "learning_rate": 2.1656886466100514e-05, + "loss": 0.5525, + "step": 1796 + }, + { + "epoch": 0.4983361064891847, + "grad_norm": 0.19252420961856842, + "learning_rate": 2.1653126924484985e-05, + "loss": 0.5308, + "step": 1797 + }, + { + "epoch": 0.4986134220743206, + "grad_norm": 0.2611597180366516, + "learning_rate": 2.1649365596851884e-05, + "loss": 0.5664, + "step": 1798 + }, + { + "epoch": 0.49889073765945646, + "grad_norm": 0.18851755559444427, + "learning_rate": 2.164560248393515e-05, + "loss": 0.5314, + "step": 1799 + }, + { + "epoch": 0.49916805324459235, + "grad_norm": 0.19385330379009247, + "learning_rate": 2.164183758646906e-05, + "loss": 0.5615, + "step": 1800 + }, + { + "epoch": 0.49944536882972823, + "grad_norm": 0.20486874878406525, + "learning_rate": 2.163807090518825e-05, + "loss": 0.5833, + "step": 1801 + }, + { + "epoch": 0.4997226844148641, + "grad_norm": 0.21984978020191193, + "learning_rate": 2.16343024408277e-05, + "loss": 0.5494, + "step": 1802 + }, + { + "epoch": 0.5, + "grad_norm": 0.1872473657131195, + "learning_rate": 2.1630532194122733e-05, + "loss": 0.5388, + "step": 1803 + }, + { + "epoch": 0.5002773155851359, + "grad_norm": 0.19348090887069702, + "learning_rate": 2.1626760165809022e-05, + "loss": 0.5615, + "step": 1804 + }, + { + "epoch": 0.5005546311702718, + "grad_norm": 0.18968060612678528, + "learning_rate": 2.16229863566226e-05, + "loss": 0.5401, + "step": 1805 + }, + { + "epoch": 0.5008319467554077, + "grad_norm": 0.1913541853427887, + "learning_rate": 2.161921076729983e-05, + "loss": 0.5797, + "step": 1806 + }, + { + "epoch": 0.5011092623405435, + "grad_norm": 0.18872712552547455, + "learning_rate": 2.1615433398577428e-05, + "loss": 0.5385, + "step": 1807 + }, + { + "epoch": 0.5013865779256794, + "grad_norm": 0.19591103494167328, + "learning_rate": 2.1611654251192465e-05, + "loss": 0.5568, + "step": 1808 + }, + { + "epoch": 0.5016638935108153, + "grad_norm": 0.19541212916374207, + "learning_rate": 2.1607873325882343e-05, + "loss": 0.5679, + "step": 1809 + }, + { + "epoch": 0.5019412090959512, + "grad_norm": 0.1972798854112625, + "learning_rate": 2.160409062338483e-05, + "loss": 0.5513, + "step": 1810 + }, + { + "epoch": 0.5022185246810871, + "grad_norm": 0.19507858157157898, + "learning_rate": 2.1600306144438027e-05, + "loss": 0.5509, + "step": 1811 + }, + { + "epoch": 0.502495840266223, + "grad_norm": 0.18637265264987946, + "learning_rate": 2.1596519889780387e-05, + "loss": 0.5632, + "step": 1812 + }, + { + "epoch": 0.5027731558513588, + "grad_norm": 0.17938151955604553, + "learning_rate": 2.159273186015071e-05, + "loss": 0.5583, + "step": 1813 + }, + { + "epoch": 0.5030504714364947, + "grad_norm": 0.19754791259765625, + "learning_rate": 2.158894205628814e-05, + "loss": 0.5705, + "step": 1814 + }, + { + "epoch": 0.5033277870216306, + "grad_norm": 0.19835114479064941, + "learning_rate": 2.1585150478932165e-05, + "loss": 0.6031, + "step": 1815 + }, + { + "epoch": 0.5036051026067665, + "grad_norm": 0.1887637823820114, + "learning_rate": 2.1581357128822627e-05, + "loss": 0.5551, + "step": 1816 + }, + { + "epoch": 0.5038824181919024, + "grad_norm": 0.18568859994411469, + "learning_rate": 2.157756200669971e-05, + "loss": 0.5384, + "step": 1817 + }, + { + "epoch": 0.5041597337770383, + "grad_norm": 0.1866898089647293, + "learning_rate": 2.1573765113303936e-05, + "loss": 0.5474, + "step": 1818 + }, + { + "epoch": 0.5044370493621742, + "grad_norm": 0.18115058541297913, + "learning_rate": 2.156996644937618e-05, + "loss": 0.5459, + "step": 1819 + }, + { + "epoch": 0.50471436494731, + "grad_norm": 0.188395157456398, + "learning_rate": 2.1566166015657672e-05, + "loss": 0.5379, + "step": 1820 + }, + { + "epoch": 0.5049916805324459, + "grad_norm": 0.2012917846441269, + "learning_rate": 2.156236381288997e-05, + "loss": 0.5771, + "step": 1821 + }, + { + "epoch": 0.5052689961175818, + "grad_norm": 0.18014481663703918, + "learning_rate": 2.1558559841814986e-05, + "loss": 0.5508, + "step": 1822 + }, + { + "epoch": 0.5055463117027177, + "grad_norm": 0.1956920027732849, + "learning_rate": 2.1554754103174972e-05, + "loss": 0.5625, + "step": 1823 + }, + { + "epoch": 0.5058236272878536, + "grad_norm": 0.1985912024974823, + "learning_rate": 2.1550946597712536e-05, + "loss": 0.532, + "step": 1824 + }, + { + "epoch": 0.5061009428729895, + "grad_norm": 0.1955011934041977, + "learning_rate": 2.1547137326170613e-05, + "loss": 0.5641, + "step": 1825 + }, + { + "epoch": 0.5063782584581253, + "grad_norm": 0.1937127411365509, + "learning_rate": 2.1543326289292497e-05, + "loss": 0.5369, + "step": 1826 + }, + { + "epoch": 0.5066555740432612, + "grad_norm": 0.22294695675373077, + "learning_rate": 2.153951348782183e-05, + "loss": 0.5754, + "step": 1827 + }, + { + "epoch": 0.5069328896283971, + "grad_norm": 0.1839090883731842, + "learning_rate": 2.1535698922502582e-05, + "loss": 0.5344, + "step": 1828 + }, + { + "epoch": 0.507210205213533, + "grad_norm": 0.18940389156341553, + "learning_rate": 2.1531882594079074e-05, + "loss": 0.5399, + "step": 1829 + }, + { + "epoch": 0.5074875207986689, + "grad_norm": 0.19242316484451294, + "learning_rate": 2.152806450329598e-05, + "loss": 0.5473, + "step": 1830 + }, + { + "epoch": 0.5077648363838048, + "grad_norm": 0.19500425457954407, + "learning_rate": 2.1524244650898308e-05, + "loss": 0.5812, + "step": 1831 + }, + { + "epoch": 0.5080421519689406, + "grad_norm": 0.19228143990039825, + "learning_rate": 2.1520423037631408e-05, + "loss": 0.5518, + "step": 1832 + }, + { + "epoch": 0.5083194675540765, + "grad_norm": 0.1868668794631958, + "learning_rate": 2.1516599664240985e-05, + "loss": 0.5534, + "step": 1833 + }, + { + "epoch": 0.5085967831392124, + "grad_norm": 0.1996205449104309, + "learning_rate": 2.151277453147308e-05, + "loss": 0.5283, + "step": 1834 + }, + { + "epoch": 0.5088740987243483, + "grad_norm": 0.18492724001407623, + "learning_rate": 2.150894764007407e-05, + "loss": 0.563, + "step": 1835 + }, + { + "epoch": 0.5091514143094842, + "grad_norm": 0.1847442388534546, + "learning_rate": 2.150511899079069e-05, + "loss": 0.5478, + "step": 1836 + }, + { + "epoch": 0.5094287298946201, + "grad_norm": 0.19469550251960754, + "learning_rate": 2.1501288584370006e-05, + "loss": 0.5388, + "step": 1837 + }, + { + "epoch": 0.509706045479756, + "grad_norm": 0.18692530691623688, + "learning_rate": 2.1497456421559436e-05, + "loss": 0.523, + "step": 1838 + }, + { + "epoch": 0.5099833610648918, + "grad_norm": 0.18751968443393707, + "learning_rate": 2.1493622503106736e-05, + "loss": 0.561, + "step": 1839 + }, + { + "epoch": 0.5102606766500277, + "grad_norm": 0.18232478201389313, + "learning_rate": 2.1489786829760005e-05, + "loss": 0.5579, + "step": 1840 + }, + { + "epoch": 0.5105379922351636, + "grad_norm": 0.20106928050518036, + "learning_rate": 2.1485949402267684e-05, + "loss": 0.5445, + "step": 1841 + }, + { + "epoch": 0.5108153078202995, + "grad_norm": 0.19342289865016937, + "learning_rate": 2.1482110221378555e-05, + "loss": 0.5627, + "step": 1842 + }, + { + "epoch": 0.5110926234054354, + "grad_norm": 0.1977401226758957, + "learning_rate": 2.1478269287841747e-05, + "loss": 0.5949, + "step": 1843 + }, + { + "epoch": 0.5113699389905713, + "grad_norm": 0.18564966320991516, + "learning_rate": 2.1474426602406722e-05, + "loss": 0.5598, + "step": 1844 + }, + { + "epoch": 0.5116472545757071, + "grad_norm": 0.19453705847263336, + "learning_rate": 2.1470582165823296e-05, + "loss": 0.5876, + "step": 1845 + }, + { + "epoch": 0.511924570160843, + "grad_norm": 0.1874392032623291, + "learning_rate": 2.146673597884162e-05, + "loss": 0.5465, + "step": 1846 + }, + { + "epoch": 0.5122018857459789, + "grad_norm": 0.19269202649593353, + "learning_rate": 2.1462888042212183e-05, + "loss": 0.5333, + "step": 1847 + }, + { + "epoch": 0.5124792013311148, + "grad_norm": 0.21227677166461945, + "learning_rate": 2.1459038356685824e-05, + "loss": 0.5772, + "step": 1848 + }, + { + "epoch": 0.5127565169162507, + "grad_norm": 0.18281084299087524, + "learning_rate": 2.1455186923013716e-05, + "loss": 0.5732, + "step": 1849 + }, + { + "epoch": 0.5130338325013866, + "grad_norm": 0.18322256207466125, + "learning_rate": 2.1451333741947373e-05, + "loss": 0.5367, + "step": 1850 + }, + { + "epoch": 0.5133111480865225, + "grad_norm": 0.19677676260471344, + "learning_rate": 2.1447478814238658e-05, + "loss": 0.5797, + "step": 1851 + }, + { + "epoch": 0.5135884636716583, + "grad_norm": 0.2173527330160141, + "learning_rate": 2.1443622140639768e-05, + "loss": 0.5688, + "step": 1852 + }, + { + "epoch": 0.5138657792567942, + "grad_norm": 0.240644633769989, + "learning_rate": 2.143976372190324e-05, + "loss": 0.5754, + "step": 1853 + }, + { + "epoch": 0.5141430948419301, + "grad_norm": 0.19872795045375824, + "learning_rate": 2.1435903558781954e-05, + "loss": 0.5752, + "step": 1854 + }, + { + "epoch": 0.514420410427066, + "grad_norm": 0.18520157039165497, + "learning_rate": 2.143204165202914e-05, + "loss": 0.5564, + "step": 1855 + }, + { + "epoch": 0.5146977260122019, + "grad_norm": 0.1843482255935669, + "learning_rate": 2.1428178002398342e-05, + "loss": 0.536, + "step": 1856 + }, + { + "epoch": 0.5149750415973378, + "grad_norm": 0.18434906005859375, + "learning_rate": 2.1424312610643467e-05, + "loss": 0.5722, + "step": 1857 + }, + { + "epoch": 0.5152523571824736, + "grad_norm": 0.18243710696697235, + "learning_rate": 2.1420445477518756e-05, + "loss": 0.5134, + "step": 1858 + }, + { + "epoch": 0.5155296727676095, + "grad_norm": 0.1954345405101776, + "learning_rate": 2.14165766037788e-05, + "loss": 0.5465, + "step": 1859 + }, + { + "epoch": 0.5158069883527454, + "grad_norm": 0.1804470270872116, + "learning_rate": 2.1412705990178496e-05, + "loss": 0.5529, + "step": 1860 + }, + { + "epoch": 0.5160843039378813, + "grad_norm": 0.19080010056495667, + "learning_rate": 2.140883363747312e-05, + "loss": 0.54, + "step": 1861 + }, + { + "epoch": 0.5163616195230172, + "grad_norm": 0.18515829741954803, + "learning_rate": 2.1404959546418268e-05, + "loss": 0.5409, + "step": 1862 + }, + { + "epoch": 0.5166389351081531, + "grad_norm": 0.19277918338775635, + "learning_rate": 2.1401083717769876e-05, + "loss": 0.5703, + "step": 1863 + }, + { + "epoch": 0.516916250693289, + "grad_norm": 0.2017827183008194, + "learning_rate": 2.139720615228422e-05, + "loss": 0.5545, + "step": 1864 + }, + { + "epoch": 0.5171935662784248, + "grad_norm": 0.19409088790416718, + "learning_rate": 2.1393326850717915e-05, + "loss": 0.5613, + "step": 1865 + }, + { + "epoch": 0.5174708818635607, + "grad_norm": 0.18852104246616364, + "learning_rate": 2.138944581382792e-05, + "loss": 0.5784, + "step": 1866 + }, + { + "epoch": 0.5177481974486966, + "grad_norm": 0.18418540060520172, + "learning_rate": 2.1385563042371525e-05, + "loss": 0.5291, + "step": 1867 + }, + { + "epoch": 0.5180255130338325, + "grad_norm": 0.20222961902618408, + "learning_rate": 2.138167853710636e-05, + "loss": 0.5432, + "step": 1868 + }, + { + "epoch": 0.5183028286189684, + "grad_norm": 0.1890316754579544, + "learning_rate": 2.1377792298790396e-05, + "loss": 0.5859, + "step": 1869 + }, + { + "epoch": 0.5185801442041043, + "grad_norm": 0.19611713290214539, + "learning_rate": 2.1373904328181946e-05, + "loss": 0.5468, + "step": 1870 + }, + { + "epoch": 0.5188574597892401, + "grad_norm": 0.18812295794487, + "learning_rate": 2.1370014626039648e-05, + "loss": 0.5356, + "step": 1871 + }, + { + "epoch": 0.519134775374376, + "grad_norm": 0.18963585793972015, + "learning_rate": 2.136612319312249e-05, + "loss": 0.5646, + "step": 1872 + }, + { + "epoch": 0.5194120909595119, + "grad_norm": 0.1885354369878769, + "learning_rate": 2.1362230030189795e-05, + "loss": 0.5578, + "step": 1873 + }, + { + "epoch": 0.5196894065446478, + "grad_norm": 0.1933208405971527, + "learning_rate": 2.1358335138001224e-05, + "loss": 0.5379, + "step": 1874 + }, + { + "epoch": 0.5199667221297837, + "grad_norm": 0.1998060792684555, + "learning_rate": 2.1354438517316767e-05, + "loss": 0.5741, + "step": 1875 + }, + { + "epoch": 0.5202440377149196, + "grad_norm": 0.18762660026550293, + "learning_rate": 2.135054016889676e-05, + "loss": 0.5618, + "step": 1876 + }, + { + "epoch": 0.5205213533000554, + "grad_norm": 0.2009068727493286, + "learning_rate": 2.1346640093501872e-05, + "loss": 0.5499, + "step": 1877 + }, + { + "epoch": 0.5207986688851913, + "grad_norm": 0.1818576157093048, + "learning_rate": 2.1342738291893122e-05, + "loss": 0.5276, + "step": 1878 + }, + { + "epoch": 0.5210759844703272, + "grad_norm": 0.2910788357257843, + "learning_rate": 2.1338834764831845e-05, + "loss": 0.5333, + "step": 1879 + }, + { + "epoch": 0.5213533000554631, + "grad_norm": 0.20181319117546082, + "learning_rate": 2.1334929513079722e-05, + "loss": 0.567, + "step": 1880 + }, + { + "epoch": 0.521630615640599, + "grad_norm": 0.19496895372867584, + "learning_rate": 2.133102253739878e-05, + "loss": 0.5369, + "step": 1881 + }, + { + "epoch": 0.5219079312257349, + "grad_norm": 0.18957465887069702, + "learning_rate": 2.1327113838551362e-05, + "loss": 0.5359, + "step": 1882 + }, + { + "epoch": 0.5221852468108708, + "grad_norm": 0.19057804346084595, + "learning_rate": 2.132320341730017e-05, + "loss": 0.5595, + "step": 1883 + }, + { + "epoch": 0.5224625623960066, + "grad_norm": 0.18731848895549774, + "learning_rate": 2.131929127440822e-05, + "loss": 0.5258, + "step": 1884 + }, + { + "epoch": 0.5227398779811425, + "grad_norm": 0.19227994978427887, + "learning_rate": 2.131537741063888e-05, + "loss": 0.5927, + "step": 1885 + }, + { + "epoch": 0.5230171935662784, + "grad_norm": 0.18374542891979218, + "learning_rate": 2.1311461826755847e-05, + "loss": 0.543, + "step": 1886 + }, + { + "epoch": 0.5232945091514143, + "grad_norm": 0.1936596930027008, + "learning_rate": 2.1307544523523156e-05, + "loss": 0.5677, + "step": 1887 + }, + { + "epoch": 0.5235718247365502, + "grad_norm": 0.18285861611366272, + "learning_rate": 2.1303625501705183e-05, + "loss": 0.5738, + "step": 1888 + }, + { + "epoch": 0.5238491403216861, + "grad_norm": 0.23504310846328735, + "learning_rate": 2.1299704762066618e-05, + "loss": 0.5785, + "step": 1889 + }, + { + "epoch": 0.5241264559068219, + "grad_norm": 0.1995551437139511, + "learning_rate": 2.129578230537252e-05, + "loss": 0.5637, + "step": 1890 + }, + { + "epoch": 0.5244037714919578, + "grad_norm": 0.18941858410835266, + "learning_rate": 2.1291858132388248e-05, + "loss": 0.5523, + "step": 1891 + }, + { + "epoch": 0.5246810870770937, + "grad_norm": 0.18262585997581482, + "learning_rate": 2.1287932243879523e-05, + "loss": 0.5587, + "step": 1892 + }, + { + "epoch": 0.5249584026622296, + "grad_norm": 0.1891373097896576, + "learning_rate": 2.1284004640612376e-05, + "loss": 0.5494, + "step": 1893 + }, + { + "epoch": 0.5252357182473655, + "grad_norm": 0.1890500783920288, + "learning_rate": 2.1280075323353206e-05, + "loss": 0.5411, + "step": 1894 + }, + { + "epoch": 0.5255130338325014, + "grad_norm": 0.182157963514328, + "learning_rate": 2.127614429286871e-05, + "loss": 0.5265, + "step": 1895 + }, + { + "epoch": 0.5257903494176372, + "grad_norm": 0.18624311685562134, + "learning_rate": 2.1272211549925946e-05, + "loss": 0.5309, + "step": 1896 + }, + { + "epoch": 0.5260676650027731, + "grad_norm": 0.19566656649112701, + "learning_rate": 2.1268277095292292e-05, + "loss": 0.5694, + "step": 1897 + }, + { + "epoch": 0.526344980587909, + "grad_norm": 0.19160917401313782, + "learning_rate": 2.1264340929735467e-05, + "loss": 0.5324, + "step": 1898 + }, + { + "epoch": 0.5266222961730449, + "grad_norm": 0.18548499047756195, + "learning_rate": 2.126040305402352e-05, + "loss": 0.5326, + "step": 1899 + }, + { + "epoch": 0.5268996117581808, + "grad_norm": 0.18145646154880524, + "learning_rate": 2.1256463468924837e-05, + "loss": 0.5491, + "step": 1900 + }, + { + "epoch": 0.5271769273433167, + "grad_norm": 0.1829940527677536, + "learning_rate": 2.125252217520813e-05, + "loss": 0.5235, + "step": 1901 + }, + { + "epoch": 0.5274542429284526, + "grad_norm": 0.19107188284397125, + "learning_rate": 2.1248579173642453e-05, + "loss": 0.539, + "step": 1902 + }, + { + "epoch": 0.5277315585135884, + "grad_norm": 0.18394434452056885, + "learning_rate": 2.1244634464997188e-05, + "loss": 0.5672, + "step": 1903 + }, + { + "epoch": 0.5280088740987243, + "grad_norm": 0.19489826261997223, + "learning_rate": 2.1240688050042058e-05, + "loss": 0.5514, + "step": 1904 + }, + { + "epoch": 0.5282861896838602, + "grad_norm": 0.18582908809185028, + "learning_rate": 2.1236739929547105e-05, + "loss": 0.5484, + "step": 1905 + }, + { + "epoch": 0.5285635052689961, + "grad_norm": 0.19354532659053802, + "learning_rate": 2.123279010428272e-05, + "loss": 0.5542, + "step": 1906 + }, + { + "epoch": 0.528840820854132, + "grad_norm": 0.19419358670711517, + "learning_rate": 2.1228838575019612e-05, + "loss": 0.5349, + "step": 1907 + }, + { + "epoch": 0.5291181364392679, + "grad_norm": 0.18452230095863342, + "learning_rate": 2.1224885342528834e-05, + "loss": 0.5676, + "step": 1908 + }, + { + "epoch": 0.5293954520244037, + "grad_norm": 0.19760510325431824, + "learning_rate": 2.1220930407581762e-05, + "loss": 0.5439, + "step": 1909 + }, + { + "epoch": 0.5296727676095396, + "grad_norm": 0.18620994687080383, + "learning_rate": 2.121697377095011e-05, + "loss": 0.5612, + "step": 1910 + }, + { + "epoch": 0.5299500831946755, + "grad_norm": 0.1856573075056076, + "learning_rate": 2.121301543340593e-05, + "loss": 0.5267, + "step": 1911 + }, + { + "epoch": 0.5302273987798114, + "grad_norm": 0.19919690489768982, + "learning_rate": 2.1209055395721586e-05, + "loss": 0.5499, + "step": 1912 + }, + { + "epoch": 0.5305047143649473, + "grad_norm": 0.18541640043258667, + "learning_rate": 2.1205093658669793e-05, + "loss": 0.5693, + "step": 1913 + }, + { + "epoch": 0.5307820299500832, + "grad_norm": 0.18908998370170593, + "learning_rate": 2.120113022302359e-05, + "loss": 0.5421, + "step": 1914 + }, + { + "epoch": 0.531059345535219, + "grad_norm": 0.19005346298217773, + "learning_rate": 2.119716508955635e-05, + "loss": 0.5475, + "step": 1915 + }, + { + "epoch": 0.5313366611203549, + "grad_norm": 0.19358742237091064, + "learning_rate": 2.1193198259041774e-05, + "loss": 0.5671, + "step": 1916 + }, + { + "epoch": 0.5316139767054908, + "grad_norm": 0.19891729950904846, + "learning_rate": 2.1189229732253894e-05, + "loss": 0.5623, + "step": 1917 + }, + { + "epoch": 0.5318912922906267, + "grad_norm": 0.1928594410419464, + "learning_rate": 2.1185259509967082e-05, + "loss": 0.5467, + "step": 1918 + }, + { + "epoch": 0.5321686078757626, + "grad_norm": 0.21051675081253052, + "learning_rate": 2.118128759295602e-05, + "loss": 0.5504, + "step": 1919 + }, + { + "epoch": 0.5324459234608985, + "grad_norm": 0.1916693150997162, + "learning_rate": 2.1177313981995745e-05, + "loss": 0.5376, + "step": 1920 + }, + { + "epoch": 0.5327232390460344, + "grad_norm": 0.18833374977111816, + "learning_rate": 2.1173338677861616e-05, + "loss": 0.526, + "step": 1921 + }, + { + "epoch": 0.5330005546311702, + "grad_norm": 0.1901186853647232, + "learning_rate": 2.116936168132931e-05, + "loss": 0.5658, + "step": 1922 + }, + { + "epoch": 0.5332778702163061, + "grad_norm": 0.19015184044837952, + "learning_rate": 2.1165382993174848e-05, + "loss": 0.5703, + "step": 1923 + }, + { + "epoch": 0.533555185801442, + "grad_norm": 0.17683614790439606, + "learning_rate": 2.116140261417458e-05, + "loss": 0.5384, + "step": 1924 + }, + { + "epoch": 0.5338325013865779, + "grad_norm": 0.18865369260311127, + "learning_rate": 2.1157420545105187e-05, + "loss": 0.5473, + "step": 1925 + }, + { + "epoch": 0.5341098169717138, + "grad_norm": 0.1918121576309204, + "learning_rate": 2.1153436786743668e-05, + "loss": 0.5587, + "step": 1926 + }, + { + "epoch": 0.5343871325568497, + "grad_norm": 0.20620866119861603, + "learning_rate": 2.1149451339867363e-05, + "loss": 0.5358, + "step": 1927 + }, + { + "epoch": 0.5346644481419855, + "grad_norm": 0.19819065928459167, + "learning_rate": 2.114546420525394e-05, + "loss": 0.5681, + "step": 1928 + }, + { + "epoch": 0.5349417637271214, + "grad_norm": 0.19372405111789703, + "learning_rate": 2.114147538368139e-05, + "loss": 0.5615, + "step": 1929 + }, + { + "epoch": 0.5352190793122573, + "grad_norm": 0.19177958369255066, + "learning_rate": 2.1137484875928048e-05, + "loss": 0.5336, + "step": 1930 + }, + { + "epoch": 0.5354963948973932, + "grad_norm": 0.1978660523891449, + "learning_rate": 2.1133492682772556e-05, + "loss": 0.5465, + "step": 1931 + }, + { + "epoch": 0.5357737104825291, + "grad_norm": 0.19225792586803436, + "learning_rate": 2.1129498804993902e-05, + "loss": 0.5548, + "step": 1932 + }, + { + "epoch": 0.536051026067665, + "grad_norm": 0.19737133383750916, + "learning_rate": 2.1125503243371398e-05, + "loss": 0.5636, + "step": 1933 + }, + { + "epoch": 0.5363283416528009, + "grad_norm": 0.19044062495231628, + "learning_rate": 2.112150599868468e-05, + "loss": 0.5595, + "step": 1934 + }, + { + "epoch": 0.5366056572379367, + "grad_norm": 0.18862438201904297, + "learning_rate": 2.1117507071713724e-05, + "loss": 0.5594, + "step": 1935 + }, + { + "epoch": 0.5368829728230726, + "grad_norm": 0.20726455748081207, + "learning_rate": 2.111350646323882e-05, + "loss": 0.5678, + "step": 1936 + }, + { + "epoch": 0.5371602884082085, + "grad_norm": 0.1967361867427826, + "learning_rate": 2.1109504174040594e-05, + "loss": 0.5518, + "step": 1937 + }, + { + "epoch": 0.5374376039933444, + "grad_norm": 0.19529032707214355, + "learning_rate": 2.1105500204899997e-05, + "loss": 0.5775, + "step": 1938 + }, + { + "epoch": 0.5377149195784803, + "grad_norm": 0.19610485434532166, + "learning_rate": 2.110149455659831e-05, + "loss": 0.563, + "step": 1939 + }, + { + "epoch": 0.5379922351636162, + "grad_norm": 0.18098145723342896, + "learning_rate": 2.109748722991715e-05, + "loss": 0.5401, + "step": 1940 + }, + { + "epoch": 0.538269550748752, + "grad_norm": 0.21725578606128693, + "learning_rate": 2.109347822563844e-05, + "loss": 0.5671, + "step": 1941 + }, + { + "epoch": 0.5385468663338879, + "grad_norm": 0.20272882282733917, + "learning_rate": 2.108946754454445e-05, + "loss": 0.5699, + "step": 1942 + }, + { + "epoch": 0.5388241819190238, + "grad_norm": 0.18334084749221802, + "learning_rate": 2.108545518741777e-05, + "loss": 0.5508, + "step": 1943 + }, + { + "epoch": 0.5391014975041597, + "grad_norm": 0.20137275755405426, + "learning_rate": 2.1081441155041314e-05, + "loss": 0.5315, + "step": 1944 + }, + { + "epoch": 0.5393788130892956, + "grad_norm": 0.21601121127605438, + "learning_rate": 2.1077425448198327e-05, + "loss": 0.5415, + "step": 1945 + }, + { + "epoch": 0.5396561286744315, + "grad_norm": 0.19099989533424377, + "learning_rate": 2.107340806767238e-05, + "loss": 0.5492, + "step": 1946 + }, + { + "epoch": 0.5399334442595674, + "grad_norm": 0.18626771867275238, + "learning_rate": 2.106938901424737e-05, + "loss": 0.5622, + "step": 1947 + }, + { + "epoch": 0.5402107598447032, + "grad_norm": 0.1979578286409378, + "learning_rate": 2.1065368288707523e-05, + "loss": 0.5659, + "step": 1948 + }, + { + "epoch": 0.5404880754298391, + "grad_norm": 0.18457596004009247, + "learning_rate": 2.1061345891837393e-05, + "loss": 0.554, + "step": 1949 + }, + { + "epoch": 0.540765391014975, + "grad_norm": 0.1944621354341507, + "learning_rate": 2.1057321824421843e-05, + "loss": 0.5574, + "step": 1950 + }, + { + "epoch": 0.5410427066001109, + "grad_norm": 0.20393145084381104, + "learning_rate": 2.1053296087246087e-05, + "loss": 0.5557, + "step": 1951 + }, + { + "epoch": 0.5413200221852468, + "grad_norm": 0.19211652874946594, + "learning_rate": 2.1049268681095647e-05, + "loss": 0.5626, + "step": 1952 + }, + { + "epoch": 0.5415973377703827, + "grad_norm": 0.18954886496067047, + "learning_rate": 2.1045239606756378e-05, + "loss": 0.5481, + "step": 1953 + }, + { + "epoch": 0.5418746533555185, + "grad_norm": 0.19707705080509186, + "learning_rate": 2.1041208865014464e-05, + "loss": 0.5435, + "step": 1954 + }, + { + "epoch": 0.5421519689406544, + "grad_norm": 0.20506185293197632, + "learning_rate": 2.10371764566564e-05, + "loss": 0.5322, + "step": 1955 + }, + { + "epoch": 0.5424292845257903, + "grad_norm": 0.19055700302124023, + "learning_rate": 2.103314238246903e-05, + "loss": 0.5513, + "step": 1956 + }, + { + "epoch": 0.5427066001109262, + "grad_norm": 0.2124052196741104, + "learning_rate": 2.102910664323949e-05, + "loss": 0.574, + "step": 1957 + }, + { + "epoch": 0.5429839156960621, + "grad_norm": 0.19025616347789764, + "learning_rate": 2.1025069239755273e-05, + "loss": 0.5342, + "step": 1958 + }, + { + "epoch": 0.543261231281198, + "grad_norm": 0.18279728293418884, + "learning_rate": 2.102103017280418e-05, + "loss": 0.5542, + "step": 1959 + }, + { + "epoch": 0.5435385468663338, + "grad_norm": 0.18862898647785187, + "learning_rate": 2.101698944317434e-05, + "loss": 0.5743, + "step": 1960 + }, + { + "epoch": 0.5438158624514697, + "grad_norm": 0.18205633759498596, + "learning_rate": 2.101294705165421e-05, + "loss": 0.5597, + "step": 1961 + }, + { + "epoch": 0.5440931780366056, + "grad_norm": 0.1904565840959549, + "learning_rate": 2.100890299903256e-05, + "loss": 0.5789, + "step": 1962 + }, + { + "epoch": 0.5443704936217415, + "grad_norm": 0.19704897701740265, + "learning_rate": 2.1004857286098495e-05, + "loss": 0.5626, + "step": 1963 + }, + { + "epoch": 0.5446478092068774, + "grad_norm": 0.1878540813922882, + "learning_rate": 2.1000809913641445e-05, + "loss": 0.5713, + "step": 1964 + }, + { + "epoch": 0.5449251247920133, + "grad_norm": 0.2131820023059845, + "learning_rate": 2.0996760882451148e-05, + "loss": 0.5632, + "step": 1965 + }, + { + "epoch": 0.5452024403771492, + "grad_norm": 0.19665499031543732, + "learning_rate": 2.0992710193317693e-05, + "loss": 0.5585, + "step": 1966 + }, + { + "epoch": 0.545479755962285, + "grad_norm": 0.18704085052013397, + "learning_rate": 2.0988657847031467e-05, + "loss": 0.5534, + "step": 1967 + }, + { + "epoch": 0.5457570715474209, + "grad_norm": 0.1849927306175232, + "learning_rate": 2.0984603844383195e-05, + "loss": 0.5654, + "step": 1968 + }, + { + "epoch": 0.5460343871325568, + "grad_norm": 0.18812572956085205, + "learning_rate": 2.0980548186163918e-05, + "loss": 0.5412, + "step": 1969 + }, + { + "epoch": 0.5463117027176927, + "grad_norm": 0.18602755665779114, + "learning_rate": 2.0976490873165e-05, + "loss": 0.5503, + "step": 1970 + }, + { + "epoch": 0.5465890183028286, + "grad_norm": 0.19630911946296692, + "learning_rate": 2.097243190617813e-05, + "loss": 0.5687, + "step": 1971 + }, + { + "epoch": 0.5468663338879645, + "grad_norm": 0.18516132235527039, + "learning_rate": 2.0968371285995323e-05, + "loss": 0.5561, + "step": 1972 + }, + { + "epoch": 0.5471436494731003, + "grad_norm": 0.18923065066337585, + "learning_rate": 2.0964309013408914e-05, + "loss": 0.5477, + "step": 1973 + }, + { + "epoch": 0.5474209650582362, + "grad_norm": 0.18436457216739655, + "learning_rate": 2.096024508921156e-05, + "loss": 0.56, + "step": 1974 + }, + { + "epoch": 0.5476982806433721, + "grad_norm": 0.18705391883850098, + "learning_rate": 2.095617951419624e-05, + "loss": 0.536, + "step": 1975 + }, + { + "epoch": 0.547975596228508, + "grad_norm": 0.18911142647266388, + "learning_rate": 2.095211228915625e-05, + "loss": 0.5431, + "step": 1976 + }, + { + "epoch": 0.5482529118136439, + "grad_norm": 0.19025933742523193, + "learning_rate": 2.0948043414885222e-05, + "loss": 0.5352, + "step": 1977 + }, + { + "epoch": 0.5485302273987798, + "grad_norm": 0.18421220779418945, + "learning_rate": 2.0943972892177094e-05, + "loss": 0.536, + "step": 1978 + }, + { + "epoch": 0.5488075429839157, + "grad_norm": 0.19545422494411469, + "learning_rate": 2.0939900721826132e-05, + "loss": 0.5563, + "step": 1979 + }, + { + "epoch": 0.5490848585690515, + "grad_norm": 0.19028547406196594, + "learning_rate": 2.0935826904626937e-05, + "loss": 0.5081, + "step": 1980 + }, + { + "epoch": 0.5493621741541874, + "grad_norm": 0.20487383008003235, + "learning_rate": 2.0931751441374406e-05, + "loss": 0.5426, + "step": 1981 + }, + { + "epoch": 0.5496394897393233, + "grad_norm": 0.20360392332077026, + "learning_rate": 2.0927674332863774e-05, + "loss": 0.5686, + "step": 1982 + }, + { + "epoch": 0.5499168053244592, + "grad_norm": 0.19089289009571075, + "learning_rate": 2.092359557989059e-05, + "loss": 0.5574, + "step": 1983 + }, + { + "epoch": 0.5501941209095951, + "grad_norm": 0.1797301024198532, + "learning_rate": 2.0919515183250736e-05, + "loss": 0.5666, + "step": 1984 + }, + { + "epoch": 0.550471436494731, + "grad_norm": 0.19856330752372742, + "learning_rate": 2.0915433143740393e-05, + "loss": 0.5373, + "step": 1985 + }, + { + "epoch": 0.5507487520798668, + "grad_norm": 0.19353127479553223, + "learning_rate": 2.0911349462156082e-05, + "loss": 0.5454, + "step": 1986 + }, + { + "epoch": 0.5510260676650027, + "grad_norm": 0.19544550776481628, + "learning_rate": 2.090726413929464e-05, + "loss": 0.5705, + "step": 1987 + }, + { + "epoch": 0.5513033832501386, + "grad_norm": 0.199398934841156, + "learning_rate": 2.0903177175953216e-05, + "loss": 0.5431, + "step": 1988 + }, + { + "epoch": 0.5515806988352745, + "grad_norm": 0.19894284009933472, + "learning_rate": 2.0899088572929286e-05, + "loss": 0.5658, + "step": 1989 + }, + { + "epoch": 0.5518580144204104, + "grad_norm": 0.1806151121854782, + "learning_rate": 2.0894998331020645e-05, + "loss": 0.5748, + "step": 1990 + }, + { + "epoch": 0.5521353300055463, + "grad_norm": 0.19128967821598053, + "learning_rate": 2.089090645102541e-05, + "loss": 0.5576, + "step": 1991 + }, + { + "epoch": 0.5524126455906821, + "grad_norm": 0.20147933065891266, + "learning_rate": 2.0886812933742013e-05, + "loss": 0.5738, + "step": 1992 + }, + { + "epoch": 0.552689961175818, + "grad_norm": 0.17531876266002655, + "learning_rate": 2.0882717779969207e-05, + "loss": 0.544, + "step": 1993 + }, + { + "epoch": 0.5529672767609539, + "grad_norm": 0.19415581226348877, + "learning_rate": 2.087862099050607e-05, + "loss": 0.5315, + "step": 1994 + }, + { + "epoch": 0.5532445923460898, + "grad_norm": 0.22229987382888794, + "learning_rate": 2.087452256615199e-05, + "loss": 0.5324, + "step": 1995 + }, + { + "epoch": 0.5535219079312257, + "grad_norm": 0.18078093230724335, + "learning_rate": 2.0870422507706676e-05, + "loss": 0.5378, + "step": 1996 + }, + { + "epoch": 0.5537992235163616, + "grad_norm": 0.18564875423908234, + "learning_rate": 2.0866320815970157e-05, + "loss": 0.5238, + "step": 1997 + }, + { + "epoch": 0.5540765391014975, + "grad_norm": 0.19045381247997284, + "learning_rate": 2.086221749174279e-05, + "loss": 0.5527, + "step": 1998 + }, + { + "epoch": 0.5543538546866333, + "grad_norm": 0.18606480956077576, + "learning_rate": 2.0858112535825242e-05, + "loss": 0.5132, + "step": 1999 + }, + { + "epoch": 0.5546311702717692, + "grad_norm": 0.19520308077335358, + "learning_rate": 2.0854005949018487e-05, + "loss": 0.554, + "step": 2000 + }, + { + "epoch": 0.5549084858569051, + "grad_norm": 0.19881142675876617, + "learning_rate": 2.0849897732123838e-05, + "loss": 0.5327, + "step": 2001 + }, + { + "epoch": 0.555185801442041, + "grad_norm": 0.18831515312194824, + "learning_rate": 2.0845787885942917e-05, + "loss": 0.5541, + "step": 2002 + }, + { + "epoch": 0.5554631170271769, + "grad_norm": 0.1954246610403061, + "learning_rate": 2.0841676411277662e-05, + "loss": 0.5744, + "step": 2003 + }, + { + "epoch": 0.5557404326123128, + "grad_norm": 0.20168912410736084, + "learning_rate": 2.0837563308930325e-05, + "loss": 0.5704, + "step": 2004 + }, + { + "epoch": 0.5560177481974486, + "grad_norm": 0.19541358947753906, + "learning_rate": 2.0833448579703492e-05, + "loss": 0.5555, + "step": 2005 + }, + { + "epoch": 0.5562950637825845, + "grad_norm": 0.20079733431339264, + "learning_rate": 2.082933222440005e-05, + "loss": 0.5394, + "step": 2006 + }, + { + "epoch": 0.5565723793677204, + "grad_norm": 0.1951141506433487, + "learning_rate": 2.082521424382321e-05, + "loss": 0.5314, + "step": 2007 + }, + { + "epoch": 0.5568496949528563, + "grad_norm": 0.1864672750234604, + "learning_rate": 2.0821094638776497e-05, + "loss": 0.5365, + "step": 2008 + }, + { + "epoch": 0.5571270105379922, + "grad_norm": 0.18978318572044373, + "learning_rate": 2.0816973410063754e-05, + "loss": 0.5199, + "step": 2009 + }, + { + "epoch": 0.5574043261231281, + "grad_norm": 0.1931847780942917, + "learning_rate": 2.0812850558489153e-05, + "loss": 0.5701, + "step": 2010 + }, + { + "epoch": 0.557681641708264, + "grad_norm": 0.18549029529094696, + "learning_rate": 2.0808726084857157e-05, + "loss": 0.4933, + "step": 2011 + }, + { + "epoch": 0.5579589572933998, + "grad_norm": 0.18897327780723572, + "learning_rate": 2.0804599989972567e-05, + "loss": 0.58, + "step": 2012 + }, + { + "epoch": 0.5582362728785357, + "grad_norm": 0.20237743854522705, + "learning_rate": 2.0800472274640494e-05, + "loss": 0.5577, + "step": 2013 + }, + { + "epoch": 0.5585135884636716, + "grad_norm": 0.18905304372310638, + "learning_rate": 2.0796342939666362e-05, + "loss": 0.5763, + "step": 2014 + }, + { + "epoch": 0.5587909040488075, + "grad_norm": 0.2023850679397583, + "learning_rate": 2.079221198585592e-05, + "loss": 0.5502, + "step": 2015 + }, + { + "epoch": 0.5590682196339434, + "grad_norm": 0.19735179841518402, + "learning_rate": 2.0788079414015215e-05, + "loss": 0.5293, + "step": 2016 + }, + { + "epoch": 0.5593455352190793, + "grad_norm": 0.18779948353767395, + "learning_rate": 2.078394522495063e-05, + "loss": 0.5524, + "step": 2017 + }, + { + "epoch": 0.5596228508042151, + "grad_norm": 0.20868045091629028, + "learning_rate": 2.0779809419468854e-05, + "loss": 0.5724, + "step": 2018 + }, + { + "epoch": 0.559900166389351, + "grad_norm": 0.21078519523143768, + "learning_rate": 2.077567199837689e-05, + "loss": 0.5398, + "step": 2019 + }, + { + "epoch": 0.5601774819744869, + "grad_norm": 0.18992477655410767, + "learning_rate": 2.0771532962482057e-05, + "loss": 0.5665, + "step": 2020 + }, + { + "epoch": 0.5604547975596228, + "grad_norm": 0.1859736144542694, + "learning_rate": 2.0767392312591992e-05, + "loss": 0.5429, + "step": 2021 + }, + { + "epoch": 0.5607321131447587, + "grad_norm": 0.18972904980182648, + "learning_rate": 2.0763250049514654e-05, + "loss": 0.5595, + "step": 2022 + }, + { + "epoch": 0.5610094287298946, + "grad_norm": 0.189973384141922, + "learning_rate": 2.0759106174058293e-05, + "loss": 0.5669, + "step": 2023 + }, + { + "epoch": 0.5612867443150305, + "grad_norm": 0.19088168442249298, + "learning_rate": 2.07549606870315e-05, + "loss": 0.5654, + "step": 2024 + }, + { + "epoch": 0.5615640599001663, + "grad_norm": 0.1857694834470749, + "learning_rate": 2.075081358924317e-05, + "loss": 0.5318, + "step": 2025 + }, + { + "epoch": 0.5618413754853022, + "grad_norm": 0.19243502616882324, + "learning_rate": 2.0746664881502496e-05, + "loss": 0.549, + "step": 2026 + }, + { + "epoch": 0.5621186910704381, + "grad_norm": 0.18214242160320282, + "learning_rate": 2.0742514564619022e-05, + "loss": 0.5318, + "step": 2027 + }, + { + "epoch": 0.562396006655574, + "grad_norm": 0.18616682291030884, + "learning_rate": 2.0738362639402574e-05, + "loss": 0.5397, + "step": 2028 + }, + { + "epoch": 0.56267332224071, + "grad_norm": 0.18527580797672272, + "learning_rate": 2.07342091066633e-05, + "loss": 0.5513, + "step": 2029 + }, + { + "epoch": 0.5629506378258459, + "grad_norm": 0.20299869775772095, + "learning_rate": 2.073005396721167e-05, + "loss": 0.5437, + "step": 2030 + }, + { + "epoch": 0.5632279534109818, + "grad_norm": 0.18229471147060394, + "learning_rate": 2.072589722185846e-05, + "loss": 0.5411, + "step": 2031 + }, + { + "epoch": 0.5635052689961176, + "grad_norm": 0.19586963951587677, + "learning_rate": 2.0721738871414763e-05, + "loss": 0.5674, + "step": 2032 + }, + { + "epoch": 0.5637825845812535, + "grad_norm": 0.21511641144752502, + "learning_rate": 2.0717578916691977e-05, + "loss": 0.5762, + "step": 2033 + }, + { + "epoch": 0.5640599001663894, + "grad_norm": 0.18836332857608795, + "learning_rate": 2.071341735850183e-05, + "loss": 0.5599, + "step": 2034 + }, + { + "epoch": 0.5643372157515253, + "grad_norm": 0.1830950826406479, + "learning_rate": 2.070925419765634e-05, + "loss": 0.539, + "step": 2035 + }, + { + "epoch": 0.5646145313366612, + "grad_norm": 0.18749170005321503, + "learning_rate": 2.070508943496786e-05, + "loss": 0.5516, + "step": 2036 + }, + { + "epoch": 0.5648918469217971, + "grad_norm": 0.18844857811927795, + "learning_rate": 2.070092307124904e-05, + "loss": 0.5301, + "step": 2037 + }, + { + "epoch": 0.5651691625069329, + "grad_norm": 0.1957472264766693, + "learning_rate": 2.0696755107312845e-05, + "loss": 0.5612, + "step": 2038 + }, + { + "epoch": 0.5654464780920688, + "grad_norm": 0.1901545226573944, + "learning_rate": 2.0692585543972566e-05, + "loss": 0.5529, + "step": 2039 + }, + { + "epoch": 0.5657237936772047, + "grad_norm": 0.1877562254667282, + "learning_rate": 2.0688414382041788e-05, + "loss": 0.5324, + "step": 2040 + }, + { + "epoch": 0.5660011092623406, + "grad_norm": 0.18741729855537415, + "learning_rate": 2.068424162233441e-05, + "loss": 0.5522, + "step": 2041 + }, + { + "epoch": 0.5662784248474765, + "grad_norm": 0.18096144497394562, + "learning_rate": 2.068006726566466e-05, + "loss": 0.5382, + "step": 2042 + }, + { + "epoch": 0.5665557404326124, + "grad_norm": 0.18831691145896912, + "learning_rate": 2.0675891312847064e-05, + "loss": 0.5547, + "step": 2043 + }, + { + "epoch": 0.5668330560177482, + "grad_norm": 0.19189482927322388, + "learning_rate": 2.0671713764696445e-05, + "loss": 0.5699, + "step": 2044 + }, + { + "epoch": 0.5671103716028841, + "grad_norm": 0.183754563331604, + "learning_rate": 2.0667534622027974e-05, + "loss": 0.5256, + "step": 2045 + }, + { + "epoch": 0.56738768718802, + "grad_norm": 0.19887655973434448, + "learning_rate": 2.0663353885657098e-05, + "loss": 0.5792, + "step": 2046 + }, + { + "epoch": 0.5676650027731559, + "grad_norm": 0.18769381940364838, + "learning_rate": 2.0659171556399596e-05, + "loss": 0.5679, + "step": 2047 + }, + { + "epoch": 0.5679423183582918, + "grad_norm": 0.20135840773582458, + "learning_rate": 2.0654987635071554e-05, + "loss": 0.5766, + "step": 2048 + }, + { + "epoch": 0.5682196339434277, + "grad_norm": 0.20204971730709076, + "learning_rate": 2.065080212248936e-05, + "loss": 0.602, + "step": 2049 + }, + { + "epoch": 0.5684969495285636, + "grad_norm": 0.18772201240062714, + "learning_rate": 2.0646615019469724e-05, + "loss": 0.5584, + "step": 2050 + }, + { + "epoch": 0.5687742651136994, + "grad_norm": 0.18956024944782257, + "learning_rate": 2.064242632682965e-05, + "loss": 0.573, + "step": 2051 + }, + { + "epoch": 0.5690515806988353, + "grad_norm": 0.20082207024097443, + "learning_rate": 2.0638236045386472e-05, + "loss": 0.5478, + "step": 2052 + }, + { + "epoch": 0.5693288962839712, + "grad_norm": 0.1947973519563675, + "learning_rate": 2.063404417595783e-05, + "loss": 0.5543, + "step": 2053 + }, + { + "epoch": 0.5696062118691071, + "grad_norm": 0.19126874208450317, + "learning_rate": 2.0629850719361654e-05, + "loss": 0.5704, + "step": 2054 + }, + { + "epoch": 0.569883527454243, + "grad_norm": 0.3870353698730469, + "learning_rate": 2.062565567641621e-05, + "loss": 0.5323, + "step": 2055 + }, + { + "epoch": 0.5701608430393789, + "grad_norm": 0.20603255927562714, + "learning_rate": 2.0621459047940056e-05, + "loss": 0.574, + "step": 2056 + }, + { + "epoch": 0.5704381586245147, + "grad_norm": 0.18535418808460236, + "learning_rate": 2.0617260834752068e-05, + "loss": 0.5358, + "step": 2057 + }, + { + "epoch": 0.5707154742096506, + "grad_norm": 0.24727782607078552, + "learning_rate": 2.061306103767143e-05, + "loss": 0.5744, + "step": 2058 + }, + { + "epoch": 0.5709927897947865, + "grad_norm": 0.19518110156059265, + "learning_rate": 2.0608859657517633e-05, + "loss": 0.5655, + "step": 2059 + }, + { + "epoch": 0.5712701053799224, + "grad_norm": 0.19751004874706268, + "learning_rate": 2.0604656695110476e-05, + "loss": 0.5555, + "step": 2060 + }, + { + "epoch": 0.5715474209650583, + "grad_norm": 0.18788260221481323, + "learning_rate": 2.0600452151270068e-05, + "loss": 0.5463, + "step": 2061 + }, + { + "epoch": 0.5718247365501942, + "grad_norm": 0.23272345960140228, + "learning_rate": 2.0596246026816826e-05, + "loss": 0.5601, + "step": 2062 + }, + { + "epoch": 0.57210205213533, + "grad_norm": 0.19826073944568634, + "learning_rate": 2.059203832257148e-05, + "loss": 0.5654, + "step": 2063 + }, + { + "epoch": 0.5723793677204659, + "grad_norm": 0.19848833978176117, + "learning_rate": 2.058782903935506e-05, + "loss": 0.5666, + "step": 2064 + }, + { + "epoch": 0.5726566833056018, + "grad_norm": 0.21063697338104248, + "learning_rate": 2.0583618177988917e-05, + "loss": 0.5421, + "step": 2065 + }, + { + "epoch": 0.5729339988907377, + "grad_norm": 0.19619859755039215, + "learning_rate": 2.0579405739294695e-05, + "loss": 0.5343, + "step": 2066 + }, + { + "epoch": 0.5732113144758736, + "grad_norm": 0.18998843431472778, + "learning_rate": 2.057519172409435e-05, + "loss": 0.5486, + "step": 2067 + }, + { + "epoch": 0.5734886300610095, + "grad_norm": 0.19432705640792847, + "learning_rate": 2.0570976133210152e-05, + "loss": 0.528, + "step": 2068 + }, + { + "epoch": 0.5737659456461454, + "grad_norm": 0.18903441727161407, + "learning_rate": 2.0566758967464677e-05, + "loss": 0.5529, + "step": 2069 + }, + { + "epoch": 0.5740432612312812, + "grad_norm": 0.19154904782772064, + "learning_rate": 2.05625402276808e-05, + "loss": 0.5519, + "step": 2070 + }, + { + "epoch": 0.5743205768164171, + "grad_norm": 0.19544386863708496, + "learning_rate": 2.0558319914681713e-05, + "loss": 0.5444, + "step": 2071 + }, + { + "epoch": 0.574597892401553, + "grad_norm": 0.1918288618326187, + "learning_rate": 2.055409802929091e-05, + "loss": 0.5646, + "step": 2072 + }, + { + "epoch": 0.5748752079866889, + "grad_norm": 0.19810417294502258, + "learning_rate": 2.054987457233219e-05, + "loss": 0.5799, + "step": 2073 + }, + { + "epoch": 0.5751525235718248, + "grad_norm": 0.20202040672302246, + "learning_rate": 2.0545649544629665e-05, + "loss": 0.5555, + "step": 2074 + }, + { + "epoch": 0.5754298391569607, + "grad_norm": 0.19962945580482483, + "learning_rate": 2.0541422947007748e-05, + "loss": 0.5245, + "step": 2075 + }, + { + "epoch": 0.5757071547420965, + "grad_norm": 0.19925406575202942, + "learning_rate": 2.053719478029116e-05, + "loss": 0.5755, + "step": 2076 + }, + { + "epoch": 0.5759844703272324, + "grad_norm": 0.1866733878850937, + "learning_rate": 2.0532965045304932e-05, + "loss": 0.5339, + "step": 2077 + }, + { + "epoch": 0.5762617859123683, + "grad_norm": 0.22320881485939026, + "learning_rate": 2.052873374287439e-05, + "loss": 0.5111, + "step": 2078 + }, + { + "epoch": 0.5765391014975042, + "grad_norm": 0.19458182156085968, + "learning_rate": 2.0524500873825182e-05, + "loss": 0.5625, + "step": 2079 + }, + { + "epoch": 0.5768164170826401, + "grad_norm": 0.18524032831192017, + "learning_rate": 2.0520266438983242e-05, + "loss": 0.5876, + "step": 2080 + }, + { + "epoch": 0.577093732667776, + "grad_norm": 0.17904391884803772, + "learning_rate": 2.0516030439174833e-05, + "loss": 0.5541, + "step": 2081 + }, + { + "epoch": 0.5773710482529119, + "grad_norm": 0.1819324642419815, + "learning_rate": 2.05117928752265e-05, + "loss": 0.5326, + "step": 2082 + }, + { + "epoch": 0.5776483638380477, + "grad_norm": 0.18749113380908966, + "learning_rate": 2.0507553747965114e-05, + "loss": 0.5502, + "step": 2083 + }, + { + "epoch": 0.5779256794231836, + "grad_norm": 0.1892707794904709, + "learning_rate": 2.050331305821783e-05, + "loss": 0.5533, + "step": 2084 + }, + { + "epoch": 0.5782029950083195, + "grad_norm": 0.1937221884727478, + "learning_rate": 2.0499070806812126e-05, + "loss": 0.5615, + "step": 2085 + }, + { + "epoch": 0.5784803105934554, + "grad_norm": 0.20081382989883423, + "learning_rate": 2.0494826994575777e-05, + "loss": 0.5424, + "step": 2086 + }, + { + "epoch": 0.5787576261785913, + "grad_norm": 0.18762163817882538, + "learning_rate": 2.0490581622336863e-05, + "loss": 0.5621, + "step": 2087 + }, + { + "epoch": 0.5790349417637272, + "grad_norm": 0.18031221628189087, + "learning_rate": 2.048633469092377e-05, + "loss": 0.5045, + "step": 2088 + }, + { + "epoch": 0.579312257348863, + "grad_norm": 0.21820896863937378, + "learning_rate": 2.048208620116518e-05, + "loss": 0.5676, + "step": 2089 + }, + { + "epoch": 0.5795895729339989, + "grad_norm": 0.19855897128582, + "learning_rate": 2.0477836153890095e-05, + "loss": 0.5461, + "step": 2090 + }, + { + "epoch": 0.5798668885191348, + "grad_norm": 0.1899833232164383, + "learning_rate": 2.0473584549927806e-05, + "loss": 0.5519, + "step": 2091 + }, + { + "epoch": 0.5801442041042707, + "grad_norm": 0.1892709583044052, + "learning_rate": 2.0469331390107914e-05, + "loss": 0.5255, + "step": 2092 + }, + { + "epoch": 0.5804215196894066, + "grad_norm": 0.19004952907562256, + "learning_rate": 2.0465076675260326e-05, + "loss": 0.5468, + "step": 2093 + }, + { + "epoch": 0.5806988352745425, + "grad_norm": 0.18664851784706116, + "learning_rate": 2.0460820406215247e-05, + "loss": 0.5333, + "step": 2094 + }, + { + "epoch": 0.5809761508596784, + "grad_norm": 0.18904832005500793, + "learning_rate": 2.045656258380319e-05, + "loss": 0.5653, + "step": 2095 + }, + { + "epoch": 0.5812534664448142, + "grad_norm": 0.19123364984989166, + "learning_rate": 2.0452303208854966e-05, + "loss": 0.5368, + "step": 2096 + }, + { + "epoch": 0.5815307820299501, + "grad_norm": 0.1906225085258484, + "learning_rate": 2.0448042282201694e-05, + "loss": 0.5672, + "step": 2097 + }, + { + "epoch": 0.581808097615086, + "grad_norm": 0.19360537827014923, + "learning_rate": 2.0443779804674796e-05, + "loss": 0.548, + "step": 2098 + }, + { + "epoch": 0.5820854132002219, + "grad_norm": 0.21694041788578033, + "learning_rate": 2.0439515777105987e-05, + "loss": 0.5724, + "step": 2099 + }, + { + "epoch": 0.5823627287853578, + "grad_norm": 0.191674143075943, + "learning_rate": 2.04352502003273e-05, + "loss": 0.5295, + "step": 2100 + }, + { + "epoch": 0.5826400443704937, + "grad_norm": 0.19594216346740723, + "learning_rate": 2.0430983075171055e-05, + "loss": 0.5412, + "step": 2101 + }, + { + "epoch": 0.5829173599556295, + "grad_norm": 0.19115525484085083, + "learning_rate": 2.0426714402469887e-05, + "loss": 0.5368, + "step": 2102 + }, + { + "epoch": 0.5831946755407654, + "grad_norm": 0.19171211123466492, + "learning_rate": 2.042244418305673e-05, + "loss": 0.57, + "step": 2103 + }, + { + "epoch": 0.5834719911259013, + "grad_norm": 0.18373924493789673, + "learning_rate": 2.0418172417764802e-05, + "loss": 0.5648, + "step": 2104 + }, + { + "epoch": 0.5837493067110372, + "grad_norm": 0.20192669332027435, + "learning_rate": 2.0413899107427652e-05, + "loss": 0.5699, + "step": 2105 + }, + { + "epoch": 0.5840266222961731, + "grad_norm": 0.18735186755657196, + "learning_rate": 2.0409624252879112e-05, + "loss": 0.551, + "step": 2106 + }, + { + "epoch": 0.584303937881309, + "grad_norm": 0.19031678140163422, + "learning_rate": 2.0405347854953316e-05, + "loss": 0.5313, + "step": 2107 + }, + { + "epoch": 0.5845812534664449, + "grad_norm": 0.19912661612033844, + "learning_rate": 2.0401069914484707e-05, + "loss": 0.5815, + "step": 2108 + }, + { + "epoch": 0.5848585690515807, + "grad_norm": 0.1985718458890915, + "learning_rate": 2.0396790432308025e-05, + "loss": 0.5364, + "step": 2109 + }, + { + "epoch": 0.5851358846367166, + "grad_norm": 0.20383597910404205, + "learning_rate": 2.0392509409258303e-05, + "loss": 0.5747, + "step": 2110 + }, + { + "epoch": 0.5854132002218525, + "grad_norm": 0.19357682764530182, + "learning_rate": 2.038822684617089e-05, + "loss": 0.5508, + "step": 2111 + }, + { + "epoch": 0.5856905158069884, + "grad_norm": 0.18165504932403564, + "learning_rate": 2.0383942743881425e-05, + "loss": 0.5234, + "step": 2112 + }, + { + "epoch": 0.5859678313921243, + "grad_norm": 0.18874984979629517, + "learning_rate": 2.0379657103225852e-05, + "loss": 0.5691, + "step": 2113 + }, + { + "epoch": 0.5862451469772602, + "grad_norm": 0.18956932425498962, + "learning_rate": 2.0375369925040406e-05, + "loss": 0.5664, + "step": 2114 + }, + { + "epoch": 0.586522462562396, + "grad_norm": 0.17788472771644592, + "learning_rate": 2.0371081210161634e-05, + "loss": 0.5473, + "step": 2115 + }, + { + "epoch": 0.5867997781475319, + "grad_norm": 0.22669924795627594, + "learning_rate": 2.0366790959426378e-05, + "loss": 0.5603, + "step": 2116 + }, + { + "epoch": 0.5870770937326678, + "grad_norm": 0.1913762092590332, + "learning_rate": 2.0362499173671784e-05, + "loss": 0.5698, + "step": 2117 + }, + { + "epoch": 0.5873544093178037, + "grad_norm": 0.19173979759216309, + "learning_rate": 2.0358205853735287e-05, + "loss": 0.5735, + "step": 2118 + }, + { + "epoch": 0.5876317249029396, + "grad_norm": 0.18526272475719452, + "learning_rate": 2.035391100045462e-05, + "loss": 0.5462, + "step": 2119 + }, + { + "epoch": 0.5879090404880755, + "grad_norm": 0.20534314215183258, + "learning_rate": 2.034961461466784e-05, + "loss": 0.5643, + "step": 2120 + }, + { + "epoch": 0.5881863560732113, + "grad_norm": 0.19565671682357788, + "learning_rate": 2.0345316697213273e-05, + "loss": 0.5599, + "step": 2121 + }, + { + "epoch": 0.5884636716583472, + "grad_norm": 0.19610898196697235, + "learning_rate": 2.034101724892956e-05, + "loss": 0.5597, + "step": 2122 + }, + { + "epoch": 0.5887409872434831, + "grad_norm": 0.19708271324634552, + "learning_rate": 2.033671627065564e-05, + "loss": 0.5494, + "step": 2123 + }, + { + "epoch": 0.589018302828619, + "grad_norm": 0.285875141620636, + "learning_rate": 2.033241376323075e-05, + "loss": 0.5553, + "step": 2124 + }, + { + "epoch": 0.5892956184137549, + "grad_norm": 0.1879926174879074, + "learning_rate": 2.0328109727494417e-05, + "loss": 0.5464, + "step": 2125 + }, + { + "epoch": 0.5895729339988908, + "grad_norm": 0.20556902885437012, + "learning_rate": 2.032380416428647e-05, + "loss": 0.5533, + "step": 2126 + }, + { + "epoch": 0.5898502495840267, + "grad_norm": 0.1847870945930481, + "learning_rate": 2.0319497074447043e-05, + "loss": 0.5629, + "step": 2127 + }, + { + "epoch": 0.5901275651691625, + "grad_norm": 0.19035299122333527, + "learning_rate": 2.0315188458816567e-05, + "loss": 0.5491, + "step": 2128 + }, + { + "epoch": 0.5904048807542984, + "grad_norm": 0.17980261147022247, + "learning_rate": 2.031087831823576e-05, + "loss": 0.5281, + "step": 2129 + }, + { + "epoch": 0.5906821963394343, + "grad_norm": 0.19888748228549957, + "learning_rate": 2.030656665354565e-05, + "loss": 0.5443, + "step": 2130 + }, + { + "epoch": 0.5909595119245702, + "grad_norm": 0.17473064363002777, + "learning_rate": 2.0302253465587555e-05, + "loss": 0.5263, + "step": 2131 + }, + { + "epoch": 0.5912368275097061, + "grad_norm": 0.1915203481912613, + "learning_rate": 2.0297938755203088e-05, + "loss": 0.5609, + "step": 2132 + }, + { + "epoch": 0.591514143094842, + "grad_norm": 0.1980845183134079, + "learning_rate": 2.029362252323417e-05, + "loss": 0.5549, + "step": 2133 + }, + { + "epoch": 0.5917914586799778, + "grad_norm": 0.18366824090480804, + "learning_rate": 2.028930477052301e-05, + "loss": 0.5333, + "step": 2134 + }, + { + "epoch": 0.5920687742651137, + "grad_norm": 0.1946541965007782, + "learning_rate": 2.0284985497912118e-05, + "loss": 0.588, + "step": 2135 + }, + { + "epoch": 0.5923460898502496, + "grad_norm": 0.1904669553041458, + "learning_rate": 2.028066470624429e-05, + "loss": 0.5602, + "step": 2136 + }, + { + "epoch": 0.5926234054353855, + "grad_norm": 0.1851152926683426, + "learning_rate": 2.0276342396362636e-05, + "loss": 0.5461, + "step": 2137 + }, + { + "epoch": 0.5929007210205214, + "grad_norm": 0.18607455492019653, + "learning_rate": 2.0272018569110552e-05, + "loss": 0.542, + "step": 2138 + }, + { + "epoch": 0.5931780366056573, + "grad_norm": 0.19023166596889496, + "learning_rate": 2.0267693225331726e-05, + "loss": 0.5564, + "step": 2139 + }, + { + "epoch": 0.5934553521907932, + "grad_norm": 0.18977974355220795, + "learning_rate": 2.0263366365870152e-05, + "loss": 0.5778, + "step": 2140 + }, + { + "epoch": 0.593732667775929, + "grad_norm": 0.2097538560628891, + "learning_rate": 2.0259037991570116e-05, + "loss": 0.5822, + "step": 2141 + }, + { + "epoch": 0.5940099833610649, + "grad_norm": 0.18402041494846344, + "learning_rate": 2.0254708103276193e-05, + "loss": 0.5207, + "step": 2142 + }, + { + "epoch": 0.5942872989462008, + "grad_norm": 0.196335569024086, + "learning_rate": 2.025037670183326e-05, + "loss": 0.5509, + "step": 2143 + }, + { + "epoch": 0.5945646145313367, + "grad_norm": 0.18417152762413025, + "learning_rate": 2.0246043788086498e-05, + "loss": 0.5418, + "step": 2144 + }, + { + "epoch": 0.5948419301164726, + "grad_norm": 0.1963946372270584, + "learning_rate": 2.024170936288136e-05, + "loss": 0.5717, + "step": 2145 + }, + { + "epoch": 0.5951192457016085, + "grad_norm": 0.20180796086788177, + "learning_rate": 2.023737342706361e-05, + "loss": 0.5457, + "step": 2146 + }, + { + "epoch": 0.5953965612867443, + "grad_norm": 0.18826082348823547, + "learning_rate": 2.0233035981479316e-05, + "loss": 0.545, + "step": 2147 + }, + { + "epoch": 0.5956738768718802, + "grad_norm": 0.1920921951532364, + "learning_rate": 2.0228697026974808e-05, + "loss": 0.5584, + "step": 2148 + }, + { + "epoch": 0.5959511924570161, + "grad_norm": 0.20350618660449982, + "learning_rate": 2.0224356564396747e-05, + "loss": 0.5343, + "step": 2149 + }, + { + "epoch": 0.596228508042152, + "grad_norm": 0.19070284068584442, + "learning_rate": 2.0220014594592068e-05, + "loss": 0.5573, + "step": 2150 + }, + { + "epoch": 0.5965058236272879, + "grad_norm": 0.1937059462070465, + "learning_rate": 2.0215671118408004e-05, + "loss": 0.5548, + "step": 2151 + }, + { + "epoch": 0.5967831392124238, + "grad_norm": 0.18935304880142212, + "learning_rate": 2.021132613669208e-05, + "loss": 0.5746, + "step": 2152 + }, + { + "epoch": 0.5970604547975596, + "grad_norm": 0.20879191160202026, + "learning_rate": 2.0206979650292117e-05, + "loss": 0.5602, + "step": 2153 + }, + { + "epoch": 0.5973377703826955, + "grad_norm": 0.19923923909664154, + "learning_rate": 2.020263166005624e-05, + "loss": 0.5441, + "step": 2154 + }, + { + "epoch": 0.5976150859678314, + "grad_norm": 0.1885758638381958, + "learning_rate": 2.019828216683284e-05, + "loss": 0.5362, + "step": 2155 + }, + { + "epoch": 0.5978924015529673, + "grad_norm": 0.18912041187286377, + "learning_rate": 2.019393117147063e-05, + "loss": 0.5488, + "step": 2156 + }, + { + "epoch": 0.5981697171381032, + "grad_norm": 0.18454459309577942, + "learning_rate": 2.0189578674818603e-05, + "loss": 0.5386, + "step": 2157 + }, + { + "epoch": 0.5984470327232391, + "grad_norm": 0.1803792268037796, + "learning_rate": 2.018522467772604e-05, + "loss": 0.5444, + "step": 2158 + }, + { + "epoch": 0.598724348308375, + "grad_norm": 0.18980631232261658, + "learning_rate": 2.0180869181042532e-05, + "loss": 0.5521, + "step": 2159 + }, + { + "epoch": 0.5990016638935108, + "grad_norm": 0.18810968101024628, + "learning_rate": 2.0176512185617945e-05, + "loss": 0.5342, + "step": 2160 + }, + { + "epoch": 0.5992789794786467, + "grad_norm": 0.24775730073451996, + "learning_rate": 2.0172153692302445e-05, + "loss": 0.533, + "step": 2161 + }, + { + "epoch": 0.5995562950637826, + "grad_norm": 0.19899478554725647, + "learning_rate": 2.0167793701946488e-05, + "loss": 0.5394, + "step": 2162 + }, + { + "epoch": 0.5998336106489185, + "grad_norm": 0.19542957842350006, + "learning_rate": 2.0163432215400822e-05, + "loss": 0.5379, + "step": 2163 + }, + { + "epoch": 0.6001109262340544, + "grad_norm": 0.2002883106470108, + "learning_rate": 2.0159069233516504e-05, + "loss": 0.5672, + "step": 2164 + }, + { + "epoch": 0.6003882418191903, + "grad_norm": 0.18904021382331848, + "learning_rate": 2.0154704757144845e-05, + "loss": 0.5483, + "step": 2165 + }, + { + "epoch": 0.6006655574043261, + "grad_norm": 0.19162911176681519, + "learning_rate": 2.0150338787137486e-05, + "loss": 0.5552, + "step": 2166 + }, + { + "epoch": 0.600942872989462, + "grad_norm": 0.18318617343902588, + "learning_rate": 2.014597132434633e-05, + "loss": 0.5548, + "step": 2167 + }, + { + "epoch": 0.6012201885745979, + "grad_norm": 0.1864987015724182, + "learning_rate": 2.01416023696236e-05, + "loss": 0.5401, + "step": 2168 + }, + { + "epoch": 0.6014975041597338, + "grad_norm": 0.18252375721931458, + "learning_rate": 2.0137231923821785e-05, + "loss": 0.5547, + "step": 2169 + }, + { + "epoch": 0.6017748197448697, + "grad_norm": 0.196000874042511, + "learning_rate": 2.013285998779367e-05, + "loss": 0.5518, + "step": 2170 + }, + { + "epoch": 0.6020521353300056, + "grad_norm": 0.17955927550792694, + "learning_rate": 2.0128486562392354e-05, + "loss": 0.5312, + "step": 2171 + }, + { + "epoch": 0.6023294509151415, + "grad_norm": 0.1867658495903015, + "learning_rate": 2.0124111648471192e-05, + "loss": 0.5374, + "step": 2172 + }, + { + "epoch": 0.6026067665002773, + "grad_norm": 0.20001055300235748, + "learning_rate": 2.0119735246883852e-05, + "loss": 0.5501, + "step": 2173 + }, + { + "epoch": 0.6028840820854132, + "grad_norm": 0.19283756613731384, + "learning_rate": 2.011535735848428e-05, + "loss": 0.5505, + "step": 2174 + }, + { + "epoch": 0.6031613976705491, + "grad_norm": 0.18642939627170563, + "learning_rate": 2.011097798412673e-05, + "loss": 0.5246, + "step": 2175 + }, + { + "epoch": 0.603438713255685, + "grad_norm": 0.1911175698041916, + "learning_rate": 2.0106597124665716e-05, + "loss": 0.5134, + "step": 2176 + }, + { + "epoch": 0.6037160288408209, + "grad_norm": 0.18297746777534485, + "learning_rate": 2.0102214780956073e-05, + "loss": 0.5598, + "step": 2177 + }, + { + "epoch": 0.6039933444259568, + "grad_norm": 0.19483189284801483, + "learning_rate": 2.0097830953852914e-05, + "loss": 0.5502, + "step": 2178 + }, + { + "epoch": 0.6042706600110926, + "grad_norm": 0.18785777688026428, + "learning_rate": 2.009344564421163e-05, + "loss": 0.5384, + "step": 2179 + }, + { + "epoch": 0.6045479755962285, + "grad_norm": 0.19045081734657288, + "learning_rate": 2.0089058852887923e-05, + "loss": 0.5311, + "step": 2180 + }, + { + "epoch": 0.6048252911813644, + "grad_norm": 0.19081301987171173, + "learning_rate": 2.0084670580737758e-05, + "loss": 0.5593, + "step": 2181 + }, + { + "epoch": 0.6051026067665003, + "grad_norm": 0.19843098521232605, + "learning_rate": 2.0080280828617414e-05, + "loss": 0.557, + "step": 2182 + }, + { + "epoch": 0.6053799223516362, + "grad_norm": 0.1962948590517044, + "learning_rate": 2.0075889597383446e-05, + "loss": 0.5591, + "step": 2183 + }, + { + "epoch": 0.6056572379367721, + "grad_norm": 0.193936288356781, + "learning_rate": 2.0071496887892693e-05, + "loss": 0.5721, + "step": 2184 + }, + { + "epoch": 0.605934553521908, + "grad_norm": 0.19298399984836578, + "learning_rate": 2.00671027010023e-05, + "loss": 0.5734, + "step": 2185 + }, + { + "epoch": 0.6062118691070438, + "grad_norm": 0.20237858593463898, + "learning_rate": 2.006270703756968e-05, + "loss": 0.548, + "step": 2186 + }, + { + "epoch": 0.6064891846921797, + "grad_norm": 0.19070473313331604, + "learning_rate": 2.0058309898452552e-05, + "loss": 0.5187, + "step": 2187 + }, + { + "epoch": 0.6067665002773156, + "grad_norm": 0.18649962544441223, + "learning_rate": 2.0053911284508902e-05, + "loss": 0.5465, + "step": 2188 + }, + { + "epoch": 0.6070438158624515, + "grad_norm": 0.21800090372562408, + "learning_rate": 2.0049511196597027e-05, + "loss": 0.5665, + "step": 2189 + }, + { + "epoch": 0.6073211314475874, + "grad_norm": 0.18259546160697937, + "learning_rate": 2.00451096355755e-05, + "loss": 0.5456, + "step": 2190 + }, + { + "epoch": 0.6075984470327233, + "grad_norm": 0.19721095263957977, + "learning_rate": 2.0040706602303173e-05, + "loss": 0.5752, + "step": 2191 + }, + { + "epoch": 0.6078757626178591, + "grad_norm": 0.19083106517791748, + "learning_rate": 2.0036302097639204e-05, + "loss": 0.552, + "step": 2192 + }, + { + "epoch": 0.608153078202995, + "grad_norm": 0.18962670862674713, + "learning_rate": 2.0031896122443023e-05, + "loss": 0.5729, + "step": 2193 + }, + { + "epoch": 0.6084303937881309, + "grad_norm": 0.19744956493377686, + "learning_rate": 2.0027488677574358e-05, + "loss": 0.5494, + "step": 2194 + }, + { + "epoch": 0.6087077093732668, + "grad_norm": 0.19325025379657745, + "learning_rate": 2.0023079763893208e-05, + "loss": 0.5389, + "step": 2195 + }, + { + "epoch": 0.6089850249584027, + "grad_norm": 0.1866646558046341, + "learning_rate": 2.0018669382259885e-05, + "loss": 0.5703, + "step": 2196 + }, + { + "epoch": 0.6092623405435386, + "grad_norm": 0.21793098747730255, + "learning_rate": 2.001425753353496e-05, + "loss": 0.5585, + "step": 2197 + }, + { + "epoch": 0.6095396561286744, + "grad_norm": 0.18613992631435394, + "learning_rate": 2.0009844218579298e-05, + "loss": 0.544, + "step": 2198 + }, + { + "epoch": 0.6098169717138103, + "grad_norm": 0.1990228146314621, + "learning_rate": 2.0005429438254063e-05, + "loss": 0.5681, + "step": 2199 + }, + { + "epoch": 0.6100942872989462, + "grad_norm": 0.19253648817539215, + "learning_rate": 2.000101319342069e-05, + "loss": 0.5451, + "step": 2200 + }, + { + "epoch": 0.6103716028840821, + "grad_norm": 0.18924476206302643, + "learning_rate": 1.9996595484940915e-05, + "loss": 0.5684, + "step": 2201 + }, + { + "epoch": 0.610648918469218, + "grad_norm": 0.18552450835704803, + "learning_rate": 1.9992176313676737e-05, + "loss": 0.5307, + "step": 2202 + }, + { + "epoch": 0.6109262340543539, + "grad_norm": 0.19528694450855255, + "learning_rate": 1.9987755680490456e-05, + "loss": 0.5598, + "step": 2203 + }, + { + "epoch": 0.6112035496394898, + "grad_norm": 0.1909715086221695, + "learning_rate": 1.998333358624466e-05, + "loss": 0.5647, + "step": 2204 + }, + { + "epoch": 0.6114808652246256, + "grad_norm": 0.19347867369651794, + "learning_rate": 1.9978910031802218e-05, + "loss": 0.5726, + "step": 2205 + }, + { + "epoch": 0.6117581808097615, + "grad_norm": 0.22693443298339844, + "learning_rate": 1.9974485018026273e-05, + "loss": 0.5403, + "step": 2206 + }, + { + "epoch": 0.6120354963948974, + "grad_norm": 0.1854747086763382, + "learning_rate": 1.997005854578027e-05, + "loss": 0.5733, + "step": 2207 + }, + { + "epoch": 0.6123128119800333, + "grad_norm": 0.2657473683357239, + "learning_rate": 1.9965630615927932e-05, + "loss": 0.5516, + "step": 2208 + }, + { + "epoch": 0.6125901275651692, + "grad_norm": 0.19607201218605042, + "learning_rate": 1.996120122933326e-05, + "loss": 0.5525, + "step": 2209 + }, + { + "epoch": 0.6128674431503051, + "grad_norm": 0.17779147624969482, + "learning_rate": 1.9956770386860547e-05, + "loss": 0.5377, + "step": 2210 + }, + { + "epoch": 0.6131447587354409, + "grad_norm": 0.2034800499677658, + "learning_rate": 1.9952338089374366e-05, + "loss": 0.5434, + "step": 2211 + }, + { + "epoch": 0.6134220743205768, + "grad_norm": 0.18624994158744812, + "learning_rate": 1.9947904337739582e-05, + "loss": 0.5301, + "step": 2212 + }, + { + "epoch": 0.6136993899057127, + "grad_norm": 0.18844860792160034, + "learning_rate": 1.9943469132821334e-05, + "loss": 0.5508, + "step": 2213 + }, + { + "epoch": 0.6139767054908486, + "grad_norm": 0.20432956516742706, + "learning_rate": 1.9939032475485043e-05, + "loss": 0.5565, + "step": 2214 + }, + { + "epoch": 0.6142540210759845, + "grad_norm": 0.18925762176513672, + "learning_rate": 1.9934594366596423e-05, + "loss": 0.5745, + "step": 2215 + }, + { + "epoch": 0.6145313366611204, + "grad_norm": 0.20550455152988434, + "learning_rate": 1.993015480702147e-05, + "loss": 0.5689, + "step": 2216 + }, + { + "epoch": 0.6148086522462562, + "grad_norm": 0.18953469395637512, + "learning_rate": 1.992571379762645e-05, + "loss": 0.5393, + "step": 2217 + }, + { + "epoch": 0.6150859678313921, + "grad_norm": 0.19926683604717255, + "learning_rate": 1.9921271339277935e-05, + "loss": 0.5563, + "step": 2218 + }, + { + "epoch": 0.615363283416528, + "grad_norm": 0.18314692378044128, + "learning_rate": 1.9916827432842756e-05, + "loss": 0.5353, + "step": 2219 + }, + { + "epoch": 0.6156405990016639, + "grad_norm": 0.20692428946495056, + "learning_rate": 1.991238207918804e-05, + "loss": 0.57, + "step": 2220 + }, + { + "epoch": 0.6159179145867998, + "grad_norm": 0.202706441283226, + "learning_rate": 1.990793527918119e-05, + "loss": 0.5496, + "step": 2221 + }, + { + "epoch": 0.6161952301719357, + "grad_norm": 0.18222178518772125, + "learning_rate": 1.99034870336899e-05, + "loss": 0.5553, + "step": 2222 + }, + { + "epoch": 0.6164725457570716, + "grad_norm": 0.20487068593502045, + "learning_rate": 1.9899037343582135e-05, + "loss": 0.5611, + "step": 2223 + }, + { + "epoch": 0.6167498613422074, + "grad_norm": 0.18165314197540283, + "learning_rate": 1.989458620972615e-05, + "loss": 0.5561, + "step": 2224 + }, + { + "epoch": 0.6170271769273433, + "grad_norm": 0.1925044059753418, + "learning_rate": 1.9890133632990488e-05, + "loss": 0.5866, + "step": 2225 + }, + { + "epoch": 0.6173044925124792, + "grad_norm": 0.19239196181297302, + "learning_rate": 1.988567961424395e-05, + "loss": 0.5629, + "step": 2226 + }, + { + "epoch": 0.6175818080976151, + "grad_norm": 0.19511006772518158, + "learning_rate": 1.9881224154355638e-05, + "loss": 0.5484, + "step": 2227 + }, + { + "epoch": 0.617859123682751, + "grad_norm": 0.18884329497814178, + "learning_rate": 1.9876767254194932e-05, + "loss": 0.5541, + "step": 2228 + }, + { + "epoch": 0.6181364392678869, + "grad_norm": 0.1917329877614975, + "learning_rate": 1.9872308914631494e-05, + "loss": 0.5535, + "step": 2229 + }, + { + "epoch": 0.6184137548530227, + "grad_norm": 0.19358281791210175, + "learning_rate": 1.986784913653526e-05, + "loss": 0.5445, + "step": 2230 + }, + { + "epoch": 0.6186910704381586, + "grad_norm": 0.1858266144990921, + "learning_rate": 1.9863387920776454e-05, + "loss": 0.5446, + "step": 2231 + }, + { + "epoch": 0.6189683860232945, + "grad_norm": 0.1879933476448059, + "learning_rate": 1.985892526822557e-05, + "loss": 0.5511, + "step": 2232 + }, + { + "epoch": 0.6192457016084304, + "grad_norm": 0.19498036801815033, + "learning_rate": 1.9854461179753396e-05, + "loss": 0.5535, + "step": 2233 + }, + { + "epoch": 0.6195230171935663, + "grad_norm": 0.1921449601650238, + "learning_rate": 1.9849995656231e-05, + "loss": 0.563, + "step": 2234 + }, + { + "epoch": 0.6198003327787022, + "grad_norm": 0.1764960139989853, + "learning_rate": 1.984552869852971e-05, + "loss": 0.5604, + "step": 2235 + }, + { + "epoch": 0.620077648363838, + "grad_norm": 0.18862247467041016, + "learning_rate": 1.984106030752116e-05, + "loss": 0.5507, + "step": 2236 + }, + { + "epoch": 0.6203549639489739, + "grad_norm": 0.19972002506256104, + "learning_rate": 1.9836590484077244e-05, + "loss": 0.5443, + "step": 2237 + }, + { + "epoch": 0.6206322795341098, + "grad_norm": 0.18754172325134277, + "learning_rate": 1.983211922907014e-05, + "loss": 0.5361, + "step": 2238 + }, + { + "epoch": 0.6209095951192457, + "grad_norm": 0.20183882117271423, + "learning_rate": 1.9827646543372322e-05, + "loss": 0.5537, + "step": 2239 + }, + { + "epoch": 0.6211869107043816, + "grad_norm": 0.18864542245864868, + "learning_rate": 1.9823172427856518e-05, + "loss": 0.5515, + "step": 2240 + }, + { + "epoch": 0.6214642262895175, + "grad_norm": 0.20876039564609528, + "learning_rate": 1.981869688339575e-05, + "loss": 0.5353, + "step": 2241 + }, + { + "epoch": 0.6217415418746534, + "grad_norm": 0.20126941800117493, + "learning_rate": 1.9814219910863313e-05, + "loss": 0.5549, + "step": 2242 + }, + { + "epoch": 0.6220188574597892, + "grad_norm": 0.1895267367362976, + "learning_rate": 1.9809741511132786e-05, + "loss": 0.5427, + "step": 2243 + }, + { + "epoch": 0.6222961730449251, + "grad_norm": 0.1944306641817093, + "learning_rate": 1.980526168507802e-05, + "loss": 0.5627, + "step": 2244 + }, + { + "epoch": 0.622573488630061, + "grad_norm": 0.19707219302654266, + "learning_rate": 1.980078043357315e-05, + "loss": 0.573, + "step": 2245 + }, + { + "epoch": 0.6228508042151969, + "grad_norm": 0.1903533786535263, + "learning_rate": 1.9796297757492587e-05, + "loss": 0.5534, + "step": 2246 + }, + { + "epoch": 0.6231281198003328, + "grad_norm": 0.1989421546459198, + "learning_rate": 1.9791813657711022e-05, + "loss": 0.5343, + "step": 2247 + }, + { + "epoch": 0.6234054353854687, + "grad_norm": 0.1919817328453064, + "learning_rate": 1.9787328135103418e-05, + "loss": 0.5493, + "step": 2248 + }, + { + "epoch": 0.6236827509706045, + "grad_norm": 0.18703347444534302, + "learning_rate": 1.9782841190545024e-05, + "loss": 0.572, + "step": 2249 + }, + { + "epoch": 0.6239600665557404, + "grad_norm": 0.18850034475326538, + "learning_rate": 1.9778352824911356e-05, + "loss": 0.5358, + "step": 2250 + }, + { + "epoch": 0.6242373821408763, + "grad_norm": 0.22748717665672302, + "learning_rate": 1.9773863039078217e-05, + "loss": 0.5569, + "step": 2251 + }, + { + "epoch": 0.6245146977260122, + "grad_norm": 0.1906166821718216, + "learning_rate": 1.976937183392168e-05, + "loss": 0.5226, + "step": 2252 + }, + { + "epoch": 0.6247920133111481, + "grad_norm": 0.1964375227689743, + "learning_rate": 1.9764879210318098e-05, + "loss": 0.5467, + "step": 2253 + }, + { + "epoch": 0.625069328896284, + "grad_norm": 0.27289459109306335, + "learning_rate": 1.9760385169144108e-05, + "loss": 0.5597, + "step": 2254 + }, + { + "epoch": 0.6253466444814199, + "grad_norm": 0.1879124641418457, + "learning_rate": 1.9755889711276603e-05, + "loss": 0.5509, + "step": 2255 + }, + { + "epoch": 0.6256239600665557, + "grad_norm": 0.1970473676919937, + "learning_rate": 1.9751392837592782e-05, + "loss": 0.53, + "step": 2256 + }, + { + "epoch": 0.6259012756516916, + "grad_norm": 0.20250020921230316, + "learning_rate": 1.9746894548970092e-05, + "loss": 0.5458, + "step": 2257 + }, + { + "epoch": 0.6261785912368275, + "grad_norm": 0.20085811614990234, + "learning_rate": 1.9742394846286277e-05, + "loss": 0.5483, + "step": 2258 + }, + { + "epoch": 0.6264559068219634, + "grad_norm": 0.21487122774124146, + "learning_rate": 1.9737893730419337e-05, + "loss": 0.5522, + "step": 2259 + }, + { + "epoch": 0.6267332224070993, + "grad_norm": 0.1989215910434723, + "learning_rate": 1.9733391202247577e-05, + "loss": 0.5269, + "step": 2260 + }, + { + "epoch": 0.6270105379922352, + "grad_norm": 0.1842491775751114, + "learning_rate": 1.9728887262649536e-05, + "loss": 0.543, + "step": 2261 + }, + { + "epoch": 0.627287853577371, + "grad_norm": 0.1977192461490631, + "learning_rate": 1.972438191250407e-05, + "loss": 0.5457, + "step": 2262 + }, + { + "epoch": 0.6275651691625069, + "grad_norm": 0.18922410905361176, + "learning_rate": 1.9719875152690288e-05, + "loss": 0.5508, + "step": 2263 + }, + { + "epoch": 0.6278424847476428, + "grad_norm": 0.19666342437267303, + "learning_rate": 1.9715366984087575e-05, + "loss": 0.5359, + "step": 2264 + }, + { + "epoch": 0.6281198003327787, + "grad_norm": 0.19078543782234192, + "learning_rate": 1.9710857407575595e-05, + "loss": 0.5349, + "step": 2265 + }, + { + "epoch": 0.6283971159179146, + "grad_norm": 0.18407784402370453, + "learning_rate": 1.970634642403429e-05, + "loss": 0.5554, + "step": 2266 + }, + { + "epoch": 0.6286744315030505, + "grad_norm": 0.1921215057373047, + "learning_rate": 1.9701834034343864e-05, + "loss": 0.583, + "step": 2267 + }, + { + "epoch": 0.6289517470881864, + "grad_norm": 0.2015913873910904, + "learning_rate": 1.969732023938481e-05, + "loss": 0.5656, + "step": 2268 + }, + { + "epoch": 0.6292290626733222, + "grad_norm": 0.19959089159965515, + "learning_rate": 1.969280504003789e-05, + "loss": 0.5404, + "step": 2269 + }, + { + "epoch": 0.6295063782584581, + "grad_norm": 0.18149082362651825, + "learning_rate": 1.968828843718414e-05, + "loss": 0.5716, + "step": 2270 + }, + { + "epoch": 0.629783693843594, + "grad_norm": 0.20897513628005981, + "learning_rate": 1.9683770431704857e-05, + "loss": 0.5566, + "step": 2271 + }, + { + "epoch": 0.6300610094287299, + "grad_norm": 0.18247532844543457, + "learning_rate": 1.9679251024481636e-05, + "loss": 0.5463, + "step": 2272 + }, + { + "epoch": 0.6303383250138658, + "grad_norm": 0.18694794178009033, + "learning_rate": 1.9674730216396334e-05, + "loss": 0.5611, + "step": 2273 + }, + { + "epoch": 0.6306156405990017, + "grad_norm": 0.1819789707660675, + "learning_rate": 1.9670208008331073e-05, + "loss": 0.5336, + "step": 2274 + }, + { + "epoch": 0.6308929561841375, + "grad_norm": 0.18136881291866302, + "learning_rate": 1.9665684401168258e-05, + "loss": 0.5496, + "step": 2275 + }, + { + "epoch": 0.6311702717692734, + "grad_norm": 0.1916753500699997, + "learning_rate": 1.9661159395790563e-05, + "loss": 0.56, + "step": 2276 + }, + { + "epoch": 0.6314475873544093, + "grad_norm": 0.18821988999843597, + "learning_rate": 1.965663299308094e-05, + "loss": 0.5507, + "step": 2277 + }, + { + "epoch": 0.6317249029395452, + "grad_norm": 0.2045837789773941, + "learning_rate": 1.965210519392261e-05, + "loss": 0.5495, + "step": 2278 + }, + { + "epoch": 0.6320022185246811, + "grad_norm": 0.17228901386260986, + "learning_rate": 1.964757599919907e-05, + "loss": 0.5505, + "step": 2279 + }, + { + "epoch": 0.632279534109817, + "grad_norm": 0.19306735694408417, + "learning_rate": 1.9643045409794074e-05, + "loss": 0.5257, + "step": 2280 + }, + { + "epoch": 0.6325568496949528, + "grad_norm": 0.19630911946296692, + "learning_rate": 1.9638513426591668e-05, + "loss": 0.5324, + "step": 2281 + }, + { + "epoch": 0.6328341652800887, + "grad_norm": 0.19862103462219238, + "learning_rate": 1.9633980050476164e-05, + "loss": 0.5556, + "step": 2282 + }, + { + "epoch": 0.6331114808652246, + "grad_norm": 0.20335890352725983, + "learning_rate": 1.9629445282332136e-05, + "loss": 0.5583, + "step": 2283 + }, + { + "epoch": 0.6333887964503605, + "grad_norm": 0.19751910865306854, + "learning_rate": 1.9624909123044448e-05, + "loss": 0.5844, + "step": 2284 + }, + { + "epoch": 0.6336661120354964, + "grad_norm": 0.18284855782985687, + "learning_rate": 1.9620371573498212e-05, + "loss": 0.5393, + "step": 2285 + }, + { + "epoch": 0.6339434276206323, + "grad_norm": 0.18531352281570435, + "learning_rate": 1.961583263457884e-05, + "loss": 0.5591, + "step": 2286 + }, + { + "epoch": 0.6342207432057682, + "grad_norm": 0.18705891072750092, + "learning_rate": 1.9611292307171987e-05, + "loss": 0.5309, + "step": 2287 + }, + { + "epoch": 0.634498058790904, + "grad_norm": 0.18091407418251038, + "learning_rate": 1.9606750592163593e-05, + "loss": 0.5041, + "step": 2288 + }, + { + "epoch": 0.6347753743760399, + "grad_norm": 0.19142916798591614, + "learning_rate": 1.960220749043987e-05, + "loss": 0.5601, + "step": 2289 + }, + { + "epoch": 0.6350526899611758, + "grad_norm": 0.18897870182991028, + "learning_rate": 1.9597663002887294e-05, + "loss": 0.5541, + "step": 2290 + }, + { + "epoch": 0.6353300055463117, + "grad_norm": 0.19178354740142822, + "learning_rate": 1.959311713039262e-05, + "loss": 0.5217, + "step": 2291 + }, + { + "epoch": 0.6356073211314476, + "grad_norm": 0.18749533593654633, + "learning_rate": 1.9588569873842864e-05, + "loss": 0.5291, + "step": 2292 + }, + { + "epoch": 0.6358846367165835, + "grad_norm": 0.20121093094348907, + "learning_rate": 1.9584021234125323e-05, + "loss": 0.5414, + "step": 2293 + }, + { + "epoch": 0.6361619523017193, + "grad_norm": 0.18628259003162384, + "learning_rate": 1.957947121212754e-05, + "loss": 0.511, + "step": 2294 + }, + { + "epoch": 0.6364392678868552, + "grad_norm": 0.18616369366645813, + "learning_rate": 1.9574919808737364e-05, + "loss": 0.5496, + "step": 2295 + }, + { + "epoch": 0.6367165834719911, + "grad_norm": 0.20249204337596893, + "learning_rate": 1.9570367024842888e-05, + "loss": 0.5585, + "step": 2296 + }, + { + "epoch": 0.636993899057127, + "grad_norm": 0.23987984657287598, + "learning_rate": 1.9565812861332477e-05, + "loss": 0.5659, + "step": 2297 + }, + { + "epoch": 0.6372712146422629, + "grad_norm": 0.1872101128101349, + "learning_rate": 1.956125731909477e-05, + "loss": 0.5419, + "step": 2298 + }, + { + "epoch": 0.6375485302273988, + "grad_norm": 0.18463543057441711, + "learning_rate": 1.955670039901868e-05, + "loss": 0.5302, + "step": 2299 + }, + { + "epoch": 0.6378258458125347, + "grad_norm": 0.20480939745903015, + "learning_rate": 1.955214210199338e-05, + "loss": 0.5343, + "step": 2300 + }, + { + "epoch": 0.6381031613976705, + "grad_norm": 0.18215136229991913, + "learning_rate": 1.9547582428908306e-05, + "loss": 0.546, + "step": 2301 + }, + { + "epoch": 0.6383804769828064, + "grad_norm": 0.17943714559078217, + "learning_rate": 1.954302138065318e-05, + "loss": 0.5334, + "step": 2302 + }, + { + "epoch": 0.6386577925679423, + "grad_norm": 0.19718489050865173, + "learning_rate": 1.9538458958117982e-05, + "loss": 0.5621, + "step": 2303 + }, + { + "epoch": 0.6389351081530782, + "grad_norm": 0.19121624529361725, + "learning_rate": 1.953389516219296e-05, + "loss": 0.5523, + "step": 2304 + }, + { + "epoch": 0.6392124237382141, + "grad_norm": 0.1928092986345291, + "learning_rate": 1.9529329993768634e-05, + "loss": 0.5455, + "step": 2305 + }, + { + "epoch": 0.63948973932335, + "grad_norm": 0.1785450428724289, + "learning_rate": 1.952476345373579e-05, + "loss": 0.5643, + "step": 2306 + }, + { + "epoch": 0.6397670549084858, + "grad_norm": 0.17965517938137054, + "learning_rate": 1.9520195542985476e-05, + "loss": 0.5266, + "step": 2307 + }, + { + "epoch": 0.6400443704936217, + "grad_norm": 0.21014821529388428, + "learning_rate": 1.9515626262409016e-05, + "loss": 0.5327, + "step": 2308 + }, + { + "epoch": 0.6403216860787576, + "grad_norm": 0.18984338641166687, + "learning_rate": 1.951105561289799e-05, + "loss": 0.5678, + "step": 2309 + }, + { + "epoch": 0.6405990016638935, + "grad_norm": 0.18631823360919952, + "learning_rate": 1.9506483595344267e-05, + "loss": 0.5469, + "step": 2310 + }, + { + "epoch": 0.6408763172490294, + "grad_norm": 0.18727704882621765, + "learning_rate": 1.9501910210639958e-05, + "loss": 0.5657, + "step": 2311 + }, + { + "epoch": 0.6411536328341653, + "grad_norm": 0.17807155847549438, + "learning_rate": 1.9497335459677458e-05, + "loss": 0.5044, + "step": 2312 + }, + { + "epoch": 0.6414309484193012, + "grad_norm": 0.18843533098697662, + "learning_rate": 1.9492759343349415e-05, + "loss": 0.553, + "step": 2313 + }, + { + "epoch": 0.641708264004437, + "grad_norm": 0.1941610723733902, + "learning_rate": 1.9488181862548753e-05, + "loss": 0.587, + "step": 2314 + }, + { + "epoch": 0.6419855795895729, + "grad_norm": 0.1894078403711319, + "learning_rate": 1.9483603018168666e-05, + "loss": 0.5285, + "step": 2315 + }, + { + "epoch": 0.6422628951747088, + "grad_norm": 0.19420726597309113, + "learning_rate": 1.9479022811102604e-05, + "loss": 0.5302, + "step": 2316 + }, + { + "epoch": 0.6425402107598447, + "grad_norm": 0.205157071352005, + "learning_rate": 1.9474441242244284e-05, + "loss": 0.5539, + "step": 2317 + }, + { + "epoch": 0.6428175263449806, + "grad_norm": 0.1943119317293167, + "learning_rate": 1.9469858312487693e-05, + "loss": 0.5465, + "step": 2318 + }, + { + "epoch": 0.6430948419301165, + "grad_norm": 0.19968454539775848, + "learning_rate": 1.946527402272708e-05, + "loss": 0.5438, + "step": 2319 + }, + { + "epoch": 0.6433721575152523, + "grad_norm": 0.18584848940372467, + "learning_rate": 1.9460688373856967e-05, + "loss": 0.546, + "step": 2320 + }, + { + "epoch": 0.6436494731003882, + "grad_norm": 0.17821067571640015, + "learning_rate": 1.945610136677213e-05, + "loss": 0.5148, + "step": 2321 + }, + { + "epoch": 0.6439267886855241, + "grad_norm": 0.19228345155715942, + "learning_rate": 1.945151300236762e-05, + "loss": 0.5368, + "step": 2322 + }, + { + "epoch": 0.64420410427066, + "grad_norm": 0.18330131471157074, + "learning_rate": 1.9446923281538747e-05, + "loss": 0.5611, + "step": 2323 + }, + { + "epoch": 0.6444814198557959, + "grad_norm": 0.18893574178218842, + "learning_rate": 1.9442332205181086e-05, + "loss": 0.54, + "step": 2324 + }, + { + "epoch": 0.6447587354409318, + "grad_norm": 0.19229231774806976, + "learning_rate": 1.943773977419047e-05, + "loss": 0.5236, + "step": 2325 + }, + { + "epoch": 0.6450360510260676, + "grad_norm": 0.20103448629379272, + "learning_rate": 1.9433145989463027e-05, + "loss": 0.554, + "step": 2326 + }, + { + "epoch": 0.6453133666112035, + "grad_norm": 0.1895090788602829, + "learning_rate": 1.9428550851895098e-05, + "loss": 0.5676, + "step": 2327 + }, + { + "epoch": 0.6455906821963394, + "grad_norm": 0.18887649476528168, + "learning_rate": 1.9423954362383334e-05, + "loss": 0.5601, + "step": 2328 + }, + { + "epoch": 0.6458679977814753, + "grad_norm": 0.2108272910118103, + "learning_rate": 1.941935652182463e-05, + "loss": 0.5748, + "step": 2329 + }, + { + "epoch": 0.6461453133666112, + "grad_norm": 0.20968154072761536, + "learning_rate": 1.941475733111614e-05, + "loss": 0.5306, + "step": 2330 + }, + { + "epoch": 0.6464226289517471, + "grad_norm": 0.18780824542045593, + "learning_rate": 1.9410156791155297e-05, + "loss": 0.5326, + "step": 2331 + }, + { + "epoch": 0.646699944536883, + "grad_norm": 0.19030767679214478, + "learning_rate": 1.9405554902839778e-05, + "loss": 0.5533, + "step": 2332 + }, + { + "epoch": 0.6469772601220188, + "grad_norm": 0.1849377304315567, + "learning_rate": 1.9400951667067542e-05, + "loss": 0.571, + "step": 2333 + }, + { + "epoch": 0.6472545757071547, + "grad_norm": 0.1965888887643814, + "learning_rate": 1.9396347084736794e-05, + "loss": 0.5591, + "step": 2334 + }, + { + "epoch": 0.6475318912922906, + "grad_norm": 0.1921495646238327, + "learning_rate": 1.9391741156746013e-05, + "loss": 0.5723, + "step": 2335 + }, + { + "epoch": 0.6478092068774265, + "grad_norm": 0.20073464512825012, + "learning_rate": 1.9387133883993948e-05, + "loss": 0.535, + "step": 2336 + }, + { + "epoch": 0.6480865224625624, + "grad_norm": 0.18834145367145538, + "learning_rate": 1.938252526737958e-05, + "loss": 0.5256, + "step": 2337 + }, + { + "epoch": 0.6483638380476983, + "grad_norm": 0.1929401457309723, + "learning_rate": 1.9377915307802192e-05, + "loss": 0.5204, + "step": 2338 + }, + { + "epoch": 0.6486411536328341, + "grad_norm": 0.19703806936740875, + "learning_rate": 1.9373304006161298e-05, + "loss": 0.5603, + "step": 2339 + }, + { + "epoch": 0.64891846921797, + "grad_norm": 0.205661803483963, + "learning_rate": 1.9368691363356682e-05, + "loss": 0.522, + "step": 2340 + }, + { + "epoch": 0.6491957848031059, + "grad_norm": 0.19013790786266327, + "learning_rate": 1.9364077380288408e-05, + "loss": 0.549, + "step": 2341 + }, + { + "epoch": 0.6494731003882418, + "grad_norm": 0.18775691092014313, + "learning_rate": 1.935946205785677e-05, + "loss": 0.5389, + "step": 2342 + }, + { + "epoch": 0.6497504159733777, + "grad_norm": 0.18783038854599, + "learning_rate": 1.9354845396962353e-05, + "loss": 0.5378, + "step": 2343 + }, + { + "epoch": 0.6500277315585136, + "grad_norm": 0.19113753736019135, + "learning_rate": 1.9350227398505976e-05, + "loss": 0.5461, + "step": 2344 + }, + { + "epoch": 0.6503050471436495, + "grad_norm": 0.17977707087993622, + "learning_rate": 1.9345608063388742e-05, + "loss": 0.5512, + "step": 2345 + }, + { + "epoch": 0.6505823627287853, + "grad_norm": 0.20450885593891144, + "learning_rate": 1.9340987392512006e-05, + "loss": 0.5501, + "step": 2346 + }, + { + "epoch": 0.6508596783139212, + "grad_norm": 0.2484101504087448, + "learning_rate": 1.9336365386777376e-05, + "loss": 0.5223, + "step": 2347 + }, + { + "epoch": 0.6511369938990571, + "grad_norm": 0.20386487245559692, + "learning_rate": 1.9331742047086743e-05, + "loss": 0.5262, + "step": 2348 + }, + { + "epoch": 0.651414309484193, + "grad_norm": 0.18846935033798218, + "learning_rate": 1.9327117374342223e-05, + "loss": 0.5347, + "step": 2349 + }, + { + "epoch": 0.6516916250693289, + "grad_norm": 0.18970006704330444, + "learning_rate": 1.932249136944623e-05, + "loss": 0.5683, + "step": 2350 + }, + { + "epoch": 0.6519689406544648, + "grad_norm": 0.1962558776140213, + "learning_rate": 1.9317864033301407e-05, + "loss": 0.5791, + "step": 2351 + }, + { + "epoch": 0.6522462562396006, + "grad_norm": 0.20231659710407257, + "learning_rate": 1.9313235366810676e-05, + "loss": 0.5627, + "step": 2352 + }, + { + "epoch": 0.6525235718247365, + "grad_norm": 0.1958416849374771, + "learning_rate": 1.9308605370877215e-05, + "loss": 0.5375, + "step": 2353 + }, + { + "epoch": 0.6528008874098724, + "grad_norm": 0.20812073349952698, + "learning_rate": 1.9303974046404455e-05, + "loss": 0.5299, + "step": 2354 + }, + { + "epoch": 0.6530782029950083, + "grad_norm": 0.1921248584985733, + "learning_rate": 1.929934139429609e-05, + "loss": 0.5544, + "step": 2355 + }, + { + "epoch": 0.6533555185801442, + "grad_norm": 0.20390520989894867, + "learning_rate": 1.929470741545607e-05, + "loss": 0.581, + "step": 2356 + }, + { + "epoch": 0.6536328341652801, + "grad_norm": 0.18480364978313446, + "learning_rate": 1.9290072110788616e-05, + "loss": 0.5599, + "step": 2357 + }, + { + "epoch": 0.653910149750416, + "grad_norm": 0.1953095942735672, + "learning_rate": 1.928543548119819e-05, + "loss": 0.5961, + "step": 2358 + }, + { + "epoch": 0.6541874653355518, + "grad_norm": 0.19727711379528046, + "learning_rate": 1.9280797527589527e-05, + "loss": 0.5585, + "step": 2359 + }, + { + "epoch": 0.6544647809206877, + "grad_norm": 0.19928644597530365, + "learning_rate": 1.927615825086761e-05, + "loss": 0.5522, + "step": 2360 + }, + { + "epoch": 0.6547420965058236, + "grad_norm": 0.1891396939754486, + "learning_rate": 1.9271517651937688e-05, + "loss": 0.5586, + "step": 2361 + }, + { + "epoch": 0.6550194120909595, + "grad_norm": 0.1949121057987213, + "learning_rate": 1.9266875731705266e-05, + "loss": 0.5307, + "step": 2362 + }, + { + "epoch": 0.6552967276760954, + "grad_norm": 0.21069341897964478, + "learning_rate": 1.9262232491076104e-05, + "loss": 0.5546, + "step": 2363 + }, + { + "epoch": 0.6555740432612313, + "grad_norm": 0.18490912020206451, + "learning_rate": 1.925758793095622e-05, + "loss": 0.5613, + "step": 2364 + }, + { + "epoch": 0.6558513588463671, + "grad_norm": 0.1878899782896042, + "learning_rate": 1.9252942052251892e-05, + "loss": 0.5497, + "step": 2365 + }, + { + "epoch": 0.656128674431503, + "grad_norm": 0.19967246055603027, + "learning_rate": 1.9248294855869653e-05, + "loss": 0.5327, + "step": 2366 + }, + { + "epoch": 0.6564059900166389, + "grad_norm": 0.19277790188789368, + "learning_rate": 1.9243646342716296e-05, + "loss": 0.5545, + "step": 2367 + }, + { + "epoch": 0.6566833056017748, + "grad_norm": 0.19045887887477875, + "learning_rate": 1.9238996513698864e-05, + "loss": 0.5071, + "step": 2368 + }, + { + "epoch": 0.6569606211869107, + "grad_norm": 0.19380688667297363, + "learning_rate": 1.923434536972467e-05, + "loss": 0.5437, + "step": 2369 + }, + { + "epoch": 0.6572379367720466, + "grad_norm": 0.18202729523181915, + "learning_rate": 1.9229692911701275e-05, + "loss": 0.5356, + "step": 2370 + }, + { + "epoch": 0.6575152523571824, + "grad_norm": 0.18908429145812988, + "learning_rate": 1.9225039140536488e-05, + "loss": 0.5526, + "step": 2371 + }, + { + "epoch": 0.6577925679423183, + "grad_norm": 0.21290616691112518, + "learning_rate": 1.9220384057138386e-05, + "loss": 0.5306, + "step": 2372 + }, + { + "epoch": 0.6580698835274542, + "grad_norm": 0.18612539768218994, + "learning_rate": 1.9215727662415303e-05, + "loss": 0.5387, + "step": 2373 + }, + { + "epoch": 0.6583471991125901, + "grad_norm": 0.182894766330719, + "learning_rate": 1.9211069957275822e-05, + "loss": 0.5365, + "step": 2374 + }, + { + "epoch": 0.658624514697726, + "grad_norm": 0.1848146617412567, + "learning_rate": 1.920641094262879e-05, + "loss": 0.5423, + "step": 2375 + }, + { + "epoch": 0.6589018302828619, + "grad_norm": 0.18521788716316223, + "learning_rate": 1.92017506193833e-05, + "loss": 0.5534, + "step": 2376 + }, + { + "epoch": 0.6591791458679978, + "grad_norm": 0.1895231306552887, + "learning_rate": 1.9197088988448703e-05, + "loss": 0.5447, + "step": 2377 + }, + { + "epoch": 0.6594564614531336, + "grad_norm": 0.18669599294662476, + "learning_rate": 1.9192426050734608e-05, + "loss": 0.5446, + "step": 2378 + }, + { + "epoch": 0.6597337770382695, + "grad_norm": 0.1867615282535553, + "learning_rate": 1.9187761807150878e-05, + "loss": 0.5269, + "step": 2379 + }, + { + "epoch": 0.6600110926234054, + "grad_norm": 0.19190800189971924, + "learning_rate": 1.918309625860763e-05, + "loss": 0.5833, + "step": 2380 + }, + { + "epoch": 0.6602884082085413, + "grad_norm": 0.18087397515773773, + "learning_rate": 1.917842940601524e-05, + "loss": 0.5379, + "step": 2381 + }, + { + "epoch": 0.6605657237936772, + "grad_norm": 0.18190105259418488, + "learning_rate": 1.9173761250284324e-05, + "loss": 0.5489, + "step": 2382 + }, + { + "epoch": 0.6608430393788131, + "grad_norm": 0.19485372304916382, + "learning_rate": 1.9169091792325777e-05, + "loss": 0.5687, + "step": 2383 + }, + { + "epoch": 0.6611203549639489, + "grad_norm": 0.19502972066402435, + "learning_rate": 1.9164421033050724e-05, + "loss": 0.5445, + "step": 2384 + }, + { + "epoch": 0.6613976705490848, + "grad_norm": 0.18996240198612213, + "learning_rate": 1.915974897337056e-05, + "loss": 0.5621, + "step": 2385 + }, + { + "epoch": 0.6616749861342207, + "grad_norm": 0.19751591980457306, + "learning_rate": 1.915507561419692e-05, + "loss": 0.5468, + "step": 2386 + }, + { + "epoch": 0.6619523017193566, + "grad_norm": 0.2202579826116562, + "learning_rate": 1.915040095644171e-05, + "loss": 0.5515, + "step": 2387 + }, + { + "epoch": 0.6622296173044925, + "grad_norm": 0.1802307367324829, + "learning_rate": 1.914572500101707e-05, + "loss": 0.5449, + "step": 2388 + }, + { + "epoch": 0.6625069328896284, + "grad_norm": 0.18632298707962036, + "learning_rate": 1.914104774883541e-05, + "loss": 0.5612, + "step": 2389 + }, + { + "epoch": 0.6627842484747642, + "grad_norm": 0.19929082691669464, + "learning_rate": 1.9136369200809378e-05, + "loss": 0.5789, + "step": 2390 + }, + { + "epoch": 0.6630615640599001, + "grad_norm": 0.18650726974010468, + "learning_rate": 1.913168935785189e-05, + "loss": 0.5618, + "step": 2391 + }, + { + "epoch": 0.663338879645036, + "grad_norm": 0.1912173330783844, + "learning_rate": 1.912700822087611e-05, + "loss": 0.5487, + "step": 2392 + }, + { + "epoch": 0.6636161952301719, + "grad_norm": 0.19155828654766083, + "learning_rate": 1.912232579079544e-05, + "loss": 0.5421, + "step": 2393 + }, + { + "epoch": 0.6638935108153078, + "grad_norm": 0.21673326194286346, + "learning_rate": 1.9117642068523556e-05, + "loss": 0.5302, + "step": 2394 + }, + { + "epoch": 0.6641708264004437, + "grad_norm": 0.1941951960325241, + "learning_rate": 1.9112957054974373e-05, + "loss": 0.5558, + "step": 2395 + }, + { + "epoch": 0.6644481419855796, + "grad_norm": 0.18575075268745422, + "learning_rate": 1.9108270751062064e-05, + "loss": 0.5737, + "step": 2396 + }, + { + "epoch": 0.6647254575707154, + "grad_norm": 0.19413797557353973, + "learning_rate": 1.9103583157701046e-05, + "loss": 0.5559, + "step": 2397 + }, + { + "epoch": 0.6650027731558513, + "grad_norm": 0.19030508399009705, + "learning_rate": 1.9098894275805994e-05, + "loss": 0.5519, + "step": 2398 + }, + { + "epoch": 0.6652800887409872, + "grad_norm": 0.18635134398937225, + "learning_rate": 1.9094204106291842e-05, + "loss": 0.5203, + "step": 2399 + }, + { + "epoch": 0.6655574043261231, + "grad_norm": 0.1943938434123993, + "learning_rate": 1.908951265007375e-05, + "loss": 0.5664, + "step": 2400 + }, + { + "epoch": 0.665834719911259, + "grad_norm": 0.2069421112537384, + "learning_rate": 1.9084819908067156e-05, + "loss": 0.5561, + "step": 2401 + }, + { + "epoch": 0.6661120354963949, + "grad_norm": 0.18940883874893188, + "learning_rate": 1.9080125881187737e-05, + "loss": 0.5833, + "step": 2402 + }, + { + "epoch": 0.6663893510815307, + "grad_norm": 0.1828288733959198, + "learning_rate": 1.907543057035142e-05, + "loss": 0.5478, + "step": 2403 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.18825417757034302, + "learning_rate": 1.907073397647439e-05, + "loss": 0.5362, + "step": 2404 + }, + { + "epoch": 0.6669439822518025, + "grad_norm": 0.19599126279354095, + "learning_rate": 1.906603610047307e-05, + "loss": 0.5425, + "step": 2405 + }, + { + "epoch": 0.6672212978369384, + "grad_norm": 0.1776115894317627, + "learning_rate": 1.9061336943264145e-05, + "loss": 0.5395, + "step": 2406 + }, + { + "epoch": 0.6674986134220743, + "grad_norm": 0.20096100866794586, + "learning_rate": 1.905663650576454e-05, + "loss": 0.547, + "step": 2407 + }, + { + "epoch": 0.6677759290072102, + "grad_norm": 0.1988787204027176, + "learning_rate": 1.9051934788891443e-05, + "loss": 0.5336, + "step": 2408 + }, + { + "epoch": 0.668053244592346, + "grad_norm": 0.19204209744930267, + "learning_rate": 1.9047231793562276e-05, + "loss": 0.5449, + "step": 2409 + }, + { + "epoch": 0.6683305601774819, + "grad_norm": 0.1779128760099411, + "learning_rate": 1.904252752069472e-05, + "loss": 0.5023, + "step": 2410 + }, + { + "epoch": 0.6686078757626178, + "grad_norm": 0.2877557575702667, + "learning_rate": 1.9037821971206703e-05, + "loss": 0.5446, + "step": 2411 + }, + { + "epoch": 0.6688851913477537, + "grad_norm": 0.19600874185562134, + "learning_rate": 1.90331151460164e-05, + "loss": 0.5275, + "step": 2412 + }, + { + "epoch": 0.6691625069328896, + "grad_norm": 0.28246966004371643, + "learning_rate": 1.9028407046042246e-05, + "loss": 0.5321, + "step": 2413 + }, + { + "epoch": 0.6694398225180255, + "grad_norm": 0.1964629739522934, + "learning_rate": 1.9023697672202905e-05, + "loss": 0.5727, + "step": 2414 + }, + { + "epoch": 0.6697171381031614, + "grad_norm": 0.17945913970470428, + "learning_rate": 1.901898702541731e-05, + "loss": 0.5237, + "step": 2415 + }, + { + "epoch": 0.6699944536882972, + "grad_norm": 0.190501868724823, + "learning_rate": 1.901427510660463e-05, + "loss": 0.5491, + "step": 2416 + }, + { + "epoch": 0.6702717692734331, + "grad_norm": 0.19791793823242188, + "learning_rate": 1.9009561916684282e-05, + "loss": 0.5586, + "step": 2417 + }, + { + "epoch": 0.670549084858569, + "grad_norm": 0.18914659321308136, + "learning_rate": 1.900484745657594e-05, + "loss": 0.5302, + "step": 2418 + }, + { + "epoch": 0.6708264004437049, + "grad_norm": 0.1981426477432251, + "learning_rate": 1.9000131727199513e-05, + "loss": 0.5609, + "step": 2419 + }, + { + "epoch": 0.6711037160288408, + "grad_norm": 0.1939757615327835, + "learning_rate": 1.8995414729475165e-05, + "loss": 0.5749, + "step": 2420 + }, + { + "epoch": 0.6713810316139767, + "grad_norm": 0.19931401312351227, + "learning_rate": 1.899069646432332e-05, + "loss": 0.547, + "step": 2421 + }, + { + "epoch": 0.6716583471991125, + "grad_norm": 0.19219137728214264, + "learning_rate": 1.898597693266462e-05, + "loss": 0.5426, + "step": 2422 + }, + { + "epoch": 0.6719356627842484, + "grad_norm": 0.199588343501091, + "learning_rate": 1.898125613541998e-05, + "loss": 0.5411, + "step": 2423 + }, + { + "epoch": 0.6722129783693843, + "grad_norm": 0.19906532764434814, + "learning_rate": 1.897653407351055e-05, + "loss": 0.5707, + "step": 2424 + }, + { + "epoch": 0.6724902939545202, + "grad_norm": 0.18722088634967804, + "learning_rate": 1.8971810747857726e-05, + "loss": 0.5623, + "step": 2425 + }, + { + "epoch": 0.6727676095396561, + "grad_norm": 0.18101942539215088, + "learning_rate": 1.8967086159383162e-05, + "loss": 0.5519, + "step": 2426 + }, + { + "epoch": 0.673044925124792, + "grad_norm": 0.19272929430007935, + "learning_rate": 1.8962360309008746e-05, + "loss": 0.5413, + "step": 2427 + }, + { + "epoch": 0.6733222407099279, + "grad_norm": 0.19695578515529633, + "learning_rate": 1.8957633197656615e-05, + "loss": 0.5299, + "step": 2428 + }, + { + "epoch": 0.6735995562950637, + "grad_norm": 0.19021473824977875, + "learning_rate": 1.8952904826249158e-05, + "loss": 0.5453, + "step": 2429 + }, + { + "epoch": 0.6738768718801996, + "grad_norm": 0.1905011683702469, + "learning_rate": 1.8948175195709e-05, + "loss": 0.5458, + "step": 2430 + }, + { + "epoch": 0.6741541874653355, + "grad_norm": 0.1979636549949646, + "learning_rate": 1.8943444306959017e-05, + "loss": 0.5379, + "step": 2431 + }, + { + "epoch": 0.6744315030504714, + "grad_norm": 0.19846384227275848, + "learning_rate": 1.8938712160922343e-05, + "loss": 0.5164, + "step": 2432 + }, + { + "epoch": 0.6747088186356073, + "grad_norm": 0.18356280028820038, + "learning_rate": 1.893397875852233e-05, + "loss": 0.5429, + "step": 2433 + }, + { + "epoch": 0.6749861342207432, + "grad_norm": 0.1836164891719818, + "learning_rate": 1.8929244100682597e-05, + "loss": 0.5569, + "step": 2434 + }, + { + "epoch": 0.675263449805879, + "grad_norm": 0.20087088644504547, + "learning_rate": 1.8924508188327e-05, + "loss": 0.5676, + "step": 2435 + }, + { + "epoch": 0.6755407653910149, + "grad_norm": 0.1871204376220703, + "learning_rate": 1.891977102237964e-05, + "loss": 0.5314, + "step": 2436 + }, + { + "epoch": 0.6758180809761508, + "grad_norm": 0.19372668862342834, + "learning_rate": 1.891503260376487e-05, + "loss": 0.543, + "step": 2437 + }, + { + "epoch": 0.6760953965612867, + "grad_norm": 0.20033282041549683, + "learning_rate": 1.891029293340727e-05, + "loss": 0.5395, + "step": 2438 + }, + { + "epoch": 0.6763727121464226, + "grad_norm": 0.1941455453634262, + "learning_rate": 1.8905552012231684e-05, + "loss": 0.5775, + "step": 2439 + }, + { + "epoch": 0.6766500277315585, + "grad_norm": 0.18717962503433228, + "learning_rate": 1.890080984116319e-05, + "loss": 0.5305, + "step": 2440 + }, + { + "epoch": 0.6769273433166944, + "grad_norm": 0.1877082884311676, + "learning_rate": 1.8896066421127106e-05, + "loss": 0.5438, + "step": 2441 + }, + { + "epoch": 0.6772046589018302, + "grad_norm": 0.19558964669704437, + "learning_rate": 1.8891321753049008e-05, + "loss": 0.5452, + "step": 2442 + }, + { + "epoch": 0.6774819744869661, + "grad_norm": 0.19550803303718567, + "learning_rate": 1.8886575837854696e-05, + "loss": 0.5409, + "step": 2443 + }, + { + "epoch": 0.677759290072102, + "grad_norm": 0.18728572130203247, + "learning_rate": 1.888182867647023e-05, + "loss": 0.5572, + "step": 2444 + }, + { + "epoch": 0.6780366056572379, + "grad_norm": 0.184623122215271, + "learning_rate": 1.8877080269821906e-05, + "loss": 0.565, + "step": 2445 + }, + { + "epoch": 0.6783139212423738, + "grad_norm": 0.19215163588523865, + "learning_rate": 1.8872330618836265e-05, + "loss": 0.532, + "step": 2446 + }, + { + "epoch": 0.6785912368275097, + "grad_norm": 0.19430597126483917, + "learning_rate": 1.886757972444009e-05, + "loss": 0.5409, + "step": 2447 + }, + { + "epoch": 0.6788685524126455, + "grad_norm": 0.1996561586856842, + "learning_rate": 1.88628275875604e-05, + "loss": 0.5514, + "step": 2448 + }, + { + "epoch": 0.6791458679977814, + "grad_norm": 0.20458458364009857, + "learning_rate": 1.8858074209124473e-05, + "loss": 0.5566, + "step": 2449 + }, + { + "epoch": 0.6794231835829173, + "grad_norm": 0.20288583636283875, + "learning_rate": 1.885331959005981e-05, + "loss": 0.5237, + "step": 2450 + }, + { + "epoch": 0.6797004991680532, + "grad_norm": 0.18585625290870667, + "learning_rate": 1.8848563731294172e-05, + "loss": 0.53, + "step": 2451 + }, + { + "epoch": 0.6799778147531891, + "grad_norm": 0.22308149933815002, + "learning_rate": 1.8843806633755544e-05, + "loss": 0.5513, + "step": 2452 + }, + { + "epoch": 0.680255130338325, + "grad_norm": 0.18869346380233765, + "learning_rate": 1.8839048298372165e-05, + "loss": 0.5527, + "step": 2453 + }, + { + "epoch": 0.6805324459234608, + "grad_norm": 0.1881789267063141, + "learning_rate": 1.8834288726072513e-05, + "loss": 0.5368, + "step": 2454 + }, + { + "epoch": 0.6808097615085967, + "grad_norm": 0.18957830965518951, + "learning_rate": 1.882952791778531e-05, + "loss": 0.5235, + "step": 2455 + }, + { + "epoch": 0.6810870770937326, + "grad_norm": 0.1874406933784485, + "learning_rate": 1.882476587443951e-05, + "loss": 0.5164, + "step": 2456 + }, + { + "epoch": 0.6813643926788685, + "grad_norm": 0.19134515523910522, + "learning_rate": 1.8820002596964316e-05, + "loss": 0.5606, + "step": 2457 + }, + { + "epoch": 0.6816417082640044, + "grad_norm": 0.18497633934020996, + "learning_rate": 1.881523808628917e-05, + "loss": 0.5681, + "step": 2458 + }, + { + "epoch": 0.6819190238491403, + "grad_norm": 0.18682947754859924, + "learning_rate": 1.881047234334376e-05, + "loss": 0.55, + "step": 2459 + }, + { + "epoch": 0.6821963394342762, + "grad_norm": 0.1982649862766266, + "learning_rate": 1.8805705369057993e-05, + "loss": 0.5321, + "step": 2460 + }, + { + "epoch": 0.682473655019412, + "grad_norm": 0.19017384946346283, + "learning_rate": 1.880093716436205e-05, + "loss": 0.5295, + "step": 2461 + }, + { + "epoch": 0.6827509706045479, + "grad_norm": 0.20334112644195557, + "learning_rate": 1.8796167730186322e-05, + "loss": 0.5601, + "step": 2462 + }, + { + "epoch": 0.6830282861896838, + "grad_norm": 0.1974753588438034, + "learning_rate": 1.8791397067461457e-05, + "loss": 0.5572, + "step": 2463 + }, + { + "epoch": 0.6833056017748197, + "grad_norm": 0.17885488271713257, + "learning_rate": 1.878662517711834e-05, + "loss": 0.5245, + "step": 2464 + }, + { + "epoch": 0.6835829173599556, + "grad_norm": 0.18409696221351624, + "learning_rate": 1.8781852060088083e-05, + "loss": 0.5321, + "step": 2465 + }, + { + "epoch": 0.6838602329450915, + "grad_norm": 0.19201841950416565, + "learning_rate": 1.877707771730206e-05, + "loss": 0.5563, + "step": 2466 + }, + { + "epoch": 0.6841375485302273, + "grad_norm": 0.1807066947221756, + "learning_rate": 1.8772302149691866e-05, + "loss": 0.5253, + "step": 2467 + }, + { + "epoch": 0.6844148641153632, + "grad_norm": 0.18462277948856354, + "learning_rate": 1.8767525358189343e-05, + "loss": 0.5315, + "step": 2468 + }, + { + "epoch": 0.6846921797004991, + "grad_norm": 0.18250201642513275, + "learning_rate": 1.876274734372656e-05, + "loss": 0.5383, + "step": 2469 + }, + { + "epoch": 0.684969495285635, + "grad_norm": 0.1986282765865326, + "learning_rate": 1.8757968107235853e-05, + "loss": 0.5282, + "step": 2470 + }, + { + "epoch": 0.6852468108707709, + "grad_norm": 0.26469552516937256, + "learning_rate": 1.8753187649649757e-05, + "loss": 0.5564, + "step": 2471 + }, + { + "epoch": 0.6855241264559068, + "grad_norm": 0.19594305753707886, + "learning_rate": 1.874840597190108e-05, + "loss": 0.5546, + "step": 2472 + }, + { + "epoch": 0.6858014420410427, + "grad_norm": 0.18754026293754578, + "learning_rate": 1.8743623074922843e-05, + "loss": 0.5309, + "step": 2473 + }, + { + "epoch": 0.6860787576261785, + "grad_norm": 0.1846148520708084, + "learning_rate": 1.873883895964833e-05, + "loss": 0.5288, + "step": 2474 + }, + { + "epoch": 0.6863560732113144, + "grad_norm": 0.18642264604568481, + "learning_rate": 1.873405362701104e-05, + "loss": 0.5316, + "step": 2475 + }, + { + "epoch": 0.6866333887964503, + "grad_norm": 0.20203615725040436, + "learning_rate": 1.8729267077944717e-05, + "loss": 0.5235, + "step": 2476 + }, + { + "epoch": 0.6869107043815862, + "grad_norm": 0.18540050089359283, + "learning_rate": 1.872447931338335e-05, + "loss": 0.5488, + "step": 2477 + }, + { + "epoch": 0.6871880199667221, + "grad_norm": 0.19175854325294495, + "learning_rate": 1.8719690334261148e-05, + "loss": 0.5529, + "step": 2478 + }, + { + "epoch": 0.687465335551858, + "grad_norm": 0.18168555200099945, + "learning_rate": 1.8714900141512574e-05, + "loss": 0.5119, + "step": 2479 + }, + { + "epoch": 0.687742651136994, + "grad_norm": 0.1855335384607315, + "learning_rate": 1.871010873607233e-05, + "loss": 0.5448, + "step": 2480 + }, + { + "epoch": 0.6880199667221298, + "grad_norm": 0.1834007054567337, + "learning_rate": 1.870531611887533e-05, + "loss": 0.5583, + "step": 2481 + }, + { + "epoch": 0.6882972823072657, + "grad_norm": 0.1926104575395584, + "learning_rate": 1.870052229085675e-05, + "loss": 0.5549, + "step": 2482 + }, + { + "epoch": 0.6885745978924016, + "grad_norm": 0.19106236100196838, + "learning_rate": 1.8695727252951995e-05, + "loss": 0.5146, + "step": 2483 + }, + { + "epoch": 0.6888519134775375, + "grad_norm": 0.18813811242580414, + "learning_rate": 1.8690931006096695e-05, + "loss": 0.5773, + "step": 2484 + }, + { + "epoch": 0.6891292290626734, + "grad_norm": 0.19836729764938354, + "learning_rate": 1.8686133551226735e-05, + "loss": 0.5793, + "step": 2485 + }, + { + "epoch": 0.6894065446478093, + "grad_norm": 0.1816731095314026, + "learning_rate": 1.8681334889278217e-05, + "loss": 0.5205, + "step": 2486 + }, + { + "epoch": 0.6896838602329451, + "grad_norm": 0.1861170530319214, + "learning_rate": 1.8676535021187495e-05, + "loss": 0.5589, + "step": 2487 + }, + { + "epoch": 0.689961175818081, + "grad_norm": 0.18680687248706818, + "learning_rate": 1.867173394789114e-05, + "loss": 0.5283, + "step": 2488 + }, + { + "epoch": 0.6902384914032169, + "grad_norm": 0.19478975236415863, + "learning_rate": 1.866693167032598e-05, + "loss": 0.5559, + "step": 2489 + }, + { + "epoch": 0.6905158069883528, + "grad_norm": 0.19333085417747498, + "learning_rate": 1.8662128189429058e-05, + "loss": 0.516, + "step": 2490 + }, + { + "epoch": 0.6907931225734887, + "grad_norm": 0.19102855026721954, + "learning_rate": 1.8657323506137668e-05, + "loss": 0.5587, + "step": 2491 + }, + { + "epoch": 0.6910704381586246, + "grad_norm": 0.19927440583705902, + "learning_rate": 1.8652517621389324e-05, + "loss": 0.5475, + "step": 2492 + }, + { + "epoch": 0.6913477537437605, + "grad_norm": 0.19673167169094086, + "learning_rate": 1.8647710536121784e-05, + "loss": 0.5504, + "step": 2493 + }, + { + "epoch": 0.6916250693288963, + "grad_norm": 0.19291207194328308, + "learning_rate": 1.8642902251273038e-05, + "loss": 0.555, + "step": 2494 + }, + { + "epoch": 0.6919023849140322, + "grad_norm": 0.22686271369457245, + "learning_rate": 1.863809276778131e-05, + "loss": 0.5169, + "step": 2495 + }, + { + "epoch": 0.6921797004991681, + "grad_norm": 0.19432714581489563, + "learning_rate": 1.8633282086585057e-05, + "loss": 0.5287, + "step": 2496 + }, + { + "epoch": 0.692457016084304, + "grad_norm": 0.18512091040611267, + "learning_rate": 1.8628470208622972e-05, + "loss": 0.5452, + "step": 2497 + }, + { + "epoch": 0.6927343316694399, + "grad_norm": 0.18711940944194794, + "learning_rate": 1.8623657134833976e-05, + "loss": 0.5489, + "step": 2498 + }, + { + "epoch": 0.6930116472545758, + "grad_norm": 0.19019177556037903, + "learning_rate": 1.8618842866157234e-05, + "loss": 0.5188, + "step": 2499 + }, + { + "epoch": 0.6932889628397116, + "grad_norm": 0.18888919055461884, + "learning_rate": 1.861402740353213e-05, + "loss": 0.5563, + "step": 2500 + }, + { + "epoch": 0.6935662784248475, + "grad_norm": 0.19487237930297852, + "learning_rate": 1.8609210747898293e-05, + "loss": 0.5627, + "step": 2501 + }, + { + "epoch": 0.6938435940099834, + "grad_norm": 0.18301233649253845, + "learning_rate": 1.8604392900195573e-05, + "loss": 0.5539, + "step": 2502 + }, + { + "epoch": 0.6941209095951193, + "grad_norm": 0.18450453877449036, + "learning_rate": 1.8599573861364074e-05, + "loss": 0.5238, + "step": 2503 + }, + { + "epoch": 0.6943982251802552, + "grad_norm": 0.19734638929367065, + "learning_rate": 1.8594753632344104e-05, + "loss": 0.5619, + "step": 2504 + }, + { + "epoch": 0.6946755407653911, + "grad_norm": 0.1974724531173706, + "learning_rate": 1.858993221407622e-05, + "loss": 0.5805, + "step": 2505 + }, + { + "epoch": 0.694952856350527, + "grad_norm": 0.20074540376663208, + "learning_rate": 1.858510960750122e-05, + "loss": 0.5515, + "step": 2506 + }, + { + "epoch": 0.6952301719356628, + "grad_norm": 0.18881772458553314, + "learning_rate": 1.8580285813560104e-05, + "loss": 0.5371, + "step": 2507 + }, + { + "epoch": 0.6955074875207987, + "grad_norm": 0.19910936057567596, + "learning_rate": 1.8575460833194142e-05, + "loss": 0.5288, + "step": 2508 + }, + { + "epoch": 0.6957848031059346, + "grad_norm": 0.18827465176582336, + "learning_rate": 1.8570634667344795e-05, + "loss": 0.5591, + "step": 2509 + }, + { + "epoch": 0.6960621186910705, + "grad_norm": 0.18730634450912476, + "learning_rate": 1.8565807316953796e-05, + "loss": 0.5126, + "step": 2510 + }, + { + "epoch": 0.6963394342762064, + "grad_norm": 0.18552148342132568, + "learning_rate": 1.856097878296307e-05, + "loss": 0.5208, + "step": 2511 + }, + { + "epoch": 0.6966167498613423, + "grad_norm": 0.18370574712753296, + "learning_rate": 1.8556149066314803e-05, + "loss": 0.5122, + "step": 2512 + }, + { + "epoch": 0.6968940654464781, + "grad_norm": 0.19410766661167145, + "learning_rate": 1.8551318167951403e-05, + "loss": 0.5062, + "step": 2513 + }, + { + "epoch": 0.697171381031614, + "grad_norm": 0.1899997889995575, + "learning_rate": 1.85464860888155e-05, + "loss": 0.5355, + "step": 2514 + }, + { + "epoch": 0.6974486966167499, + "grad_norm": 0.2276785969734192, + "learning_rate": 1.854165282984996e-05, + "loss": 0.5434, + "step": 2515 + }, + { + "epoch": 0.6977260122018858, + "grad_norm": 0.20629100501537323, + "learning_rate": 1.8536818391997884e-05, + "loss": 0.5434, + "step": 2516 + }, + { + "epoch": 0.6980033277870217, + "grad_norm": 0.1940404623746872, + "learning_rate": 1.8531982776202598e-05, + "loss": 0.5566, + "step": 2517 + }, + { + "epoch": 0.6982806433721576, + "grad_norm": 0.18856097757816315, + "learning_rate": 1.8527145983407658e-05, + "loss": 0.5414, + "step": 2518 + }, + { + "epoch": 0.6985579589572934, + "grad_norm": 0.18844252824783325, + "learning_rate": 1.8522308014556843e-05, + "loss": 0.5535, + "step": 2519 + }, + { + "epoch": 0.6988352745424293, + "grad_norm": 0.19054186344146729, + "learning_rate": 1.8517468870594188e-05, + "loss": 0.5436, + "step": 2520 + }, + { + "epoch": 0.6991125901275652, + "grad_norm": 0.1948595494031906, + "learning_rate": 1.8512628552463917e-05, + "loss": 0.5589, + "step": 2521 + }, + { + "epoch": 0.6993899057127011, + "grad_norm": 0.2020605355501175, + "learning_rate": 1.850778706111052e-05, + "loss": 0.5586, + "step": 2522 + }, + { + "epoch": 0.699667221297837, + "grad_norm": 0.1909698247909546, + "learning_rate": 1.8502944397478693e-05, + "loss": 0.5231, + "step": 2523 + }, + { + "epoch": 0.6999445368829729, + "grad_norm": 0.19061705470085144, + "learning_rate": 1.849810056251337e-05, + "loss": 0.5436, + "step": 2524 + }, + { + "epoch": 0.7002218524681088, + "grad_norm": 0.18788163363933563, + "learning_rate": 1.8493255557159704e-05, + "loss": 0.5223, + "step": 2525 + }, + { + "epoch": 0.7004991680532446, + "grad_norm": 0.18889035284519196, + "learning_rate": 1.8488409382363095e-05, + "loss": 0.5178, + "step": 2526 + }, + { + "epoch": 0.7007764836383805, + "grad_norm": 0.18843677639961243, + "learning_rate": 1.8483562039069157e-05, + "loss": 0.5535, + "step": 2527 + }, + { + "epoch": 0.7010537992235164, + "grad_norm": 0.195985808968544, + "learning_rate": 1.847871352822373e-05, + "loss": 0.5592, + "step": 2528 + }, + { + "epoch": 0.7013311148086523, + "grad_norm": 0.1854049116373062, + "learning_rate": 1.8473863850772897e-05, + "loss": 0.5272, + "step": 2529 + }, + { + "epoch": 0.7016084303937882, + "grad_norm": 0.17917431890964508, + "learning_rate": 1.8469013007662946e-05, + "loss": 0.5239, + "step": 2530 + }, + { + "epoch": 0.7018857459789241, + "grad_norm": 0.18059198558330536, + "learning_rate": 1.8464160999840417e-05, + "loss": 0.565, + "step": 2531 + }, + { + "epoch": 0.7021630615640599, + "grad_norm": 0.18871940672397614, + "learning_rate": 1.8459307828252052e-05, + "loss": 0.5422, + "step": 2532 + }, + { + "epoch": 0.7024403771491958, + "grad_norm": 0.1852055937051773, + "learning_rate": 1.845445349384485e-05, + "loss": 0.5211, + "step": 2533 + }, + { + "epoch": 0.7027176927343317, + "grad_norm": 0.1867264062166214, + "learning_rate": 1.8449597997566005e-05, + "loss": 0.5318, + "step": 2534 + }, + { + "epoch": 0.7029950083194676, + "grad_norm": 0.1850994974374771, + "learning_rate": 1.844474134036296e-05, + "loss": 0.5375, + "step": 2535 + }, + { + "epoch": 0.7032723239046035, + "grad_norm": 0.188653826713562, + "learning_rate": 1.8439883523183377e-05, + "loss": 0.5223, + "step": 2536 + }, + { + "epoch": 0.7035496394897394, + "grad_norm": 0.19549886882305145, + "learning_rate": 1.8435024546975142e-05, + "loss": 0.5946, + "step": 2537 + }, + { + "epoch": 0.7038269550748752, + "grad_norm": 0.19603100419044495, + "learning_rate": 1.8430164412686375e-05, + "loss": 0.5579, + "step": 2538 + }, + { + "epoch": 0.7041042706600111, + "grad_norm": 0.27463892102241516, + "learning_rate": 1.8425303121265414e-05, + "loss": 0.561, + "step": 2539 + }, + { + "epoch": 0.704381586245147, + "grad_norm": 0.1948249191045761, + "learning_rate": 1.842044067366082e-05, + "loss": 0.5715, + "step": 2540 + }, + { + "epoch": 0.7046589018302829, + "grad_norm": 0.191276416182518, + "learning_rate": 1.8415577070821398e-05, + "loss": 0.5448, + "step": 2541 + }, + { + "epoch": 0.7049362174154188, + "grad_norm": 0.19777309894561768, + "learning_rate": 1.841071231369616e-05, + "loss": 0.5333, + "step": 2542 + }, + { + "epoch": 0.7052135330005547, + "grad_norm": 0.1844799816608429, + "learning_rate": 1.8405846403234346e-05, + "loss": 0.5455, + "step": 2543 + }, + { + "epoch": 0.7054908485856906, + "grad_norm": 0.17811760306358337, + "learning_rate": 1.840097934038543e-05, + "loss": 0.5307, + "step": 2544 + }, + { + "epoch": 0.7057681641708264, + "grad_norm": 0.1874844878911972, + "learning_rate": 1.8396111126099094e-05, + "loss": 0.5755, + "step": 2545 + }, + { + "epoch": 0.7060454797559623, + "grad_norm": 0.20174047350883484, + "learning_rate": 1.839124176132527e-05, + "loss": 0.5422, + "step": 2546 + }, + { + "epoch": 0.7063227953410982, + "grad_norm": 0.1862955093383789, + "learning_rate": 1.838637124701409e-05, + "loss": 0.5469, + "step": 2547 + }, + { + "epoch": 0.7066001109262341, + "grad_norm": 0.18790222704410553, + "learning_rate": 1.8381499584115924e-05, + "loss": 0.5472, + "step": 2548 + }, + { + "epoch": 0.70687742651137, + "grad_norm": 0.17207522690296173, + "learning_rate": 1.8376626773581358e-05, + "loss": 0.5074, + "step": 2549 + }, + { + "epoch": 0.7071547420965059, + "grad_norm": 0.19320419430732727, + "learning_rate": 1.8371752816361215e-05, + "loss": 0.5416, + "step": 2550 + }, + { + "epoch": 0.7074320576816417, + "grad_norm": 0.19113574922084808, + "learning_rate": 1.8366877713406526e-05, + "loss": 0.533, + "step": 2551 + }, + { + "epoch": 0.7077093732667776, + "grad_norm": 0.19141121208667755, + "learning_rate": 1.8362001465668554e-05, + "loss": 0.5435, + "step": 2552 + }, + { + "epoch": 0.7079866888519135, + "grad_norm": 0.17392635345458984, + "learning_rate": 1.8357124074098788e-05, + "loss": 0.5126, + "step": 2553 + }, + { + "epoch": 0.7082640044370494, + "grad_norm": 0.18224339187145233, + "learning_rate": 1.8352245539648933e-05, + "loss": 0.5369, + "step": 2554 + }, + { + "epoch": 0.7085413200221853, + "grad_norm": 0.193458691239357, + "learning_rate": 1.834736586327092e-05, + "loss": 0.5581, + "step": 2555 + }, + { + "epoch": 0.7088186356073212, + "grad_norm": 0.19396451115608215, + "learning_rate": 1.8342485045916902e-05, + "loss": 0.546, + "step": 2556 + }, + { + "epoch": 0.709095951192457, + "grad_norm": 0.2662739157676697, + "learning_rate": 1.8337603088539263e-05, + "loss": 0.5557, + "step": 2557 + }, + { + "epoch": 0.7093732667775929, + "grad_norm": 0.18841521441936493, + "learning_rate": 1.8332719992090592e-05, + "loss": 0.5359, + "step": 2558 + }, + { + "epoch": 0.7096505823627288, + "grad_norm": 0.20101507008075714, + "learning_rate": 1.8327835757523716e-05, + "loss": 0.54, + "step": 2559 + }, + { + "epoch": 0.7099278979478647, + "grad_norm": 0.17808422446250916, + "learning_rate": 1.832295038579168e-05, + "loss": 0.5263, + "step": 2560 + }, + { + "epoch": 0.7102052135330006, + "grad_norm": 0.19543784856796265, + "learning_rate": 1.8318063877847747e-05, + "loss": 0.5342, + "step": 2561 + }, + { + "epoch": 0.7104825291181365, + "grad_norm": 0.1944831758737564, + "learning_rate": 1.8313176234645406e-05, + "loss": 0.4973, + "step": 2562 + }, + { + "epoch": 0.7107598447032724, + "grad_norm": 0.19557087123394012, + "learning_rate": 1.8308287457138362e-05, + "loss": 0.533, + "step": 2563 + }, + { + "epoch": 0.7110371602884082, + "grad_norm": 0.19200831651687622, + "learning_rate": 1.8303397546280547e-05, + "loss": 0.5417, + "step": 2564 + }, + { + "epoch": 0.7113144758735441, + "grad_norm": 0.1837347149848938, + "learning_rate": 1.829850650302612e-05, + "loss": 0.5594, + "step": 2565 + }, + { + "epoch": 0.71159179145868, + "grad_norm": 0.1859401911497116, + "learning_rate": 1.8293614328329437e-05, + "loss": 0.5249, + "step": 2566 + }, + { + "epoch": 0.7118691070438159, + "grad_norm": 0.18670018017292023, + "learning_rate": 1.8288721023145105e-05, + "loss": 0.5339, + "step": 2567 + }, + { + "epoch": 0.7121464226289518, + "grad_norm": 0.19364280998706818, + "learning_rate": 1.8283826588427927e-05, + "loss": 0.5729, + "step": 2568 + }, + { + "epoch": 0.7124237382140877, + "grad_norm": 0.19787278771400452, + "learning_rate": 1.827893102513295e-05, + "loss": 0.5544, + "step": 2569 + }, + { + "epoch": 0.7127010537992235, + "grad_norm": 0.19540858268737793, + "learning_rate": 1.827403433421541e-05, + "loss": 0.5233, + "step": 2570 + }, + { + "epoch": 0.7129783693843594, + "grad_norm": 0.20480972528457642, + "learning_rate": 1.8269136516630798e-05, + "loss": 0.545, + "step": 2571 + }, + { + "epoch": 0.7132556849694953, + "grad_norm": 0.20005930960178375, + "learning_rate": 1.82642375733348e-05, + "loss": 0.561, + "step": 2572 + }, + { + "epoch": 0.7135330005546312, + "grad_norm": 0.18213188648223877, + "learning_rate": 1.825933750528333e-05, + "loss": 0.5277, + "step": 2573 + }, + { + "epoch": 0.7138103161397671, + "grad_norm": 0.19505798816680908, + "learning_rate": 1.8254436313432522e-05, + "loss": 0.5283, + "step": 2574 + }, + { + "epoch": 0.714087631724903, + "grad_norm": 0.1885930299758911, + "learning_rate": 1.824953399873873e-05, + "loss": 0.5189, + "step": 2575 + }, + { + "epoch": 0.7143649473100389, + "grad_norm": 0.19202375411987305, + "learning_rate": 1.824463056215852e-05, + "loss": 0.5197, + "step": 2576 + }, + { + "epoch": 0.7146422628951747, + "grad_norm": 0.19426311552524567, + "learning_rate": 1.823972600464869e-05, + "loss": 0.5243, + "step": 2577 + }, + { + "epoch": 0.7149195784803106, + "grad_norm": 0.18350745737552643, + "learning_rate": 1.8234820327166244e-05, + "loss": 0.5283, + "step": 2578 + }, + { + "epoch": 0.7151968940654465, + "grad_norm": 0.1888923943042755, + "learning_rate": 1.822991353066841e-05, + "loss": 0.5298, + "step": 2579 + }, + { + "epoch": 0.7154742096505824, + "grad_norm": 0.19120195508003235, + "learning_rate": 1.8225005616112636e-05, + "loss": 0.5583, + "step": 2580 + }, + { + "epoch": 0.7157515252357183, + "grad_norm": 0.18390871584415436, + "learning_rate": 1.8220096584456587e-05, + "loss": 0.5174, + "step": 2581 + }, + { + "epoch": 0.7160288408208542, + "grad_norm": 0.19268232583999634, + "learning_rate": 1.8215186436658142e-05, + "loss": 0.5706, + "step": 2582 + }, + { + "epoch": 0.71630615640599, + "grad_norm": 0.2088870108127594, + "learning_rate": 1.82102751736754e-05, + "loss": 0.5467, + "step": 2583 + }, + { + "epoch": 0.7165834719911259, + "grad_norm": 0.18982863426208496, + "learning_rate": 1.8205362796466682e-05, + "loss": 0.547, + "step": 2584 + }, + { + "epoch": 0.7168607875762618, + "grad_norm": 0.20448660850524902, + "learning_rate": 1.820044930599052e-05, + "loss": 0.5523, + "step": 2585 + }, + { + "epoch": 0.7171381031613977, + "grad_norm": 0.19304388761520386, + "learning_rate": 1.8195534703205674e-05, + "loss": 0.5766, + "step": 2586 + }, + { + "epoch": 0.7174154187465336, + "grad_norm": 0.20076531171798706, + "learning_rate": 1.81906189890711e-05, + "loss": 0.5327, + "step": 2587 + }, + { + "epoch": 0.7176927343316695, + "grad_norm": 0.24745801091194153, + "learning_rate": 1.8185702164546e-05, + "loss": 0.5386, + "step": 2588 + }, + { + "epoch": 0.7179700499168054, + "grad_norm": 0.1958095282316208, + "learning_rate": 1.8180784230589758e-05, + "loss": 0.5592, + "step": 2589 + }, + { + "epoch": 0.7182473655019412, + "grad_norm": 0.1962784081697464, + "learning_rate": 1.8175865188162007e-05, + "loss": 0.5547, + "step": 2590 + }, + { + "epoch": 0.7185246810870771, + "grad_norm": 0.21214796602725983, + "learning_rate": 1.8170945038222577e-05, + "loss": 0.5789, + "step": 2591 + }, + { + "epoch": 0.718801996672213, + "grad_norm": 0.18657919764518738, + "learning_rate": 1.8166023781731523e-05, + "loss": 0.5909, + "step": 2592 + }, + { + "epoch": 0.7190793122573489, + "grad_norm": 0.1949455291032791, + "learning_rate": 1.816110141964911e-05, + "loss": 0.5804, + "step": 2593 + }, + { + "epoch": 0.7193566278424848, + "grad_norm": 0.20440496504306793, + "learning_rate": 1.8156177952935824e-05, + "loss": 0.5819, + "step": 2594 + }, + { + "epoch": 0.7196339434276207, + "grad_norm": 0.19375431537628174, + "learning_rate": 1.815125338255236e-05, + "loss": 0.5707, + "step": 2595 + }, + { + "epoch": 0.7199112590127565, + "grad_norm": 0.2573017477989197, + "learning_rate": 1.8146327709459635e-05, + "loss": 0.5622, + "step": 2596 + }, + { + "epoch": 0.7201885745978924, + "grad_norm": 0.19676506519317627, + "learning_rate": 1.8141400934618775e-05, + "loss": 0.5668, + "step": 2597 + }, + { + "epoch": 0.7204658901830283, + "grad_norm": 0.19473743438720703, + "learning_rate": 1.8136473058991126e-05, + "loss": 0.5654, + "step": 2598 + }, + { + "epoch": 0.7207432057681642, + "grad_norm": 0.18709680438041687, + "learning_rate": 1.8131544083538253e-05, + "loss": 0.5283, + "step": 2599 + }, + { + "epoch": 0.7210205213533001, + "grad_norm": 0.19263465702533722, + "learning_rate": 1.812661400922192e-05, + "loss": 0.5379, + "step": 2600 + }, + { + "epoch": 0.721297836938436, + "grad_norm": 0.19263778626918793, + "learning_rate": 1.8121682837004118e-05, + "loss": 0.5678, + "step": 2601 + }, + { + "epoch": 0.7215751525235718, + "grad_norm": 0.19861197471618652, + "learning_rate": 1.8116750567847058e-05, + "loss": 0.5456, + "step": 2602 + }, + { + "epoch": 0.7218524681087077, + "grad_norm": 0.1754927933216095, + "learning_rate": 1.8111817202713143e-05, + "loss": 0.5164, + "step": 2603 + }, + { + "epoch": 0.7221297836938436, + "grad_norm": 0.19112901389598846, + "learning_rate": 1.8106882742565008e-05, + "loss": 0.5362, + "step": 2604 + }, + { + "epoch": 0.7224070992789795, + "grad_norm": 0.1980351209640503, + "learning_rate": 1.8101947188365503e-05, + "loss": 0.5687, + "step": 2605 + }, + { + "epoch": 0.7226844148641154, + "grad_norm": 0.19299866259098053, + "learning_rate": 1.8097010541077678e-05, + "loss": 0.5589, + "step": 2606 + }, + { + "epoch": 0.7229617304492513, + "grad_norm": 0.19257931411266327, + "learning_rate": 1.809207280166481e-05, + "loss": 0.5701, + "step": 2607 + }, + { + "epoch": 0.7232390460343872, + "grad_norm": 0.1912074089050293, + "learning_rate": 1.8087133971090374e-05, + "loss": 0.544, + "step": 2608 + }, + { + "epoch": 0.723516361619523, + "grad_norm": 0.19741860032081604, + "learning_rate": 1.808219405031808e-05, + "loss": 0.5527, + "step": 2609 + }, + { + "epoch": 0.7237936772046589, + "grad_norm": 0.18676535785198212, + "learning_rate": 1.807725304031182e-05, + "loss": 0.5539, + "step": 2610 + }, + { + "epoch": 0.7240709927897948, + "grad_norm": 0.17912089824676514, + "learning_rate": 1.807231094203573e-05, + "loss": 0.5342, + "step": 2611 + }, + { + "epoch": 0.7243483083749307, + "grad_norm": 0.18593581020832062, + "learning_rate": 1.806736775645414e-05, + "loss": 0.5366, + "step": 2612 + }, + { + "epoch": 0.7246256239600666, + "grad_norm": 0.20467206835746765, + "learning_rate": 1.8062423484531592e-05, + "loss": 0.527, + "step": 2613 + }, + { + "epoch": 0.7249029395452025, + "grad_norm": 0.18463392555713654, + "learning_rate": 1.8057478127232854e-05, + "loss": 0.545, + "step": 2614 + }, + { + "epoch": 0.7251802551303383, + "grad_norm": 0.20011630654335022, + "learning_rate": 1.805253168552289e-05, + "loss": 0.5242, + "step": 2615 + }, + { + "epoch": 0.7254575707154742, + "grad_norm": 0.18936677277088165, + "learning_rate": 1.804758416036688e-05, + "loss": 0.5285, + "step": 2616 + }, + { + "epoch": 0.7257348863006101, + "grad_norm": 0.19428247213363647, + "learning_rate": 1.804263555273022e-05, + "loss": 0.5448, + "step": 2617 + }, + { + "epoch": 0.726012201885746, + "grad_norm": 0.18848338723182678, + "learning_rate": 1.8037685863578514e-05, + "loss": 0.5673, + "step": 2618 + }, + { + "epoch": 0.7262895174708819, + "grad_norm": 0.19170448184013367, + "learning_rate": 1.803273509387758e-05, + "loss": 0.5234, + "step": 2619 + }, + { + "epoch": 0.7265668330560178, + "grad_norm": 0.18249256908893585, + "learning_rate": 1.8027783244593443e-05, + "loss": 0.5377, + "step": 2620 + }, + { + "epoch": 0.7268441486411537, + "grad_norm": 0.18726296722888947, + "learning_rate": 1.8022830316692336e-05, + "loss": 0.5381, + "step": 2621 + }, + { + "epoch": 0.7271214642262895, + "grad_norm": 0.1881718784570694, + "learning_rate": 1.801787631114071e-05, + "loss": 0.5453, + "step": 2622 + }, + { + "epoch": 0.7273987798114254, + "grad_norm": 0.19469492137432098, + "learning_rate": 1.8012921228905225e-05, + "loss": 0.5231, + "step": 2623 + }, + { + "epoch": 0.7276760953965613, + "grad_norm": 0.18475371599197388, + "learning_rate": 1.8007965070952743e-05, + "loss": 0.5323, + "step": 2624 + }, + { + "epoch": 0.7279534109816972, + "grad_norm": 0.18544836342334747, + "learning_rate": 1.8003007838250343e-05, + "loss": 0.5345, + "step": 2625 + }, + { + "epoch": 0.7282307265668331, + "grad_norm": 0.19587865471839905, + "learning_rate": 1.799804953176532e-05, + "loss": 0.5168, + "step": 2626 + }, + { + "epoch": 0.728508042151969, + "grad_norm": 0.23171131312847137, + "learning_rate": 1.7993090152465163e-05, + "loss": 0.5235, + "step": 2627 + }, + { + "epoch": 0.7287853577371048, + "grad_norm": 0.19391484558582306, + "learning_rate": 1.7988129701317582e-05, + "loss": 0.5345, + "step": 2628 + }, + { + "epoch": 0.7290626733222407, + "grad_norm": 0.19040954113006592, + "learning_rate": 1.7983168179290488e-05, + "loss": 0.5226, + "step": 2629 + }, + { + "epoch": 0.7293399889073766, + "grad_norm": 0.1814422607421875, + "learning_rate": 1.797820558735201e-05, + "loss": 0.5291, + "step": 2630 + }, + { + "epoch": 0.7296173044925125, + "grad_norm": 0.19977906346321106, + "learning_rate": 1.797324192647048e-05, + "loss": 0.5641, + "step": 2631 + }, + { + "epoch": 0.7298946200776484, + "grad_norm": 0.19043037295341492, + "learning_rate": 1.796827719761444e-05, + "loss": 0.5686, + "step": 2632 + }, + { + "epoch": 0.7301719356627843, + "grad_norm": 0.19778837263584137, + "learning_rate": 1.7963311401752638e-05, + "loss": 0.5648, + "step": 2633 + }, + { + "epoch": 0.7304492512479202, + "grad_norm": 0.45009469985961914, + "learning_rate": 1.7958344539854034e-05, + "loss": 0.5244, + "step": 2634 + }, + { + "epoch": 0.730726566833056, + "grad_norm": 0.1936669498682022, + "learning_rate": 1.7953376612887793e-05, + "loss": 0.5296, + "step": 2635 + }, + { + "epoch": 0.7310038824181919, + "grad_norm": 0.19970230758190155, + "learning_rate": 1.7948407621823287e-05, + "loss": 0.5832, + "step": 2636 + }, + { + "epoch": 0.7312811980033278, + "grad_norm": 0.19142326712608337, + "learning_rate": 1.794343756763011e-05, + "loss": 0.5478, + "step": 2637 + }, + { + "epoch": 0.7315585135884637, + "grad_norm": 0.18312163650989532, + "learning_rate": 1.7938466451278034e-05, + "loss": 0.5382, + "step": 2638 + }, + { + "epoch": 0.7318358291735996, + "grad_norm": 0.19482283294200897, + "learning_rate": 1.793349427373707e-05, + "loss": 0.5392, + "step": 2639 + }, + { + "epoch": 0.7321131447587355, + "grad_norm": 0.20726899802684784, + "learning_rate": 1.7928521035977413e-05, + "loss": 0.5597, + "step": 2640 + }, + { + "epoch": 0.7323904603438713, + "grad_norm": 0.20082899928092957, + "learning_rate": 1.7923546738969478e-05, + "loss": 0.5293, + "step": 2641 + }, + { + "epoch": 0.7326677759290072, + "grad_norm": 0.19428935647010803, + "learning_rate": 1.791857138368388e-05, + "loss": 0.5428, + "step": 2642 + }, + { + "epoch": 0.7329450915141431, + "grad_norm": 0.19222451746463776, + "learning_rate": 1.791359497109144e-05, + "loss": 0.5417, + "step": 2643 + }, + { + "epoch": 0.733222407099279, + "grad_norm": 0.1904270201921463, + "learning_rate": 1.7908617502163188e-05, + "loss": 0.5368, + "step": 2644 + }, + { + "epoch": 0.7334997226844149, + "grad_norm": 0.1973213404417038, + "learning_rate": 1.7903638977870372e-05, + "loss": 0.5347, + "step": 2645 + }, + { + "epoch": 0.7337770382695508, + "grad_norm": 0.1838080883026123, + "learning_rate": 1.7898659399184415e-05, + "loss": 0.5239, + "step": 2646 + }, + { + "epoch": 0.7340543538546866, + "grad_norm": 0.18665340542793274, + "learning_rate": 1.7893678767076982e-05, + "loss": 0.5469, + "step": 2647 + }, + { + "epoch": 0.7343316694398225, + "grad_norm": 0.18644295632839203, + "learning_rate": 1.788869708251991e-05, + "loss": 0.5423, + "step": 2648 + }, + { + "epoch": 0.7346089850249584, + "grad_norm": 0.18912896513938904, + "learning_rate": 1.788371434648528e-05, + "loss": 0.5289, + "step": 2649 + }, + { + "epoch": 0.7348863006100943, + "grad_norm": 0.1896572709083557, + "learning_rate": 1.7878730559945327e-05, + "loss": 0.544, + "step": 2650 + }, + { + "epoch": 0.7351636161952302, + "grad_norm": 0.18456673622131348, + "learning_rate": 1.7873745723872545e-05, + "loss": 0.5437, + "step": 2651 + }, + { + "epoch": 0.7354409317803661, + "grad_norm": 0.18309368193149567, + "learning_rate": 1.7868759839239596e-05, + "loss": 0.5079, + "step": 2652 + }, + { + "epoch": 0.735718247365502, + "grad_norm": 0.18934939801692963, + "learning_rate": 1.7863772907019356e-05, + "loss": 0.5473, + "step": 2653 + }, + { + "epoch": 0.7359955629506378, + "grad_norm": 0.19800709187984467, + "learning_rate": 1.7858784928184916e-05, + "loss": 0.5386, + "step": 2654 + }, + { + "epoch": 0.7362728785357737, + "grad_norm": 0.20616304874420166, + "learning_rate": 1.7853795903709556e-05, + "loss": 0.5388, + "step": 2655 + }, + { + "epoch": 0.7365501941209096, + "grad_norm": 0.18855001032352448, + "learning_rate": 1.7848805834566768e-05, + "loss": 0.5499, + "step": 2656 + }, + { + "epoch": 0.7368275097060455, + "grad_norm": 0.18595397472381592, + "learning_rate": 1.7843814721730244e-05, + "loss": 0.5599, + "step": 2657 + }, + { + "epoch": 0.7371048252911814, + "grad_norm": 0.1929646134376526, + "learning_rate": 1.7838822566173894e-05, + "loss": 0.5412, + "step": 2658 + }, + { + "epoch": 0.7373821408763173, + "grad_norm": 0.19367007911205292, + "learning_rate": 1.7833829368871808e-05, + "loss": 0.5328, + "step": 2659 + }, + { + "epoch": 0.7376594564614531, + "grad_norm": 0.1873459815979004, + "learning_rate": 1.7828835130798296e-05, + "loss": 0.5444, + "step": 2660 + }, + { + "epoch": 0.737936772046589, + "grad_norm": 0.20193496346473694, + "learning_rate": 1.7823839852927867e-05, + "loss": 0.5287, + "step": 2661 + }, + { + "epoch": 0.7382140876317249, + "grad_norm": 0.18771541118621826, + "learning_rate": 1.7818843536235224e-05, + "loss": 0.5404, + "step": 2662 + }, + { + "epoch": 0.7384914032168608, + "grad_norm": 0.1829247921705246, + "learning_rate": 1.781384618169529e-05, + "loss": 0.5512, + "step": 2663 + }, + { + "epoch": 0.7387687188019967, + "grad_norm": 0.2635699212551117, + "learning_rate": 1.7808847790283183e-05, + "loss": 0.5678, + "step": 2664 + }, + { + "epoch": 0.7390460343871326, + "grad_norm": 0.1860508918762207, + "learning_rate": 1.780384836297421e-05, + "loss": 0.5348, + "step": 2665 + }, + { + "epoch": 0.7393233499722685, + "grad_norm": 0.18465931713581085, + "learning_rate": 1.7798847900743904e-05, + "loss": 0.5043, + "step": 2666 + }, + { + "epoch": 0.7396006655574043, + "grad_norm": 0.18881580233573914, + "learning_rate": 1.779384640456798e-05, + "loss": 0.5251, + "step": 2667 + }, + { + "epoch": 0.7398779811425402, + "grad_norm": 0.18016085028648376, + "learning_rate": 1.7788843875422367e-05, + "loss": 0.5585, + "step": 2668 + }, + { + "epoch": 0.7401552967276761, + "grad_norm": 0.19220809638500214, + "learning_rate": 1.7783840314283183e-05, + "loss": 0.5263, + "step": 2669 + }, + { + "epoch": 0.740432612312812, + "grad_norm": 0.18954598903656006, + "learning_rate": 1.7778835722126764e-05, + "loss": 0.542, + "step": 2670 + }, + { + "epoch": 0.7407099278979479, + "grad_norm": 0.18674500286579132, + "learning_rate": 1.7773830099929635e-05, + "loss": 0.5247, + "step": 2671 + }, + { + "epoch": 0.7409872434830838, + "grad_norm": 0.18231706321239471, + "learning_rate": 1.776882344866853e-05, + "loss": 0.5115, + "step": 2672 + }, + { + "epoch": 0.7412645590682196, + "grad_norm": 0.18846355378627777, + "learning_rate": 1.776381576932037e-05, + "loss": 0.5186, + "step": 2673 + }, + { + "epoch": 0.7415418746533555, + "grad_norm": 0.18721552193164825, + "learning_rate": 1.7758807062862292e-05, + "loss": 0.5313, + "step": 2674 + }, + { + "epoch": 0.7418191902384914, + "grad_norm": 0.191980242729187, + "learning_rate": 1.775379733027163e-05, + "loss": 0.5505, + "step": 2675 + }, + { + "epoch": 0.7420965058236273, + "grad_norm": 0.18773001432418823, + "learning_rate": 1.7748786572525907e-05, + "loss": 0.5334, + "step": 2676 + }, + { + "epoch": 0.7423738214087632, + "grad_norm": 0.19638672471046448, + "learning_rate": 1.7743774790602864e-05, + "loss": 0.5718, + "step": 2677 + }, + { + "epoch": 0.7426511369938991, + "grad_norm": 0.19333893060684204, + "learning_rate": 1.7738761985480425e-05, + "loss": 0.5479, + "step": 2678 + }, + { + "epoch": 0.742928452579035, + "grad_norm": 0.20004448294639587, + "learning_rate": 1.7733748158136725e-05, + "loss": 0.5331, + "step": 2679 + }, + { + "epoch": 0.7432057681641708, + "grad_norm": 0.19896887242794037, + "learning_rate": 1.7728733309550097e-05, + "loss": 0.5484, + "step": 2680 + }, + { + "epoch": 0.7434830837493067, + "grad_norm": 0.1829969435930252, + "learning_rate": 1.7723717440699066e-05, + "loss": 0.5459, + "step": 2681 + }, + { + "epoch": 0.7437603993344426, + "grad_norm": 0.1870088130235672, + "learning_rate": 1.771870055256236e-05, + "loss": 0.5628, + "step": 2682 + }, + { + "epoch": 0.7440377149195785, + "grad_norm": 0.18835529685020447, + "learning_rate": 1.7713682646118914e-05, + "loss": 0.5439, + "step": 2683 + }, + { + "epoch": 0.7443150305047144, + "grad_norm": 0.20633459091186523, + "learning_rate": 1.7708663722347845e-05, + "loss": 0.5677, + "step": 2684 + }, + { + "epoch": 0.7445923460898503, + "grad_norm": 0.18712858855724335, + "learning_rate": 1.7703643782228488e-05, + "loss": 0.5666, + "step": 2685 + }, + { + "epoch": 0.7448696616749861, + "grad_norm": 0.18475639820098877, + "learning_rate": 1.769862282674036e-05, + "loss": 0.5548, + "step": 2686 + }, + { + "epoch": 0.745146977260122, + "grad_norm": 0.19311586022377014, + "learning_rate": 1.769360085686318e-05, + "loss": 0.5428, + "step": 2687 + }, + { + "epoch": 0.7454242928452579, + "grad_norm": 0.19635801017284393, + "learning_rate": 1.7688577873576872e-05, + "loss": 0.5228, + "step": 2688 + }, + { + "epoch": 0.7457016084303938, + "grad_norm": 0.18878091871738434, + "learning_rate": 1.7683553877861554e-05, + "loss": 0.4995, + "step": 2689 + }, + { + "epoch": 0.7459789240155297, + "grad_norm": 0.1772637963294983, + "learning_rate": 1.7678528870697537e-05, + "loss": 0.5241, + "step": 2690 + }, + { + "epoch": 0.7462562396006656, + "grad_norm": 0.18778811395168304, + "learning_rate": 1.7673502853065335e-05, + "loss": 0.5247, + "step": 2691 + }, + { + "epoch": 0.7465335551858014, + "grad_norm": 0.20334573090076447, + "learning_rate": 1.7668475825945656e-05, + "loss": 0.5369, + "step": 2692 + }, + { + "epoch": 0.7468108707709373, + "grad_norm": 0.18709522485733032, + "learning_rate": 1.766344779031941e-05, + "loss": 0.5701, + "step": 2693 + }, + { + "epoch": 0.7470881863560732, + "grad_norm": 0.18577025830745697, + "learning_rate": 1.7658418747167694e-05, + "loss": 0.5409, + "step": 2694 + }, + { + "epoch": 0.7473655019412091, + "grad_norm": 0.19616863131523132, + "learning_rate": 1.765338869747181e-05, + "loss": 0.5533, + "step": 2695 + }, + { + "epoch": 0.747642817526345, + "grad_norm": 0.20094148814678192, + "learning_rate": 1.764835764221326e-05, + "loss": 0.5252, + "step": 2696 + }, + { + "epoch": 0.7479201331114809, + "grad_norm": 0.19073578715324402, + "learning_rate": 1.7643325582373728e-05, + "loss": 0.52, + "step": 2697 + }, + { + "epoch": 0.7481974486966168, + "grad_norm": 0.19425810873508453, + "learning_rate": 1.7638292518935103e-05, + "loss": 0.5612, + "step": 2698 + }, + { + "epoch": 0.7484747642817526, + "grad_norm": 0.1896180808544159, + "learning_rate": 1.7633258452879475e-05, + "loss": 0.5504, + "step": 2699 + }, + { + "epoch": 0.7487520798668885, + "grad_norm": 0.19124028086662292, + "learning_rate": 1.762822338518912e-05, + "loss": 0.5335, + "step": 2700 + }, + { + "epoch": 0.7490293954520244, + "grad_norm": 0.19054248929023743, + "learning_rate": 1.762318731684651e-05, + "loss": 0.552, + "step": 2701 + }, + { + "epoch": 0.7493067110371603, + "grad_norm": 0.18848967552185059, + "learning_rate": 1.761815024883432e-05, + "loss": 0.557, + "step": 2702 + }, + { + "epoch": 0.7495840266222962, + "grad_norm": 0.1953321397304535, + "learning_rate": 1.7613112182135406e-05, + "loss": 0.5916, + "step": 2703 + }, + { + "epoch": 0.7498613422074321, + "grad_norm": 0.18123508989810944, + "learning_rate": 1.7608073117732848e-05, + "loss": 0.5643, + "step": 2704 + }, + { + "epoch": 0.7501386577925679, + "grad_norm": 0.19151508808135986, + "learning_rate": 1.760303305660988e-05, + "loss": 0.5512, + "step": 2705 + }, + { + "epoch": 0.7504159733777038, + "grad_norm": 0.18512828648090363, + "learning_rate": 1.7597991999749967e-05, + "loss": 0.5627, + "step": 2706 + }, + { + "epoch": 0.7506932889628397, + "grad_norm": 0.20286305248737335, + "learning_rate": 1.7592949948136737e-05, + "loss": 0.568, + "step": 2707 + }, + { + "epoch": 0.7509706045479756, + "grad_norm": 0.20783564448356628, + "learning_rate": 1.758790690275405e-05, + "loss": 0.5563, + "step": 2708 + }, + { + "epoch": 0.7512479201331115, + "grad_norm": 0.18659134209156036, + "learning_rate": 1.7582862864585913e-05, + "loss": 0.5285, + "step": 2709 + }, + { + "epoch": 0.7515252357182474, + "grad_norm": 0.17614908516407013, + "learning_rate": 1.757781783461657e-05, + "loss": 0.4952, + "step": 2710 + }, + { + "epoch": 0.7518025513033832, + "grad_norm": 0.18984104692935944, + "learning_rate": 1.757277181383043e-05, + "loss": 0.5564, + "step": 2711 + }, + { + "epoch": 0.7520798668885191, + "grad_norm": 0.19199056923389435, + "learning_rate": 1.756772480321211e-05, + "loss": 0.5815, + "step": 2712 + }, + { + "epoch": 0.752357182473655, + "grad_norm": 0.5123929381370544, + "learning_rate": 1.7562676803746414e-05, + "loss": 0.537, + "step": 2713 + }, + { + "epoch": 0.7526344980587909, + "grad_norm": 0.1854097694158554, + "learning_rate": 1.7557627816418337e-05, + "loss": 0.5275, + "step": 2714 + }, + { + "epoch": 0.7529118136439268, + "grad_norm": 0.1879512071609497, + "learning_rate": 1.755257784221308e-05, + "loss": 0.5292, + "step": 2715 + }, + { + "epoch": 0.7531891292290627, + "grad_norm": 0.18993008136749268, + "learning_rate": 1.7547526882116014e-05, + "loss": 0.5282, + "step": 2716 + }, + { + "epoch": 0.7534664448141986, + "grad_norm": 0.18528947234153748, + "learning_rate": 1.7542474937112725e-05, + "loss": 0.5457, + "step": 2717 + }, + { + "epoch": 0.7537437603993344, + "grad_norm": 0.1859760731458664, + "learning_rate": 1.753742200818898e-05, + "loss": 0.5448, + "step": 2718 + }, + { + "epoch": 0.7540210759844703, + "grad_norm": 0.17713625729084015, + "learning_rate": 1.753236809633073e-05, + "loss": 0.5479, + "step": 2719 + }, + { + "epoch": 0.7542983915696062, + "grad_norm": 0.17718899250030518, + "learning_rate": 1.7527313202524144e-05, + "loss": 0.5378, + "step": 2720 + }, + { + "epoch": 0.7545757071547421, + "grad_norm": 0.19346462190151215, + "learning_rate": 1.752225732775555e-05, + "loss": 0.552, + "step": 2721 + }, + { + "epoch": 0.754853022739878, + "grad_norm": 0.17717614769935608, + "learning_rate": 1.7517200473011488e-05, + "loss": 0.5348, + "step": 2722 + }, + { + "epoch": 0.7551303383250139, + "grad_norm": 0.1940585970878601, + "learning_rate": 1.751214263927869e-05, + "loss": 0.5405, + "step": 2723 + }, + { + "epoch": 0.7554076539101497, + "grad_norm": 0.19863441586494446, + "learning_rate": 1.7507083827544065e-05, + "loss": 0.5357, + "step": 2724 + }, + { + "epoch": 0.7556849694952856, + "grad_norm": 0.18913887441158295, + "learning_rate": 1.7502024038794727e-05, + "loss": 0.5658, + "step": 2725 + }, + { + "epoch": 0.7559622850804215, + "grad_norm": 0.18873843550682068, + "learning_rate": 1.7496963274017975e-05, + "loss": 0.5774, + "step": 2726 + }, + { + "epoch": 0.7562396006655574, + "grad_norm": 0.1888992041349411, + "learning_rate": 1.7491901534201295e-05, + "loss": 0.5319, + "step": 2727 + }, + { + "epoch": 0.7565169162506933, + "grad_norm": 0.18841078877449036, + "learning_rate": 1.7486838820332362e-05, + "loss": 0.5542, + "step": 2728 + }, + { + "epoch": 0.7567942318358292, + "grad_norm": 0.19633720815181732, + "learning_rate": 1.7481775133399057e-05, + "loss": 0.5615, + "step": 2729 + }, + { + "epoch": 0.757071547420965, + "grad_norm": 0.19098395109176636, + "learning_rate": 1.7476710474389434e-05, + "loss": 0.5642, + "step": 2730 + }, + { + "epoch": 0.7573488630061009, + "grad_norm": 0.1895277500152588, + "learning_rate": 1.747164484429174e-05, + "loss": 0.557, + "step": 2731 + }, + { + "epoch": 0.7576261785912368, + "grad_norm": 0.18626920878887177, + "learning_rate": 1.7466578244094417e-05, + "loss": 0.5314, + "step": 2732 + }, + { + "epoch": 0.7579034941763727, + "grad_norm": 0.1883586198091507, + "learning_rate": 1.746151067478609e-05, + "loss": 0.5457, + "step": 2733 + }, + { + "epoch": 0.7581808097615086, + "grad_norm": 0.18349739909172058, + "learning_rate": 1.745644213735558e-05, + "loss": 0.5467, + "step": 2734 + }, + { + "epoch": 0.7584581253466445, + "grad_norm": 0.17938470840454102, + "learning_rate": 1.7451372632791888e-05, + "loss": 0.5444, + "step": 2735 + }, + { + "epoch": 0.7587354409317804, + "grad_norm": 0.19910098612308502, + "learning_rate": 1.7446302162084215e-05, + "loss": 0.5777, + "step": 2736 + }, + { + "epoch": 0.7590127565169162, + "grad_norm": 0.20055991411209106, + "learning_rate": 1.7441230726221936e-05, + "loss": 0.5546, + "step": 2737 + }, + { + "epoch": 0.7592900721020521, + "grad_norm": 0.18521596491336823, + "learning_rate": 1.743615832619463e-05, + "loss": 0.5239, + "step": 2738 + }, + { + "epoch": 0.759567387687188, + "grad_norm": 0.19067248702049255, + "learning_rate": 1.7431084962992052e-05, + "loss": 0.5288, + "step": 2739 + }, + { + "epoch": 0.7598447032723239, + "grad_norm": 0.18803349137306213, + "learning_rate": 1.7426010637604152e-05, + "loss": 0.5397, + "step": 2740 + }, + { + "epoch": 0.7601220188574598, + "grad_norm": 0.1846192330121994, + "learning_rate": 1.7420935351021062e-05, + "loss": 0.5379, + "step": 2741 + }, + { + "epoch": 0.7603993344425957, + "grad_norm": 0.1797967106103897, + "learning_rate": 1.7415859104233108e-05, + "loss": 0.5528, + "step": 2742 + }, + { + "epoch": 0.7606766500277315, + "grad_norm": 0.18609130382537842, + "learning_rate": 1.7410781898230797e-05, + "loss": 0.5473, + "step": 2743 + }, + { + "epoch": 0.7609539656128674, + "grad_norm": 0.23831035196781158, + "learning_rate": 1.7405703734004837e-05, + "loss": 0.5114, + "step": 2744 + }, + { + "epoch": 0.7612312811980033, + "grad_norm": 0.16559574007987976, + "learning_rate": 1.74006246125461e-05, + "loss": 0.501, + "step": 2745 + }, + { + "epoch": 0.7615085967831392, + "grad_norm": 0.1880342662334442, + "learning_rate": 1.7395544534845663e-05, + "loss": 0.5344, + "step": 2746 + }, + { + "epoch": 0.7617859123682751, + "grad_norm": 0.1892349123954773, + "learning_rate": 1.7390463501894778e-05, + "loss": 0.5745, + "step": 2747 + }, + { + "epoch": 0.762063227953411, + "grad_norm": 0.2029358148574829, + "learning_rate": 1.7385381514684896e-05, + "loss": 0.5206, + "step": 2748 + }, + { + "epoch": 0.7623405435385469, + "grad_norm": 0.2009795904159546, + "learning_rate": 1.7380298574207645e-05, + "loss": 0.5765, + "step": 2749 + }, + { + "epoch": 0.7626178591236827, + "grad_norm": 0.18493135273456573, + "learning_rate": 1.737521468145484e-05, + "loss": 0.567, + "step": 2750 + }, + { + "epoch": 0.7628951747088186, + "grad_norm": 0.2014453411102295, + "learning_rate": 1.7370129837418487e-05, + "loss": 0.564, + "step": 2751 + }, + { + "epoch": 0.7631724902939545, + "grad_norm": 0.19111500680446625, + "learning_rate": 1.7365044043090766e-05, + "loss": 0.5589, + "step": 2752 + }, + { + "epoch": 0.7634498058790904, + "grad_norm": 0.19902296364307404, + "learning_rate": 1.7359957299464062e-05, + "loss": 0.5543, + "step": 2753 + }, + { + "epoch": 0.7637271214642263, + "grad_norm": 0.19618061184883118, + "learning_rate": 1.7354869607530923e-05, + "loss": 0.5639, + "step": 2754 + }, + { + "epoch": 0.7640044370493622, + "grad_norm": 0.17767266929149628, + "learning_rate": 1.7349780968284094e-05, + "loss": 0.5041, + "step": 2755 + }, + { + "epoch": 0.764281752634498, + "grad_norm": 0.19212745130062103, + "learning_rate": 1.7344691382716508e-05, + "loss": 0.5416, + "step": 2756 + }, + { + "epoch": 0.7645590682196339, + "grad_norm": 0.19896987080574036, + "learning_rate": 1.7339600851821274e-05, + "loss": 0.5505, + "step": 2757 + }, + { + "epoch": 0.7648363838047698, + "grad_norm": 0.1840539574623108, + "learning_rate": 1.7334509376591695e-05, + "loss": 0.5373, + "step": 2758 + }, + { + "epoch": 0.7651136993899057, + "grad_norm": 0.20208869874477386, + "learning_rate": 1.7329416958021247e-05, + "loss": 0.5553, + "step": 2759 + }, + { + "epoch": 0.7653910149750416, + "grad_norm": 0.18945381045341492, + "learning_rate": 1.7324323597103597e-05, + "loss": 0.536, + "step": 2760 + }, + { + "epoch": 0.7656683305601775, + "grad_norm": 0.19363372027873993, + "learning_rate": 1.7319229294832597e-05, + "loss": 0.5607, + "step": 2761 + }, + { + "epoch": 0.7659456461453134, + "grad_norm": 0.20634472370147705, + "learning_rate": 1.7314134052202272e-05, + "loss": 0.5451, + "step": 2762 + }, + { + "epoch": 0.7662229617304492, + "grad_norm": 0.19053952395915985, + "learning_rate": 1.730903787020685e-05, + "loss": 0.5661, + "step": 2763 + }, + { + "epoch": 0.7665002773155851, + "grad_norm": 0.18681201338768005, + "learning_rate": 1.7303940749840726e-05, + "loss": 0.5312, + "step": 2764 + }, + { + "epoch": 0.766777592900721, + "grad_norm": 0.19541014730930328, + "learning_rate": 1.7298842692098488e-05, + "loss": 0.5366, + "step": 2765 + }, + { + "epoch": 0.7670549084858569, + "grad_norm": 0.19399814307689667, + "learning_rate": 1.729374369797489e-05, + "loss": 0.547, + "step": 2766 + }, + { + "epoch": 0.7673322240709928, + "grad_norm": 0.19590160250663757, + "learning_rate": 1.7288643768464892e-05, + "loss": 0.5286, + "step": 2767 + }, + { + "epoch": 0.7676095396561287, + "grad_norm": 0.18889807164669037, + "learning_rate": 1.7283542904563625e-05, + "loss": 0.5357, + "step": 2768 + }, + { + "epoch": 0.7678868552412645, + "grad_norm": 0.1905566155910492, + "learning_rate": 1.7278441107266395e-05, + "loss": 0.5663, + "step": 2769 + }, + { + "epoch": 0.7681641708264004, + "grad_norm": 0.19881530106067657, + "learning_rate": 1.7273338377568707e-05, + "loss": 0.5829, + "step": 2770 + }, + { + "epoch": 0.7684414864115363, + "grad_norm": 0.17877154052257538, + "learning_rate": 1.726823471646623e-05, + "loss": 0.5607, + "step": 2771 + }, + { + "epoch": 0.7687188019966722, + "grad_norm": 0.1801327019929886, + "learning_rate": 1.7263130124954832e-05, + "loss": 0.5608, + "step": 2772 + }, + { + "epoch": 0.7689961175818081, + "grad_norm": 0.1976090669631958, + "learning_rate": 1.7258024604030547e-05, + "loss": 0.4987, + "step": 2773 + }, + { + "epoch": 0.769273433166944, + "grad_norm": 0.18582318723201752, + "learning_rate": 1.72529181546896e-05, + "loss": 0.5422, + "step": 2774 + }, + { + "epoch": 0.7695507487520798, + "grad_norm": 0.1935378760099411, + "learning_rate": 1.7247810777928396e-05, + "loss": 0.5464, + "step": 2775 + }, + { + "epoch": 0.7698280643372157, + "grad_norm": 0.187955841422081, + "learning_rate": 1.7242702474743517e-05, + "loss": 0.514, + "step": 2776 + }, + { + "epoch": 0.7701053799223516, + "grad_norm": 0.18268531560897827, + "learning_rate": 1.7237593246131735e-05, + "loss": 0.517, + "step": 2777 + }, + { + "epoch": 0.7703826955074875, + "grad_norm": 0.18831866979599, + "learning_rate": 1.7232483093089986e-05, + "loss": 0.579, + "step": 2778 + }, + { + "epoch": 0.7706600110926234, + "grad_norm": 0.20355799794197083, + "learning_rate": 1.7227372016615402e-05, + "loss": 0.5656, + "step": 2779 + }, + { + "epoch": 0.7709373266777593, + "grad_norm": 0.1870361566543579, + "learning_rate": 1.7222260017705286e-05, + "loss": 0.5528, + "step": 2780 + }, + { + "epoch": 0.7712146422628952, + "grad_norm": 0.2032066434621811, + "learning_rate": 1.7217147097357127e-05, + "loss": 0.5684, + "step": 2781 + }, + { + "epoch": 0.771491957848031, + "grad_norm": 0.18692703545093536, + "learning_rate": 1.7212033256568595e-05, + "loss": 0.5654, + "step": 2782 + }, + { + "epoch": 0.7717692734331669, + "grad_norm": 0.1919548362493515, + "learning_rate": 1.7206918496337525e-05, + "loss": 0.5801, + "step": 2783 + }, + { + "epoch": 0.7720465890183028, + "grad_norm": 0.19089831411838531, + "learning_rate": 1.7201802817661955e-05, + "loss": 0.5617, + "step": 2784 + }, + { + "epoch": 0.7723239046034387, + "grad_norm": 0.19473430514335632, + "learning_rate": 1.7196686221540077e-05, + "loss": 0.5376, + "step": 2785 + }, + { + "epoch": 0.7726012201885746, + "grad_norm": 0.1860806941986084, + "learning_rate": 1.7191568708970286e-05, + "loss": 0.5541, + "step": 2786 + }, + { + "epoch": 0.7728785357737105, + "grad_norm": 0.18971897661685944, + "learning_rate": 1.7186450280951137e-05, + "loss": 0.5514, + "step": 2787 + }, + { + "epoch": 0.7731558513588463, + "grad_norm": 0.18644990026950836, + "learning_rate": 1.7181330938481375e-05, + "loss": 0.5504, + "step": 2788 + }, + { + "epoch": 0.7734331669439822, + "grad_norm": 0.18728910386562347, + "learning_rate": 1.717621068255992e-05, + "loss": 0.5528, + "step": 2789 + }, + { + "epoch": 0.7737104825291181, + "grad_norm": 0.19061507284641266, + "learning_rate": 1.7171089514185857e-05, + "loss": 0.5356, + "step": 2790 + }, + { + "epoch": 0.773987798114254, + "grad_norm": 0.18289197981357574, + "learning_rate": 1.7165967434358483e-05, + "loss": 0.5487, + "step": 2791 + }, + { + "epoch": 0.7742651136993899, + "grad_norm": 0.19676977396011353, + "learning_rate": 1.716084444407723e-05, + "loss": 0.5625, + "step": 2792 + }, + { + "epoch": 0.7745424292845258, + "grad_norm": 0.18007270991802216, + "learning_rate": 1.7155720544341746e-05, + "loss": 0.543, + "step": 2793 + }, + { + "epoch": 0.7748197448696617, + "grad_norm": 0.19169475138187408, + "learning_rate": 1.715059573615183e-05, + "loss": 0.5378, + "step": 2794 + }, + { + "epoch": 0.7750970604547975, + "grad_norm": 0.1876417100429535, + "learning_rate": 1.714547002050747e-05, + "loss": 0.5304, + "step": 2795 + }, + { + "epoch": 0.7753743760399334, + "grad_norm": 0.18966078758239746, + "learning_rate": 1.714034339840883e-05, + "loss": 0.5302, + "step": 2796 + }, + { + "epoch": 0.7756516916250693, + "grad_norm": 0.1796397715806961, + "learning_rate": 1.7135215870856253e-05, + "loss": 0.5388, + "step": 2797 + }, + { + "epoch": 0.7759290072102052, + "grad_norm": 0.19350911676883698, + "learning_rate": 1.7130087438850252e-05, + "loss": 0.553, + "step": 2798 + }, + { + "epoch": 0.7762063227953411, + "grad_norm": 0.19393737614154816, + "learning_rate": 1.7124958103391516e-05, + "loss": 0.514, + "step": 2799 + }, + { + "epoch": 0.776483638380477, + "grad_norm": 0.2177378535270691, + "learning_rate": 1.711982786548092e-05, + "loss": 0.5187, + "step": 2800 + }, + { + "epoch": 0.7767609539656128, + "grad_norm": 0.18803465366363525, + "learning_rate": 1.7114696726119505e-05, + "loss": 0.5661, + "step": 2801 + }, + { + "epoch": 0.7770382695507487, + "grad_norm": 0.19588807225227356, + "learning_rate": 1.7109564686308498e-05, + "loss": 0.5463, + "step": 2802 + }, + { + "epoch": 0.7773155851358846, + "grad_norm": 0.18581606447696686, + "learning_rate": 1.710443174704929e-05, + "loss": 0.548, + "step": 2803 + }, + { + "epoch": 0.7775929007210205, + "grad_norm": 0.18562249839305878, + "learning_rate": 1.7099297909343455e-05, + "loss": 0.5313, + "step": 2804 + }, + { + "epoch": 0.7778702163061564, + "grad_norm": 0.17982806265354156, + "learning_rate": 1.7094163174192744e-05, + "loss": 0.5499, + "step": 2805 + }, + { + "epoch": 0.7781475318912923, + "grad_norm": 0.18412619829177856, + "learning_rate": 1.708902754259908e-05, + "loss": 0.5495, + "step": 2806 + }, + { + "epoch": 0.7784248474764282, + "grad_norm": 0.19657637178897858, + "learning_rate": 1.7083891015564555e-05, + "loss": 0.5565, + "step": 2807 + }, + { + "epoch": 0.778702163061564, + "grad_norm": 0.17935718595981598, + "learning_rate": 1.7078753594091445e-05, + "loss": 0.5446, + "step": 2808 + }, + { + "epoch": 0.7789794786466999, + "grad_norm": 0.18647761642932892, + "learning_rate": 1.7073615279182198e-05, + "loss": 0.5416, + "step": 2809 + }, + { + "epoch": 0.7792567942318358, + "grad_norm": 0.19295147061347961, + "learning_rate": 1.7068476071839434e-05, + "loss": 0.5498, + "step": 2810 + }, + { + "epoch": 0.7795341098169717, + "grad_norm": 0.19305361807346344, + "learning_rate": 1.706333597306595e-05, + "loss": 0.5613, + "step": 2811 + }, + { + "epoch": 0.7798114254021076, + "grad_norm": 0.1887744814157486, + "learning_rate": 1.7058194983864715e-05, + "loss": 0.5493, + "step": 2812 + }, + { + "epoch": 0.7800887409872435, + "grad_norm": 0.18676388263702393, + "learning_rate": 1.7053053105238866e-05, + "loss": 0.527, + "step": 2813 + }, + { + "epoch": 0.7803660565723793, + "grad_norm": 0.1888217329978943, + "learning_rate": 1.7047910338191732e-05, + "loss": 0.5543, + "step": 2814 + }, + { + "epoch": 0.7806433721575152, + "grad_norm": 0.19341908395290375, + "learning_rate": 1.7042766683726793e-05, + "loss": 0.5188, + "step": 2815 + }, + { + "epoch": 0.7809206877426511, + "grad_norm": 0.18204385042190552, + "learning_rate": 1.7037622142847717e-05, + "loss": 0.541, + "step": 2816 + }, + { + "epoch": 0.781198003327787, + "grad_norm": 0.20170709490776062, + "learning_rate": 1.7032476716558338e-05, + "loss": 0.5925, + "step": 2817 + }, + { + "epoch": 0.7814753189129229, + "grad_norm": 0.18066342175006866, + "learning_rate": 1.7027330405862668e-05, + "loss": 0.5537, + "step": 2818 + }, + { + "epoch": 0.7817526344980588, + "grad_norm": 0.18021415174007416, + "learning_rate": 1.7022183211764886e-05, + "loss": 0.5098, + "step": 2819 + }, + { + "epoch": 0.7820299500831946, + "grad_norm": 0.19226256012916565, + "learning_rate": 1.7017035135269345e-05, + "loss": 0.5626, + "step": 2820 + }, + { + "epoch": 0.7823072656683305, + "grad_norm": 0.1880798488855362, + "learning_rate": 1.7011886177380572e-05, + "loss": 0.5601, + "step": 2821 + }, + { + "epoch": 0.7825845812534664, + "grad_norm": 0.19588159024715424, + "learning_rate": 1.7006736339103267e-05, + "loss": 0.56, + "step": 2822 + }, + { + "epoch": 0.7828618968386023, + "grad_norm": 0.21878387033939362, + "learning_rate": 1.7001585621442295e-05, + "loss": 0.5683, + "step": 2823 + }, + { + "epoch": 0.7831392124237382, + "grad_norm": 0.18837064504623413, + "learning_rate": 1.6996434025402706e-05, + "loss": 0.5827, + "step": 2824 + }, + { + "epoch": 0.7834165280088741, + "grad_norm": 0.1791253536939621, + "learning_rate": 1.6991281551989704e-05, + "loss": 0.5125, + "step": 2825 + }, + { + "epoch": 0.78369384359401, + "grad_norm": 0.19195421040058136, + "learning_rate": 1.698612820220868e-05, + "loss": 0.5348, + "step": 2826 + }, + { + "epoch": 0.7839711591791458, + "grad_norm": 0.1831100434064865, + "learning_rate": 1.6980973977065185e-05, + "loss": 0.5354, + "step": 2827 + }, + { + "epoch": 0.7842484747642817, + "grad_norm": 0.19085463881492615, + "learning_rate": 1.6975818877564945e-05, + "loss": 0.5724, + "step": 2828 + }, + { + "epoch": 0.7845257903494176, + "grad_norm": 0.18546035885810852, + "learning_rate": 1.6970662904713857e-05, + "loss": 0.5433, + "step": 2829 + }, + { + "epoch": 0.7848031059345535, + "grad_norm": 0.17541223764419556, + "learning_rate": 1.6965506059517988e-05, + "loss": 0.5348, + "step": 2830 + }, + { + "epoch": 0.7850804215196894, + "grad_norm": 0.18578267097473145, + "learning_rate": 1.696034834298358e-05, + "loss": 0.5355, + "step": 2831 + }, + { + "epoch": 0.7853577371048253, + "grad_norm": 0.19077135622501373, + "learning_rate": 1.6955189756117028e-05, + "loss": 0.5387, + "step": 2832 + }, + { + "epoch": 0.7856350526899611, + "grad_norm": 0.19363151490688324, + "learning_rate": 1.6950030299924925e-05, + "loss": 0.54, + "step": 2833 + }, + { + "epoch": 0.785912368275097, + "grad_norm": 0.19038641452789307, + "learning_rate": 1.6944869975414e-05, + "loss": 0.5365, + "step": 2834 + }, + { + "epoch": 0.7861896838602329, + "grad_norm": 0.18864920735359192, + "learning_rate": 1.6939708783591184e-05, + "loss": 0.5272, + "step": 2835 + }, + { + "epoch": 0.7864669994453688, + "grad_norm": 0.195342555642128, + "learning_rate": 1.6934546725463558e-05, + "loss": 0.5459, + "step": 2836 + }, + { + "epoch": 0.7867443150305047, + "grad_norm": 0.1781705915927887, + "learning_rate": 1.6929383802038372e-05, + "loss": 0.525, + "step": 2837 + }, + { + "epoch": 0.7870216306156406, + "grad_norm": 0.18777886033058167, + "learning_rate": 1.6924220014323054e-05, + "loss": 0.5254, + "step": 2838 + }, + { + "epoch": 0.7872989462007765, + "grad_norm": 0.19651533663272858, + "learning_rate": 1.6919055363325193e-05, + "loss": 0.5388, + "step": 2839 + }, + { + "epoch": 0.7875762617859123, + "grad_norm": 0.19021400809288025, + "learning_rate": 1.6913889850052546e-05, + "loss": 0.5362, + "step": 2840 + }, + { + "epoch": 0.7878535773710482, + "grad_norm": 0.18892168998718262, + "learning_rate": 1.690872347551305e-05, + "loss": 0.5446, + "step": 2841 + }, + { + "epoch": 0.7881308929561841, + "grad_norm": 0.1848224401473999, + "learning_rate": 1.6903556240714795e-05, + "loss": 0.5353, + "step": 2842 + }, + { + "epoch": 0.78840820854132, + "grad_norm": 0.19968490302562714, + "learning_rate": 1.6898388146666046e-05, + "loss": 0.5173, + "step": 2843 + }, + { + "epoch": 0.7886855241264559, + "grad_norm": 0.19547419250011444, + "learning_rate": 1.689321919437524e-05, + "loss": 0.5421, + "step": 2844 + }, + { + "epoch": 0.7889628397115918, + "grad_norm": 0.18620312213897705, + "learning_rate": 1.688804938485097e-05, + "loss": 0.5297, + "step": 2845 + }, + { + "epoch": 0.7892401552967276, + "grad_norm": 0.18695352971553802, + "learning_rate": 1.6882878719102007e-05, + "loss": 0.5386, + "step": 2846 + }, + { + "epoch": 0.7895174708818635, + "grad_norm": 0.18656794726848602, + "learning_rate": 1.6877707198137285e-05, + "loss": 0.557, + "step": 2847 + }, + { + "epoch": 0.7897947864669994, + "grad_norm": 0.18816913664340973, + "learning_rate": 1.6872534822965903e-05, + "loss": 0.5638, + "step": 2848 + }, + { + "epoch": 0.7900721020521353, + "grad_norm": 0.18742120265960693, + "learning_rate": 1.6867361594597126e-05, + "loss": 0.5482, + "step": 2849 + }, + { + "epoch": 0.7903494176372712, + "grad_norm": 0.1863267421722412, + "learning_rate": 1.6862187514040396e-05, + "loss": 0.5471, + "step": 2850 + }, + { + "epoch": 0.7906267332224071, + "grad_norm": 0.24633349478244781, + "learning_rate": 1.6857012582305303e-05, + "loss": 0.5251, + "step": 2851 + }, + { + "epoch": 0.790904048807543, + "grad_norm": 0.1922018826007843, + "learning_rate": 1.6851836800401624e-05, + "loss": 0.5706, + "step": 2852 + }, + { + "epoch": 0.7911813643926788, + "grad_norm": 0.18678666651248932, + "learning_rate": 1.684666016933928e-05, + "loss": 0.55, + "step": 2853 + }, + { + "epoch": 0.7914586799778147, + "grad_norm": 0.1959305703639984, + "learning_rate": 1.6841482690128376e-05, + "loss": 0.5414, + "step": 2854 + }, + { + "epoch": 0.7917359955629506, + "grad_norm": 0.19318552315235138, + "learning_rate": 1.6836304363779178e-05, + "loss": 0.5383, + "step": 2855 + }, + { + "epoch": 0.7920133111480865, + "grad_norm": 0.18508724868297577, + "learning_rate": 1.6831125191302104e-05, + "loss": 0.5371, + "step": 2856 + }, + { + "epoch": 0.7922906267332224, + "grad_norm": 0.18992586433887482, + "learning_rate": 1.682594517370776e-05, + "loss": 0.499, + "step": 2857 + }, + { + "epoch": 0.7925679423183583, + "grad_norm": 0.19711445271968842, + "learning_rate": 1.682076431200689e-05, + "loss": 0.5777, + "step": 2858 + }, + { + "epoch": 0.7928452579034941, + "grad_norm": 0.19418618083000183, + "learning_rate": 1.6815582607210435e-05, + "loss": 0.5459, + "step": 2859 + }, + { + "epoch": 0.79312257348863, + "grad_norm": 0.19879227876663208, + "learning_rate": 1.6810400060329472e-05, + "loss": 0.5578, + "step": 2860 + }, + { + "epoch": 0.7933998890737659, + "grad_norm": 0.1837853342294693, + "learning_rate": 1.680521667237525e-05, + "loss": 0.534, + "step": 2861 + }, + { + "epoch": 0.7936772046589018, + "grad_norm": 0.18517963588237762, + "learning_rate": 1.68000324443592e-05, + "loss": 0.5368, + "step": 2862 + }, + { + "epoch": 0.7939545202440377, + "grad_norm": 0.18853336572647095, + "learning_rate": 1.6794847377292885e-05, + "loss": 0.5728, + "step": 2863 + }, + { + "epoch": 0.7942318358291736, + "grad_norm": 0.1775895059108734, + "learning_rate": 1.678966147218806e-05, + "loss": 0.5283, + "step": 2864 + }, + { + "epoch": 0.7945091514143094, + "grad_norm": 0.18305779993534088, + "learning_rate": 1.678447473005663e-05, + "loss": 0.531, + "step": 2865 + }, + { + "epoch": 0.7947864669994453, + "grad_norm": 0.19156721234321594, + "learning_rate": 1.6779287151910665e-05, + "loss": 0.5485, + "step": 2866 + }, + { + "epoch": 0.7950637825845812, + "grad_norm": 0.18802639842033386, + "learning_rate": 1.6774098738762398e-05, + "loss": 0.548, + "step": 2867 + }, + { + "epoch": 0.7953410981697171, + "grad_norm": 0.19210045039653778, + "learning_rate": 1.6768909491624224e-05, + "loss": 0.548, + "step": 2868 + }, + { + "epoch": 0.795618413754853, + "grad_norm": 0.18298813700675964, + "learning_rate": 1.6763719411508713e-05, + "loss": 0.5385, + "step": 2869 + }, + { + "epoch": 0.7958957293399889, + "grad_norm": 0.206680029630661, + "learning_rate": 1.675852849942857e-05, + "loss": 0.5443, + "step": 2870 + }, + { + "epoch": 0.7961730449251248, + "grad_norm": 0.19752557575702667, + "learning_rate": 1.67533367563967e-05, + "loss": 0.541, + "step": 2871 + }, + { + "epoch": 0.7964503605102606, + "grad_norm": 0.1768447309732437, + "learning_rate": 1.674814418342613e-05, + "loss": 0.5125, + "step": 2872 + }, + { + "epoch": 0.7967276760953965, + "grad_norm": 0.16779829561710358, + "learning_rate": 1.6742950781530086e-05, + "loss": 0.5446, + "step": 2873 + }, + { + "epoch": 0.7970049916805324, + "grad_norm": 0.18453587591648102, + "learning_rate": 1.6737756551721924e-05, + "loss": 0.538, + "step": 2874 + }, + { + "epoch": 0.7972823072656683, + "grad_norm": 0.18118412792682648, + "learning_rate": 1.673256149501518e-05, + "loss": 0.538, + "step": 2875 + }, + { + "epoch": 0.7975596228508042, + "grad_norm": 0.18454188108444214, + "learning_rate": 1.672736561242355e-05, + "loss": 0.5208, + "step": 2876 + }, + { + "epoch": 0.7978369384359401, + "grad_norm": 0.18930627405643463, + "learning_rate": 1.672216890496089e-05, + "loss": 0.5545, + "step": 2877 + }, + { + "epoch": 0.7981142540210759, + "grad_norm": 0.18117345869541168, + "learning_rate": 1.6716971373641212e-05, + "loss": 0.5444, + "step": 2878 + }, + { + "epoch": 0.7983915696062118, + "grad_norm": 0.20891335606575012, + "learning_rate": 1.671177301947869e-05, + "loss": 0.5583, + "step": 2879 + }, + { + "epoch": 0.7986688851913477, + "grad_norm": 0.18599353730678558, + "learning_rate": 1.670657384348766e-05, + "loss": 0.56, + "step": 2880 + }, + { + "epoch": 0.7989462007764836, + "grad_norm": 0.23091299831867218, + "learning_rate": 1.6701373846682626e-05, + "loss": 0.5332, + "step": 2881 + }, + { + "epoch": 0.7992235163616195, + "grad_norm": 0.18974825739860535, + "learning_rate": 1.6696173030078242e-05, + "loss": 0.56, + "step": 2882 + }, + { + "epoch": 0.7995008319467554, + "grad_norm": 0.1794738471508026, + "learning_rate": 1.6690971394689324e-05, + "loss": 0.5671, + "step": 2883 + }, + { + "epoch": 0.7997781475318912, + "grad_norm": 0.24422964453697205, + "learning_rate": 1.6685768941530848e-05, + "loss": 0.5261, + "step": 2884 + }, + { + "epoch": 0.8000554631170271, + "grad_norm": 0.18428778648376465, + "learning_rate": 1.6680565671617955e-05, + "loss": 0.5315, + "step": 2885 + }, + { + "epoch": 0.800332778702163, + "grad_norm": 0.18910729885101318, + "learning_rate": 1.667536158596593e-05, + "loss": 0.5447, + "step": 2886 + }, + { + "epoch": 0.8006100942872989, + "grad_norm": 0.1889326423406601, + "learning_rate": 1.667015668559024e-05, + "loss": 0.5313, + "step": 2887 + }, + { + "epoch": 0.8008874098724348, + "grad_norm": 0.19478273391723633, + "learning_rate": 1.66649509715065e-05, + "loss": 0.5358, + "step": 2888 + }, + { + "epoch": 0.8011647254575707, + "grad_norm": 0.1966940313577652, + "learning_rate": 1.6659744444730467e-05, + "loss": 0.5367, + "step": 2889 + }, + { + "epoch": 0.8014420410427066, + "grad_norm": 0.19594216346740723, + "learning_rate": 1.665453710627809e-05, + "loss": 0.5571, + "step": 2890 + }, + { + "epoch": 0.8017193566278424, + "grad_norm": 0.19384315609931946, + "learning_rate": 1.6649328957165448e-05, + "loss": 0.5478, + "step": 2891 + }, + { + "epoch": 0.8019966722129783, + "grad_norm": 0.18817134201526642, + "learning_rate": 1.6644119998408795e-05, + "loss": 0.5533, + "step": 2892 + }, + { + "epoch": 0.8022739877981142, + "grad_norm": 0.19234129786491394, + "learning_rate": 1.6638910231024528e-05, + "loss": 0.55, + "step": 2893 + }, + { + "epoch": 0.8025513033832501, + "grad_norm": 0.19165417551994324, + "learning_rate": 1.6633699656029224e-05, + "loss": 0.5372, + "step": 2894 + }, + { + "epoch": 0.802828618968386, + "grad_norm": 0.20091277360916138, + "learning_rate": 1.6628488274439592e-05, + "loss": 0.5571, + "step": 2895 + }, + { + "epoch": 0.8031059345535219, + "grad_norm": 0.1875421106815338, + "learning_rate": 1.6623276087272517e-05, + "loss": 0.5346, + "step": 2896 + }, + { + "epoch": 0.8033832501386577, + "grad_norm": 0.24504978954792023, + "learning_rate": 1.661806309554503e-05, + "loss": 0.5277, + "step": 2897 + }, + { + "epoch": 0.8036605657237936, + "grad_norm": 0.188764289021492, + "learning_rate": 1.661284930027433e-05, + "loss": 0.5354, + "step": 2898 + }, + { + "epoch": 0.8039378813089295, + "grad_norm": 0.1898432970046997, + "learning_rate": 1.6607634702477765e-05, + "loss": 0.5349, + "step": 2899 + }, + { + "epoch": 0.8042151968940654, + "grad_norm": 0.18189935386180878, + "learning_rate": 1.6602419303172835e-05, + "loss": 0.5146, + "step": 2900 + }, + { + "epoch": 0.8044925124792013, + "grad_norm": 0.19001318514347076, + "learning_rate": 1.659720310337721e-05, + "loss": 0.5438, + "step": 2901 + }, + { + "epoch": 0.8047698280643372, + "grad_norm": 0.19630925357341766, + "learning_rate": 1.6591986104108706e-05, + "loss": 0.5644, + "step": 2902 + }, + { + "epoch": 0.805047143649473, + "grad_norm": 0.1834658980369568, + "learning_rate": 1.65867683063853e-05, + "loss": 0.5588, + "step": 2903 + }, + { + "epoch": 0.8053244592346089, + "grad_norm": 0.198238343000412, + "learning_rate": 1.658154971122512e-05, + "loss": 0.5471, + "step": 2904 + }, + { + "epoch": 0.8056017748197448, + "grad_norm": 0.18793722987174988, + "learning_rate": 1.657633031964645e-05, + "loss": 0.5416, + "step": 2905 + }, + { + "epoch": 0.8058790904048807, + "grad_norm": 0.1919628083705902, + "learning_rate": 1.657111013266774e-05, + "loss": 0.5489, + "step": 2906 + }, + { + "epoch": 0.8061564059900166, + "grad_norm": 0.18970000743865967, + "learning_rate": 1.6565889151307576e-05, + "loss": 0.5374, + "step": 2907 + }, + { + "epoch": 0.8064337215751525, + "grad_norm": 0.19195859134197235, + "learning_rate": 1.656066737658471e-05, + "loss": 0.5696, + "step": 2908 + }, + { + "epoch": 0.8067110371602884, + "grad_norm": 0.18715813755989075, + "learning_rate": 1.6555444809518066e-05, + "loss": 0.5365, + "step": 2909 + }, + { + "epoch": 0.8069883527454242, + "grad_norm": 0.19223229587078094, + "learning_rate": 1.6550221451126682e-05, + "loss": 0.5148, + "step": 2910 + }, + { + "epoch": 0.8072656683305601, + "grad_norm": 0.2136392742395401, + "learning_rate": 1.6544997302429794e-05, + "loss": 0.5269, + "step": 2911 + }, + { + "epoch": 0.807542983915696, + "grad_norm": 0.19737182557582855, + "learning_rate": 1.6539772364446755e-05, + "loss": 0.5592, + "step": 2912 + }, + { + "epoch": 0.8078202995008319, + "grad_norm": 0.20777487754821777, + "learning_rate": 1.6534546638197098e-05, + "loss": 0.5124, + "step": 2913 + }, + { + "epoch": 0.8080976150859678, + "grad_norm": 0.1888158619403839, + "learning_rate": 1.6529320124700495e-05, + "loss": 0.5433, + "step": 2914 + }, + { + "epoch": 0.8083749306711037, + "grad_norm": 0.24319934844970703, + "learning_rate": 1.6524092824976787e-05, + "loss": 0.5763, + "step": 2915 + }, + { + "epoch": 0.8086522462562395, + "grad_norm": 0.19144728779792786, + "learning_rate": 1.6518864740045947e-05, + "loss": 0.5245, + "step": 2916 + }, + { + "epoch": 0.8089295618413754, + "grad_norm": 0.19189846515655518, + "learning_rate": 1.6513635870928122e-05, + "loss": 0.5435, + "step": 2917 + }, + { + "epoch": 0.8092068774265113, + "grad_norm": 0.24125546216964722, + "learning_rate": 1.6508406218643597e-05, + "loss": 0.5599, + "step": 2918 + }, + { + "epoch": 0.8094841930116472, + "grad_norm": 0.2077561616897583, + "learning_rate": 1.650317578421282e-05, + "loss": 0.5466, + "step": 2919 + }, + { + "epoch": 0.8097615085967831, + "grad_norm": 0.2178879827260971, + "learning_rate": 1.6497944568656383e-05, + "loss": 0.5306, + "step": 2920 + }, + { + "epoch": 0.810038824181919, + "grad_norm": 0.24422143399715424, + "learning_rate": 1.649271257299504e-05, + "loss": 0.5388, + "step": 2921 + }, + { + "epoch": 0.8103161397670549, + "grad_norm": 0.18929331004619598, + "learning_rate": 1.6487479798249687e-05, + "loss": 0.5236, + "step": 2922 + }, + { + "epoch": 0.8105934553521907, + "grad_norm": 0.17720143496990204, + "learning_rate": 1.648224624544138e-05, + "loss": 0.5172, + "step": 2923 + }, + { + "epoch": 0.8108707709373266, + "grad_norm": 0.18783891201019287, + "learning_rate": 1.6477011915591325e-05, + "loss": 0.5389, + "step": 2924 + }, + { + "epoch": 0.8111480865224625, + "grad_norm": 0.19297097623348236, + "learning_rate": 1.6471776809720873e-05, + "loss": 0.543, + "step": 2925 + }, + { + "epoch": 0.8114254021075984, + "grad_norm": 0.18489378690719604, + "learning_rate": 1.6466540928851538e-05, + "loss": 0.5512, + "step": 2926 + }, + { + "epoch": 0.8117027176927343, + "grad_norm": 0.18271875381469727, + "learning_rate": 1.6461304274004972e-05, + "loss": 0.5417, + "step": 2927 + }, + { + "epoch": 0.8119800332778702, + "grad_norm": 0.1847040057182312, + "learning_rate": 1.6456066846202994e-05, + "loss": 0.5387, + "step": 2928 + }, + { + "epoch": 0.812257348863006, + "grad_norm": 0.20353619754314423, + "learning_rate": 1.6450828646467555e-05, + "loss": 0.5454, + "step": 2929 + }, + { + "epoch": 0.812534664448142, + "grad_norm": 0.2704160809516907, + "learning_rate": 1.644558967582078e-05, + "loss": 0.5598, + "step": 2930 + }, + { + "epoch": 0.8128119800332779, + "grad_norm": 0.2357567995786667, + "learning_rate": 1.6440349935284917e-05, + "loss": 0.5724, + "step": 2931 + }, + { + "epoch": 0.8130892956184138, + "grad_norm": 0.18530336022377014, + "learning_rate": 1.6435109425882385e-05, + "loss": 0.5365, + "step": 2932 + }, + { + "epoch": 0.8133666112035497, + "grad_norm": 0.18962644040584564, + "learning_rate": 1.6429868148635745e-05, + "loss": 0.5429, + "step": 2933 + }, + { + "epoch": 0.8136439267886856, + "grad_norm": 0.1827327460050583, + "learning_rate": 1.6424626104567708e-05, + "loss": 0.5368, + "step": 2934 + }, + { + "epoch": 0.8139212423738215, + "grad_norm": 0.19838617742061615, + "learning_rate": 1.641938329470114e-05, + "loss": 0.5366, + "step": 2935 + }, + { + "epoch": 0.8141985579589573, + "grad_norm": 0.18842142820358276, + "learning_rate": 1.6414139720059045e-05, + "loss": 0.546, + "step": 2936 + }, + { + "epoch": 0.8144758735440932, + "grad_norm": 0.1957085281610489, + "learning_rate": 1.6408895381664594e-05, + "loss": 0.5144, + "step": 2937 + }, + { + "epoch": 0.8147531891292291, + "grad_norm": 0.18413354456424713, + "learning_rate": 1.6403650280541087e-05, + "loss": 0.5441, + "step": 2938 + }, + { + "epoch": 0.815030504714365, + "grad_norm": 0.18572686612606049, + "learning_rate": 1.6398404417711984e-05, + "loss": 0.543, + "step": 2939 + }, + { + "epoch": 0.8153078202995009, + "grad_norm": 0.18513022363185883, + "learning_rate": 1.639315779420089e-05, + "loss": 0.5416, + "step": 2940 + }, + { + "epoch": 0.8155851358846368, + "grad_norm": 0.1878947764635086, + "learning_rate": 1.6387910411031564e-05, + "loss": 0.5487, + "step": 2941 + }, + { + "epoch": 0.8158624514697727, + "grad_norm": 0.20760974287986755, + "learning_rate": 1.6382662269227912e-05, + "loss": 0.5861, + "step": 2942 + }, + { + "epoch": 0.8161397670549085, + "grad_norm": 0.1904260218143463, + "learning_rate": 1.637741336981398e-05, + "loss": 0.549, + "step": 2943 + }, + { + "epoch": 0.8164170826400444, + "grad_norm": 0.1811297982931137, + "learning_rate": 1.637216371381397e-05, + "loss": 0.5556, + "step": 2944 + }, + { + "epoch": 0.8166943982251803, + "grad_norm": 0.18820282816886902, + "learning_rate": 1.6366913302252228e-05, + "loss": 0.5262, + "step": 2945 + }, + { + "epoch": 0.8169717138103162, + "grad_norm": 0.18562357127666473, + "learning_rate": 1.636166213615325e-05, + "loss": 0.5353, + "step": 2946 + }, + { + "epoch": 0.8172490293954521, + "grad_norm": 0.19570721685886383, + "learning_rate": 1.6356410216541675e-05, + "loss": 0.524, + "step": 2947 + }, + { + "epoch": 0.817526344980588, + "grad_norm": 0.19673757255077362, + "learning_rate": 1.635115754444229e-05, + "loss": 0.5532, + "step": 2948 + }, + { + "epoch": 0.8178036605657238, + "grad_norm": 0.1866602897644043, + "learning_rate": 1.6345904120880045e-05, + "loss": 0.5407, + "step": 2949 + }, + { + "epoch": 0.8180809761508597, + "grad_norm": 0.1872847080230713, + "learning_rate": 1.634064994688e-05, + "loss": 0.522, + "step": 2950 + }, + { + "epoch": 0.8183582917359956, + "grad_norm": 0.18448910117149353, + "learning_rate": 1.63353950234674e-05, + "loss": 0.5506, + "step": 2951 + }, + { + "epoch": 0.8186356073211315, + "grad_norm": 0.19117306172847748, + "learning_rate": 1.6330139351667607e-05, + "loss": 0.538, + "step": 2952 + }, + { + "epoch": 0.8189129229062674, + "grad_norm": 0.19007255136966705, + "learning_rate": 1.6324882932506152e-05, + "loss": 0.545, + "step": 2953 + }, + { + "epoch": 0.8191902384914033, + "grad_norm": 0.1907234787940979, + "learning_rate": 1.63196257670087e-05, + "loss": 0.5216, + "step": 2954 + }, + { + "epoch": 0.8194675540765392, + "grad_norm": 0.19023658335208893, + "learning_rate": 1.6314367856201063e-05, + "loss": 0.528, + "step": 2955 + }, + { + "epoch": 0.819744869661675, + "grad_norm": 0.19689738750457764, + "learning_rate": 1.6309109201109197e-05, + "loss": 0.5579, + "step": 2956 + }, + { + "epoch": 0.8200221852468109, + "grad_norm": 0.18936869502067566, + "learning_rate": 1.63038498027592e-05, + "loss": 0.5171, + "step": 2957 + }, + { + "epoch": 0.8202995008319468, + "grad_norm": 0.18959283828735352, + "learning_rate": 1.6298589662177334e-05, + "loss": 0.5109, + "step": 2958 + }, + { + "epoch": 0.8205768164170827, + "grad_norm": 0.18975764513015747, + "learning_rate": 1.6293328780389976e-05, + "loss": 0.5241, + "step": 2959 + }, + { + "epoch": 0.8208541320022186, + "grad_norm": 0.1963840126991272, + "learning_rate": 1.6288067158423676e-05, + "loss": 0.5311, + "step": 2960 + }, + { + "epoch": 0.8211314475873545, + "grad_norm": 0.1869080811738968, + "learning_rate": 1.6282804797305107e-05, + "loss": 0.5268, + "step": 2961 + }, + { + "epoch": 0.8214087631724903, + "grad_norm": 0.19208255410194397, + "learning_rate": 1.62775416980611e-05, + "loss": 0.5561, + "step": 2962 + }, + { + "epoch": 0.8216860787576262, + "grad_norm": 0.21065904200077057, + "learning_rate": 1.6272277861718622e-05, + "loss": 0.5612, + "step": 2963 + }, + { + "epoch": 0.8219633943427621, + "grad_norm": 0.19388340413570404, + "learning_rate": 1.626701328930479e-05, + "loss": 0.5448, + "step": 2964 + }, + { + "epoch": 0.822240709927898, + "grad_norm": 0.19354204833507538, + "learning_rate": 1.626174798184686e-05, + "loss": 0.5521, + "step": 2965 + }, + { + "epoch": 0.8225180255130339, + "grad_norm": 0.1918734908103943, + "learning_rate": 1.6256481940372235e-05, + "loss": 0.5737, + "step": 2966 + }, + { + "epoch": 0.8227953410981698, + "grad_norm": 0.19950683414936066, + "learning_rate": 1.625121516590845e-05, + "loss": 0.5251, + "step": 2967 + }, + { + "epoch": 0.8230726566833056, + "grad_norm": 0.1884751170873642, + "learning_rate": 1.624594765948321e-05, + "loss": 0.5487, + "step": 2968 + }, + { + "epoch": 0.8233499722684415, + "grad_norm": 0.20038394629955292, + "learning_rate": 1.624067942212433e-05, + "loss": 0.5366, + "step": 2969 + }, + { + "epoch": 0.8236272878535774, + "grad_norm": 0.18730241060256958, + "learning_rate": 1.6235410454859784e-05, + "loss": 0.5574, + "step": 2970 + }, + { + "epoch": 0.8239046034387133, + "grad_norm": 0.1982334703207016, + "learning_rate": 1.6230140758717692e-05, + "loss": 0.5472, + "step": 2971 + }, + { + "epoch": 0.8241819190238492, + "grad_norm": 0.19432491064071655, + "learning_rate": 1.6224870334726315e-05, + "loss": 0.54, + "step": 2972 + }, + { + "epoch": 0.8244592346089851, + "grad_norm": 0.1818057745695114, + "learning_rate": 1.6219599183914038e-05, + "loss": 0.5594, + "step": 2973 + }, + { + "epoch": 0.824736550194121, + "grad_norm": 0.18658170104026794, + "learning_rate": 1.6214327307309417e-05, + "loss": 0.537, + "step": 2974 + }, + { + "epoch": 0.8250138657792568, + "grad_norm": 0.19775459170341492, + "learning_rate": 1.620905470594113e-05, + "loss": 0.5423, + "step": 2975 + }, + { + "epoch": 0.8252911813643927, + "grad_norm": 0.1877543330192566, + "learning_rate": 1.6203781380837997e-05, + "loss": 0.5337, + "step": 2976 + }, + { + "epoch": 0.8255684969495286, + "grad_norm": 0.1873265951871872, + "learning_rate": 1.619850733302899e-05, + "loss": 0.5153, + "step": 2977 + }, + { + "epoch": 0.8258458125346645, + "grad_norm": 0.1832212209701538, + "learning_rate": 1.619323256354321e-05, + "loss": 0.5291, + "step": 2978 + }, + { + "epoch": 0.8261231281198004, + "grad_norm": 0.1939922571182251, + "learning_rate": 1.6187957073409907e-05, + "loss": 0.5428, + "step": 2979 + }, + { + "epoch": 0.8264004437049363, + "grad_norm": 0.2593255639076233, + "learning_rate": 1.6182680863658468e-05, + "loss": 0.5448, + "step": 2980 + }, + { + "epoch": 0.8266777592900721, + "grad_norm": 0.18556508421897888, + "learning_rate": 1.6177403935318422e-05, + "loss": 0.529, + "step": 2981 + }, + { + "epoch": 0.826955074875208, + "grad_norm": 0.20643781125545502, + "learning_rate": 1.6172126289419437e-05, + "loss": 0.535, + "step": 2982 + }, + { + "epoch": 0.8272323904603439, + "grad_norm": 0.192849263548851, + "learning_rate": 1.6166847926991324e-05, + "loss": 0.5457, + "step": 2983 + }, + { + "epoch": 0.8275097060454798, + "grad_norm": 0.19120796024799347, + "learning_rate": 1.616156884906403e-05, + "loss": 0.542, + "step": 2984 + }, + { + "epoch": 0.8277870216306157, + "grad_norm": 0.20270869135856628, + "learning_rate": 1.615628905666764e-05, + "loss": 0.5621, + "step": 2985 + }, + { + "epoch": 0.8280643372157516, + "grad_norm": 0.20787782967090607, + "learning_rate": 1.6151008550832377e-05, + "loss": 0.5007, + "step": 2986 + }, + { + "epoch": 0.8283416528008875, + "grad_norm": 0.23962002992630005, + "learning_rate": 1.6145727332588626e-05, + "loss": 0.5351, + "step": 2987 + }, + { + "epoch": 0.8286189683860233, + "grad_norm": 0.19550423324108124, + "learning_rate": 1.614044540296687e-05, + "loss": 0.5599, + "step": 2988 + }, + { + "epoch": 0.8288962839711592, + "grad_norm": 0.20781289041042328, + "learning_rate": 1.6135162762997776e-05, + "loss": 0.5381, + "step": 2989 + }, + { + "epoch": 0.8291735995562951, + "grad_norm": 0.1908697932958603, + "learning_rate": 1.61298794137121e-05, + "loss": 0.5109, + "step": 2990 + }, + { + "epoch": 0.829450915141431, + "grad_norm": 0.19248178601264954, + "learning_rate": 1.6124595356140794e-05, + "loss": 0.5061, + "step": 2991 + }, + { + "epoch": 0.8297282307265669, + "grad_norm": 0.4288308918476105, + "learning_rate": 1.611931059131489e-05, + "loss": 0.5418, + "step": 2992 + }, + { + "epoch": 0.8300055463117028, + "grad_norm": 0.2553388178348541, + "learning_rate": 1.6114025120265604e-05, + "loss": 0.5308, + "step": 2993 + }, + { + "epoch": 0.8302828618968386, + "grad_norm": 0.19662028551101685, + "learning_rate": 1.6108738944024265e-05, + "loss": 0.5647, + "step": 2994 + }, + { + "epoch": 0.8305601774819745, + "grad_norm": 0.1986682265996933, + "learning_rate": 1.6103452063622343e-05, + "loss": 0.5594, + "step": 2995 + }, + { + "epoch": 0.8308374930671104, + "grad_norm": 0.18815158307552338, + "learning_rate": 1.6098164480091454e-05, + "loss": 0.5347, + "step": 2996 + }, + { + "epoch": 0.8311148086522463, + "grad_norm": 0.19016186892986298, + "learning_rate": 1.6092876194463343e-05, + "loss": 0.5022, + "step": 2997 + }, + { + "epoch": 0.8313921242373822, + "grad_norm": 0.19305314123630524, + "learning_rate": 1.6087587207769897e-05, + "loss": 0.5561, + "step": 2998 + }, + { + "epoch": 0.8316694398225181, + "grad_norm": 0.19363488256931305, + "learning_rate": 1.6082297521043134e-05, + "loss": 0.5704, + "step": 2999 + }, + { + "epoch": 0.831946755407654, + "grad_norm": 0.2677402198314667, + "learning_rate": 1.6077007135315212e-05, + "loss": 0.5709, + "step": 3000 + }, + { + "epoch": 0.8322240709927898, + "grad_norm": 0.18514001369476318, + "learning_rate": 1.6071716051618426e-05, + "loss": 0.5328, + "step": 3001 + }, + { + "epoch": 0.8325013865779257, + "grad_norm": 0.20008349418640137, + "learning_rate": 1.606642427098521e-05, + "loss": 0.5535, + "step": 3002 + }, + { + "epoch": 0.8327787021630616, + "grad_norm": 0.20995013415813446, + "learning_rate": 1.606113179444813e-05, + "loss": 0.5541, + "step": 3003 + }, + { + "epoch": 0.8330560177481975, + "grad_norm": 0.21254870295524597, + "learning_rate": 1.6055838623039886e-05, + "loss": 0.5221, + "step": 3004 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.1829461008310318, + "learning_rate": 1.6050544757793312e-05, + "loss": 0.5445, + "step": 3005 + }, + { + "epoch": 0.8336106489184693, + "grad_norm": 0.20135165750980377, + "learning_rate": 1.604525019974139e-05, + "loss": 0.5337, + "step": 3006 + }, + { + "epoch": 0.8338879645036051, + "grad_norm": 0.1872393935918808, + "learning_rate": 1.6039954949917218e-05, + "loss": 0.5506, + "step": 3007 + }, + { + "epoch": 0.834165280088741, + "grad_norm": 0.17982006072998047, + "learning_rate": 1.6034659009354055e-05, + "loss": 0.5252, + "step": 3008 + }, + { + "epoch": 0.8344425956738769, + "grad_norm": 0.19605563580989838, + "learning_rate": 1.6029362379085264e-05, + "loss": 0.527, + "step": 3009 + }, + { + "epoch": 0.8347199112590128, + "grad_norm": 0.20329341292381287, + "learning_rate": 1.602406506014437e-05, + "loss": 0.5729, + "step": 3010 + }, + { + "epoch": 0.8349972268441487, + "grad_norm": 0.18132315576076508, + "learning_rate": 1.6018767053565008e-05, + "loss": 0.5391, + "step": 3011 + }, + { + "epoch": 0.8352745424292846, + "grad_norm": 0.19777770340442657, + "learning_rate": 1.6013468360380966e-05, + "loss": 0.5366, + "step": 3012 + }, + { + "epoch": 0.8355518580144204, + "grad_norm": 0.19389207661151886, + "learning_rate": 1.6008168981626164e-05, + "loss": 0.5569, + "step": 3013 + }, + { + "epoch": 0.8358291735995563, + "grad_norm": 0.18683470785617828, + "learning_rate": 1.6002868918334647e-05, + "loss": 0.5496, + "step": 3014 + }, + { + "epoch": 0.8361064891846922, + "grad_norm": 0.18466855585575104, + "learning_rate": 1.5997568171540594e-05, + "loss": 0.5547, + "step": 3015 + }, + { + "epoch": 0.8363838047698281, + "grad_norm": 0.19307468831539154, + "learning_rate": 1.5992266742278322e-05, + "loss": 0.5518, + "step": 3016 + }, + { + "epoch": 0.836661120354964, + "grad_norm": 0.19225814938545227, + "learning_rate": 1.5986964631582287e-05, + "loss": 0.548, + "step": 3017 + }, + { + "epoch": 0.8369384359400999, + "grad_norm": 0.19282346963882446, + "learning_rate": 1.5981661840487063e-05, + "loss": 0.5594, + "step": 3018 + }, + { + "epoch": 0.8372157515252358, + "grad_norm": 0.18779483437538147, + "learning_rate": 1.5976358370027373e-05, + "loss": 0.5191, + "step": 3019 + }, + { + "epoch": 0.8374930671103716, + "grad_norm": 0.18765175342559814, + "learning_rate": 1.597105422123806e-05, + "loss": 0.5401, + "step": 3020 + }, + { + "epoch": 0.8377703826955075, + "grad_norm": 0.1844259798526764, + "learning_rate": 1.5965749395154107e-05, + "loss": 0.5459, + "step": 3021 + }, + { + "epoch": 0.8380476982806434, + "grad_norm": 0.18752428889274597, + "learning_rate": 1.5960443892810617e-05, + "loss": 0.5339, + "step": 3022 + }, + { + "epoch": 0.8383250138657793, + "grad_norm": 0.19530834257602692, + "learning_rate": 1.5955137715242847e-05, + "loss": 0.5479, + "step": 3023 + }, + { + "epoch": 0.8386023294509152, + "grad_norm": 0.1960788518190384, + "learning_rate": 1.5949830863486166e-05, + "loss": 0.5511, + "step": 3024 + }, + { + "epoch": 0.8388796450360511, + "grad_norm": 0.2669583261013031, + "learning_rate": 1.594452333857608e-05, + "loss": 0.5637, + "step": 3025 + }, + { + "epoch": 0.8391569606211869, + "grad_norm": 0.18217816948890686, + "learning_rate": 1.5939215141548224e-05, + "loss": 0.4905, + "step": 3026 + }, + { + "epoch": 0.8394342762063228, + "grad_norm": 0.19247345626354218, + "learning_rate": 1.5933906273438383e-05, + "loss": 0.5234, + "step": 3027 + }, + { + "epoch": 0.8397115917914587, + "grad_norm": 0.1867692619562149, + "learning_rate": 1.592859673528244e-05, + "loss": 0.5166, + "step": 3028 + }, + { + "epoch": 0.8399889073765946, + "grad_norm": 0.18724261224269867, + "learning_rate": 1.5923286528116446e-05, + "loss": 0.5524, + "step": 3029 + }, + { + "epoch": 0.8402662229617305, + "grad_norm": 0.18874408304691315, + "learning_rate": 1.5917975652976544e-05, + "loss": 0.5283, + "step": 3030 + }, + { + "epoch": 0.8405435385468664, + "grad_norm": 0.2242184728384018, + "learning_rate": 1.5912664110899038e-05, + "loss": 0.532, + "step": 3031 + }, + { + "epoch": 0.8408208541320022, + "grad_norm": 0.20637090504169464, + "learning_rate": 1.5907351902920346e-05, + "loss": 0.5619, + "step": 3032 + }, + { + "epoch": 0.8410981697171381, + "grad_norm": 0.20153935253620148, + "learning_rate": 1.590203903007702e-05, + "loss": 0.5704, + "step": 3033 + }, + { + "epoch": 0.841375485302274, + "grad_norm": 0.1838448941707611, + "learning_rate": 1.5896725493405746e-05, + "loss": 0.5533, + "step": 3034 + }, + { + "epoch": 0.8416528008874099, + "grad_norm": 0.187151700258255, + "learning_rate": 1.589141129394333e-05, + "loss": 0.5276, + "step": 3035 + }, + { + "epoch": 0.8419301164725458, + "grad_norm": 0.19188909232616425, + "learning_rate": 1.5886096432726723e-05, + "loss": 0.5588, + "step": 3036 + }, + { + "epoch": 0.8422074320576817, + "grad_norm": 0.18532563745975494, + "learning_rate": 1.5880780910792984e-05, + "loss": 0.5363, + "step": 3037 + }, + { + "epoch": 0.8424847476428176, + "grad_norm": 0.18388155102729797, + "learning_rate": 1.587546472917932e-05, + "loss": 0.5544, + "step": 3038 + }, + { + "epoch": 0.8427620632279534, + "grad_norm": 0.18780486285686493, + "learning_rate": 1.5870147888923054e-05, + "loss": 0.5501, + "step": 3039 + }, + { + "epoch": 0.8430393788130893, + "grad_norm": 0.18718671798706055, + "learning_rate": 1.5864830391061644e-05, + "loss": 0.5491, + "step": 3040 + }, + { + "epoch": 0.8433166943982252, + "grad_norm": 0.1841050237417221, + "learning_rate": 1.585951223663268e-05, + "loss": 0.539, + "step": 3041 + }, + { + "epoch": 0.8435940099833611, + "grad_norm": 0.17960785329341888, + "learning_rate": 1.5854193426673862e-05, + "loss": 0.5134, + "step": 3042 + }, + { + "epoch": 0.843871325568497, + "grad_norm": 0.1937013566493988, + "learning_rate": 1.5848873962223044e-05, + "loss": 0.5083, + "step": 3043 + }, + { + "epoch": 0.8441486411536329, + "grad_norm": 0.18566550314426422, + "learning_rate": 1.5843553844318193e-05, + "loss": 0.5569, + "step": 3044 + }, + { + "epoch": 0.8444259567387687, + "grad_norm": 0.1879187971353531, + "learning_rate": 1.5838233073997395e-05, + "loss": 0.5611, + "step": 3045 + }, + { + "epoch": 0.8447032723239046, + "grad_norm": 0.1857539415359497, + "learning_rate": 1.5832911652298882e-05, + "loss": 0.5464, + "step": 3046 + }, + { + "epoch": 0.8449805879090405, + "grad_norm": 0.1914960891008377, + "learning_rate": 1.5827589580261e-05, + "loss": 0.5547, + "step": 3047 + }, + { + "epoch": 0.8452579034941764, + "grad_norm": 0.18924832344055176, + "learning_rate": 1.582226685892223e-05, + "loss": 0.5739, + "step": 3048 + }, + { + "epoch": 0.8455352190793123, + "grad_norm": 0.18020758032798767, + "learning_rate": 1.5816943489321174e-05, + "loss": 0.5355, + "step": 3049 + }, + { + "epoch": 0.8458125346644482, + "grad_norm": 0.22242438793182373, + "learning_rate": 1.5811619472496562e-05, + "loss": 0.5551, + "step": 3050 + }, + { + "epoch": 0.846089850249584, + "grad_norm": 0.18375588953495026, + "learning_rate": 1.5806294809487248e-05, + "loss": 0.5393, + "step": 3051 + }, + { + "epoch": 0.8463671658347199, + "grad_norm": 0.20009155571460724, + "learning_rate": 1.5800969501332223e-05, + "loss": 0.5555, + "step": 3052 + }, + { + "epoch": 0.8466444814198558, + "grad_norm": 0.1867419183254242, + "learning_rate": 1.5795643549070588e-05, + "loss": 0.5541, + "step": 3053 + }, + { + "epoch": 0.8469217970049917, + "grad_norm": 0.20315201580524445, + "learning_rate": 1.5790316953741583e-05, + "loss": 0.5302, + "step": 3054 + }, + { + "epoch": 0.8471991125901276, + "grad_norm": 0.19646741449832916, + "learning_rate": 1.578498971638456e-05, + "loss": 0.5491, + "step": 3055 + }, + { + "epoch": 0.8474764281752635, + "grad_norm": 0.1929592341184616, + "learning_rate": 1.5779661838039013e-05, + "loss": 0.554, + "step": 3056 + }, + { + "epoch": 0.8477537437603994, + "grad_norm": 0.18516919016838074, + "learning_rate": 1.577433331974455e-05, + "loss": 0.5474, + "step": 3057 + }, + { + "epoch": 0.8480310593455352, + "grad_norm": 0.17926859855651855, + "learning_rate": 1.57690041625409e-05, + "loss": 0.541, + "step": 3058 + }, + { + "epoch": 0.8483083749306711, + "grad_norm": 0.19980867207050323, + "learning_rate": 1.576367436746793e-05, + "loss": 0.581, + "step": 3059 + }, + { + "epoch": 0.848585690515807, + "grad_norm": 0.20209050178527832, + "learning_rate": 1.575834393556562e-05, + "loss": 0.5749, + "step": 3060 + }, + { + "epoch": 0.8488630061009429, + "grad_norm": 0.19711333513259888, + "learning_rate": 1.575301286787408e-05, + "loss": 0.5537, + "step": 3061 + }, + { + "epoch": 0.8491403216860788, + "grad_norm": 0.185336172580719, + "learning_rate": 1.5747681165433544e-05, + "loss": 0.5378, + "step": 3062 + }, + { + "epoch": 0.8494176372712147, + "grad_norm": 0.2029864341020584, + "learning_rate": 1.5742348829284366e-05, + "loss": 0.5489, + "step": 3063 + }, + { + "epoch": 0.8496949528563505, + "grad_norm": 0.18520487844944, + "learning_rate": 1.5737015860467032e-05, + "loss": 0.5589, + "step": 3064 + }, + { + "epoch": 0.8499722684414864, + "grad_norm": 0.18894660472869873, + "learning_rate": 1.573168226002213e-05, + "loss": 0.5486, + "step": 3065 + }, + { + "epoch": 0.8502495840266223, + "grad_norm": 0.2093021273612976, + "learning_rate": 1.5726348028990404e-05, + "loss": 0.528, + "step": 3066 + }, + { + "epoch": 0.8505268996117582, + "grad_norm": 0.18530364334583282, + "learning_rate": 1.5721013168412698e-05, + "loss": 0.5244, + "step": 3067 + }, + { + "epoch": 0.8508042151968941, + "grad_norm": 0.18845012784004211, + "learning_rate": 1.5715677679329978e-05, + "loss": 0.5211, + "step": 3068 + }, + { + "epoch": 0.85108153078203, + "grad_norm": 0.20350565016269684, + "learning_rate": 1.571034156278335e-05, + "loss": 0.5711, + "step": 3069 + }, + { + "epoch": 0.8513588463671659, + "grad_norm": 0.18500900268554688, + "learning_rate": 1.5705004819814025e-05, + "loss": 0.5116, + "step": 3070 + }, + { + "epoch": 0.8516361619523017, + "grad_norm": 0.18262405693531036, + "learning_rate": 1.5699667451463344e-05, + "loss": 0.5485, + "step": 3071 + }, + { + "epoch": 0.8519134775374376, + "grad_norm": 0.19544097781181335, + "learning_rate": 1.5694329458772776e-05, + "loss": 0.5465, + "step": 3072 + }, + { + "epoch": 0.8521907931225735, + "grad_norm": 0.18556345999240875, + "learning_rate": 1.5688990842783892e-05, + "loss": 0.5409, + "step": 3073 + }, + { + "epoch": 0.8524681087077094, + "grad_norm": 0.23834922909736633, + "learning_rate": 1.5683651604538405e-05, + "loss": 0.5192, + "step": 3074 + }, + { + "epoch": 0.8527454242928453, + "grad_norm": 0.20461677014827728, + "learning_rate": 1.5678311745078138e-05, + "loss": 0.5674, + "step": 3075 + }, + { + "epoch": 0.8530227398779812, + "grad_norm": 0.19039872288703918, + "learning_rate": 1.5672971265445046e-05, + "loss": 0.5433, + "step": 3076 + }, + { + "epoch": 0.853300055463117, + "grad_norm": 0.19499389827251434, + "learning_rate": 1.566763016668119e-05, + "loss": 0.5577, + "step": 3077 + }, + { + "epoch": 0.8535773710482529, + "grad_norm": 0.185109481215477, + "learning_rate": 1.5662288449828767e-05, + "loss": 0.5192, + "step": 3078 + }, + { + "epoch": 0.8538546866333888, + "grad_norm": 0.2005089968442917, + "learning_rate": 1.5656946115930084e-05, + "loss": 0.5423, + "step": 3079 + }, + { + "epoch": 0.8541320022185247, + "grad_norm": 0.19989848136901855, + "learning_rate": 1.5651603166027574e-05, + "loss": 0.5307, + "step": 3080 + }, + { + "epoch": 0.8544093178036606, + "grad_norm": 0.18417179584503174, + "learning_rate": 1.5646259601163783e-05, + "loss": 0.5479, + "step": 3081 + }, + { + "epoch": 0.8546866333887965, + "grad_norm": 0.19350256025791168, + "learning_rate": 1.5640915422381387e-05, + "loss": 0.547, + "step": 3082 + }, + { + "epoch": 0.8549639489739324, + "grad_norm": 0.19599801301956177, + "learning_rate": 1.5635570630723173e-05, + "loss": 0.572, + "step": 3083 + }, + { + "epoch": 0.8552412645590682, + "grad_norm": 0.18424679338932037, + "learning_rate": 1.5630225227232055e-05, + "loss": 0.5421, + "step": 3084 + }, + { + "epoch": 0.8555185801442041, + "grad_norm": 0.18912889063358307, + "learning_rate": 1.562487921295106e-05, + "loss": 0.5503, + "step": 3085 + }, + { + "epoch": 0.85579589572934, + "grad_norm": 0.1971350610256195, + "learning_rate": 1.561953258892334e-05, + "loss": 0.5493, + "step": 3086 + }, + { + "epoch": 0.8560732113144759, + "grad_norm": 0.19339221715927124, + "learning_rate": 1.5614185356192156e-05, + "loss": 0.5249, + "step": 3087 + }, + { + "epoch": 0.8563505268996118, + "grad_norm": 0.22969551384449005, + "learning_rate": 1.5608837515800906e-05, + "loss": 0.5398, + "step": 3088 + }, + { + "epoch": 0.8566278424847477, + "grad_norm": 0.20485931634902954, + "learning_rate": 1.560348906879309e-05, + "loss": 0.5455, + "step": 3089 + }, + { + "epoch": 0.8569051580698835, + "grad_norm": 0.2660558521747589, + "learning_rate": 1.5598140016212328e-05, + "loss": 0.5636, + "step": 3090 + }, + { + "epoch": 0.8571824736550194, + "grad_norm": 0.19878970086574554, + "learning_rate": 1.559279035910237e-05, + "loss": 0.5513, + "step": 3091 + }, + { + "epoch": 0.8574597892401553, + "grad_norm": 0.19739742577075958, + "learning_rate": 1.5587440098507067e-05, + "loss": 0.5521, + "step": 3092 + }, + { + "epoch": 0.8577371048252912, + "grad_norm": 0.19214226305484772, + "learning_rate": 1.55820892354704e-05, + "loss": 0.5721, + "step": 3093 + }, + { + "epoch": 0.8580144204104271, + "grad_norm": 0.19705668091773987, + "learning_rate": 1.5576737771036464e-05, + "loss": 0.555, + "step": 3094 + }, + { + "epoch": 0.858291735995563, + "grad_norm": 0.17740927636623383, + "learning_rate": 1.557138570624948e-05, + "loss": 0.5478, + "step": 3095 + }, + { + "epoch": 0.8585690515806988, + "grad_norm": 0.19050319492816925, + "learning_rate": 1.556603304215376e-05, + "loss": 0.5511, + "step": 3096 + }, + { + "epoch": 0.8588463671658347, + "grad_norm": 0.1907372921705246, + "learning_rate": 1.556067977979377e-05, + "loss": 0.543, + "step": 3097 + }, + { + "epoch": 0.8591236827509706, + "grad_norm": 0.18947333097457886, + "learning_rate": 1.5555325920214055e-05, + "loss": 0.56, + "step": 3098 + }, + { + "epoch": 0.8594009983361065, + "grad_norm": 0.19369062781333923, + "learning_rate": 1.5549971464459308e-05, + "loss": 0.5506, + "step": 3099 + }, + { + "epoch": 0.8596783139212424, + "grad_norm": 0.18818899989128113, + "learning_rate": 1.554461641357432e-05, + "loss": 0.5434, + "step": 3100 + }, + { + "epoch": 0.8599556295063783, + "grad_norm": 0.19132985174655914, + "learning_rate": 1.5539260768604e-05, + "loss": 0.5378, + "step": 3101 + }, + { + "epoch": 0.8602329450915142, + "grad_norm": 0.20558297634124756, + "learning_rate": 1.5533904530593386e-05, + "loss": 0.5428, + "step": 3102 + }, + { + "epoch": 0.86051026067665, + "grad_norm": 0.19788722693920135, + "learning_rate": 1.5528547700587616e-05, + "loss": 0.5417, + "step": 3103 + }, + { + "epoch": 0.8607875762617859, + "grad_norm": 0.20902171730995178, + "learning_rate": 1.552319027963195e-05, + "loss": 0.5426, + "step": 3104 + }, + { + "epoch": 0.8610648918469218, + "grad_norm": 0.18526656925678253, + "learning_rate": 1.5517832268771764e-05, + "loss": 0.5342, + "step": 3105 + }, + { + "epoch": 0.8613422074320577, + "grad_norm": 0.1860983669757843, + "learning_rate": 1.551247366905254e-05, + "loss": 0.5466, + "step": 3106 + }, + { + "epoch": 0.8616195230171936, + "grad_norm": 0.18875092267990112, + "learning_rate": 1.5507114481519895e-05, + "loss": 0.5417, + "step": 3107 + }, + { + "epoch": 0.8618968386023295, + "grad_norm": 0.1913326233625412, + "learning_rate": 1.5501754707219536e-05, + "loss": 0.5589, + "step": 3108 + }, + { + "epoch": 0.8621741541874653, + "grad_norm": 0.19133377075195312, + "learning_rate": 1.549639434719731e-05, + "loss": 0.5345, + "step": 3109 + }, + { + "epoch": 0.8624514697726012, + "grad_norm": 0.18826799094676971, + "learning_rate": 1.549103340249916e-05, + "loss": 0.527, + "step": 3110 + }, + { + "epoch": 0.8627287853577371, + "grad_norm": 0.18173575401306152, + "learning_rate": 1.548567187417114e-05, + "loss": 0.5431, + "step": 3111 + }, + { + "epoch": 0.863006100942873, + "grad_norm": 0.19065049290657043, + "learning_rate": 1.548030976325944e-05, + "loss": 0.5147, + "step": 3112 + }, + { + "epoch": 0.8632834165280089, + "grad_norm": 0.2025771290063858, + "learning_rate": 1.547494707081034e-05, + "loss": 0.5635, + "step": 3113 + }, + { + "epoch": 0.8635607321131448, + "grad_norm": 0.18411415815353394, + "learning_rate": 1.546958379787025e-05, + "loss": 0.5246, + "step": 3114 + }, + { + "epoch": 0.8638380476982807, + "grad_norm": 0.19902820885181427, + "learning_rate": 1.546421994548568e-05, + "loss": 0.5564, + "step": 3115 + }, + { + "epoch": 0.8641153632834165, + "grad_norm": 0.18202657997608185, + "learning_rate": 1.5458855514703266e-05, + "loss": 0.5364, + "step": 3116 + }, + { + "epoch": 0.8643926788685524, + "grad_norm": 0.19231395423412323, + "learning_rate": 1.545349050656974e-05, + "loss": 0.5521, + "step": 3117 + }, + { + "epoch": 0.8646699944536883, + "grad_norm": 0.18620963394641876, + "learning_rate": 1.5448124922131974e-05, + "loss": 0.5308, + "step": 3118 + }, + { + "epoch": 0.8649473100388242, + "grad_norm": 0.18969541788101196, + "learning_rate": 1.5442758762436923e-05, + "loss": 0.5446, + "step": 3119 + }, + { + "epoch": 0.8652246256239601, + "grad_norm": 0.19268915057182312, + "learning_rate": 1.543739202853167e-05, + "loss": 0.531, + "step": 3120 + }, + { + "epoch": 0.865501941209096, + "grad_norm": 0.4672553837299347, + "learning_rate": 1.5432024721463413e-05, + "loss": 0.5564, + "step": 3121 + }, + { + "epoch": 0.8657792567942318, + "grad_norm": 0.19262655079364777, + "learning_rate": 1.5426656842279445e-05, + "loss": 0.558, + "step": 3122 + }, + { + "epoch": 0.8660565723793677, + "grad_norm": 0.20529362559318542, + "learning_rate": 1.5421288392027185e-05, + "loss": 0.5247, + "step": 3123 + }, + { + "epoch": 0.8663338879645036, + "grad_norm": 0.19508616626262665, + "learning_rate": 1.5415919371754166e-05, + "loss": 0.5307, + "step": 3124 + }, + { + "epoch": 0.8666112035496395, + "grad_norm": 0.18688172101974487, + "learning_rate": 1.541054978250802e-05, + "loss": 0.5422, + "step": 3125 + }, + { + "epoch": 0.8668885191347754, + "grad_norm": 0.19961917400360107, + "learning_rate": 1.5405179625336495e-05, + "loss": 0.5598, + "step": 3126 + }, + { + "epoch": 0.8671658347199113, + "grad_norm": 0.18617486953735352, + "learning_rate": 1.5399808901287457e-05, + "loss": 0.5221, + "step": 3127 + }, + { + "epoch": 0.8674431503050472, + "grad_norm": 0.1895580142736435, + "learning_rate": 1.5394437611408873e-05, + "loss": 0.5515, + "step": 3128 + }, + { + "epoch": 0.867720465890183, + "grad_norm": 0.19370147585868835, + "learning_rate": 1.5389065756748826e-05, + "loss": 0.5312, + "step": 3129 + }, + { + "epoch": 0.8679977814753189, + "grad_norm": 0.19625920057296753, + "learning_rate": 1.5383693338355504e-05, + "loss": 0.526, + "step": 3130 + }, + { + "epoch": 0.8682750970604548, + "grad_norm": 0.1878737509250641, + "learning_rate": 1.537832035727721e-05, + "loss": 0.5229, + "step": 3131 + }, + { + "epoch": 0.8685524126455907, + "grad_norm": 0.19030463695526123, + "learning_rate": 1.537294681456235e-05, + "loss": 0.5197, + "step": 3132 + }, + { + "epoch": 0.8688297282307266, + "grad_norm": 0.19420188665390015, + "learning_rate": 1.536757271125946e-05, + "loss": 0.5369, + "step": 3133 + }, + { + "epoch": 0.8691070438158625, + "grad_norm": 0.2041894942522049, + "learning_rate": 1.5362198048417147e-05, + "loss": 0.5315, + "step": 3134 + }, + { + "epoch": 0.8693843594009983, + "grad_norm": 0.1823713183403015, + "learning_rate": 1.535682282708417e-05, + "loss": 0.5358, + "step": 3135 + }, + { + "epoch": 0.8696616749861342, + "grad_norm": 0.18803556263446808, + "learning_rate": 1.5351447048309367e-05, + "loss": 0.5234, + "step": 3136 + }, + { + "epoch": 0.8699389905712701, + "grad_norm": 0.20315398275852203, + "learning_rate": 1.5346070713141697e-05, + "loss": 0.5523, + "step": 3137 + }, + { + "epoch": 0.870216306156406, + "grad_norm": 0.18089507520198822, + "learning_rate": 1.5340693822630224e-05, + "loss": 0.5425, + "step": 3138 + }, + { + "epoch": 0.8704936217415419, + "grad_norm": 0.19417926669120789, + "learning_rate": 1.5335316377824127e-05, + "loss": 0.5329, + "step": 3139 + }, + { + "epoch": 0.8707709373266778, + "grad_norm": 0.19868281483650208, + "learning_rate": 1.5329938379772685e-05, + "loss": 0.5564, + "step": 3140 + }, + { + "epoch": 0.8710482529118136, + "grad_norm": 0.186373770236969, + "learning_rate": 1.5324559829525285e-05, + "loss": 0.5498, + "step": 3141 + }, + { + "epoch": 0.8713255684969495, + "grad_norm": 0.21897022426128387, + "learning_rate": 1.531918072813143e-05, + "loss": 0.5508, + "step": 3142 + }, + { + "epoch": 0.8716028840820854, + "grad_norm": 0.19098646938800812, + "learning_rate": 1.5313801076640715e-05, + "loss": 0.5481, + "step": 3143 + }, + { + "epoch": 0.8718801996672213, + "grad_norm": 0.1954183280467987, + "learning_rate": 1.5308420876102863e-05, + "loss": 0.5531, + "step": 3144 + }, + { + "epoch": 0.8721575152523572, + "grad_norm": 0.18958479166030884, + "learning_rate": 1.5303040127567694e-05, + "loss": 0.5437, + "step": 3145 + }, + { + "epoch": 0.8724348308374931, + "grad_norm": 0.17530956864356995, + "learning_rate": 1.5297658832085126e-05, + "loss": 0.5216, + "step": 3146 + }, + { + "epoch": 0.872712146422629, + "grad_norm": 0.19810372591018677, + "learning_rate": 1.5292276990705202e-05, + "loss": 0.5623, + "step": 3147 + }, + { + "epoch": 0.8729894620077648, + "grad_norm": 0.19809319078922272, + "learning_rate": 1.5286894604478054e-05, + "loss": 0.5226, + "step": 3148 + }, + { + "epoch": 0.8732667775929007, + "grad_norm": 0.20191790163516998, + "learning_rate": 1.528151167445393e-05, + "loss": 0.5612, + "step": 3149 + }, + { + "epoch": 0.8735440931780366, + "grad_norm": 0.18988439440727234, + "learning_rate": 1.5276128201683187e-05, + "loss": 0.5362, + "step": 3150 + }, + { + "epoch": 0.8738214087631725, + "grad_norm": 0.181321382522583, + "learning_rate": 1.5270744187216277e-05, + "loss": 0.5082, + "step": 3151 + }, + { + "epoch": 0.8740987243483084, + "grad_norm": 0.18067540228366852, + "learning_rate": 1.526535963210377e-05, + "loss": 0.529, + "step": 3152 + }, + { + "epoch": 0.8743760399334443, + "grad_norm": 0.19349637627601624, + "learning_rate": 1.5259974537396325e-05, + "loss": 0.5495, + "step": 3153 + }, + { + "epoch": 0.8746533555185801, + "grad_norm": 0.17970281839370728, + "learning_rate": 1.5254588904144735e-05, + "loss": 0.529, + "step": 3154 + }, + { + "epoch": 0.874930671103716, + "grad_norm": 0.18023180961608887, + "learning_rate": 1.5249202733399859e-05, + "loss": 0.5638, + "step": 3155 + }, + { + "epoch": 0.8752079866888519, + "grad_norm": 0.20077262818813324, + "learning_rate": 1.5243816026212695e-05, + "loss": 0.5346, + "step": 3156 + }, + { + "epoch": 0.8754853022739878, + "grad_norm": 0.18828438222408295, + "learning_rate": 1.5238428783634326e-05, + "loss": 0.5721, + "step": 3157 + }, + { + "epoch": 0.8757626178591237, + "grad_norm": 0.19122296571731567, + "learning_rate": 1.5233041006715948e-05, + "loss": 0.5158, + "step": 3158 + }, + { + "epoch": 0.8760399334442596, + "grad_norm": 0.19371193647384644, + "learning_rate": 1.5227652696508859e-05, + "loss": 0.5758, + "step": 3159 + }, + { + "epoch": 0.8763172490293955, + "grad_norm": 0.19302338361740112, + "learning_rate": 1.5222263854064465e-05, + "loss": 0.5643, + "step": 3160 + }, + { + "epoch": 0.8765945646145313, + "grad_norm": 0.28244075179100037, + "learning_rate": 1.5216874480434264e-05, + "loss": 0.5462, + "step": 3161 + }, + { + "epoch": 0.8768718801996672, + "grad_norm": 0.1919064074754715, + "learning_rate": 1.521148457666987e-05, + "loss": 0.5317, + "step": 3162 + }, + { + "epoch": 0.8771491957848031, + "grad_norm": 0.18293742835521698, + "learning_rate": 1.5206094143823e-05, + "loss": 0.514, + "step": 3163 + }, + { + "epoch": 0.877426511369939, + "grad_norm": 0.193914532661438, + "learning_rate": 1.520070318294546e-05, + "loss": 0.5488, + "step": 3164 + }, + { + "epoch": 0.8777038269550749, + "grad_norm": 0.1869155466556549, + "learning_rate": 1.5195311695089175e-05, + "loss": 0.5591, + "step": 3165 + }, + { + "epoch": 0.8779811425402108, + "grad_norm": 0.19056154787540436, + "learning_rate": 1.5189919681306173e-05, + "loss": 0.5584, + "step": 3166 + }, + { + "epoch": 0.8782584581253466, + "grad_norm": 0.1936255842447281, + "learning_rate": 1.5184527142648569e-05, + "loss": 0.5477, + "step": 3167 + }, + { + "epoch": 0.8785357737104825, + "grad_norm": 0.19368582963943481, + "learning_rate": 1.5179134080168595e-05, + "loss": 0.5568, + "step": 3168 + }, + { + "epoch": 0.8788130892956184, + "grad_norm": 0.19679602980613708, + "learning_rate": 1.517374049491858e-05, + "loss": 0.5549, + "step": 3169 + }, + { + "epoch": 0.8790904048807543, + "grad_norm": 0.20116961002349854, + "learning_rate": 1.5168346387950955e-05, + "loss": 0.5565, + "step": 3170 + }, + { + "epoch": 0.8793677204658902, + "grad_norm": 0.1948871910572052, + "learning_rate": 1.5162951760318256e-05, + "loss": 0.5502, + "step": 3171 + }, + { + "epoch": 0.8796450360510261, + "grad_norm": 0.19126254320144653, + "learning_rate": 1.515755661307311e-05, + "loss": 0.5574, + "step": 3172 + }, + { + "epoch": 0.879922351636162, + "grad_norm": 0.18868835270404816, + "learning_rate": 1.5152160947268262e-05, + "loss": 0.5377, + "step": 3173 + }, + { + "epoch": 0.8801996672212978, + "grad_norm": 0.20093926787376404, + "learning_rate": 1.5146764763956542e-05, + "loss": 0.5486, + "step": 3174 + }, + { + "epoch": 0.8804769828064337, + "grad_norm": 0.20768702030181885, + "learning_rate": 1.5141368064190897e-05, + "loss": 0.5431, + "step": 3175 + }, + { + "epoch": 0.8807542983915696, + "grad_norm": 0.19229131937026978, + "learning_rate": 1.5135970849024356e-05, + "loss": 0.5298, + "step": 3176 + }, + { + "epoch": 0.8810316139767055, + "grad_norm": 0.18817956745624542, + "learning_rate": 1.5130573119510064e-05, + "loss": 0.5414, + "step": 3177 + }, + { + "epoch": 0.8813089295618414, + "grad_norm": 0.19554218649864197, + "learning_rate": 1.5125174876701262e-05, + "loss": 0.5381, + "step": 3178 + }, + { + "epoch": 0.8815862451469773, + "grad_norm": 0.1836453080177307, + "learning_rate": 1.5119776121651288e-05, + "loss": 0.508, + "step": 3179 + }, + { + "epoch": 0.8818635607321131, + "grad_norm": 0.2512352764606476, + "learning_rate": 1.5114376855413586e-05, + "loss": 0.5146, + "step": 3180 + }, + { + "epoch": 0.882140876317249, + "grad_norm": 0.19116099178791046, + "learning_rate": 1.5108977079041692e-05, + "loss": 0.5332, + "step": 3181 + }, + { + "epoch": 0.8824181919023849, + "grad_norm": 0.19376236200332642, + "learning_rate": 1.5103576793589244e-05, + "loss": 0.5188, + "step": 3182 + }, + { + "epoch": 0.8826955074875208, + "grad_norm": 0.2322767972946167, + "learning_rate": 1.5098176000109984e-05, + "loss": 0.5468, + "step": 3183 + }, + { + "epoch": 0.8829728230726567, + "grad_norm": 0.19449107348918915, + "learning_rate": 1.5092774699657747e-05, + "loss": 0.5223, + "step": 3184 + }, + { + "epoch": 0.8832501386577926, + "grad_norm": 0.19483071565628052, + "learning_rate": 1.5087372893286475e-05, + "loss": 0.5526, + "step": 3185 + }, + { + "epoch": 0.8835274542429284, + "grad_norm": 0.18979991972446442, + "learning_rate": 1.5081970582050201e-05, + "loss": 0.5145, + "step": 3186 + }, + { + "epoch": 0.8838047698280643, + "grad_norm": 0.18646521866321564, + "learning_rate": 1.5076567767003056e-05, + "loss": 0.5308, + "step": 3187 + }, + { + "epoch": 0.8840820854132002, + "grad_norm": 0.1952681541442871, + "learning_rate": 1.5071164449199277e-05, + "loss": 0.5384, + "step": 3188 + }, + { + "epoch": 0.8843594009983361, + "grad_norm": 0.1863500475883484, + "learning_rate": 1.506576062969319e-05, + "loss": 0.5721, + "step": 3189 + }, + { + "epoch": 0.884636716583472, + "grad_norm": 0.19182690978050232, + "learning_rate": 1.5060356309539226e-05, + "loss": 0.5171, + "step": 3190 + }, + { + "epoch": 0.8849140321686079, + "grad_norm": 0.18404962122440338, + "learning_rate": 1.5054951489791908e-05, + "loss": 0.5362, + "step": 3191 + }, + { + "epoch": 0.8851913477537438, + "grad_norm": 0.19122952222824097, + "learning_rate": 1.5049546171505869e-05, + "loss": 0.5424, + "step": 3192 + }, + { + "epoch": 0.8854686633388796, + "grad_norm": 0.1863165944814682, + "learning_rate": 1.5044140355735816e-05, + "loss": 0.53, + "step": 3193 + }, + { + "epoch": 0.8857459789240155, + "grad_norm": 0.19876129925251007, + "learning_rate": 1.5038734043536582e-05, + "loss": 0.5484, + "step": 3194 + }, + { + "epoch": 0.8860232945091514, + "grad_norm": 0.18133728206157684, + "learning_rate": 1.5033327235963065e-05, + "loss": 0.5036, + "step": 3195 + }, + { + "epoch": 0.8863006100942873, + "grad_norm": 0.19382363557815552, + "learning_rate": 1.5027919934070291e-05, + "loss": 0.5558, + "step": 3196 + }, + { + "epoch": 0.8865779256794232, + "grad_norm": 0.2071426957845688, + "learning_rate": 1.5022512138913358e-05, + "loss": 0.539, + "step": 3197 + }, + { + "epoch": 0.8868552412645591, + "grad_norm": 0.19322386384010315, + "learning_rate": 1.5017103851547476e-05, + "loss": 0.551, + "step": 3198 + }, + { + "epoch": 0.8871325568496949, + "grad_norm": 0.18517249822616577, + "learning_rate": 1.5011695073027942e-05, + "loss": 0.5171, + "step": 3199 + }, + { + "epoch": 0.8874098724348308, + "grad_norm": 0.18865883350372314, + "learning_rate": 1.5006285804410156e-05, + "loss": 0.5206, + "step": 3200 + }, + { + "epoch": 0.8876871880199667, + "grad_norm": 0.19007782638072968, + "learning_rate": 1.5000876046749603e-05, + "loss": 0.5194, + "step": 3201 + }, + { + "epoch": 0.8879645036051026, + "grad_norm": 0.18567977845668793, + "learning_rate": 1.4995465801101877e-05, + "loss": 0.535, + "step": 3202 + }, + { + "epoch": 0.8882418191902385, + "grad_norm": 0.19124850630760193, + "learning_rate": 1.4990055068522654e-05, + "loss": 0.5304, + "step": 3203 + }, + { + "epoch": 0.8885191347753744, + "grad_norm": 0.19194400310516357, + "learning_rate": 1.4984643850067717e-05, + "loss": 0.549, + "step": 3204 + }, + { + "epoch": 0.8887964503605102, + "grad_norm": 0.18528962135314941, + "learning_rate": 1.4979232146792936e-05, + "loss": 0.5053, + "step": 3205 + }, + { + "epoch": 0.8890737659456461, + "grad_norm": 0.1908206045627594, + "learning_rate": 1.4973819959754273e-05, + "loss": 0.5427, + "step": 3206 + }, + { + "epoch": 0.889351081530782, + "grad_norm": 0.18456414341926575, + "learning_rate": 1.4968407290007796e-05, + "loss": 0.567, + "step": 3207 + }, + { + "epoch": 0.8896283971159179, + "grad_norm": 0.19162026047706604, + "learning_rate": 1.496299413860966e-05, + "loss": 0.5609, + "step": 3208 + }, + { + "epoch": 0.8899057127010538, + "grad_norm": 0.18303340673446655, + "learning_rate": 1.4957580506616109e-05, + "loss": 0.5281, + "step": 3209 + }, + { + "epoch": 0.8901830282861897, + "grad_norm": 0.19368955492973328, + "learning_rate": 1.4952166395083486e-05, + "loss": 0.5348, + "step": 3210 + }, + { + "epoch": 0.8904603438713256, + "grad_norm": 0.1844678521156311, + "learning_rate": 1.4946751805068238e-05, + "loss": 0.542, + "step": 3211 + }, + { + "epoch": 0.8907376594564614, + "grad_norm": 0.18950358033180237, + "learning_rate": 1.4941336737626879e-05, + "loss": 0.5558, + "step": 3212 + }, + { + "epoch": 0.8910149750415973, + "grad_norm": 0.19502434134483337, + "learning_rate": 1.4935921193816046e-05, + "loss": 0.5109, + "step": 3213 + }, + { + "epoch": 0.8912922906267332, + "grad_norm": 0.18396303057670593, + "learning_rate": 1.4930505174692447e-05, + "loss": 0.5152, + "step": 3214 + }, + { + "epoch": 0.8915696062118691, + "grad_norm": 0.20252270996570587, + "learning_rate": 1.4925088681312895e-05, + "loss": 0.5219, + "step": 3215 + }, + { + "epoch": 0.891846921797005, + "grad_norm": 0.18550090491771698, + "learning_rate": 1.4919671714734288e-05, + "loss": 0.5321, + "step": 3216 + }, + { + "epoch": 0.8921242373821409, + "grad_norm": 0.19548344612121582, + "learning_rate": 1.4914254276013622e-05, + "loss": 0.559, + "step": 3217 + }, + { + "epoch": 0.8924015529672767, + "grad_norm": 0.1747296005487442, + "learning_rate": 1.4908836366207985e-05, + "loss": 0.5218, + "step": 3218 + }, + { + "epoch": 0.8926788685524126, + "grad_norm": 0.18561501801013947, + "learning_rate": 1.4903417986374548e-05, + "loss": 0.5615, + "step": 3219 + }, + { + "epoch": 0.8929561841375485, + "grad_norm": 0.19387111067771912, + "learning_rate": 1.4897999137570586e-05, + "loss": 0.5437, + "step": 3220 + }, + { + "epoch": 0.8932334997226844, + "grad_norm": 0.19557684659957886, + "learning_rate": 1.4892579820853459e-05, + "loss": 0.5387, + "step": 3221 + }, + { + "epoch": 0.8935108153078203, + "grad_norm": 0.1842055469751358, + "learning_rate": 1.488716003728062e-05, + "loss": 0.5277, + "step": 3222 + }, + { + "epoch": 0.8937881308929562, + "grad_norm": 0.1835155189037323, + "learning_rate": 1.4881739787909607e-05, + "loss": 0.5342, + "step": 3223 + }, + { + "epoch": 0.894065446478092, + "grad_norm": 0.1914055347442627, + "learning_rate": 1.4876319073798061e-05, + "loss": 0.548, + "step": 3224 + }, + { + "epoch": 0.8943427620632279, + "grad_norm": 0.18737882375717163, + "learning_rate": 1.4870897896003705e-05, + "loss": 0.5281, + "step": 3225 + }, + { + "epoch": 0.8946200776483638, + "grad_norm": 0.20303812623023987, + "learning_rate": 1.4865476255584351e-05, + "loss": 0.5603, + "step": 3226 + }, + { + "epoch": 0.8948973932334997, + "grad_norm": 0.20020204782485962, + "learning_rate": 1.486005415359791e-05, + "loss": 0.5434, + "step": 3227 + }, + { + "epoch": 0.8951747088186356, + "grad_norm": 0.1958005726337433, + "learning_rate": 1.4854631591102374e-05, + "loss": 0.522, + "step": 3228 + }, + { + "epoch": 0.8954520244037715, + "grad_norm": 0.23392035067081451, + "learning_rate": 1.4849208569155829e-05, + "loss": 0.5574, + "step": 3229 + }, + { + "epoch": 0.8957293399889074, + "grad_norm": 0.1955118626356125, + "learning_rate": 1.4843785088816455e-05, + "loss": 0.5327, + "step": 3230 + }, + { + "epoch": 0.8960066555740432, + "grad_norm": 0.18367242813110352, + "learning_rate": 1.4838361151142511e-05, + "loss": 0.5166, + "step": 3231 + }, + { + "epoch": 0.8962839711591791, + "grad_norm": 0.19902175664901733, + "learning_rate": 1.4832936757192354e-05, + "loss": 0.5283, + "step": 3232 + }, + { + "epoch": 0.896561286744315, + "grad_norm": 0.19180850684642792, + "learning_rate": 1.4827511908024419e-05, + "loss": 0.5277, + "step": 3233 + }, + { + "epoch": 0.8968386023294509, + "grad_norm": 0.18553797900676727, + "learning_rate": 1.4822086604697253e-05, + "loss": 0.5325, + "step": 3234 + }, + { + "epoch": 0.8971159179145868, + "grad_norm": 0.1895224153995514, + "learning_rate": 1.4816660848269462e-05, + "loss": 0.552, + "step": 3235 + }, + { + "epoch": 0.8973932334997227, + "grad_norm": 0.18854071199893951, + "learning_rate": 1.4811234639799761e-05, + "loss": 0.5471, + "step": 3236 + }, + { + "epoch": 0.8976705490848585, + "grad_norm": 0.17953041195869446, + "learning_rate": 1.480580798034695e-05, + "loss": 0.5556, + "step": 3237 + }, + { + "epoch": 0.8979478646699944, + "grad_norm": 0.1898965984582901, + "learning_rate": 1.480038087096991e-05, + "loss": 0.5282, + "step": 3238 + }, + { + "epoch": 0.8982251802551303, + "grad_norm": 0.19018866121768951, + "learning_rate": 1.4794953312727613e-05, + "loss": 0.5301, + "step": 3239 + }, + { + "epoch": 0.8985024958402662, + "grad_norm": 0.1814391165971756, + "learning_rate": 1.4789525306679122e-05, + "loss": 0.5526, + "step": 3240 + }, + { + "epoch": 0.8987798114254021, + "grad_norm": 0.19814236462116241, + "learning_rate": 1.4784096853883586e-05, + "loss": 0.5741, + "step": 3241 + }, + { + "epoch": 0.899057127010538, + "grad_norm": 0.19339510798454285, + "learning_rate": 1.4778667955400233e-05, + "loss": 0.5375, + "step": 3242 + }, + { + "epoch": 0.8993344425956739, + "grad_norm": 0.29308080673217773, + "learning_rate": 1.4773238612288393e-05, + "loss": 0.5239, + "step": 3243 + }, + { + "epoch": 0.8996117581808097, + "grad_norm": 0.1853429526090622, + "learning_rate": 1.476780882560747e-05, + "loss": 0.5436, + "step": 3244 + }, + { + "epoch": 0.8998890737659456, + "grad_norm": 0.18454696238040924, + "learning_rate": 1.4762378596416961e-05, + "loss": 0.5032, + "step": 3245 + }, + { + "epoch": 0.9001663893510815, + "grad_norm": 0.19088363647460938, + "learning_rate": 1.4756947925776448e-05, + "loss": 0.5369, + "step": 3246 + }, + { + "epoch": 0.9004437049362174, + "grad_norm": 0.18263565003871918, + "learning_rate": 1.4751516814745598e-05, + "loss": 0.5313, + "step": 3247 + }, + { + "epoch": 0.9007210205213533, + "grad_norm": 0.1870230883359909, + "learning_rate": 1.4746085264384165e-05, + "loss": 0.551, + "step": 3248 + }, + { + "epoch": 0.9009983361064892, + "grad_norm": 0.18567577004432678, + "learning_rate": 1.4740653275751987e-05, + "loss": 0.5585, + "step": 3249 + }, + { + "epoch": 0.901275651691625, + "grad_norm": 0.20058301091194153, + "learning_rate": 1.4735220849908987e-05, + "loss": 0.5031, + "step": 3250 + }, + { + "epoch": 0.9015529672767609, + "grad_norm": 0.18694059550762177, + "learning_rate": 1.4729787987915186e-05, + "loss": 0.5334, + "step": 3251 + }, + { + "epoch": 0.9018302828618968, + "grad_norm": 0.18202227354049683, + "learning_rate": 1.4724354690830663e-05, + "loss": 0.5553, + "step": 3252 + }, + { + "epoch": 0.9021075984470327, + "grad_norm": 0.19465142488479614, + "learning_rate": 1.4718920959715616e-05, + "loss": 0.5115, + "step": 3253 + }, + { + "epoch": 0.9023849140321686, + "grad_norm": 0.18764632940292358, + "learning_rate": 1.4713486795630291e-05, + "loss": 0.5546, + "step": 3254 + }, + { + "epoch": 0.9026622296173045, + "grad_norm": 0.2006525844335556, + "learning_rate": 1.4708052199635053e-05, + "loss": 0.5239, + "step": 3255 + }, + { + "epoch": 0.9029395452024404, + "grad_norm": 0.18893550336360931, + "learning_rate": 1.4702617172790325e-05, + "loss": 0.5246, + "step": 3256 + }, + { + "epoch": 0.9032168607875762, + "grad_norm": 0.2028273344039917, + "learning_rate": 1.4697181716156633e-05, + "loss": 0.5548, + "step": 3257 + }, + { + "epoch": 0.9034941763727121, + "grad_norm": 0.18957914412021637, + "learning_rate": 1.4691745830794574e-05, + "loss": 0.5261, + "step": 3258 + }, + { + "epoch": 0.903771491957848, + "grad_norm": 0.19126209616661072, + "learning_rate": 1.4686309517764835e-05, + "loss": 0.5479, + "step": 3259 + }, + { + "epoch": 0.9040488075429839, + "grad_norm": 0.1924603134393692, + "learning_rate": 1.4680872778128183e-05, + "loss": 0.5456, + "step": 3260 + }, + { + "epoch": 0.9043261231281198, + "grad_norm": 0.1945699006319046, + "learning_rate": 1.4675435612945468e-05, + "loss": 0.5437, + "step": 3261 + }, + { + "epoch": 0.9046034387132557, + "grad_norm": 0.18470460176467896, + "learning_rate": 1.466999802327763e-05, + "loss": 0.5342, + "step": 3262 + }, + { + "epoch": 0.9048807542983915, + "grad_norm": 0.18286101520061493, + "learning_rate": 1.4664560010185685e-05, + "loss": 0.5007, + "step": 3263 + }, + { + "epoch": 0.9051580698835274, + "grad_norm": 0.20376376807689667, + "learning_rate": 1.4659121574730736e-05, + "loss": 0.5517, + "step": 3264 + }, + { + "epoch": 0.9054353854686633, + "grad_norm": 0.18289178609848022, + "learning_rate": 1.465368271797396e-05, + "loss": 0.5254, + "step": 3265 + }, + { + "epoch": 0.9057127010537992, + "grad_norm": 0.18297207355499268, + "learning_rate": 1.4648243440976625e-05, + "loss": 0.5287, + "step": 3266 + }, + { + "epoch": 0.9059900166389351, + "grad_norm": 0.19243334233760834, + "learning_rate": 1.464280374480008e-05, + "loss": 0.5408, + "step": 3267 + }, + { + "epoch": 0.906267332224071, + "grad_norm": 0.19214653968811035, + "learning_rate": 1.463736363050575e-05, + "loss": 0.5455, + "step": 3268 + }, + { + "epoch": 0.9065446478092068, + "grad_norm": 0.1994084268808365, + "learning_rate": 1.4631923099155143e-05, + "loss": 0.546, + "step": 3269 + }, + { + "epoch": 0.9068219633943427, + "grad_norm": 0.19682733714580536, + "learning_rate": 1.4626482151809865e-05, + "loss": 0.5031, + "step": 3270 + }, + { + "epoch": 0.9070992789794786, + "grad_norm": 0.3178056478500366, + "learning_rate": 1.462104078953157e-05, + "loss": 0.5183, + "step": 3271 + }, + { + "epoch": 0.9073765945646145, + "grad_norm": 0.1968078464269638, + "learning_rate": 1.4615599013382028e-05, + "loss": 0.5475, + "step": 3272 + }, + { + "epoch": 0.9076539101497504, + "grad_norm": 0.18579484522342682, + "learning_rate": 1.461015682442306e-05, + "loss": 0.5296, + "step": 3273 + }, + { + "epoch": 0.9079312257348863, + "grad_norm": 0.1962941437959671, + "learning_rate": 1.4604714223716595e-05, + "loss": 0.5565, + "step": 3274 + }, + { + "epoch": 0.9082085413200222, + "grad_norm": 0.19378679990768433, + "learning_rate": 1.4599271212324617e-05, + "loss": 0.5506, + "step": 3275 + }, + { + "epoch": 0.908485856905158, + "grad_norm": 0.22947950661182404, + "learning_rate": 1.4593827791309206e-05, + "loss": 0.5625, + "step": 3276 + }, + { + "epoch": 0.9087631724902939, + "grad_norm": 0.1949935257434845, + "learning_rate": 1.458838396173252e-05, + "loss": 0.5327, + "step": 3277 + }, + { + "epoch": 0.9090404880754298, + "grad_norm": 0.19015999138355255, + "learning_rate": 1.458293972465679e-05, + "loss": 0.5168, + "step": 3278 + }, + { + "epoch": 0.9093178036605657, + "grad_norm": 0.19812412559986115, + "learning_rate": 1.4577495081144337e-05, + "loss": 0.5389, + "step": 3279 + }, + { + "epoch": 0.9095951192457016, + "grad_norm": 0.19743192195892334, + "learning_rate": 1.4572050032257548e-05, + "loss": 0.5492, + "step": 3280 + }, + { + "epoch": 0.9098724348308375, + "grad_norm": 0.17543548345565796, + "learning_rate": 1.4566604579058904e-05, + "loss": 0.5064, + "step": 3281 + }, + { + "epoch": 0.9101497504159733, + "grad_norm": 0.20693279802799225, + "learning_rate": 1.4561158722610948e-05, + "loss": 0.5312, + "step": 3282 + }, + { + "epoch": 0.9104270660011092, + "grad_norm": 0.19201938807964325, + "learning_rate": 1.4555712463976318e-05, + "loss": 0.518, + "step": 3283 + }, + { + "epoch": 0.9107043815862451, + "grad_norm": 0.1977842152118683, + "learning_rate": 1.4550265804217722e-05, + "loss": 0.5263, + "step": 3284 + }, + { + "epoch": 0.910981697171381, + "grad_norm": 0.18007364869117737, + "learning_rate": 1.4544818744397947e-05, + "loss": 0.521, + "step": 3285 + }, + { + "epoch": 0.9112590127565169, + "grad_norm": 0.19621910154819489, + "learning_rate": 1.453937128557986e-05, + "loss": 0.5701, + "step": 3286 + }, + { + "epoch": 0.9115363283416528, + "grad_norm": 0.1956057995557785, + "learning_rate": 1.4533923428826399e-05, + "loss": 0.5542, + "step": 3287 + }, + { + "epoch": 0.9118136439267887, + "grad_norm": 0.20553717017173767, + "learning_rate": 1.452847517520059e-05, + "loss": 0.5379, + "step": 3288 + }, + { + "epoch": 0.9120909595119245, + "grad_norm": 0.19297057390213013, + "learning_rate": 1.4523026525765532e-05, + "loss": 0.5562, + "step": 3289 + }, + { + "epoch": 0.9123682750970604, + "grad_norm": 0.19896887242794037, + "learning_rate": 1.4517577481584399e-05, + "loss": 0.5502, + "step": 3290 + }, + { + "epoch": 0.9126455906821963, + "grad_norm": 0.18490025401115417, + "learning_rate": 1.4512128043720447e-05, + "loss": 0.5426, + "step": 3291 + }, + { + "epoch": 0.9129229062673322, + "grad_norm": 0.19716937839984894, + "learning_rate": 1.4506678213236998e-05, + "loss": 0.5433, + "step": 3292 + }, + { + "epoch": 0.9132002218524681, + "grad_norm": 0.19106687605381012, + "learning_rate": 1.4501227991197472e-05, + "loss": 0.5437, + "step": 3293 + }, + { + "epoch": 0.913477537437604, + "grad_norm": 0.18353629112243652, + "learning_rate": 1.4495777378665337e-05, + "loss": 0.5374, + "step": 3294 + }, + { + "epoch": 0.9137548530227398, + "grad_norm": 0.19178996980190277, + "learning_rate": 1.4490326376704161e-05, + "loss": 0.5471, + "step": 3295 + }, + { + "epoch": 0.9140321686078757, + "grad_norm": 0.18615961074829102, + "learning_rate": 1.4484874986377573e-05, + "loss": 0.5503, + "step": 3296 + }, + { + "epoch": 0.9143094841930116, + "grad_norm": 0.19387958943843842, + "learning_rate": 1.447942320874929e-05, + "loss": 0.5174, + "step": 3297 + }, + { + "epoch": 0.9145867997781475, + "grad_norm": 0.18303687870502472, + "learning_rate": 1.4473971044883095e-05, + "loss": 0.5335, + "step": 3298 + }, + { + "epoch": 0.9148641153632834, + "grad_norm": 0.9202612638473511, + "learning_rate": 1.4468518495842848e-05, + "loss": 0.547, + "step": 3299 + }, + { + "epoch": 0.9151414309484193, + "grad_norm": 0.1895207315683365, + "learning_rate": 1.446306556269249e-05, + "loss": 0.5561, + "step": 3300 + }, + { + "epoch": 0.9154187465335551, + "grad_norm": 0.18821243941783905, + "learning_rate": 1.4457612246496027e-05, + "loss": 0.5318, + "step": 3301 + }, + { + "epoch": 0.915696062118691, + "grad_norm": 0.1886141002178192, + "learning_rate": 1.4452158548317551e-05, + "loss": 0.5517, + "step": 3302 + }, + { + "epoch": 0.9159733777038269, + "grad_norm": 0.1940373033285141, + "learning_rate": 1.444670446922122e-05, + "loss": 0.5617, + "step": 3303 + }, + { + "epoch": 0.9162506932889628, + "grad_norm": 0.18342509865760803, + "learning_rate": 1.444125001027127e-05, + "loss": 0.5438, + "step": 3304 + }, + { + "epoch": 0.9165280088740987, + "grad_norm": 0.19425570964813232, + "learning_rate": 1.4435795172532014e-05, + "loss": 0.5413, + "step": 3305 + }, + { + "epoch": 0.9168053244592346, + "grad_norm": 0.19505858421325684, + "learning_rate": 1.4430339957067826e-05, + "loss": 0.5666, + "step": 3306 + }, + { + "epoch": 0.9170826400443705, + "grad_norm": 0.19822010397911072, + "learning_rate": 1.4424884364943172e-05, + "loss": 0.521, + "step": 3307 + }, + { + "epoch": 0.9173599556295063, + "grad_norm": 0.1916007101535797, + "learning_rate": 1.4419428397222582e-05, + "loss": 0.5649, + "step": 3308 + }, + { + "epoch": 0.9176372712146422, + "grad_norm": 0.19743779301643372, + "learning_rate": 1.441397205497065e-05, + "loss": 0.5337, + "step": 3309 + }, + { + "epoch": 0.9179145867997781, + "grad_norm": 0.19250613451004028, + "learning_rate": 1.4408515339252068e-05, + "loss": 0.5477, + "step": 3310 + }, + { + "epoch": 0.918191902384914, + "grad_norm": 0.18895046412944794, + "learning_rate": 1.4403058251131574e-05, + "loss": 0.5782, + "step": 3311 + }, + { + "epoch": 0.9184692179700499, + "grad_norm": 0.1895778477191925, + "learning_rate": 1.4397600791673999e-05, + "loss": 0.5344, + "step": 3312 + }, + { + "epoch": 0.9187465335551858, + "grad_norm": 0.1935378462076187, + "learning_rate": 1.4392142961944228e-05, + "loss": 0.5568, + "step": 3313 + }, + { + "epoch": 0.9190238491403216, + "grad_norm": 0.18773426115512848, + "learning_rate": 1.4386684763007235e-05, + "loss": 0.546, + "step": 3314 + }, + { + "epoch": 0.9193011647254575, + "grad_norm": 0.18107493221759796, + "learning_rate": 1.438122619592806e-05, + "loss": 0.5007, + "step": 3315 + }, + { + "epoch": 0.9195784803105934, + "grad_norm": 0.19908292591571808, + "learning_rate": 1.4375767261771814e-05, + "loss": 0.545, + "step": 3316 + }, + { + "epoch": 0.9198557958957293, + "grad_norm": 0.19010509550571442, + "learning_rate": 1.4370307961603673e-05, + "loss": 0.5593, + "step": 3317 + }, + { + "epoch": 0.9201331114808652, + "grad_norm": 0.19176799058914185, + "learning_rate": 1.4364848296488897e-05, + "loss": 0.5676, + "step": 3318 + }, + { + "epoch": 0.9204104270660011, + "grad_norm": 0.1841881275177002, + "learning_rate": 1.4359388267492812e-05, + "loss": 0.5361, + "step": 3319 + }, + { + "epoch": 0.920687742651137, + "grad_norm": 0.19731801748275757, + "learning_rate": 1.4353927875680808e-05, + "loss": 0.5213, + "step": 3320 + }, + { + "epoch": 0.9209650582362728, + "grad_norm": 0.18500731885433197, + "learning_rate": 1.4348467122118364e-05, + "loss": 0.5417, + "step": 3321 + }, + { + "epoch": 0.9212423738214087, + "grad_norm": 0.2047429382801056, + "learning_rate": 1.4343006007871004e-05, + "loss": 0.5333, + "step": 3322 + }, + { + "epoch": 0.9215196894065446, + "grad_norm": 0.2538173198699951, + "learning_rate": 1.4337544534004346e-05, + "loss": 0.5529, + "step": 3323 + }, + { + "epoch": 0.9217970049916805, + "grad_norm": 0.1965561956167221, + "learning_rate": 1.4332082701584063e-05, + "loss": 0.5432, + "step": 3324 + }, + { + "epoch": 0.9220743205768164, + "grad_norm": 0.1974562108516693, + "learning_rate": 1.4326620511675906e-05, + "loss": 0.5159, + "step": 3325 + }, + { + "epoch": 0.9223516361619523, + "grad_norm": 0.19231897592544556, + "learning_rate": 1.4321157965345688e-05, + "loss": 0.5674, + "step": 3326 + }, + { + "epoch": 0.9226289517470881, + "grad_norm": 0.19817808270454407, + "learning_rate": 1.4315695063659304e-05, + "loss": 0.5496, + "step": 3327 + }, + { + "epoch": 0.922906267332224, + "grad_norm": 0.1902833878993988, + "learning_rate": 1.4310231807682706e-05, + "loss": 0.5342, + "step": 3328 + }, + { + "epoch": 0.9231835829173599, + "grad_norm": 0.19699136912822723, + "learning_rate": 1.4304768198481923e-05, + "loss": 0.5417, + "step": 3329 + }, + { + "epoch": 0.9234608985024958, + "grad_norm": 0.19054579734802246, + "learning_rate": 1.4299304237123043e-05, + "loss": 0.5556, + "step": 3330 + }, + { + "epoch": 0.9237382140876317, + "grad_norm": 0.191939577460289, + "learning_rate": 1.4293839924672242e-05, + "loss": 0.5304, + "step": 3331 + }, + { + "epoch": 0.9240155296727676, + "grad_norm": 0.22973594069480896, + "learning_rate": 1.4288375262195739e-05, + "loss": 0.5337, + "step": 3332 + }, + { + "epoch": 0.9242928452579035, + "grad_norm": 0.1892794668674469, + "learning_rate": 1.4282910250759843e-05, + "loss": 0.5527, + "step": 3333 + }, + { + "epoch": 0.9245701608430393, + "grad_norm": 0.19066397845745087, + "learning_rate": 1.427744489143092e-05, + "loss": 0.5446, + "step": 3334 + }, + { + "epoch": 0.9248474764281752, + "grad_norm": 0.18730609118938446, + "learning_rate": 1.4271979185275402e-05, + "loss": 0.5425, + "step": 3335 + }, + { + "epoch": 0.9251247920133111, + "grad_norm": 0.18019746243953705, + "learning_rate": 1.4266513133359802e-05, + "loss": 0.5263, + "step": 3336 + }, + { + "epoch": 0.925402107598447, + "grad_norm": 0.19604218006134033, + "learning_rate": 1.4261046736750686e-05, + "loss": 0.5552, + "step": 3337 + }, + { + "epoch": 0.9256794231835829, + "grad_norm": 0.1902369260787964, + "learning_rate": 1.4255579996514693e-05, + "loss": 0.5531, + "step": 3338 + }, + { + "epoch": 0.9259567387687188, + "grad_norm": 0.19522936642169952, + "learning_rate": 1.4250112913718525e-05, + "loss": 0.545, + "step": 3339 + }, + { + "epoch": 0.9262340543538546, + "grad_norm": 0.21855325996875763, + "learning_rate": 1.4244645489428968e-05, + "loss": 0.5133, + "step": 3340 + }, + { + "epoch": 0.9265113699389905, + "grad_norm": 0.20022931694984436, + "learning_rate": 1.4239177724712843e-05, + "loss": 0.5392, + "step": 3341 + }, + { + "epoch": 0.9267886855241264, + "grad_norm": 0.19183357059955597, + "learning_rate": 1.423370962063707e-05, + "loss": 0.5338, + "step": 3342 + }, + { + "epoch": 0.9270660011092623, + "grad_norm": 0.18884657323360443, + "learning_rate": 1.4228241178268617e-05, + "loss": 0.5143, + "step": 3343 + }, + { + "epoch": 0.9273433166943982, + "grad_norm": 0.19082637131214142, + "learning_rate": 1.4222772398674522e-05, + "loss": 0.5281, + "step": 3344 + }, + { + "epoch": 0.9276206322795341, + "grad_norm": 0.20209652185440063, + "learning_rate": 1.4217303282921888e-05, + "loss": 0.526, + "step": 3345 + }, + { + "epoch": 0.92789794786467, + "grad_norm": 0.18856458365917206, + "learning_rate": 1.4211833832077881e-05, + "loss": 0.52, + "step": 3346 + }, + { + "epoch": 0.9281752634498058, + "grad_norm": 0.1941954344511032, + "learning_rate": 1.4206364047209742e-05, + "loss": 0.5401, + "step": 3347 + }, + { + "epoch": 0.9284525790349417, + "grad_norm": 0.18749526143074036, + "learning_rate": 1.4200893929384767e-05, + "loss": 0.5449, + "step": 3348 + }, + { + "epoch": 0.9287298946200776, + "grad_norm": 0.18534240126609802, + "learning_rate": 1.4195423479670319e-05, + "loss": 0.5039, + "step": 3349 + }, + { + "epoch": 0.9290072102052135, + "grad_norm": 0.18905483186244965, + "learning_rate": 1.4189952699133837e-05, + "loss": 0.5632, + "step": 3350 + }, + { + "epoch": 0.9292845257903494, + "grad_norm": 0.20442554354667664, + "learning_rate": 1.4184481588842805e-05, + "loss": 0.5339, + "step": 3351 + }, + { + "epoch": 0.9295618413754853, + "grad_norm": 0.1810925155878067, + "learning_rate": 1.4179010149864785e-05, + "loss": 0.512, + "step": 3352 + }, + { + "epoch": 0.9298391569606211, + "grad_norm": 0.2010018676519394, + "learning_rate": 1.4173538383267404e-05, + "loss": 0.5514, + "step": 3353 + }, + { + "epoch": 0.930116472545757, + "grad_norm": 0.18728989362716675, + "learning_rate": 1.4168066290118342e-05, + "loss": 0.5161, + "step": 3354 + }, + { + "epoch": 0.9303937881308929, + "grad_norm": 0.18747897446155548, + "learning_rate": 1.4162593871485352e-05, + "loss": 0.5549, + "step": 3355 + }, + { + "epoch": 0.9306711037160288, + "grad_norm": 0.20961935818195343, + "learning_rate": 1.415712112843625e-05, + "loss": 0.5422, + "step": 3356 + }, + { + "epoch": 0.9309484193011647, + "grad_norm": 0.1834210753440857, + "learning_rate": 1.415164806203891e-05, + "loss": 0.5074, + "step": 3357 + }, + { + "epoch": 0.9312257348863006, + "grad_norm": 0.1954893171787262, + "learning_rate": 1.4146174673361273e-05, + "loss": 0.5722, + "step": 3358 + }, + { + "epoch": 0.9315030504714364, + "grad_norm": 0.1870308816432953, + "learning_rate": 1.4140700963471346e-05, + "loss": 0.5381, + "step": 3359 + }, + { + "epoch": 0.9317803660565723, + "grad_norm": 0.19276247918605804, + "learning_rate": 1.4135226933437185e-05, + "loss": 0.5517, + "step": 3360 + }, + { + "epoch": 0.9320576816417082, + "grad_norm": 0.18969060480594635, + "learning_rate": 1.412975258432693e-05, + "loss": 0.5347, + "step": 3361 + }, + { + "epoch": 0.9323349972268441, + "grad_norm": 0.19431856274604797, + "learning_rate": 1.4124277917208765e-05, + "loss": 0.5077, + "step": 3362 + }, + { + "epoch": 0.93261231281198, + "grad_norm": 0.1918652504682541, + "learning_rate": 1.4118802933150943e-05, + "loss": 0.526, + "step": 3363 + }, + { + "epoch": 0.9328896283971159, + "grad_norm": 0.19903786480426788, + "learning_rate": 1.4113327633221782e-05, + "loss": 0.5312, + "step": 3364 + }, + { + "epoch": 0.9331669439822518, + "grad_norm": 0.21233876049518585, + "learning_rate": 1.4107852018489653e-05, + "loss": 0.5256, + "step": 3365 + }, + { + "epoch": 0.9334442595673876, + "grad_norm": 0.1878969967365265, + "learning_rate": 1.4102376090022997e-05, + "loss": 0.5439, + "step": 3366 + }, + { + "epoch": 0.9337215751525235, + "grad_norm": 0.194586381316185, + "learning_rate": 1.409689984889031e-05, + "loss": 0.5392, + "step": 3367 + }, + { + "epoch": 0.9339988907376594, + "grad_norm": 0.1912042200565338, + "learning_rate": 1.4091423296160152e-05, + "loss": 0.5269, + "step": 3368 + }, + { + "epoch": 0.9342762063227953, + "grad_norm": 0.18847908079624176, + "learning_rate": 1.4085946432901154e-05, + "loss": 0.5175, + "step": 3369 + }, + { + "epoch": 0.9345535219079312, + "grad_norm": 0.19912898540496826, + "learning_rate": 1.4080469260181977e-05, + "loss": 0.5336, + "step": 3370 + }, + { + "epoch": 0.9348308374930671, + "grad_norm": 0.1928989291191101, + "learning_rate": 1.4074991779071378e-05, + "loss": 0.5847, + "step": 3371 + }, + { + "epoch": 0.9351081530782029, + "grad_norm": 0.20281600952148438, + "learning_rate": 1.4069513990638156e-05, + "loss": 0.5772, + "step": 3372 + }, + { + "epoch": 0.9353854686633388, + "grad_norm": 0.21058966219425201, + "learning_rate": 1.4064035895951169e-05, + "loss": 0.5201, + "step": 3373 + }, + { + "epoch": 0.9356627842484747, + "grad_norm": 0.190606027841568, + "learning_rate": 1.4058557496079342e-05, + "loss": 0.5226, + "step": 3374 + }, + { + "epoch": 0.9359400998336106, + "grad_norm": 0.1862584948539734, + "learning_rate": 1.4053078792091654e-05, + "loss": 0.5275, + "step": 3375 + }, + { + "epoch": 0.9362174154187465, + "grad_norm": 0.1864033192396164, + "learning_rate": 1.4047599785057144e-05, + "loss": 0.515, + "step": 3376 + }, + { + "epoch": 0.9364947310038824, + "grad_norm": 0.21961943805217743, + "learning_rate": 1.4042120476044912e-05, + "loss": 0.5344, + "step": 3377 + }, + { + "epoch": 0.9367720465890182, + "grad_norm": 0.18101632595062256, + "learning_rate": 1.4036640866124123e-05, + "loss": 0.547, + "step": 3378 + }, + { + "epoch": 0.9370493621741541, + "grad_norm": 0.21359553933143616, + "learning_rate": 1.4031160956363982e-05, + "loss": 0.5392, + "step": 3379 + }, + { + "epoch": 0.93732667775929, + "grad_norm": 0.1979496330022812, + "learning_rate": 1.4025680747833775e-05, + "loss": 0.5188, + "step": 3380 + }, + { + "epoch": 0.937603993344426, + "grad_norm": 0.18914289772510529, + "learning_rate": 1.402020024160283e-05, + "loss": 0.5291, + "step": 3381 + }, + { + "epoch": 0.9378813089295619, + "grad_norm": 0.18374024331569672, + "learning_rate": 1.4014719438740543e-05, + "loss": 0.5368, + "step": 3382 + }, + { + "epoch": 0.9381586245146978, + "grad_norm": 0.201919287443161, + "learning_rate": 1.4009238340316358e-05, + "loss": 0.5429, + "step": 3383 + }, + { + "epoch": 0.9384359400998337, + "grad_norm": 0.19644081592559814, + "learning_rate": 1.400375694739979e-05, + "loss": 0.5151, + "step": 3384 + }, + { + "epoch": 0.9387132556849695, + "grad_norm": 0.18970987200737, + "learning_rate": 1.39982752610604e-05, + "loss": 0.5852, + "step": 3385 + }, + { + "epoch": 0.9389905712701054, + "grad_norm": 0.19986121356487274, + "learning_rate": 1.3992793282367808e-05, + "loss": 0.5209, + "step": 3386 + }, + { + "epoch": 0.9392678868552413, + "grad_norm": 0.19541729986667633, + "learning_rate": 1.3987311012391698e-05, + "loss": 0.5315, + "step": 3387 + }, + { + "epoch": 0.9395452024403772, + "grad_norm": 0.1904657930135727, + "learning_rate": 1.3981828452201804e-05, + "loss": 0.5483, + "step": 3388 + }, + { + "epoch": 0.9398225180255131, + "grad_norm": 0.19447685778141022, + "learning_rate": 1.3976345602867916e-05, + "loss": 0.5446, + "step": 3389 + }, + { + "epoch": 0.940099833610649, + "grad_norm": 0.1892615556716919, + "learning_rate": 1.3970862465459891e-05, + "loss": 0.5366, + "step": 3390 + }, + { + "epoch": 0.9403771491957849, + "grad_norm": 0.1949482262134552, + "learning_rate": 1.3965379041047624e-05, + "loss": 0.5577, + "step": 3391 + }, + { + "epoch": 0.9406544647809207, + "grad_norm": 0.19417253136634827, + "learning_rate": 1.3959895330701083e-05, + "loss": 0.5431, + "step": 3392 + }, + { + "epoch": 0.9409317803660566, + "grad_norm": 0.19106963276863098, + "learning_rate": 1.3954411335490284e-05, + "loss": 0.5308, + "step": 3393 + }, + { + "epoch": 0.9412090959511925, + "grad_norm": 0.19582101702690125, + "learning_rate": 1.39489270564853e-05, + "loss": 0.5221, + "step": 3394 + }, + { + "epoch": 0.9414864115363284, + "grad_norm": 0.1944214105606079, + "learning_rate": 1.3943442494756259e-05, + "loss": 0.5428, + "step": 3395 + }, + { + "epoch": 0.9417637271214643, + "grad_norm": 0.19008556008338928, + "learning_rate": 1.3937957651373342e-05, + "loss": 0.5458, + "step": 3396 + }, + { + "epoch": 0.9420410427066002, + "grad_norm": 0.186477929353714, + "learning_rate": 1.3932472527406792e-05, + "loss": 0.5315, + "step": 3397 + }, + { + "epoch": 0.942318358291736, + "grad_norm": 0.20386064052581787, + "learning_rate": 1.3926987123926897e-05, + "loss": 0.5309, + "step": 3398 + }, + { + "epoch": 0.9425956738768719, + "grad_norm": 0.18348954617977142, + "learning_rate": 1.3921501442004011e-05, + "loss": 0.5333, + "step": 3399 + }, + { + "epoch": 0.9428729894620078, + "grad_norm": 0.19923634827136993, + "learning_rate": 1.3916015482708528e-05, + "loss": 0.5301, + "step": 3400 + }, + { + "epoch": 0.9431503050471437, + "grad_norm": 0.19134169816970825, + "learning_rate": 1.3910529247110906e-05, + "loss": 0.519, + "step": 3401 + }, + { + "epoch": 0.9434276206322796, + "grad_norm": 0.18783889710903168, + "learning_rate": 1.390504273628166e-05, + "loss": 0.5207, + "step": 3402 + }, + { + "epoch": 0.9437049362174155, + "grad_norm": 0.18133123219013214, + "learning_rate": 1.3899555951291348e-05, + "loss": 0.5244, + "step": 3403 + }, + { + "epoch": 0.9439822518025514, + "grad_norm": 0.17967382073402405, + "learning_rate": 1.3894068893210594e-05, + "loss": 0.558, + "step": 3404 + }, + { + "epoch": 0.9442595673876872, + "grad_norm": 0.1942291408777237, + "learning_rate": 1.3888581563110059e-05, + "loss": 0.5307, + "step": 3405 + }, + { + "epoch": 0.9445368829728231, + "grad_norm": 0.22371730208396912, + "learning_rate": 1.3883093962060472e-05, + "loss": 0.5386, + "step": 3406 + }, + { + "epoch": 0.944814198557959, + "grad_norm": 0.18771930038928986, + "learning_rate": 1.387760609113261e-05, + "loss": 0.5267, + "step": 3407 + }, + { + "epoch": 0.9450915141430949, + "grad_norm": 0.19484716653823853, + "learning_rate": 1.3872117951397298e-05, + "loss": 0.5578, + "step": 3408 + }, + { + "epoch": 0.9453688297282308, + "grad_norm": 0.190118208527565, + "learning_rate": 1.3866629543925424e-05, + "loss": 0.4886, + "step": 3409 + }, + { + "epoch": 0.9456461453133667, + "grad_norm": 0.18805035948753357, + "learning_rate": 1.3861140869787914e-05, + "loss": 0.5393, + "step": 3410 + }, + { + "epoch": 0.9459234608985025, + "grad_norm": 0.1881994605064392, + "learning_rate": 1.385565193005576e-05, + "loss": 0.5368, + "step": 3411 + }, + { + "epoch": 0.9462007764836384, + "grad_norm": 0.18705572187900543, + "learning_rate": 1.3850162725799997e-05, + "loss": 0.5706, + "step": 3412 + }, + { + "epoch": 0.9464780920687743, + "grad_norm": 0.23857946693897247, + "learning_rate": 1.3844673258091714e-05, + "loss": 0.5151, + "step": 3413 + }, + { + "epoch": 0.9467554076539102, + "grad_norm": 0.190442755818367, + "learning_rate": 1.383918352800205e-05, + "loss": 0.5287, + "step": 3414 + }, + { + "epoch": 0.9470327232390461, + "grad_norm": 0.19661828875541687, + "learning_rate": 1.38336935366022e-05, + "loss": 0.5474, + "step": 3415 + }, + { + "epoch": 0.947310038824182, + "grad_norm": 0.1797836571931839, + "learning_rate": 1.3828203284963409e-05, + "loss": 0.5416, + "step": 3416 + }, + { + "epoch": 0.9475873544093179, + "grad_norm": 0.23815502226352692, + "learning_rate": 1.382271277415696e-05, + "loss": 0.5501, + "step": 3417 + }, + { + "epoch": 0.9478646699944537, + "grad_norm": 0.19180168211460114, + "learning_rate": 1.381722200525421e-05, + "loss": 0.5254, + "step": 3418 + }, + { + "epoch": 0.9481419855795896, + "grad_norm": 0.2042774260044098, + "learning_rate": 1.3811730979326545e-05, + "loss": 0.5437, + "step": 3419 + }, + { + "epoch": 0.9484193011647255, + "grad_norm": 0.1914202868938446, + "learning_rate": 1.3806239697445414e-05, + "loss": 0.5602, + "step": 3420 + }, + { + "epoch": 0.9486966167498614, + "grad_norm": 0.2009340077638626, + "learning_rate": 1.3800748160682309e-05, + "loss": 0.5497, + "step": 3421 + }, + { + "epoch": 0.9489739323349973, + "grad_norm": 0.19518032670021057, + "learning_rate": 1.3795256370108776e-05, + "loss": 0.5149, + "step": 3422 + }, + { + "epoch": 0.9492512479201332, + "grad_norm": 0.19419553875923157, + "learning_rate": 1.3789764326796407e-05, + "loss": 0.5407, + "step": 3423 + }, + { + "epoch": 0.949528563505269, + "grad_norm": 0.19669625163078308, + "learning_rate": 1.3784272031816844e-05, + "loss": 0.537, + "step": 3424 + }, + { + "epoch": 0.9498058790904049, + "grad_norm": 0.1969965547323227, + "learning_rate": 1.3778779486241786e-05, + "loss": 0.5324, + "step": 3425 + }, + { + "epoch": 0.9500831946755408, + "grad_norm": 0.1851070374250412, + "learning_rate": 1.3773286691142966e-05, + "loss": 0.5533, + "step": 3426 + }, + { + "epoch": 0.9503605102606767, + "grad_norm": 0.18586984276771545, + "learning_rate": 1.3767793647592175e-05, + "loss": 0.5347, + "step": 3427 + }, + { + "epoch": 0.9506378258458126, + "grad_norm": 0.1796058714389801, + "learning_rate": 1.3762300356661261e-05, + "loss": 0.5223, + "step": 3428 + }, + { + "epoch": 0.9509151414309485, + "grad_norm": 0.1968797892332077, + "learning_rate": 1.3756806819422097e-05, + "loss": 0.5296, + "step": 3429 + }, + { + "epoch": 0.9511924570160843, + "grad_norm": 0.19946354627609253, + "learning_rate": 1.3751313036946627e-05, + "loss": 0.555, + "step": 3430 + }, + { + "epoch": 0.9514697726012202, + "grad_norm": 0.19412177801132202, + "learning_rate": 1.3745819010306832e-05, + "loss": 0.5367, + "step": 3431 + }, + { + "epoch": 0.9517470881863561, + "grad_norm": 0.1967850923538208, + "learning_rate": 1.374032474057474e-05, + "loss": 0.5355, + "step": 3432 + }, + { + "epoch": 0.952024403771492, + "grad_norm": 0.26605215668678284, + "learning_rate": 1.3734830228822428e-05, + "loss": 0.5161, + "step": 3433 + }, + { + "epoch": 0.9523017193566279, + "grad_norm": 0.21650773286819458, + "learning_rate": 1.372933547612202e-05, + "loss": 0.5398, + "step": 3434 + }, + { + "epoch": 0.9525790349417638, + "grad_norm": 0.20562243461608887, + "learning_rate": 1.3723840483545697e-05, + "loss": 0.5509, + "step": 3435 + }, + { + "epoch": 0.9528563505268997, + "grad_norm": 0.19407188892364502, + "learning_rate": 1.3718345252165663e-05, + "loss": 0.5373, + "step": 3436 + }, + { + "epoch": 0.9531336661120355, + "grad_norm": 0.19095157086849213, + "learning_rate": 1.3712849783054197e-05, + "loss": 0.54, + "step": 3437 + }, + { + "epoch": 0.9534109816971714, + "grad_norm": 0.1831715852022171, + "learning_rate": 1.3707354077283599e-05, + "loss": 0.5451, + "step": 3438 + }, + { + "epoch": 0.9536882972823073, + "grad_norm": 0.19795599579811096, + "learning_rate": 1.3701858135926238e-05, + "loss": 0.5329, + "step": 3439 + }, + { + "epoch": 0.9539656128674432, + "grad_norm": 0.19685760140419006, + "learning_rate": 1.3696361960054506e-05, + "loss": 0.5525, + "step": 3440 + }, + { + "epoch": 0.9542429284525791, + "grad_norm": 0.1895620971918106, + "learning_rate": 1.3690865550740864e-05, + "loss": 0.539, + "step": 3441 + }, + { + "epoch": 0.954520244037715, + "grad_norm": 0.1906086504459381, + "learning_rate": 1.3685368909057799e-05, + "loss": 0.5449, + "step": 3442 + }, + { + "epoch": 0.9547975596228508, + "grad_norm": 0.19026583433151245, + "learning_rate": 1.3679872036077853e-05, + "loss": 0.5303, + "step": 3443 + }, + { + "epoch": 0.9550748752079867, + "grad_norm": 0.1858745813369751, + "learning_rate": 1.3674374932873615e-05, + "loss": 0.5364, + "step": 3444 + }, + { + "epoch": 0.9553521907931226, + "grad_norm": 0.19552922248840332, + "learning_rate": 1.3668877600517712e-05, + "loss": 0.5367, + "step": 3445 + }, + { + "epoch": 0.9556295063782585, + "grad_norm": 0.1855076402425766, + "learning_rate": 1.3663380040082821e-05, + "loss": 0.532, + "step": 3446 + }, + { + "epoch": 0.9559068219633944, + "grad_norm": 0.19864481687545776, + "learning_rate": 1.365788225264166e-05, + "loss": 0.5252, + "step": 3447 + }, + { + "epoch": 0.9561841375485303, + "grad_norm": 0.18544144928455353, + "learning_rate": 1.3652384239266993e-05, + "loss": 0.5359, + "step": 3448 + }, + { + "epoch": 0.9564614531336662, + "grad_norm": 0.18177370727062225, + "learning_rate": 1.364688600103163e-05, + "loss": 0.5516, + "step": 3449 + }, + { + "epoch": 0.956738768718802, + "grad_norm": 0.2049761265516281, + "learning_rate": 1.3641387539008424e-05, + "loss": 0.5446, + "step": 3450 + }, + { + "epoch": 0.9570160843039379, + "grad_norm": 0.18312260508537292, + "learning_rate": 1.3635888854270268e-05, + "loss": 0.5292, + "step": 3451 + }, + { + "epoch": 0.9572933998890738, + "grad_norm": 0.18809424340724945, + "learning_rate": 1.36303899478901e-05, + "loss": 0.5296, + "step": 3452 + }, + { + "epoch": 0.9575707154742097, + "grad_norm": 0.183831587433815, + "learning_rate": 1.3624890820940902e-05, + "loss": 0.5213, + "step": 3453 + }, + { + "epoch": 0.9578480310593456, + "grad_norm": 0.18347223103046417, + "learning_rate": 1.3619391474495708e-05, + "loss": 0.5516, + "step": 3454 + }, + { + "epoch": 0.9581253466444815, + "grad_norm": 0.1897597759962082, + "learning_rate": 1.3613891909627575e-05, + "loss": 0.5539, + "step": 3455 + }, + { + "epoch": 0.9584026622296173, + "grad_norm": 0.19610610604286194, + "learning_rate": 1.360839212740962e-05, + "loss": 0.5581, + "step": 3456 + }, + { + "epoch": 0.9586799778147532, + "grad_norm": 0.20589366555213928, + "learning_rate": 1.3602892128914992e-05, + "loss": 0.5266, + "step": 3457 + }, + { + "epoch": 0.9589572933998891, + "grad_norm": 0.18203216791152954, + "learning_rate": 1.3597391915216896e-05, + "loss": 0.5222, + "step": 3458 + }, + { + "epoch": 0.959234608985025, + "grad_norm": 0.19360937178134918, + "learning_rate": 1.3591891487388553e-05, + "loss": 0.5271, + "step": 3459 + }, + { + "epoch": 0.9595119245701609, + "grad_norm": 0.2998809814453125, + "learning_rate": 1.3586390846503259e-05, + "loss": 0.5281, + "step": 3460 + }, + { + "epoch": 0.9597892401552968, + "grad_norm": 0.18753725290298462, + "learning_rate": 1.3580889993634322e-05, + "loss": 0.5553, + "step": 3461 + }, + { + "epoch": 0.9600665557404326, + "grad_norm": 0.19862516224384308, + "learning_rate": 1.3575388929855112e-05, + "loss": 0.4975, + "step": 3462 + }, + { + "epoch": 0.9603438713255685, + "grad_norm": 0.18844319880008698, + "learning_rate": 1.3569887656239033e-05, + "loss": 0.4977, + "step": 3463 + }, + { + "epoch": 0.9606211869107044, + "grad_norm": 0.2365204095840454, + "learning_rate": 1.3564386173859523e-05, + "loss": 0.5243, + "step": 3464 + }, + { + "epoch": 0.9608985024958403, + "grad_norm": 0.19346970319747925, + "learning_rate": 1.3558884483790072e-05, + "loss": 0.5504, + "step": 3465 + }, + { + "epoch": 0.9611758180809762, + "grad_norm": 0.19397403299808502, + "learning_rate": 1.3553382587104201e-05, + "loss": 0.5448, + "step": 3466 + }, + { + "epoch": 0.9614531336661121, + "grad_norm": 0.19531095027923584, + "learning_rate": 1.3547880484875477e-05, + "loss": 0.5614, + "step": 3467 + }, + { + "epoch": 0.961730449251248, + "grad_norm": 0.20053645968437195, + "learning_rate": 1.354237817817751e-05, + "loss": 0.5412, + "step": 3468 + }, + { + "epoch": 0.9620077648363838, + "grad_norm": 0.19779855012893677, + "learning_rate": 1.3536875668083943e-05, + "loss": 0.5675, + "step": 3469 + }, + { + "epoch": 0.9622850804215197, + "grad_norm": 0.19257789850234985, + "learning_rate": 1.3531372955668462e-05, + "loss": 0.5128, + "step": 3470 + }, + { + "epoch": 0.9625623960066556, + "grad_norm": 0.19003498554229736, + "learning_rate": 1.352587004200479e-05, + "loss": 0.545, + "step": 3471 + }, + { + "epoch": 0.9628397115917915, + "grad_norm": 0.1935829222202301, + "learning_rate": 1.3520366928166695e-05, + "loss": 0.5568, + "step": 3472 + }, + { + "epoch": 0.9631170271769274, + "grad_norm": 0.19856667518615723, + "learning_rate": 1.3514863615227979e-05, + "loss": 0.5247, + "step": 3473 + }, + { + "epoch": 0.9633943427620633, + "grad_norm": 0.18320922553539276, + "learning_rate": 1.3509360104262478e-05, + "loss": 0.5358, + "step": 3474 + }, + { + "epoch": 0.9636716583471991, + "grad_norm": 0.20552265644073486, + "learning_rate": 1.3503856396344086e-05, + "loss": 0.5631, + "step": 3475 + }, + { + "epoch": 0.963948973932335, + "grad_norm": 0.19140848517417908, + "learning_rate": 1.3498352492546706e-05, + "loss": 0.5241, + "step": 3476 + }, + { + "epoch": 0.9642262895174709, + "grad_norm": 0.18474581837654114, + "learning_rate": 1.3492848393944312e-05, + "loss": 0.4961, + "step": 3477 + }, + { + "epoch": 0.9645036051026068, + "grad_norm": 0.1950991153717041, + "learning_rate": 1.3487344101610885e-05, + "loss": 0.5223, + "step": 3478 + }, + { + "epoch": 0.9647809206877427, + "grad_norm": 0.19947074353694916, + "learning_rate": 1.348183961662047e-05, + "loss": 0.5232, + "step": 3479 + }, + { + "epoch": 0.9650582362728786, + "grad_norm": 0.19454912841320038, + "learning_rate": 1.3476334940047127e-05, + "loss": 0.5611, + "step": 3480 + }, + { + "epoch": 0.9653355518580145, + "grad_norm": 0.19796130061149597, + "learning_rate": 1.3470830072964973e-05, + "loss": 0.5459, + "step": 3481 + }, + { + "epoch": 0.9656128674431503, + "grad_norm": 0.1934625804424286, + "learning_rate": 1.346532501644815e-05, + "loss": 0.5142, + "step": 3482 + }, + { + "epoch": 0.9658901830282862, + "grad_norm": 0.18265816569328308, + "learning_rate": 1.345981977157084e-05, + "loss": 0.5416, + "step": 3483 + }, + { + "epoch": 0.9661674986134221, + "grad_norm": 0.19438108801841736, + "learning_rate": 1.3454314339407262e-05, + "loss": 0.5264, + "step": 3484 + }, + { + "epoch": 0.966444814198558, + "grad_norm": 0.20060043036937714, + "learning_rate": 1.3448808721031673e-05, + "loss": 0.5472, + "step": 3485 + }, + { + "epoch": 0.9667221297836939, + "grad_norm": 0.20769962668418884, + "learning_rate": 1.3443302917518361e-05, + "loss": 0.5334, + "step": 3486 + }, + { + "epoch": 0.9669994453688298, + "grad_norm": 0.19313625991344452, + "learning_rate": 1.3437796929941661e-05, + "loss": 0.5565, + "step": 3487 + }, + { + "epoch": 0.9672767609539656, + "grad_norm": 0.20908023416996002, + "learning_rate": 1.3432290759375935e-05, + "loss": 0.5485, + "step": 3488 + }, + { + "epoch": 0.9675540765391015, + "grad_norm": 0.1994648575782776, + "learning_rate": 1.342678440689558e-05, + "loss": 0.5566, + "step": 3489 + }, + { + "epoch": 0.9678313921242374, + "grad_norm": 0.18501751124858856, + "learning_rate": 1.342127787357503e-05, + "loss": 0.5345, + "step": 3490 + }, + { + "epoch": 0.9681087077093733, + "grad_norm": 0.17410939931869507, + "learning_rate": 1.341577116048876e-05, + "loss": 0.5523, + "step": 3491 + }, + { + "epoch": 0.9683860232945092, + "grad_norm": 0.1948653906583786, + "learning_rate": 1.3410264268711276e-05, + "loss": 0.5419, + "step": 3492 + }, + { + "epoch": 0.9686633388796451, + "grad_norm": 0.1910163313150406, + "learning_rate": 1.3404757199317108e-05, + "loss": 0.5215, + "step": 3493 + }, + { + "epoch": 0.968940654464781, + "grad_norm": 0.21614839136600494, + "learning_rate": 1.3399249953380849e-05, + "loss": 0.5235, + "step": 3494 + }, + { + "epoch": 0.9692179700499168, + "grad_norm": 0.19417038559913635, + "learning_rate": 1.3393742531977094e-05, + "loss": 0.5504, + "step": 3495 + }, + { + "epoch": 0.9694952856350527, + "grad_norm": 0.18813247978687286, + "learning_rate": 1.3388234936180493e-05, + "loss": 0.5334, + "step": 3496 + }, + { + "epoch": 0.9697726012201886, + "grad_norm": 0.18560314178466797, + "learning_rate": 1.3382727167065723e-05, + "loss": 0.5295, + "step": 3497 + }, + { + "epoch": 0.9700499168053245, + "grad_norm": 0.19894826412200928, + "learning_rate": 1.3377219225707495e-05, + "loss": 0.5235, + "step": 3498 + }, + { + "epoch": 0.9703272323904604, + "grad_norm": 0.18726693093776703, + "learning_rate": 1.3371711113180552e-05, + "loss": 0.5032, + "step": 3499 + }, + { + "epoch": 0.9706045479755963, + "grad_norm": 0.20481140911579132, + "learning_rate": 1.3366202830559679e-05, + "loss": 0.5465, + "step": 3500 + }, + { + "epoch": 0.9708818635607321, + "grad_norm": 0.18740220367908478, + "learning_rate": 1.3360694378919683e-05, + "loss": 0.5052, + "step": 3501 + }, + { + "epoch": 0.971159179145868, + "grad_norm": 0.2005304992198944, + "learning_rate": 1.3355185759335409e-05, + "loss": 0.5565, + "step": 3502 + }, + { + "epoch": 0.9714364947310039, + "grad_norm": 0.1884390115737915, + "learning_rate": 1.3349676972881736e-05, + "loss": 0.5513, + "step": 3503 + }, + { + "epoch": 0.9717138103161398, + "grad_norm": 0.19259166717529297, + "learning_rate": 1.3344168020633574e-05, + "loss": 0.5136, + "step": 3504 + }, + { + "epoch": 0.9719911259012757, + "grad_norm": 0.18772351741790771, + "learning_rate": 1.3338658903665868e-05, + "loss": 0.5418, + "step": 3505 + }, + { + "epoch": 0.9722684414864116, + "grad_norm": 0.19562283158302307, + "learning_rate": 1.3333149623053584e-05, + "loss": 0.5291, + "step": 3506 + }, + { + "epoch": 0.9725457570715474, + "grad_norm": 0.18652015924453735, + "learning_rate": 1.332764017987174e-05, + "loss": 0.5406, + "step": 3507 + }, + { + "epoch": 0.9728230726566833, + "grad_norm": 0.19781257212162018, + "learning_rate": 1.3322130575195366e-05, + "loss": 0.5253, + "step": 3508 + }, + { + "epoch": 0.9731003882418192, + "grad_norm": 0.20048947632312775, + "learning_rate": 1.3316620810099536e-05, + "loss": 0.5339, + "step": 3509 + }, + { + "epoch": 0.9733777038269551, + "grad_norm": 0.1913914531469345, + "learning_rate": 1.331111088565935e-05, + "loss": 0.5399, + "step": 3510 + }, + { + "epoch": 0.973655019412091, + "grad_norm": 0.18670813739299774, + "learning_rate": 1.3305600802949941e-05, + "loss": 0.4995, + "step": 3511 + }, + { + "epoch": 0.9739323349972269, + "grad_norm": 0.1913759857416153, + "learning_rate": 1.3300090563046472e-05, + "loss": 0.5324, + "step": 3512 + }, + { + "epoch": 0.9742096505823628, + "grad_norm": 0.18199758231639862, + "learning_rate": 1.3294580167024135e-05, + "loss": 0.535, + "step": 3513 + }, + { + "epoch": 0.9744869661674986, + "grad_norm": 0.18627969920635223, + "learning_rate": 1.328906961595815e-05, + "loss": 0.5282, + "step": 3514 + }, + { + "epoch": 0.9747642817526345, + "grad_norm": 0.1850520372390747, + "learning_rate": 1.3283558910923785e-05, + "loss": 0.5126, + "step": 3515 + }, + { + "epoch": 0.9750415973377704, + "grad_norm": 0.19907443225383759, + "learning_rate": 1.327804805299631e-05, + "loss": 0.5228, + "step": 3516 + }, + { + "epoch": 0.9753189129229063, + "grad_norm": 0.19155220687389374, + "learning_rate": 1.3272537043251054e-05, + "loss": 0.5289, + "step": 3517 + }, + { + "epoch": 0.9755962285080422, + "grad_norm": 0.19737032055854797, + "learning_rate": 1.3267025882763345e-05, + "loss": 0.5342, + "step": 3518 + }, + { + "epoch": 0.9758735440931781, + "grad_norm": 0.186650350689888, + "learning_rate": 1.3261514572608569e-05, + "loss": 0.5542, + "step": 3519 + }, + { + "epoch": 0.9761508596783139, + "grad_norm": 0.1934208869934082, + "learning_rate": 1.3256003113862122e-05, + "loss": 0.53, + "step": 3520 + }, + { + "epoch": 0.9764281752634498, + "grad_norm": 0.1932929903268814, + "learning_rate": 1.3250491507599439e-05, + "loss": 0.5322, + "step": 3521 + }, + { + "epoch": 0.9767054908485857, + "grad_norm": 0.20356619358062744, + "learning_rate": 1.3244979754895978e-05, + "loss": 0.5443, + "step": 3522 + }, + { + "epoch": 0.9769828064337216, + "grad_norm": 0.19713589549064636, + "learning_rate": 1.3239467856827229e-05, + "loss": 0.5335, + "step": 3523 + }, + { + "epoch": 0.9772601220188575, + "grad_norm": 0.18841132521629333, + "learning_rate": 1.323395581446871e-05, + "loss": 0.5614, + "step": 3524 + }, + { + "epoch": 0.9775374376039934, + "grad_norm": 0.18878857791423798, + "learning_rate": 1.3228443628895962e-05, + "loss": 0.536, + "step": 3525 + }, + { + "epoch": 0.9778147531891292, + "grad_norm": 0.1925961673259735, + "learning_rate": 1.3222931301184565e-05, + "loss": 0.5743, + "step": 3526 + }, + { + "epoch": 0.9780920687742651, + "grad_norm": 0.2000453919172287, + "learning_rate": 1.321741883241012e-05, + "loss": 0.5144, + "step": 3527 + }, + { + "epoch": 0.978369384359401, + "grad_norm": 0.1939268261194229, + "learning_rate": 1.3211906223648251e-05, + "loss": 0.5379, + "step": 3528 + }, + { + "epoch": 0.9786466999445369, + "grad_norm": 0.19331765174865723, + "learning_rate": 1.3206393475974615e-05, + "loss": 0.5428, + "step": 3529 + }, + { + "epoch": 0.9789240155296728, + "grad_norm": 0.20399914681911469, + "learning_rate": 1.3200880590464898e-05, + "loss": 0.5323, + "step": 3530 + }, + { + "epoch": 0.9792013311148087, + "grad_norm": 0.2036525458097458, + "learning_rate": 1.3195367568194807e-05, + "loss": 0.541, + "step": 3531 + }, + { + "epoch": 0.9794786466999446, + "grad_norm": 0.19523896276950836, + "learning_rate": 1.3189854410240082e-05, + "loss": 0.5479, + "step": 3532 + }, + { + "epoch": 0.9797559622850804, + "grad_norm": 0.20129665732383728, + "learning_rate": 1.318434111767648e-05, + "loss": 0.5565, + "step": 3533 + }, + { + "epoch": 0.9800332778702163, + "grad_norm": 0.19564275443553925, + "learning_rate": 1.3178827691579801e-05, + "loss": 0.5781, + "step": 3534 + }, + { + "epoch": 0.9803105934553522, + "grad_norm": 0.2035827934741974, + "learning_rate": 1.317331413302585e-05, + "loss": 0.554, + "step": 3535 + }, + { + "epoch": 0.9805879090404881, + "grad_norm": 0.1899930238723755, + "learning_rate": 1.3167800443090475e-05, + "loss": 0.5187, + "step": 3536 + }, + { + "epoch": 0.980865224625624, + "grad_norm": 0.1863166242837906, + "learning_rate": 1.3162286622849538e-05, + "loss": 0.5199, + "step": 3537 + }, + { + "epoch": 0.9811425402107599, + "grad_norm": 0.20462384819984436, + "learning_rate": 1.3156772673378936e-05, + "loss": 0.5479, + "step": 3538 + }, + { + "epoch": 0.9814198557958957, + "grad_norm": 0.19956259429454803, + "learning_rate": 1.3151258595754581e-05, + "loss": 0.5492, + "step": 3539 + }, + { + "epoch": 0.9816971713810316, + "grad_norm": 0.2239493578672409, + "learning_rate": 1.3145744391052422e-05, + "loss": 0.563, + "step": 3540 + }, + { + "epoch": 0.9819744869661675, + "grad_norm": 0.20383387804031372, + "learning_rate": 1.3140230060348425e-05, + "loss": 0.5309, + "step": 3541 + }, + { + "epoch": 0.9822518025513034, + "grad_norm": 0.1931990385055542, + "learning_rate": 1.3134715604718579e-05, + "loss": 0.5233, + "step": 3542 + }, + { + "epoch": 0.9825291181364393, + "grad_norm": 0.19594894349575043, + "learning_rate": 1.3129201025238902e-05, + "loss": 0.5366, + "step": 3543 + }, + { + "epoch": 0.9828064337215752, + "grad_norm": 0.3933532238006592, + "learning_rate": 1.3123686322985434e-05, + "loss": 0.5762, + "step": 3544 + }, + { + "epoch": 0.983083749306711, + "grad_norm": 0.19356705248355865, + "learning_rate": 1.311817149903424e-05, + "loss": 0.5247, + "step": 3545 + }, + { + "epoch": 0.9833610648918469, + "grad_norm": 0.19310222566127777, + "learning_rate": 1.3112656554461405e-05, + "loss": 0.5501, + "step": 3546 + }, + { + "epoch": 0.9836383804769828, + "grad_norm": 0.1923639476299286, + "learning_rate": 1.310714149034305e-05, + "loss": 0.5265, + "step": 3547 + }, + { + "epoch": 0.9839156960621187, + "grad_norm": 0.19612760841846466, + "learning_rate": 1.3101626307755303e-05, + "loss": 0.5292, + "step": 3548 + }, + { + "epoch": 0.9841930116472546, + "grad_norm": 0.19919681549072266, + "learning_rate": 1.3096111007774322e-05, + "loss": 0.565, + "step": 3549 + }, + { + "epoch": 0.9844703272323905, + "grad_norm": 0.20053736865520477, + "learning_rate": 1.3090595591476293e-05, + "loss": 0.5669, + "step": 3550 + }, + { + "epoch": 0.9847476428175264, + "grad_norm": 0.18242114782333374, + "learning_rate": 1.3085080059937413e-05, + "loss": 0.531, + "step": 3551 + }, + { + "epoch": 0.9850249584026622, + "grad_norm": 0.18836815655231476, + "learning_rate": 1.3079564414233912e-05, + "loss": 0.5238, + "step": 3552 + }, + { + "epoch": 0.9853022739877981, + "grad_norm": 0.185248464345932, + "learning_rate": 1.3074048655442042e-05, + "loss": 0.5294, + "step": 3553 + }, + { + "epoch": 0.985579589572934, + "grad_norm": 0.18509770929813385, + "learning_rate": 1.3068532784638065e-05, + "loss": 0.5492, + "step": 3554 + }, + { + "epoch": 0.9858569051580699, + "grad_norm": 0.1903439611196518, + "learning_rate": 1.3063016802898288e-05, + "loss": 0.5463, + "step": 3555 + }, + { + "epoch": 0.9861342207432058, + "grad_norm": 0.19315816462039948, + "learning_rate": 1.3057500711299006e-05, + "loss": 0.533, + "step": 3556 + }, + { + "epoch": 0.9864115363283417, + "grad_norm": 0.19946327805519104, + "learning_rate": 1.305198451091657e-05, + "loss": 0.5282, + "step": 3557 + }, + { + "epoch": 0.9866888519134775, + "grad_norm": 0.1908363550901413, + "learning_rate": 1.3046468202827328e-05, + "loss": 0.5208, + "step": 3558 + }, + { + "epoch": 0.9869661674986134, + "grad_norm": 0.1873141974210739, + "learning_rate": 1.304095178810766e-05, + "loss": 0.5146, + "step": 3559 + }, + { + "epoch": 0.9872434830837493, + "grad_norm": 0.18219764530658722, + "learning_rate": 1.303543526783397e-05, + "loss": 0.5109, + "step": 3560 + }, + { + "epoch": 0.9875207986688852, + "grad_norm": 0.1835818737745285, + "learning_rate": 1.3029918643082673e-05, + "loss": 0.5352, + "step": 3561 + }, + { + "epoch": 0.9877981142540211, + "grad_norm": 0.1892389953136444, + "learning_rate": 1.3024401914930207e-05, + "loss": 0.5209, + "step": 3562 + }, + { + "epoch": 0.988075429839157, + "grad_norm": 0.18364709615707397, + "learning_rate": 1.3018885084453036e-05, + "loss": 0.5213, + "step": 3563 + }, + { + "epoch": 0.9883527454242929, + "grad_norm": 0.18862204253673553, + "learning_rate": 1.3013368152727634e-05, + "loss": 0.5151, + "step": 3564 + }, + { + "epoch": 0.9886300610094287, + "grad_norm": 0.1929624229669571, + "learning_rate": 1.3007851120830506e-05, + "loss": 0.5347, + "step": 3565 + }, + { + "epoch": 0.9889073765945646, + "grad_norm": 0.19009216129779816, + "learning_rate": 1.3002333989838167e-05, + "loss": 0.5589, + "step": 3566 + }, + { + "epoch": 0.9891846921797005, + "grad_norm": 0.1797810047864914, + "learning_rate": 1.299681676082716e-05, + "loss": 0.5186, + "step": 3567 + }, + { + "epoch": 0.9894620077648364, + "grad_norm": 0.1861737072467804, + "learning_rate": 1.2991299434874038e-05, + "loss": 0.5292, + "step": 3568 + }, + { + "epoch": 0.9897393233499723, + "grad_norm": 0.186946839094162, + "learning_rate": 1.298578201305538e-05, + "loss": 0.5393, + "step": 3569 + }, + { + "epoch": 0.9900166389351082, + "grad_norm": 0.20192833244800568, + "learning_rate": 1.2980264496447784e-05, + "loss": 0.5487, + "step": 3570 + }, + { + "epoch": 0.990293954520244, + "grad_norm": 0.1849672794342041, + "learning_rate": 1.2974746886127858e-05, + "loss": 0.5342, + "step": 3571 + }, + { + "epoch": 0.9905712701053799, + "grad_norm": 0.18184737861156464, + "learning_rate": 1.2969229183172236e-05, + "loss": 0.5387, + "step": 3572 + }, + { + "epoch": 0.9908485856905158, + "grad_norm": 0.19623394310474396, + "learning_rate": 1.2963711388657566e-05, + "loss": 0.5588, + "step": 3573 + }, + { + "epoch": 0.9911259012756517, + "grad_norm": 0.19016727805137634, + "learning_rate": 1.2958193503660524e-05, + "loss": 0.5393, + "step": 3574 + }, + { + "epoch": 0.9914032168607876, + "grad_norm": 0.19216102361679077, + "learning_rate": 1.2952675529257785e-05, + "loss": 0.5383, + "step": 3575 + }, + { + "epoch": 0.9916805324459235, + "grad_norm": 0.21044804155826569, + "learning_rate": 1.2947157466526062e-05, + "loss": 0.5453, + "step": 3576 + }, + { + "epoch": 0.9919578480310594, + "grad_norm": 0.19791410863399506, + "learning_rate": 1.2941639316542062e-05, + "loss": 0.5562, + "step": 3577 + }, + { + "epoch": 0.9922351636161952, + "grad_norm": 0.18991726636886597, + "learning_rate": 1.2936121080382534e-05, + "loss": 0.4977, + "step": 3578 + }, + { + "epoch": 0.9925124792013311, + "grad_norm": 0.19211836159229279, + "learning_rate": 1.293060275912423e-05, + "loss": 0.5227, + "step": 3579 + }, + { + "epoch": 0.992789794786467, + "grad_norm": 0.19635222852230072, + "learning_rate": 1.292508435384392e-05, + "loss": 0.5454, + "step": 3580 + }, + { + "epoch": 0.9930671103716029, + "grad_norm": 0.18526218831539154, + "learning_rate": 1.2919565865618388e-05, + "loss": 0.5429, + "step": 3581 + }, + { + "epoch": 0.9933444259567388, + "grad_norm": 0.1954220086336136, + "learning_rate": 1.291404729552444e-05, + "loss": 0.5479, + "step": 3582 + }, + { + "epoch": 0.9936217415418747, + "grad_norm": 0.1860000044107437, + "learning_rate": 1.2908528644638895e-05, + "loss": 0.5291, + "step": 3583 + }, + { + "epoch": 0.9938990571270105, + "grad_norm": 0.19651706516742706, + "learning_rate": 1.2903009914038586e-05, + "loss": 0.5345, + "step": 3584 + }, + { + "epoch": 0.9941763727121464, + "grad_norm": 0.18839724361896515, + "learning_rate": 1.2897491104800366e-05, + "loss": 0.5624, + "step": 3585 + }, + { + "epoch": 0.9944536882972823, + "grad_norm": 0.1895817518234253, + "learning_rate": 1.28919722180011e-05, + "loss": 0.5577, + "step": 3586 + }, + { + "epoch": 0.9947310038824182, + "grad_norm": 0.18223224580287933, + "learning_rate": 1.288645325471767e-05, + "loss": 0.5511, + "step": 3587 + }, + { + "epoch": 0.9950083194675541, + "grad_norm": 0.18843968212604523, + "learning_rate": 1.2880934216026971e-05, + "loss": 0.5416, + "step": 3588 + }, + { + "epoch": 0.99528563505269, + "grad_norm": 0.18690787255764008, + "learning_rate": 1.2875415103005915e-05, + "loss": 0.5348, + "step": 3589 + }, + { + "epoch": 0.9955629506378258, + "grad_norm": 0.18256263434886932, + "learning_rate": 1.2869895916731426e-05, + "loss": 0.5147, + "step": 3590 + }, + { + "epoch": 0.9958402662229617, + "grad_norm": 0.18710075318813324, + "learning_rate": 1.2864376658280441e-05, + "loss": 0.5245, + "step": 3591 + }, + { + "epoch": 0.9961175818080976, + "grad_norm": 0.20847736299037933, + "learning_rate": 1.2858857328729915e-05, + "loss": 0.5432, + "step": 3592 + }, + { + "epoch": 0.9963948973932335, + "grad_norm": 0.18452630937099457, + "learning_rate": 1.2853337929156822e-05, + "loss": 0.5279, + "step": 3593 + }, + { + "epoch": 0.9966722129783694, + "grad_norm": 0.19052648544311523, + "learning_rate": 1.2847818460638131e-05, + "loss": 0.5312, + "step": 3594 + }, + { + "epoch": 0.9969495285635053, + "grad_norm": 0.19351086020469666, + "learning_rate": 1.2842298924250848e-05, + "loss": 0.525, + "step": 3595 + }, + { + "epoch": 0.9972268441486412, + "grad_norm": 0.17867478728294373, + "learning_rate": 1.2836779321071974e-05, + "loss": 0.5493, + "step": 3596 + }, + { + "epoch": 0.997504159733777, + "grad_norm": 0.18988832831382751, + "learning_rate": 1.2831259652178532e-05, + "loss": 0.5449, + "step": 3597 + }, + { + "epoch": 0.9977814753189129, + "grad_norm": 0.18697360157966614, + "learning_rate": 1.2825739918647553e-05, + "loss": 0.5146, + "step": 3598 + }, + { + "epoch": 0.9980587909040488, + "grad_norm": 0.19430890679359436, + "learning_rate": 1.2820220121556087e-05, + "loss": 0.5423, + "step": 3599 + }, + { + "epoch": 0.9983361064891847, + "grad_norm": 0.19263558089733124, + "learning_rate": 1.2814700261981195e-05, + "loss": 0.5393, + "step": 3600 + }, + { + "epoch": 0.9986134220743206, + "grad_norm": 0.1892475187778473, + "learning_rate": 1.2809180340999938e-05, + "loss": 0.5205, + "step": 3601 + }, + { + "epoch": 0.9988907376594565, + "grad_norm": 0.19061142206192017, + "learning_rate": 1.280366035968941e-05, + "loss": 0.5195, + "step": 3602 + }, + { + "epoch": 0.9991680532445923, + "grad_norm": 0.1829683780670166, + "learning_rate": 1.2798140319126695e-05, + "loss": 0.5111, + "step": 3603 + }, + { + "epoch": 0.9994453688297282, + "grad_norm": 0.18301549553871155, + "learning_rate": 1.279262022038891e-05, + "loss": 0.5393, + "step": 3604 + }, + { + "epoch": 0.9997226844148641, + "grad_norm": 0.18907521665096283, + "learning_rate": 1.2787100064553162e-05, + "loss": 0.534, + "step": 3605 + }, + { + "epoch": 1.0, + "grad_norm": 0.20034904778003693, + "learning_rate": 1.2781579852696588e-05, + "loss": 0.5388, + "step": 3606 + }, + { + "epoch": 1.0, + "eval_loss": 0.8209076523780823, + "eval_runtime": 415.5978, + "eval_samples_per_second": 98.684, + "eval_steps_per_second": 1.542, + "step": 3606 + }, + { + "epoch": 1.0002773155851359, + "grad_norm": 0.19878089427947998, + "learning_rate": 1.2776059585896324e-05, + "loss": 0.537, + "step": 3607 + }, + { + "epoch": 1.0005546311702718, + "grad_norm": 0.18699733912944794, + "learning_rate": 1.2770539265229522e-05, + "loss": 0.5383, + "step": 3608 + }, + { + "epoch": 1.0008319467554077, + "grad_norm": 0.18291421234607697, + "learning_rate": 1.2765018891773343e-05, + "loss": 0.5536, + "step": 3609 + }, + { + "epoch": 1.0011092623405435, + "grad_norm": 0.19343863427639008, + "learning_rate": 1.2759498466604951e-05, + "loss": 0.5325, + "step": 3610 + }, + { + "epoch": 1.0013865779256794, + "grad_norm": 0.19165514409542084, + "learning_rate": 1.2753977990801536e-05, + "loss": 0.5405, + "step": 3611 + }, + { + "epoch": 1.0016638935108153, + "grad_norm": 0.1763203889131546, + "learning_rate": 1.2748457465440289e-05, + "loss": 0.5398, + "step": 3612 + }, + { + "epoch": 1.0019412090959512, + "grad_norm": 0.2186586856842041, + "learning_rate": 1.27429368915984e-05, + "loss": 0.5192, + "step": 3613 + }, + { + "epoch": 1.002218524681087, + "grad_norm": 0.19609209895133972, + "learning_rate": 1.2737416270353094e-05, + "loss": 0.5364, + "step": 3614 + }, + { + "epoch": 1.002495840266223, + "grad_norm": 0.20778897404670715, + "learning_rate": 1.273189560278158e-05, + "loss": 0.5505, + "step": 3615 + }, + { + "epoch": 1.0027731558513588, + "grad_norm": 0.2087172418832779, + "learning_rate": 1.2726374889961095e-05, + "loss": 0.5627, + "step": 3616 + }, + { + "epoch": 1.0030504714364947, + "grad_norm": 0.19748911261558533, + "learning_rate": 1.2720854132968865e-05, + "loss": 0.541, + "step": 3617 + }, + { + "epoch": 1.0033277870216306, + "grad_norm": 0.18848268687725067, + "learning_rate": 1.2715333332882146e-05, + "loss": 0.5373, + "step": 3618 + }, + { + "epoch": 1.0036051026067665, + "grad_norm": 0.19345615804195404, + "learning_rate": 1.2709812490778187e-05, + "loss": 0.5429, + "step": 3619 + }, + { + "epoch": 1.0038824181919024, + "grad_norm": 0.18660636246204376, + "learning_rate": 1.270429160773425e-05, + "loss": 0.5608, + "step": 3620 + }, + { + "epoch": 1.0041597337770383, + "grad_norm": 0.1913338303565979, + "learning_rate": 1.2698770684827612e-05, + "loss": 0.54, + "step": 3621 + }, + { + "epoch": 1.0044370493621742, + "grad_norm": 0.258350670337677, + "learning_rate": 1.2693249723135542e-05, + "loss": 0.4896, + "step": 3622 + }, + { + "epoch": 1.00471436494731, + "grad_norm": 0.18567179143428802, + "learning_rate": 1.2687728723735337e-05, + "loss": 0.5154, + "step": 3623 + }, + { + "epoch": 1.004991680532446, + "grad_norm": 0.1973087191581726, + "learning_rate": 1.2682207687704279e-05, + "loss": 0.557, + "step": 3624 + }, + { + "epoch": 1.0052689961175818, + "grad_norm": 0.22406698763370514, + "learning_rate": 1.2676686616119675e-05, + "loss": 0.5303, + "step": 3625 + }, + { + "epoch": 1.0055463117027177, + "grad_norm": 0.19072286784648895, + "learning_rate": 1.2671165510058834e-05, + "loss": 0.5455, + "step": 3626 + }, + { + "epoch": 1.0058236272878536, + "grad_norm": 0.1937733143568039, + "learning_rate": 1.2665644370599064e-05, + "loss": 0.5198, + "step": 3627 + }, + { + "epoch": 1.0061009428729895, + "grad_norm": 0.2021392285823822, + "learning_rate": 1.2660123198817692e-05, + "loss": 0.5313, + "step": 3628 + }, + { + "epoch": 1.0063782584581253, + "grad_norm": 0.18881459534168243, + "learning_rate": 1.2654601995792036e-05, + "loss": 0.5254, + "step": 3629 + }, + { + "epoch": 1.0066555740432612, + "grad_norm": 0.18779276311397552, + "learning_rate": 1.2649080762599442e-05, + "loss": 0.5202, + "step": 3630 + }, + { + "epoch": 1.006932889628397, + "grad_norm": 0.1880016028881073, + "learning_rate": 1.2643559500317234e-05, + "loss": 0.5177, + "step": 3631 + }, + { + "epoch": 1.007210205213533, + "grad_norm": 0.18926078081130981, + "learning_rate": 1.2638038210022765e-05, + "loss": 0.5349, + "step": 3632 + }, + { + "epoch": 1.0074875207986689, + "grad_norm": 0.18671317398548126, + "learning_rate": 1.2632516892793389e-05, + "loss": 0.5284, + "step": 3633 + }, + { + "epoch": 1.0077648363838048, + "grad_norm": 0.19339770078659058, + "learning_rate": 1.2626995549706452e-05, + "loss": 0.5238, + "step": 3634 + }, + { + "epoch": 1.0080421519689406, + "grad_norm": 0.18992464244365692, + "learning_rate": 1.2621474181839322e-05, + "loss": 0.5461, + "step": 3635 + }, + { + "epoch": 1.0083194675540765, + "grad_norm": 0.19679436087608337, + "learning_rate": 1.2615952790269356e-05, + "loss": 0.5174, + "step": 3636 + }, + { + "epoch": 1.0085967831392124, + "grad_norm": 0.1883174180984497, + "learning_rate": 1.2610431376073931e-05, + "loss": 0.5542, + "step": 3637 + }, + { + "epoch": 1.0088740987243483, + "grad_norm": 0.1904815286397934, + "learning_rate": 1.260490994033042e-05, + "loss": 0.5224, + "step": 3638 + }, + { + "epoch": 1.0091514143094842, + "grad_norm": 0.18984469771385193, + "learning_rate": 1.2599388484116198e-05, + "loss": 0.5237, + "step": 3639 + }, + { + "epoch": 1.00942872989462, + "grad_norm": 0.19481045007705688, + "learning_rate": 1.259386700850865e-05, + "loss": 0.5315, + "step": 3640 + }, + { + "epoch": 1.009706045479756, + "grad_norm": 0.17928001284599304, + "learning_rate": 1.2588345514585163e-05, + "loss": 0.5292, + "step": 3641 + }, + { + "epoch": 1.0099833610648918, + "grad_norm": 0.1879146546125412, + "learning_rate": 1.2582824003423124e-05, + "loss": 0.5269, + "step": 3642 + }, + { + "epoch": 1.0102606766500277, + "grad_norm": 0.18970663845539093, + "learning_rate": 1.2577302476099926e-05, + "loss": 0.5107, + "step": 3643 + }, + { + "epoch": 1.0105379922351636, + "grad_norm": 0.19013284146785736, + "learning_rate": 1.257178093369297e-05, + "loss": 0.5348, + "step": 3644 + }, + { + "epoch": 1.0108153078202995, + "grad_norm": 0.19010770320892334, + "learning_rate": 1.2566259377279652e-05, + "loss": 0.5264, + "step": 3645 + }, + { + "epoch": 1.0110926234054354, + "grad_norm": 0.18681344389915466, + "learning_rate": 1.2560737807937374e-05, + "loss": 0.497, + "step": 3646 + }, + { + "epoch": 1.0113699389905713, + "grad_norm": 0.1887647807598114, + "learning_rate": 1.2555216226743537e-05, + "loss": 0.5271, + "step": 3647 + }, + { + "epoch": 1.0116472545757071, + "grad_norm": 0.18254730105400085, + "learning_rate": 1.2549694634775555e-05, + "loss": 0.5283, + "step": 3648 + }, + { + "epoch": 1.011924570160843, + "grad_norm": 0.1893124133348465, + "learning_rate": 1.2544173033110832e-05, + "loss": 0.5273, + "step": 3649 + }, + { + "epoch": 1.012201885745979, + "grad_norm": 0.18291127681732178, + "learning_rate": 1.2538651422826777e-05, + "loss": 0.5425, + "step": 3650 + }, + { + "epoch": 1.0124792013311148, + "grad_norm": 0.200171560049057, + "learning_rate": 1.2533129805000807e-05, + "loss": 0.532, + "step": 3651 + }, + { + "epoch": 1.0127565169162507, + "grad_norm": 0.1930498629808426, + "learning_rate": 1.2527608180710338e-05, + "loss": 0.5295, + "step": 3652 + }, + { + "epoch": 1.0130338325013866, + "grad_norm": 0.18753856420516968, + "learning_rate": 1.2522086551032778e-05, + "loss": 0.5358, + "step": 3653 + }, + { + "epoch": 1.0133111480865225, + "grad_norm": 0.19522987306118011, + "learning_rate": 1.2516564917045548e-05, + "loss": 0.5332, + "step": 3654 + }, + { + "epoch": 1.0135884636716583, + "grad_norm": 0.17891767621040344, + "learning_rate": 1.2511043279826062e-05, + "loss": 0.5166, + "step": 3655 + }, + { + "epoch": 1.0138657792567942, + "grad_norm": 0.18819722533226013, + "learning_rate": 1.250552164045174e-05, + "loss": 0.5445, + "step": 3656 + }, + { + "epoch": 1.01414309484193, + "grad_norm": 0.1814819872379303, + "learning_rate": 1.25e-05, + "loss": 0.5191, + "step": 3657 + }, + { + "epoch": 1.014420410427066, + "grad_norm": 0.18296414613723755, + "learning_rate": 1.2494478359548261e-05, + "loss": 0.5173, + "step": 3658 + }, + { + "epoch": 1.0146977260122019, + "grad_norm": 0.20985326170921326, + "learning_rate": 1.2488956720173939e-05, + "loss": 0.5062, + "step": 3659 + }, + { + "epoch": 1.0149750415973378, + "grad_norm": 0.17893539369106293, + "learning_rate": 1.2483435082954453e-05, + "loss": 0.5458, + "step": 3660 + }, + { + "epoch": 1.0152523571824736, + "grad_norm": 0.19261109828948975, + "learning_rate": 1.2477913448967227e-05, + "loss": 0.5637, + "step": 3661 + }, + { + "epoch": 1.0155296727676095, + "grad_norm": 0.2319900244474411, + "learning_rate": 1.2472391819289667e-05, + "loss": 0.5379, + "step": 3662 + }, + { + "epoch": 1.0158069883527454, + "grad_norm": 0.1867554783821106, + "learning_rate": 1.2466870194999192e-05, + "loss": 0.5321, + "step": 3663 + }, + { + "epoch": 1.0160843039378813, + "grad_norm": 0.1840423047542572, + "learning_rate": 1.2461348577173224e-05, + "loss": 0.5228, + "step": 3664 + }, + { + "epoch": 1.0163616195230172, + "grad_norm": 0.18526844680309296, + "learning_rate": 1.2455826966889175e-05, + "loss": 0.5387, + "step": 3665 + }, + { + "epoch": 1.016638935108153, + "grad_norm": 0.19361324608325958, + "learning_rate": 1.2450305365224446e-05, + "loss": 0.5261, + "step": 3666 + }, + { + "epoch": 1.016916250693289, + "grad_norm": 0.19102880358695984, + "learning_rate": 1.2444783773256466e-05, + "loss": 0.5433, + "step": 3667 + }, + { + "epoch": 1.0171935662784248, + "grad_norm": 0.18817844986915588, + "learning_rate": 1.2439262192062631e-05, + "loss": 0.538, + "step": 3668 + }, + { + "epoch": 1.0174708818635607, + "grad_norm": 0.1901366263628006, + "learning_rate": 1.2433740622720353e-05, + "loss": 0.5453, + "step": 3669 + }, + { + "epoch": 1.0177481974486966, + "grad_norm": 0.22999554872512817, + "learning_rate": 1.242821906630703e-05, + "loss": 0.5164, + "step": 3670 + }, + { + "epoch": 1.0180255130338325, + "grad_norm": 0.1897146850824356, + "learning_rate": 1.2422697523900075e-05, + "loss": 0.5528, + "step": 3671 + }, + { + "epoch": 1.0183028286189684, + "grad_norm": 0.1865740269422531, + "learning_rate": 1.241717599657688e-05, + "loss": 0.53, + "step": 3672 + }, + { + "epoch": 1.0185801442041043, + "grad_norm": 0.18005676567554474, + "learning_rate": 1.2411654485414839e-05, + "loss": 0.5056, + "step": 3673 + }, + { + "epoch": 1.0188574597892401, + "grad_norm": 0.1854349970817566, + "learning_rate": 1.240613299149135e-05, + "loss": 0.549, + "step": 3674 + }, + { + "epoch": 1.019134775374376, + "grad_norm": 0.19649791717529297, + "learning_rate": 1.2400611515883805e-05, + "loss": 0.5321, + "step": 3675 + }, + { + "epoch": 1.019412090959512, + "grad_norm": 0.18856771290302277, + "learning_rate": 1.2395090059669585e-05, + "loss": 0.5304, + "step": 3676 + }, + { + "epoch": 1.0196894065446478, + "grad_norm": 0.18438786268234253, + "learning_rate": 1.238956862392607e-05, + "loss": 0.551, + "step": 3677 + }, + { + "epoch": 1.0199667221297837, + "grad_norm": 0.1798408478498459, + "learning_rate": 1.2384047209730647e-05, + "loss": 0.5237, + "step": 3678 + }, + { + "epoch": 1.0202440377149196, + "grad_norm": 0.17963095009326935, + "learning_rate": 1.2378525818160683e-05, + "loss": 0.5229, + "step": 3679 + }, + { + "epoch": 1.0205213533000554, + "grad_norm": 0.18897545337677002, + "learning_rate": 1.237300445029355e-05, + "loss": 0.519, + "step": 3680 + }, + { + "epoch": 1.0207986688851913, + "grad_norm": 0.18016240000724792, + "learning_rate": 1.2367483107206614e-05, + "loss": 0.5092, + "step": 3681 + }, + { + "epoch": 1.0210759844703272, + "grad_norm": 0.18622079491615295, + "learning_rate": 1.2361961789977238e-05, + "loss": 0.505, + "step": 3682 + }, + { + "epoch": 1.021353300055463, + "grad_norm": 0.1844959259033203, + "learning_rate": 1.2356440499682769e-05, + "loss": 0.5358, + "step": 3683 + }, + { + "epoch": 1.021630615640599, + "grad_norm": 0.18017494678497314, + "learning_rate": 1.2350919237400563e-05, + "loss": 0.521, + "step": 3684 + }, + { + "epoch": 1.0219079312257349, + "grad_norm": 0.18644961714744568, + "learning_rate": 1.2345398004207965e-05, + "loss": 0.5239, + "step": 3685 + }, + { + "epoch": 1.0221852468108708, + "grad_norm": 0.18692266941070557, + "learning_rate": 1.2339876801182315e-05, + "loss": 0.5055, + "step": 3686 + }, + { + "epoch": 1.0224625623960066, + "grad_norm": 0.18799254298210144, + "learning_rate": 1.2334355629400934e-05, + "loss": 0.5153, + "step": 3687 + }, + { + "epoch": 1.0227398779811425, + "grad_norm": 0.19313998520374298, + "learning_rate": 1.2328834489941168e-05, + "loss": 0.5274, + "step": 3688 + }, + { + "epoch": 1.0230171935662784, + "grad_norm": 0.24748623371124268, + "learning_rate": 1.2323313383880326e-05, + "loss": 0.5168, + "step": 3689 + }, + { + "epoch": 1.0232945091514143, + "grad_norm": 0.18509146571159363, + "learning_rate": 1.231779231229572e-05, + "loss": 0.508, + "step": 3690 + }, + { + "epoch": 1.0235718247365502, + "grad_norm": 0.2279065102338791, + "learning_rate": 1.2312271276264666e-05, + "loss": 0.5219, + "step": 3691 + }, + { + "epoch": 1.023849140321686, + "grad_norm": 0.18918190896511078, + "learning_rate": 1.230675027686446e-05, + "loss": 0.509, + "step": 3692 + }, + { + "epoch": 1.024126455906822, + "grad_norm": 0.19168592989444733, + "learning_rate": 1.2301229315172394e-05, + "loss": 0.5128, + "step": 3693 + }, + { + "epoch": 1.0244037714919578, + "grad_norm": 0.18460464477539062, + "learning_rate": 1.229570839226575e-05, + "loss": 0.4949, + "step": 3694 + }, + { + "epoch": 1.0246810870770937, + "grad_norm": 0.1955164670944214, + "learning_rate": 1.2290187509221816e-05, + "loss": 0.5084, + "step": 3695 + }, + { + "epoch": 1.0249584026622296, + "grad_norm": 0.21448062360286713, + "learning_rate": 1.2284666667117858e-05, + "loss": 0.5258, + "step": 3696 + }, + { + "epoch": 1.0252357182473655, + "grad_norm": 0.195621058344841, + "learning_rate": 1.2279145867031136e-05, + "loss": 0.5409, + "step": 3697 + }, + { + "epoch": 1.0255130338325014, + "grad_norm": 0.18348294496536255, + "learning_rate": 1.2273625110038908e-05, + "loss": 0.5288, + "step": 3698 + }, + { + "epoch": 1.0257903494176372, + "grad_norm": 0.2350398451089859, + "learning_rate": 1.2268104397218421e-05, + "loss": 0.5176, + "step": 3699 + }, + { + "epoch": 1.0260676650027731, + "grad_norm": 0.18816140294075012, + "learning_rate": 1.2262583729646909e-05, + "loss": 0.5174, + "step": 3700 + }, + { + "epoch": 1.026344980587909, + "grad_norm": 0.18344999849796295, + "learning_rate": 1.22570631084016e-05, + "loss": 0.5001, + "step": 3701 + }, + { + "epoch": 1.026622296173045, + "grad_norm": 0.19813844561576843, + "learning_rate": 1.2251542534559716e-05, + "loss": 0.5229, + "step": 3702 + }, + { + "epoch": 1.0268996117581808, + "grad_norm": 0.18872547149658203, + "learning_rate": 1.2246022009198469e-05, + "loss": 0.5265, + "step": 3703 + }, + { + "epoch": 1.0271769273433167, + "grad_norm": 0.19747066497802734, + "learning_rate": 1.2240501533395048e-05, + "loss": 0.5381, + "step": 3704 + }, + { + "epoch": 1.0274542429284526, + "grad_norm": 0.1966402530670166, + "learning_rate": 1.2234981108226662e-05, + "loss": 0.5447, + "step": 3705 + }, + { + "epoch": 1.0277315585135884, + "grad_norm": 0.1881251186132431, + "learning_rate": 1.222946073477048e-05, + "loss": 0.4944, + "step": 3706 + }, + { + "epoch": 1.0280088740987243, + "grad_norm": 0.19006265699863434, + "learning_rate": 1.222394041410368e-05, + "loss": 0.5279, + "step": 3707 + }, + { + "epoch": 1.0282861896838602, + "grad_norm": 0.17501787841320038, + "learning_rate": 1.2218420147303412e-05, + "loss": 0.4797, + "step": 3708 + }, + { + "epoch": 1.028563505268996, + "grad_norm": 0.17646637558937073, + "learning_rate": 1.2212899935446841e-05, + "loss": 0.4878, + "step": 3709 + }, + { + "epoch": 1.028840820854132, + "grad_norm": 0.19758492708206177, + "learning_rate": 1.2207379779611095e-05, + "loss": 0.5238, + "step": 3710 + }, + { + "epoch": 1.0291181364392679, + "grad_norm": 0.19391943514347076, + "learning_rate": 1.2201859680873305e-05, + "loss": 0.5086, + "step": 3711 + }, + { + "epoch": 1.0293954520244037, + "grad_norm": 0.18827463686466217, + "learning_rate": 1.2196339640310595e-05, + "loss": 0.5257, + "step": 3712 + }, + { + "epoch": 1.0296727676095396, + "grad_norm": 0.19285833835601807, + "learning_rate": 1.2190819659000063e-05, + "loss": 0.5179, + "step": 3713 + }, + { + "epoch": 1.0299500831946755, + "grad_norm": 0.18147997558116913, + "learning_rate": 1.2185299738018813e-05, + "loss": 0.4985, + "step": 3714 + }, + { + "epoch": 1.0302273987798114, + "grad_norm": 0.19085568189620972, + "learning_rate": 1.2179779878443915e-05, + "loss": 0.5131, + "step": 3715 + }, + { + "epoch": 1.0305047143649473, + "grad_norm": 0.20084667205810547, + "learning_rate": 1.217426008135245e-05, + "loss": 0.519, + "step": 3716 + }, + { + "epoch": 1.0307820299500832, + "grad_norm": 0.20030198991298676, + "learning_rate": 1.2168740347821473e-05, + "loss": 0.5277, + "step": 3717 + }, + { + "epoch": 1.031059345535219, + "grad_norm": 0.1940310150384903, + "learning_rate": 1.2163220678928028e-05, + "loss": 0.5001, + "step": 3718 + }, + { + "epoch": 1.031336661120355, + "grad_norm": 0.18660347163677216, + "learning_rate": 1.2157701075749153e-05, + "loss": 0.529, + "step": 3719 + }, + { + "epoch": 1.0316139767054908, + "grad_norm": 0.1930490881204605, + "learning_rate": 1.2152181539361871e-05, + "loss": 0.5056, + "step": 3720 + }, + { + "epoch": 1.0318912922906267, + "grad_norm": 0.1963385045528412, + "learning_rate": 1.2146662070843184e-05, + "loss": 0.5057, + "step": 3721 + }, + { + "epoch": 1.0321686078757626, + "grad_norm": 0.19734053313732147, + "learning_rate": 1.2141142671270085e-05, + "loss": 0.5287, + "step": 3722 + }, + { + "epoch": 1.0324459234608985, + "grad_norm": 0.18422985076904297, + "learning_rate": 1.2135623341719561e-05, + "loss": 0.5135, + "step": 3723 + }, + { + "epoch": 1.0327232390460344, + "grad_norm": 0.18918901681900024, + "learning_rate": 1.213010408326858e-05, + "loss": 0.5098, + "step": 3724 + }, + { + "epoch": 1.0330005546311702, + "grad_norm": 0.18276064097881317, + "learning_rate": 1.2124584896994085e-05, + "loss": 0.5098, + "step": 3725 + }, + { + "epoch": 1.0332778702163061, + "grad_norm": 0.2021724432706833, + "learning_rate": 1.2119065783973031e-05, + "loss": 0.5127, + "step": 3726 + }, + { + "epoch": 1.033555185801442, + "grad_norm": 0.18858903646469116, + "learning_rate": 1.2113546745282333e-05, + "loss": 0.5057, + "step": 3727 + }, + { + "epoch": 1.033832501386578, + "grad_norm": 0.18476137518882751, + "learning_rate": 1.2108027781998902e-05, + "loss": 0.5254, + "step": 3728 + }, + { + "epoch": 1.0341098169717138, + "grad_norm": 0.18318045139312744, + "learning_rate": 1.2102508895199633e-05, + "loss": 0.5248, + "step": 3729 + }, + { + "epoch": 1.0343871325568497, + "grad_norm": 0.18871383368968964, + "learning_rate": 1.2096990085961417e-05, + "loss": 0.5073, + "step": 3730 + }, + { + "epoch": 1.0346644481419855, + "grad_norm": 0.2086830884218216, + "learning_rate": 1.209147135536111e-05, + "loss": 0.5038, + "step": 3731 + }, + { + "epoch": 1.0349417637271214, + "grad_norm": 0.1914805769920349, + "learning_rate": 1.2085952704475562e-05, + "loss": 0.5322, + "step": 3732 + }, + { + "epoch": 1.0352190793122573, + "grad_norm": 0.18802616000175476, + "learning_rate": 1.2080434134381615e-05, + "loss": 0.5161, + "step": 3733 + }, + { + "epoch": 1.0354963948973932, + "grad_norm": 0.1888350397348404, + "learning_rate": 1.2074915646156083e-05, + "loss": 0.5178, + "step": 3734 + }, + { + "epoch": 1.035773710482529, + "grad_norm": 0.1935519278049469, + "learning_rate": 1.2069397240875774e-05, + "loss": 0.5151, + "step": 3735 + }, + { + "epoch": 1.036051026067665, + "grad_norm": 0.18621766567230225, + "learning_rate": 1.2063878919617467e-05, + "loss": 0.5034, + "step": 3736 + }, + { + "epoch": 1.0363283416528009, + "grad_norm": 0.20036379992961884, + "learning_rate": 1.2058360683457941e-05, + "loss": 0.518, + "step": 3737 + }, + { + "epoch": 1.0366056572379367, + "grad_norm": 0.19412393867969513, + "learning_rate": 1.2052842533473945e-05, + "loss": 0.5095, + "step": 3738 + }, + { + "epoch": 1.0368829728230726, + "grad_norm": 0.20189900696277618, + "learning_rate": 1.2047324470742216e-05, + "loss": 0.5, + "step": 3739 + }, + { + "epoch": 1.0371602884082085, + "grad_norm": 0.19598862528800964, + "learning_rate": 1.204180649633948e-05, + "loss": 0.5352, + "step": 3740 + }, + { + "epoch": 1.0374376039933444, + "grad_norm": 0.18859632313251495, + "learning_rate": 1.2036288611342436e-05, + "loss": 0.5148, + "step": 3741 + }, + { + "epoch": 1.0377149195784803, + "grad_norm": 0.18398115038871765, + "learning_rate": 1.2030770816827769e-05, + "loss": 0.519, + "step": 3742 + }, + { + "epoch": 1.0379922351636162, + "grad_norm": 0.19209997355937958, + "learning_rate": 1.2025253113872144e-05, + "loss": 0.535, + "step": 3743 + }, + { + "epoch": 1.038269550748752, + "grad_norm": 0.20428086817264557, + "learning_rate": 1.2019735503552219e-05, + "loss": 0.5346, + "step": 3744 + }, + { + "epoch": 1.038546866333888, + "grad_norm": 0.20181040465831757, + "learning_rate": 1.2014217986944624e-05, + "loss": 0.5171, + "step": 3745 + }, + { + "epoch": 1.0388241819190238, + "grad_norm": 0.18710096180438995, + "learning_rate": 1.200870056512596e-05, + "loss": 0.5059, + "step": 3746 + }, + { + "epoch": 1.0391014975041597, + "grad_norm": 0.18923644721508026, + "learning_rate": 1.2003183239172843e-05, + "loss": 0.5271, + "step": 3747 + }, + { + "epoch": 1.0393788130892956, + "grad_norm": 0.20316655933856964, + "learning_rate": 1.1997666010161836e-05, + "loss": 0.5138, + "step": 3748 + }, + { + "epoch": 1.0396561286744315, + "grad_norm": 0.18475180864334106, + "learning_rate": 1.1992148879169499e-05, + "loss": 0.4857, + "step": 3749 + }, + { + "epoch": 1.0399334442595674, + "grad_norm": 0.18830114603042603, + "learning_rate": 1.1986631847272367e-05, + "loss": 0.5201, + "step": 3750 + }, + { + "epoch": 1.0402107598447032, + "grad_norm": 0.1905670464038849, + "learning_rate": 1.1981114915546967e-05, + "loss": 0.4813, + "step": 3751 + }, + { + "epoch": 1.0404880754298391, + "grad_norm": 0.18634763360023499, + "learning_rate": 1.1975598085069798e-05, + "loss": 0.53, + "step": 3752 + }, + { + "epoch": 1.040765391014975, + "grad_norm": 0.19833336770534515, + "learning_rate": 1.197008135691733e-05, + "loss": 0.5381, + "step": 3753 + }, + { + "epoch": 1.041042706600111, + "grad_norm": 0.19526107609272003, + "learning_rate": 1.1964564732166032e-05, + "loss": 0.514, + "step": 3754 + }, + { + "epoch": 1.0413200221852468, + "grad_norm": 0.1905699223279953, + "learning_rate": 1.195904821189234e-05, + "loss": 0.5321, + "step": 3755 + }, + { + "epoch": 1.0415973377703827, + "grad_norm": 0.203719362616539, + "learning_rate": 1.1953531797172673e-05, + "loss": 0.5565, + "step": 3756 + }, + { + "epoch": 1.0418746533555185, + "grad_norm": 0.1842213273048401, + "learning_rate": 1.1948015489083433e-05, + "loss": 0.5008, + "step": 3757 + }, + { + "epoch": 1.0421519689406544, + "grad_norm": 0.2012074589729309, + "learning_rate": 1.1942499288700997e-05, + "loss": 0.5199, + "step": 3758 + }, + { + "epoch": 1.0424292845257903, + "grad_norm": 0.19444574415683746, + "learning_rate": 1.193698319710172e-05, + "loss": 0.5228, + "step": 3759 + }, + { + "epoch": 1.0427066001109262, + "grad_norm": 0.1847216635942459, + "learning_rate": 1.1931467215361934e-05, + "loss": 0.4973, + "step": 3760 + }, + { + "epoch": 1.042983915696062, + "grad_norm": 0.2011430710554123, + "learning_rate": 1.192595134455796e-05, + "loss": 0.5167, + "step": 3761 + }, + { + "epoch": 1.043261231281198, + "grad_norm": 0.1845628172159195, + "learning_rate": 1.192043558576609e-05, + "loss": 0.5279, + "step": 3762 + }, + { + "epoch": 1.0435385468663338, + "grad_norm": 0.19030199944972992, + "learning_rate": 1.1914919940062585e-05, + "loss": 0.5217, + "step": 3763 + }, + { + "epoch": 1.0438158624514697, + "grad_norm": 0.1975383758544922, + "learning_rate": 1.190940440852371e-05, + "loss": 0.5218, + "step": 3764 + }, + { + "epoch": 1.0440931780366056, + "grad_norm": 0.19011008739471436, + "learning_rate": 1.190388899222568e-05, + "loss": 0.5273, + "step": 3765 + }, + { + "epoch": 1.0443704936217415, + "grad_norm": 0.18708908557891846, + "learning_rate": 1.1898373692244699e-05, + "loss": 0.5336, + "step": 3766 + }, + { + "epoch": 1.0446478092068774, + "grad_norm": 0.18800389766693115, + "learning_rate": 1.189285850965695e-05, + "loss": 0.5131, + "step": 3767 + }, + { + "epoch": 1.0449251247920133, + "grad_norm": 0.187372624874115, + "learning_rate": 1.1887343445538597e-05, + "loss": 0.5171, + "step": 3768 + }, + { + "epoch": 1.0452024403771492, + "grad_norm": 0.19775499403476715, + "learning_rate": 1.1881828500965765e-05, + "loss": 0.5291, + "step": 3769 + }, + { + "epoch": 1.045479755962285, + "grad_norm": 0.19341982901096344, + "learning_rate": 1.1876313677014569e-05, + "loss": 0.5253, + "step": 3770 + }, + { + "epoch": 1.045757071547421, + "grad_norm": 0.19665475189685822, + "learning_rate": 1.1870798974761102e-05, + "loss": 0.5123, + "step": 3771 + }, + { + "epoch": 1.0460343871325568, + "grad_norm": 0.19843867421150208, + "learning_rate": 1.1865284395281426e-05, + "loss": 0.4966, + "step": 3772 + }, + { + "epoch": 1.0463117027176927, + "grad_norm": 0.18300753831863403, + "learning_rate": 1.1859769939651582e-05, + "loss": 0.5091, + "step": 3773 + }, + { + "epoch": 1.0465890183028286, + "grad_norm": 0.1935308873653412, + "learning_rate": 1.1854255608947581e-05, + "loss": 0.5112, + "step": 3774 + }, + { + "epoch": 1.0468663338879645, + "grad_norm": 0.19752877950668335, + "learning_rate": 1.1848741404245421e-05, + "loss": 0.5062, + "step": 3775 + }, + { + "epoch": 1.0471436494731003, + "grad_norm": 0.18157391250133514, + "learning_rate": 1.1843227326621069e-05, + "loss": 0.5018, + "step": 3776 + }, + { + "epoch": 1.0474209650582362, + "grad_norm": 0.20472866296768188, + "learning_rate": 1.1837713377150463e-05, + "loss": 0.5358, + "step": 3777 + }, + { + "epoch": 1.0476982806433721, + "grad_norm": 0.1965123564004898, + "learning_rate": 1.1832199556909528e-05, + "loss": 0.5269, + "step": 3778 + }, + { + "epoch": 1.047975596228508, + "grad_norm": 0.19199901819229126, + "learning_rate": 1.1826685866974153e-05, + "loss": 0.4876, + "step": 3779 + }, + { + "epoch": 1.0482529118136439, + "grad_norm": 0.1838652491569519, + "learning_rate": 1.1821172308420203e-05, + "loss": 0.542, + "step": 3780 + }, + { + "epoch": 1.0485302273987798, + "grad_norm": 0.19129469990730286, + "learning_rate": 1.1815658882323519e-05, + "loss": 0.4951, + "step": 3781 + }, + { + "epoch": 1.0488075429839157, + "grad_norm": 0.18997079133987427, + "learning_rate": 1.181014558975992e-05, + "loss": 0.4899, + "step": 3782 + }, + { + "epoch": 1.0490848585690515, + "grad_norm": 0.19084186851978302, + "learning_rate": 1.1804632431805197e-05, + "loss": 0.5315, + "step": 3783 + }, + { + "epoch": 1.0493621741541874, + "grad_norm": 0.18761594593524933, + "learning_rate": 1.1799119409535101e-05, + "loss": 0.5063, + "step": 3784 + }, + { + "epoch": 1.0496394897393233, + "grad_norm": 0.1877257525920868, + "learning_rate": 1.1793606524025388e-05, + "loss": 0.506, + "step": 3785 + }, + { + "epoch": 1.0499168053244592, + "grad_norm": 0.19573919475078583, + "learning_rate": 1.1788093776351752e-05, + "loss": 0.5218, + "step": 3786 + }, + { + "epoch": 1.050194120909595, + "grad_norm": 0.19596537947654724, + "learning_rate": 1.1782581167589883e-05, + "loss": 0.5252, + "step": 3787 + }, + { + "epoch": 1.050471436494731, + "grad_norm": 0.19336049258708954, + "learning_rate": 1.1777068698815434e-05, + "loss": 0.5247, + "step": 3788 + }, + { + "epoch": 1.0507487520798668, + "grad_norm": 0.18527399003505707, + "learning_rate": 1.1771556371104039e-05, + "loss": 0.5034, + "step": 3789 + }, + { + "epoch": 1.0510260676650027, + "grad_norm": 0.18503272533416748, + "learning_rate": 1.1766044185531296e-05, + "loss": 0.4683, + "step": 3790 + }, + { + "epoch": 1.0513033832501386, + "grad_norm": 0.1880425363779068, + "learning_rate": 1.1760532143172772e-05, + "loss": 0.5317, + "step": 3791 + }, + { + "epoch": 1.0515806988352745, + "grad_norm": 0.18435950577259064, + "learning_rate": 1.1755020245104025e-05, + "loss": 0.5474, + "step": 3792 + }, + { + "epoch": 1.0518580144204104, + "grad_norm": 0.18995283544063568, + "learning_rate": 1.1749508492400564e-05, + "loss": 0.5066, + "step": 3793 + }, + { + "epoch": 1.0521353300055463, + "grad_norm": 0.18789511919021606, + "learning_rate": 1.1743996886137882e-05, + "loss": 0.5064, + "step": 3794 + }, + { + "epoch": 1.0524126455906821, + "grad_norm": 0.17998188734054565, + "learning_rate": 1.1738485427391431e-05, + "loss": 0.5077, + "step": 3795 + }, + { + "epoch": 1.052689961175818, + "grad_norm": 0.21583081781864166, + "learning_rate": 1.1732974117236656e-05, + "loss": 0.5048, + "step": 3796 + }, + { + "epoch": 1.052967276760954, + "grad_norm": 0.21214094758033752, + "learning_rate": 1.172746295674895e-05, + "loss": 0.5244, + "step": 3797 + }, + { + "epoch": 1.0532445923460898, + "grad_norm": 0.1979261040687561, + "learning_rate": 1.1721951947003689e-05, + "loss": 0.5102, + "step": 3798 + }, + { + "epoch": 1.0535219079312257, + "grad_norm": 0.1923878788948059, + "learning_rate": 1.1716441089076216e-05, + "loss": 0.5165, + "step": 3799 + }, + { + "epoch": 1.0537992235163616, + "grad_norm": 0.19521905481815338, + "learning_rate": 1.1710930384041852e-05, + "loss": 0.5411, + "step": 3800 + }, + { + "epoch": 1.0540765391014975, + "grad_norm": 0.1916571408510208, + "learning_rate": 1.1705419832975873e-05, + "loss": 0.5001, + "step": 3801 + }, + { + "epoch": 1.0543538546866333, + "grad_norm": 0.19345450401306152, + "learning_rate": 1.1699909436953532e-05, + "loss": 0.5371, + "step": 3802 + }, + { + "epoch": 1.0546311702717692, + "grad_norm": 0.2184910923242569, + "learning_rate": 1.1694399197050062e-05, + "loss": 0.5138, + "step": 3803 + }, + { + "epoch": 1.054908485856905, + "grad_norm": 0.19563588500022888, + "learning_rate": 1.1688889114340653e-05, + "loss": 0.51, + "step": 3804 + }, + { + "epoch": 1.055185801442041, + "grad_norm": 0.19763770699501038, + "learning_rate": 1.1683379189900465e-05, + "loss": 0.5183, + "step": 3805 + }, + { + "epoch": 1.0554631170271769, + "grad_norm": 0.19711549580097198, + "learning_rate": 1.1677869424804637e-05, + "loss": 0.4926, + "step": 3806 + }, + { + "epoch": 1.0557404326123128, + "grad_norm": 0.2019127458333969, + "learning_rate": 1.1672359820128265e-05, + "loss": 0.5459, + "step": 3807 + }, + { + "epoch": 1.0560177481974486, + "grad_norm": 0.2181474268436432, + "learning_rate": 1.166685037694642e-05, + "loss": 0.5211, + "step": 3808 + }, + { + "epoch": 1.0562950637825845, + "grad_norm": 0.20076914131641388, + "learning_rate": 1.1661341096334136e-05, + "loss": 0.5064, + "step": 3809 + }, + { + "epoch": 1.0565723793677204, + "grad_norm": 0.19240239262580872, + "learning_rate": 1.1655831979366427e-05, + "loss": 0.5198, + "step": 3810 + }, + { + "epoch": 1.0568496949528563, + "grad_norm": 0.19020958244800568, + "learning_rate": 1.1650323027118269e-05, + "loss": 0.5131, + "step": 3811 + }, + { + "epoch": 1.0571270105379922, + "grad_norm": 0.19757165014743805, + "learning_rate": 1.1644814240664594e-05, + "loss": 0.5075, + "step": 3812 + }, + { + "epoch": 1.057404326123128, + "grad_norm": 0.199832484126091, + "learning_rate": 1.1639305621080321e-05, + "loss": 0.4984, + "step": 3813 + }, + { + "epoch": 1.057681641708264, + "grad_norm": 0.19308264553546906, + "learning_rate": 1.1633797169440326e-05, + "loss": 0.5108, + "step": 3814 + }, + { + "epoch": 1.0579589572933998, + "grad_norm": 0.19443538784980774, + "learning_rate": 1.1628288886819453e-05, + "loss": 0.4896, + "step": 3815 + }, + { + "epoch": 1.0582362728785357, + "grad_norm": 0.21316394209861755, + "learning_rate": 1.1622780774292506e-05, + "loss": 0.5378, + "step": 3816 + }, + { + "epoch": 1.0585135884636716, + "grad_norm": 0.1971164047718048, + "learning_rate": 1.1617272832934282e-05, + "loss": 0.5282, + "step": 3817 + }, + { + "epoch": 1.0587909040488075, + "grad_norm": 0.19226108491420746, + "learning_rate": 1.161176506381951e-05, + "loss": 0.5409, + "step": 3818 + }, + { + "epoch": 1.0590682196339434, + "grad_norm": 0.17965470254421234, + "learning_rate": 1.1606257468022907e-05, + "loss": 0.5022, + "step": 3819 + }, + { + "epoch": 1.0593455352190793, + "grad_norm": 0.191642165184021, + "learning_rate": 1.1600750046619154e-05, + "loss": 0.5034, + "step": 3820 + }, + { + "epoch": 1.0596228508042151, + "grad_norm": 0.19330790638923645, + "learning_rate": 1.1595242800682893e-05, + "loss": 0.5155, + "step": 3821 + }, + { + "epoch": 1.059900166389351, + "grad_norm": 0.20066289603710175, + "learning_rate": 1.1589735731288725e-05, + "loss": 0.5337, + "step": 3822 + }, + { + "epoch": 1.060177481974487, + "grad_norm": 0.1891581416130066, + "learning_rate": 1.1584228839511242e-05, + "loss": 0.5264, + "step": 3823 + }, + { + "epoch": 1.0604547975596228, + "grad_norm": 0.19226865470409393, + "learning_rate": 1.1578722126424971e-05, + "loss": 0.4739, + "step": 3824 + }, + { + "epoch": 1.0607321131447587, + "grad_norm": 0.1854773312807083, + "learning_rate": 1.1573215593104425e-05, + "loss": 0.527, + "step": 3825 + }, + { + "epoch": 1.0610094287298946, + "grad_norm": 0.21180889010429382, + "learning_rate": 1.1567709240624067e-05, + "loss": 0.4695, + "step": 3826 + }, + { + "epoch": 1.0612867443150305, + "grad_norm": 0.1897820234298706, + "learning_rate": 1.1562203070058341e-05, + "loss": 0.4995, + "step": 3827 + }, + { + "epoch": 1.0615640599001663, + "grad_norm": 0.1939866989850998, + "learning_rate": 1.155669708248164e-05, + "loss": 0.5126, + "step": 3828 + }, + { + "epoch": 1.0618413754853022, + "grad_norm": 0.22312824428081512, + "learning_rate": 1.1551191278968328e-05, + "loss": 0.5291, + "step": 3829 + }, + { + "epoch": 1.062118691070438, + "grad_norm": 0.19337856769561768, + "learning_rate": 1.1545685660592741e-05, + "loss": 0.5188, + "step": 3830 + }, + { + "epoch": 1.062396006655574, + "grad_norm": 0.19467610120773315, + "learning_rate": 1.1540180228429164e-05, + "loss": 0.5388, + "step": 3831 + }, + { + "epoch": 1.0626733222407099, + "grad_norm": 0.19284315407276154, + "learning_rate": 1.1534674983551857e-05, + "loss": 0.5011, + "step": 3832 + }, + { + "epoch": 1.0629506378258458, + "grad_norm": 0.22472496330738068, + "learning_rate": 1.1529169927035028e-05, + "loss": 0.5105, + "step": 3833 + }, + { + "epoch": 1.0632279534109816, + "grad_norm": 0.18030276894569397, + "learning_rate": 1.1523665059952876e-05, + "loss": 0.515, + "step": 3834 + }, + { + "epoch": 1.0635052689961175, + "grad_norm": 0.19472011923789978, + "learning_rate": 1.1518160383379534e-05, + "loss": 0.5005, + "step": 3835 + }, + { + "epoch": 1.0637825845812534, + "grad_norm": 0.1929817497730255, + "learning_rate": 1.1512655898389115e-05, + "loss": 0.5114, + "step": 3836 + }, + { + "epoch": 1.0640599001663893, + "grad_norm": 0.1856139898300171, + "learning_rate": 1.150715160605569e-05, + "loss": 0.4847, + "step": 3837 + }, + { + "epoch": 1.0643372157515252, + "grad_norm": 0.1880098432302475, + "learning_rate": 1.1501647507453295e-05, + "loss": 0.5346, + "step": 3838 + }, + { + "epoch": 1.064614531336661, + "grad_norm": 0.18800011277198792, + "learning_rate": 1.149614360365592e-05, + "loss": 0.5113, + "step": 3839 + }, + { + "epoch": 1.064891846921797, + "grad_norm": 0.19980834424495697, + "learning_rate": 1.1490639895737523e-05, + "loss": 0.5214, + "step": 3840 + }, + { + "epoch": 1.0651691625069328, + "grad_norm": 0.18565940856933594, + "learning_rate": 1.1485136384772024e-05, + "loss": 0.5108, + "step": 3841 + }, + { + "epoch": 1.0654464780920687, + "grad_norm": 0.18709343671798706, + "learning_rate": 1.1479633071833306e-05, + "loss": 0.5071, + "step": 3842 + }, + { + "epoch": 1.0657237936772046, + "grad_norm": 0.19222643971443176, + "learning_rate": 1.1474129957995209e-05, + "loss": 0.5084, + "step": 3843 + }, + { + "epoch": 1.0660011092623405, + "grad_norm": 0.18951910734176636, + "learning_rate": 1.146862704433154e-05, + "loss": 0.4997, + "step": 3844 + }, + { + "epoch": 1.0662784248474764, + "grad_norm": 0.19320812821388245, + "learning_rate": 1.146312433191606e-05, + "loss": 0.5397, + "step": 3845 + }, + { + "epoch": 1.0665557404326123, + "grad_norm": 0.19874915480613708, + "learning_rate": 1.1457621821822492e-05, + "loss": 0.4976, + "step": 3846 + }, + { + "epoch": 1.0668330560177481, + "grad_norm": 0.19041554629802704, + "learning_rate": 1.1452119515124524e-05, + "loss": 0.5293, + "step": 3847 + }, + { + "epoch": 1.067110371602884, + "grad_norm": 0.18619757890701294, + "learning_rate": 1.1446617412895802e-05, + "loss": 0.5008, + "step": 3848 + }, + { + "epoch": 1.06738768718802, + "grad_norm": 0.18963642418384552, + "learning_rate": 1.1441115516209936e-05, + "loss": 0.5099, + "step": 3849 + }, + { + "epoch": 1.0676650027731558, + "grad_norm": 0.1950794905424118, + "learning_rate": 1.143561382614048e-05, + "loss": 0.5103, + "step": 3850 + }, + { + "epoch": 1.0679423183582917, + "grad_norm": 0.1896001100540161, + "learning_rate": 1.1430112343760971e-05, + "loss": 0.529, + "step": 3851 + }, + { + "epoch": 1.0682196339434276, + "grad_norm": 0.20062585175037384, + "learning_rate": 1.142461107014489e-05, + "loss": 0.5222, + "step": 3852 + }, + { + "epoch": 1.0684969495285634, + "grad_norm": 0.19482149183750153, + "learning_rate": 1.1419110006365682e-05, + "loss": 0.497, + "step": 3853 + }, + { + "epoch": 1.0687742651136993, + "grad_norm": 0.1913536936044693, + "learning_rate": 1.1413609153496742e-05, + "loss": 0.5164, + "step": 3854 + }, + { + "epoch": 1.0690515806988352, + "grad_norm": 0.20729845762252808, + "learning_rate": 1.140810851261145e-05, + "loss": 0.5594, + "step": 3855 + }, + { + "epoch": 1.069328896283971, + "grad_norm": 0.20025323331356049, + "learning_rate": 1.1402608084783112e-05, + "loss": 0.4986, + "step": 3856 + }, + { + "epoch": 1.069606211869107, + "grad_norm": 0.21821513772010803, + "learning_rate": 1.1397107871085009e-05, + "loss": 0.5312, + "step": 3857 + }, + { + "epoch": 1.0698835274542429, + "grad_norm": 0.19112320244312286, + "learning_rate": 1.1391607872590381e-05, + "loss": 0.4957, + "step": 3858 + }, + { + "epoch": 1.0701608430393788, + "grad_norm": 0.18980076909065247, + "learning_rate": 1.138610809037243e-05, + "loss": 0.5235, + "step": 3859 + }, + { + "epoch": 1.0704381586245146, + "grad_norm": 0.18163329362869263, + "learning_rate": 1.1380608525504298e-05, + "loss": 0.4966, + "step": 3860 + }, + { + "epoch": 1.0707154742096505, + "grad_norm": 0.19191473722457886, + "learning_rate": 1.1375109179059098e-05, + "loss": 0.5233, + "step": 3861 + }, + { + "epoch": 1.0709927897947864, + "grad_norm": 0.18972338736057281, + "learning_rate": 1.1369610052109902e-05, + "loss": 0.516, + "step": 3862 + }, + { + "epoch": 1.0712701053799223, + "grad_norm": 0.1891545057296753, + "learning_rate": 1.1364111145729737e-05, + "loss": 0.5029, + "step": 3863 + }, + { + "epoch": 1.0715474209650582, + "grad_norm": 0.1886187046766281, + "learning_rate": 1.1358612460991577e-05, + "loss": 0.5157, + "step": 3864 + }, + { + "epoch": 1.071824736550194, + "grad_norm": 0.2039126604795456, + "learning_rate": 1.1353113998968371e-05, + "loss": 0.535, + "step": 3865 + }, + { + "epoch": 1.07210205213533, + "grad_norm": 0.19889415800571442, + "learning_rate": 1.134761576073301e-05, + "loss": 0.5235, + "step": 3866 + }, + { + "epoch": 1.0723793677204658, + "grad_norm": 0.18273714184761047, + "learning_rate": 1.1342117747358344e-05, + "loss": 0.5083, + "step": 3867 + }, + { + "epoch": 1.0726566833056017, + "grad_norm": 0.19936375319957733, + "learning_rate": 1.1336619959917182e-05, + "loss": 0.508, + "step": 3868 + }, + { + "epoch": 1.0729339988907376, + "grad_norm": 0.19671058654785156, + "learning_rate": 1.133112239948229e-05, + "loss": 0.5239, + "step": 3869 + }, + { + "epoch": 1.0732113144758735, + "grad_norm": 0.18869206309318542, + "learning_rate": 1.132562506712639e-05, + "loss": 0.5204, + "step": 3870 + }, + { + "epoch": 1.0734886300610094, + "grad_norm": 0.18764182925224304, + "learning_rate": 1.132012796392215e-05, + "loss": 0.4993, + "step": 3871 + }, + { + "epoch": 1.0737659456461452, + "grad_norm": 0.21141819655895233, + "learning_rate": 1.1314631090942204e-05, + "loss": 0.4947, + "step": 3872 + }, + { + "epoch": 1.0740432612312811, + "grad_norm": 0.18649768829345703, + "learning_rate": 1.130913444925914e-05, + "loss": 0.5258, + "step": 3873 + }, + { + "epoch": 1.074320576816417, + "grad_norm": 0.18479731678962708, + "learning_rate": 1.1303638039945498e-05, + "loss": 0.4958, + "step": 3874 + }, + { + "epoch": 1.074597892401553, + "grad_norm": 0.20172236859798431, + "learning_rate": 1.1298141864073763e-05, + "loss": 0.5181, + "step": 3875 + }, + { + "epoch": 1.0748752079866888, + "grad_norm": 0.194131001830101, + "learning_rate": 1.1292645922716404e-05, + "loss": 0.538, + "step": 3876 + }, + { + "epoch": 1.0751525235718247, + "grad_norm": 0.18898829817771912, + "learning_rate": 1.1287150216945808e-05, + "loss": 0.4733, + "step": 3877 + }, + { + "epoch": 1.0754298391569606, + "grad_norm": 0.19593171775341034, + "learning_rate": 1.1281654747834337e-05, + "loss": 0.5304, + "step": 3878 + }, + { + "epoch": 1.0757071547420964, + "grad_norm": 0.1899636685848236, + "learning_rate": 1.1276159516454308e-05, + "loss": 0.5191, + "step": 3879 + }, + { + "epoch": 1.0759844703272323, + "grad_norm": 0.1919141411781311, + "learning_rate": 1.1270664523877982e-05, + "loss": 0.5044, + "step": 3880 + }, + { + "epoch": 1.0762617859123682, + "grad_norm": 0.1983211487531662, + "learning_rate": 1.1265169771177573e-05, + "loss": 0.4931, + "step": 3881 + }, + { + "epoch": 1.076539101497504, + "grad_norm": 0.1884281188249588, + "learning_rate": 1.1259675259425263e-05, + "loss": 0.5212, + "step": 3882 + }, + { + "epoch": 1.07681641708264, + "grad_norm": 0.18257342278957367, + "learning_rate": 1.125418098969317e-05, + "loss": 0.4964, + "step": 3883 + }, + { + "epoch": 1.0770937326677759, + "grad_norm": 0.19225247204303741, + "learning_rate": 1.1248686963053374e-05, + "loss": 0.5216, + "step": 3884 + }, + { + "epoch": 1.0773710482529117, + "grad_norm": 0.21373282372951508, + "learning_rate": 1.1243193180577902e-05, + "loss": 0.5106, + "step": 3885 + }, + { + "epoch": 1.0776483638380476, + "grad_norm": 0.21381765604019165, + "learning_rate": 1.123769964333874e-05, + "loss": 0.4909, + "step": 3886 + }, + { + "epoch": 1.0779256794231835, + "grad_norm": 0.18304717540740967, + "learning_rate": 1.1232206352407828e-05, + "loss": 0.5087, + "step": 3887 + }, + { + "epoch": 1.0782029950083194, + "grad_norm": 0.18388831615447998, + "learning_rate": 1.1226713308857036e-05, + "loss": 0.4925, + "step": 3888 + }, + { + "epoch": 1.0784803105934553, + "grad_norm": 0.19412866234779358, + "learning_rate": 1.1221220513758219e-05, + "loss": 0.5099, + "step": 3889 + }, + { + "epoch": 1.0787576261785912, + "grad_norm": 0.19013476371765137, + "learning_rate": 1.1215727968183159e-05, + "loss": 0.5227, + "step": 3890 + }, + { + "epoch": 1.079034941763727, + "grad_norm": 0.19106021523475647, + "learning_rate": 1.1210235673203601e-05, + "loss": 0.5123, + "step": 3891 + }, + { + "epoch": 1.079312257348863, + "grad_norm": 0.18381288647651672, + "learning_rate": 1.1204743629891225e-05, + "loss": 0.5182, + "step": 3892 + }, + { + "epoch": 1.0795895729339988, + "grad_norm": 0.19413931667804718, + "learning_rate": 1.1199251839317696e-05, + "loss": 0.5014, + "step": 3893 + }, + { + "epoch": 1.0798668885191347, + "grad_norm": 0.18920141458511353, + "learning_rate": 1.119376030255459e-05, + "loss": 0.5136, + "step": 3894 + }, + { + "epoch": 1.0801442041042706, + "grad_norm": 0.19880147278308868, + "learning_rate": 1.1188269020673456e-05, + "loss": 0.5539, + "step": 3895 + }, + { + "epoch": 1.0804215196894065, + "grad_norm": 0.1854148656129837, + "learning_rate": 1.118277799474579e-05, + "loss": 0.4911, + "step": 3896 + }, + { + "epoch": 1.0806988352745424, + "grad_norm": 0.18824362754821777, + "learning_rate": 1.1177287225843041e-05, + "loss": 0.4769, + "step": 3897 + }, + { + "epoch": 1.0809761508596782, + "grad_norm": 0.1937967985868454, + "learning_rate": 1.1171796715036597e-05, + "loss": 0.5147, + "step": 3898 + }, + { + "epoch": 1.0812534664448141, + "grad_norm": 0.18958862125873566, + "learning_rate": 1.11663064633978e-05, + "loss": 0.5197, + "step": 3899 + }, + { + "epoch": 1.08153078202995, + "grad_norm": 0.1861356496810913, + "learning_rate": 1.1160816471997951e-05, + "loss": 0.5102, + "step": 3900 + }, + { + "epoch": 1.081808097615086, + "grad_norm": 0.19085288047790527, + "learning_rate": 1.115532674190829e-05, + "loss": 0.4957, + "step": 3901 + }, + { + "epoch": 1.0820854132002218, + "grad_norm": 0.19016428291797638, + "learning_rate": 1.1149837274200004e-05, + "loss": 0.523, + "step": 3902 + }, + { + "epoch": 1.0823627287853577, + "grad_norm": 0.18524803221225739, + "learning_rate": 1.1144348069944244e-05, + "loss": 0.4999, + "step": 3903 + }, + { + "epoch": 1.0826400443704935, + "grad_norm": 0.18277236819267273, + "learning_rate": 1.1138859130212089e-05, + "loss": 0.5139, + "step": 3904 + }, + { + "epoch": 1.0829173599556294, + "grad_norm": 0.18841886520385742, + "learning_rate": 1.113337045607458e-05, + "loss": 0.5124, + "step": 3905 + }, + { + "epoch": 1.0831946755407653, + "grad_norm": 0.19280953705310822, + "learning_rate": 1.1127882048602703e-05, + "loss": 0.4953, + "step": 3906 + }, + { + "epoch": 1.0834719911259012, + "grad_norm": 0.19617757201194763, + "learning_rate": 1.1122393908867392e-05, + "loss": 0.5144, + "step": 3907 + }, + { + "epoch": 1.083749306711037, + "grad_norm": 0.19670793414115906, + "learning_rate": 1.1116906037939532e-05, + "loss": 0.5048, + "step": 3908 + }, + { + "epoch": 1.084026622296173, + "grad_norm": 0.18718746304512024, + "learning_rate": 1.1111418436889944e-05, + "loss": 0.5164, + "step": 3909 + }, + { + "epoch": 1.0843039378813089, + "grad_norm": 0.18242870271205902, + "learning_rate": 1.110593110678941e-05, + "loss": 0.5263, + "step": 3910 + }, + { + "epoch": 1.0845812534664447, + "grad_norm": 0.1876215934753418, + "learning_rate": 1.1100444048708653e-05, + "loss": 0.504, + "step": 3911 + }, + { + "epoch": 1.0848585690515806, + "grad_norm": 0.19080670177936554, + "learning_rate": 1.1094957263718345e-05, + "loss": 0.5236, + "step": 3912 + }, + { + "epoch": 1.0851358846367165, + "grad_norm": 0.194438174366951, + "learning_rate": 1.1089470752889093e-05, + "loss": 0.5175, + "step": 3913 + }, + { + "epoch": 1.0854132002218524, + "grad_norm": 0.19020043313503265, + "learning_rate": 1.1083984517291476e-05, + "loss": 0.5164, + "step": 3914 + }, + { + "epoch": 1.0856905158069883, + "grad_norm": 0.1921057403087616, + "learning_rate": 1.1078498557995995e-05, + "loss": 0.5049, + "step": 3915 + }, + { + "epoch": 1.0859678313921242, + "grad_norm": 0.19786351919174194, + "learning_rate": 1.1073012876073103e-05, + "loss": 0.5317, + "step": 3916 + }, + { + "epoch": 1.08624514697726, + "grad_norm": 0.1904228925704956, + "learning_rate": 1.106752747259321e-05, + "loss": 0.5635, + "step": 3917 + }, + { + "epoch": 1.086522462562396, + "grad_norm": 0.1912548840045929, + "learning_rate": 1.106204234862666e-05, + "loss": 0.5369, + "step": 3918 + }, + { + "epoch": 1.0867997781475318, + "grad_norm": 0.1982937604188919, + "learning_rate": 1.1056557505243746e-05, + "loss": 0.5137, + "step": 3919 + }, + { + "epoch": 1.0870770937326677, + "grad_norm": 0.19235102832317352, + "learning_rate": 1.1051072943514703e-05, + "loss": 0.5317, + "step": 3920 + }, + { + "epoch": 1.0873544093178036, + "grad_norm": 0.2086210548877716, + "learning_rate": 1.1045588664509717e-05, + "loss": 0.4916, + "step": 3921 + }, + { + "epoch": 1.0876317249029395, + "grad_norm": 0.19181272387504578, + "learning_rate": 1.104010466929892e-05, + "loss": 0.5142, + "step": 3922 + }, + { + "epoch": 1.0879090404880754, + "grad_norm": 0.18342861533164978, + "learning_rate": 1.1034620958952377e-05, + "loss": 0.5328, + "step": 3923 + }, + { + "epoch": 1.0881863560732112, + "grad_norm": 0.18724067509174347, + "learning_rate": 1.1029137534540113e-05, + "loss": 0.4951, + "step": 3924 + }, + { + "epoch": 1.0884636716583471, + "grad_norm": 0.19535782933235168, + "learning_rate": 1.1023654397132087e-05, + "loss": 0.498, + "step": 3925 + }, + { + "epoch": 1.088740987243483, + "grad_norm": 0.24757403135299683, + "learning_rate": 1.10181715477982e-05, + "loss": 0.4967, + "step": 3926 + }, + { + "epoch": 1.089018302828619, + "grad_norm": 0.1959121972322464, + "learning_rate": 1.1012688987608303e-05, + "loss": 0.5174, + "step": 3927 + }, + { + "epoch": 1.0892956184137548, + "grad_norm": 0.20022380352020264, + "learning_rate": 1.1007206717632193e-05, + "loss": 0.5224, + "step": 3928 + }, + { + "epoch": 1.0895729339988907, + "grad_norm": 0.18869644403457642, + "learning_rate": 1.1001724738939606e-05, + "loss": 0.5269, + "step": 3929 + }, + { + "epoch": 1.0898502495840265, + "grad_norm": 0.19060127437114716, + "learning_rate": 1.099624305260021e-05, + "loss": 0.517, + "step": 3930 + }, + { + "epoch": 1.0901275651691624, + "grad_norm": 0.18450330197811127, + "learning_rate": 1.0990761659683643e-05, + "loss": 0.5134, + "step": 3931 + }, + { + "epoch": 1.0904048807542983, + "grad_norm": 0.20092949271202087, + "learning_rate": 1.0985280561259462e-05, + "loss": 0.5229, + "step": 3932 + }, + { + "epoch": 1.0906821963394342, + "grad_norm": 0.187063530087471, + "learning_rate": 1.0979799758397173e-05, + "loss": 0.5074, + "step": 3933 + }, + { + "epoch": 1.09095951192457, + "grad_norm": 0.21675853431224823, + "learning_rate": 1.0974319252166226e-05, + "loss": 0.5443, + "step": 3934 + }, + { + "epoch": 1.091236827509706, + "grad_norm": 0.19182507693767548, + "learning_rate": 1.0968839043636021e-05, + "loss": 0.5165, + "step": 3935 + }, + { + "epoch": 1.0915141430948418, + "grad_norm": 0.19931842386722565, + "learning_rate": 1.0963359133875884e-05, + "loss": 0.5362, + "step": 3936 + }, + { + "epoch": 1.0917914586799777, + "grad_norm": 0.19970446825027466, + "learning_rate": 1.0957879523955087e-05, + "loss": 0.5175, + "step": 3937 + }, + { + "epoch": 1.0920687742651136, + "grad_norm": 0.18823319673538208, + "learning_rate": 1.0952400214942857e-05, + "loss": 0.5184, + "step": 3938 + }, + { + "epoch": 1.0923460898502495, + "grad_norm": 0.19610384106636047, + "learning_rate": 1.094692120790835e-05, + "loss": 0.5194, + "step": 3939 + }, + { + "epoch": 1.0926234054353854, + "grad_norm": 0.2017711102962494, + "learning_rate": 1.0941442503920664e-05, + "loss": 0.5254, + "step": 3940 + }, + { + "epoch": 1.0929007210205213, + "grad_norm": 0.18085281550884247, + "learning_rate": 1.0935964104048834e-05, + "loss": 0.4865, + "step": 3941 + }, + { + "epoch": 1.0931780366056572, + "grad_norm": 0.18639199435710907, + "learning_rate": 1.0930486009361847e-05, + "loss": 0.4902, + "step": 3942 + }, + { + "epoch": 1.093455352190793, + "grad_norm": 0.1943678855895996, + "learning_rate": 1.0925008220928624e-05, + "loss": 0.5216, + "step": 3943 + }, + { + "epoch": 1.093732667775929, + "grad_norm": 0.19227345287799835, + "learning_rate": 1.0919530739818022e-05, + "loss": 0.4983, + "step": 3944 + }, + { + "epoch": 1.0940099833610648, + "grad_norm": 0.18952839076519012, + "learning_rate": 1.091405356709885e-05, + "loss": 0.496, + "step": 3945 + }, + { + "epoch": 1.0942872989462007, + "grad_norm": 0.20736488699913025, + "learning_rate": 1.090857670383985e-05, + "loss": 0.5296, + "step": 3946 + }, + { + "epoch": 1.0945646145313366, + "grad_norm": 0.1944408416748047, + "learning_rate": 1.090310015110969e-05, + "loss": 0.5024, + "step": 3947 + }, + { + "epoch": 1.0948419301164725, + "grad_norm": 0.2053939253091812, + "learning_rate": 1.0897623909977006e-05, + "loss": 0.4962, + "step": 3948 + }, + { + "epoch": 1.0951192457016083, + "grad_norm": 0.200760155916214, + "learning_rate": 1.089214798151035e-05, + "loss": 0.5123, + "step": 3949 + }, + { + "epoch": 1.0953965612867442, + "grad_norm": 0.19097808003425598, + "learning_rate": 1.0886672366778224e-05, + "loss": 0.4831, + "step": 3950 + }, + { + "epoch": 1.0956738768718801, + "grad_norm": 0.1900891810655594, + "learning_rate": 1.0881197066849055e-05, + "loss": 0.5154, + "step": 3951 + }, + { + "epoch": 1.095951192457016, + "grad_norm": 0.19628465175628662, + "learning_rate": 1.0875722082791237e-05, + "loss": 0.4804, + "step": 3952 + }, + { + "epoch": 1.0962285080421519, + "grad_norm": 0.18627870082855225, + "learning_rate": 1.0870247415673072e-05, + "loss": 0.4819, + "step": 3953 + }, + { + "epoch": 1.0965058236272878, + "grad_norm": 0.1861443817615509, + "learning_rate": 1.0864773066562814e-05, + "loss": 0.4924, + "step": 3954 + }, + { + "epoch": 1.0967831392124237, + "grad_norm": 0.2025613784790039, + "learning_rate": 1.0859299036528657e-05, + "loss": 0.5246, + "step": 3955 + }, + { + "epoch": 1.0970604547975595, + "grad_norm": 0.18845801055431366, + "learning_rate": 1.0853825326638731e-05, + "loss": 0.499, + "step": 3956 + }, + { + "epoch": 1.0973377703826954, + "grad_norm": 0.1874900460243225, + "learning_rate": 1.0848351937961094e-05, + "loss": 0.5166, + "step": 3957 + }, + { + "epoch": 1.0976150859678313, + "grad_norm": 0.20147672295570374, + "learning_rate": 1.0842878871563752e-05, + "loss": 0.5236, + "step": 3958 + }, + { + "epoch": 1.0978924015529672, + "grad_norm": 0.2169303447008133, + "learning_rate": 1.083740612851465e-05, + "loss": 0.4983, + "step": 3959 + }, + { + "epoch": 1.098169717138103, + "grad_norm": 0.183817520737648, + "learning_rate": 1.083193370988166e-05, + "loss": 0.5103, + "step": 3960 + }, + { + "epoch": 1.098447032723239, + "grad_norm": 0.2015179991722107, + "learning_rate": 1.0826461616732596e-05, + "loss": 0.4943, + "step": 3961 + }, + { + "epoch": 1.0987243483083748, + "grad_norm": 0.19928953051567078, + "learning_rate": 1.0820989850135216e-05, + "loss": 0.5246, + "step": 3962 + }, + { + "epoch": 1.0990016638935107, + "grad_norm": 0.18814896047115326, + "learning_rate": 1.0815518411157198e-05, + "loss": 0.5284, + "step": 3963 + }, + { + "epoch": 1.0992789794786466, + "grad_norm": 0.20342199504375458, + "learning_rate": 1.0810047300866166e-05, + "loss": 0.5014, + "step": 3964 + }, + { + "epoch": 1.0995562950637825, + "grad_norm": 0.19590109586715698, + "learning_rate": 1.0804576520329679e-05, + "loss": 0.5148, + "step": 3965 + }, + { + "epoch": 1.0998336106489184, + "grad_norm": 0.1944301873445511, + "learning_rate": 1.0799106070615235e-05, + "loss": 0.4809, + "step": 3966 + }, + { + "epoch": 1.1001109262340543, + "grad_norm": 0.19030778110027313, + "learning_rate": 1.0793635952790264e-05, + "loss": 0.5093, + "step": 3967 + }, + { + "epoch": 1.1003882418191901, + "grad_norm": 0.1883806735277176, + "learning_rate": 1.0788166167922118e-05, + "loss": 0.5018, + "step": 3968 + }, + { + "epoch": 1.100665557404326, + "grad_norm": 0.21474680304527283, + "learning_rate": 1.0782696717078117e-05, + "loss": 0.5091, + "step": 3969 + }, + { + "epoch": 1.100942872989462, + "grad_norm": 0.24823465943336487, + "learning_rate": 1.0777227601325482e-05, + "loss": 0.5073, + "step": 3970 + }, + { + "epoch": 1.1012201885745978, + "grad_norm": 0.18175634741783142, + "learning_rate": 1.0771758821731386e-05, + "loss": 0.4993, + "step": 3971 + }, + { + "epoch": 1.1014975041597337, + "grad_norm": 0.18903425335884094, + "learning_rate": 1.0766290379362928e-05, + "loss": 0.5283, + "step": 3972 + }, + { + "epoch": 1.1017748197448696, + "grad_norm": 0.19946777820587158, + "learning_rate": 1.0760822275287159e-05, + "loss": 0.5284, + "step": 3973 + }, + { + "epoch": 1.1020521353300055, + "grad_norm": 0.19179320335388184, + "learning_rate": 1.075535451057104e-05, + "loss": 0.5264, + "step": 3974 + }, + { + "epoch": 1.1023294509151413, + "grad_norm": 0.18837237358093262, + "learning_rate": 1.0749887086281474e-05, + "loss": 0.5051, + "step": 3975 + }, + { + "epoch": 1.1026067665002772, + "grad_norm": 0.19744771718978882, + "learning_rate": 1.0744420003485312e-05, + "loss": 0.5313, + "step": 3976 + }, + { + "epoch": 1.102884082085413, + "grad_norm": 0.20925208926200867, + "learning_rate": 1.0738953263249319e-05, + "loss": 0.5033, + "step": 3977 + }, + { + "epoch": 1.103161397670549, + "grad_norm": 0.19691763818264008, + "learning_rate": 1.0733486866640203e-05, + "loss": 0.5205, + "step": 3978 + }, + { + "epoch": 1.1034387132556849, + "grad_norm": 0.19739267230033875, + "learning_rate": 1.07280208147246e-05, + "loss": 0.4742, + "step": 3979 + }, + { + "epoch": 1.1037160288408208, + "grad_norm": 0.1945018321275711, + "learning_rate": 1.0722555108569085e-05, + "loss": 0.4977, + "step": 3980 + }, + { + "epoch": 1.1039933444259566, + "grad_norm": 0.19838854670524597, + "learning_rate": 1.071708974924016e-05, + "loss": 0.5137, + "step": 3981 + }, + { + "epoch": 1.1042706600110925, + "grad_norm": 0.21607692539691925, + "learning_rate": 1.071162473780426e-05, + "loss": 0.5166, + "step": 3982 + }, + { + "epoch": 1.1045479755962284, + "grad_norm": 0.17799584567546844, + "learning_rate": 1.0706160075327761e-05, + "loss": 0.4938, + "step": 3983 + }, + { + "epoch": 1.1048252911813643, + "grad_norm": 0.1956464648246765, + "learning_rate": 1.0700695762876958e-05, + "loss": 0.5244, + "step": 3984 + }, + { + "epoch": 1.1051026067665002, + "grad_norm": 0.18592742085456848, + "learning_rate": 1.0695231801518083e-05, + "loss": 0.466, + "step": 3985 + }, + { + "epoch": 1.105379922351636, + "grad_norm": 0.19382350146770477, + "learning_rate": 1.0689768192317296e-05, + "loss": 0.4924, + "step": 3986 + }, + { + "epoch": 1.105657237936772, + "grad_norm": 0.2073841392993927, + "learning_rate": 1.0684304936340697e-05, + "loss": 0.5109, + "step": 3987 + }, + { + "epoch": 1.1059345535219078, + "grad_norm": 0.19084088504314423, + "learning_rate": 1.0678842034654315e-05, + "loss": 0.5137, + "step": 3988 + }, + { + "epoch": 1.1062118691070437, + "grad_norm": 0.1962527185678482, + "learning_rate": 1.0673379488324095e-05, + "loss": 0.5245, + "step": 3989 + }, + { + "epoch": 1.1064891846921796, + "grad_norm": 0.18985575437545776, + "learning_rate": 1.066791729841594e-05, + "loss": 0.4747, + "step": 3990 + }, + { + "epoch": 1.1067665002773155, + "grad_norm": 0.2049971967935562, + "learning_rate": 1.0662455465995657e-05, + "loss": 0.4954, + "step": 3991 + }, + { + "epoch": 1.1070438158624514, + "grad_norm": 0.19408412277698517, + "learning_rate": 1.0656993992128999e-05, + "loss": 0.5252, + "step": 3992 + }, + { + "epoch": 1.1073211314475873, + "grad_norm": 0.19346264004707336, + "learning_rate": 1.0651532877881639e-05, + "loss": 0.4995, + "step": 3993 + }, + { + "epoch": 1.1075984470327231, + "grad_norm": 0.19205859303474426, + "learning_rate": 1.0646072124319193e-05, + "loss": 0.5091, + "step": 3994 + }, + { + "epoch": 1.107875762617859, + "grad_norm": 0.20613279938697815, + "learning_rate": 1.0640611732507192e-05, + "loss": 0.5191, + "step": 3995 + }, + { + "epoch": 1.108153078202995, + "grad_norm": 0.1821645051240921, + "learning_rate": 1.0635151703511104e-05, + "loss": 0.493, + "step": 3996 + }, + { + "epoch": 1.1084303937881308, + "grad_norm": 0.20445798337459564, + "learning_rate": 1.062969203839633e-05, + "loss": 0.5365, + "step": 3997 + }, + { + "epoch": 1.1087077093732667, + "grad_norm": 0.19484387338161469, + "learning_rate": 1.062423273822819e-05, + "loss": 0.5099, + "step": 3998 + }, + { + "epoch": 1.1089850249584026, + "grad_norm": 0.19972245395183563, + "learning_rate": 1.0618773804071943e-05, + "loss": 0.5233, + "step": 3999 + }, + { + "epoch": 1.1092623405435384, + "grad_norm": 0.20230787992477417, + "learning_rate": 1.0613315236992766e-05, + "loss": 0.524, + "step": 4000 + }, + { + "epoch": 1.1095396561286743, + "grad_norm": 0.18941400945186615, + "learning_rate": 1.0607857038055774e-05, + "loss": 0.4988, + "step": 4001 + }, + { + "epoch": 1.1098169717138102, + "grad_norm": 0.1921200454235077, + "learning_rate": 1.0602399208326006e-05, + "loss": 0.5355, + "step": 4002 + }, + { + "epoch": 1.110094287298946, + "grad_norm": 0.1832018345594406, + "learning_rate": 1.0596941748868426e-05, + "loss": 0.499, + "step": 4003 + }, + { + "epoch": 1.110371602884082, + "grad_norm": 0.19228579103946686, + "learning_rate": 1.0591484660747933e-05, + "loss": 0.5189, + "step": 4004 + }, + { + "epoch": 1.1106489184692179, + "grad_norm": 0.19055628776550293, + "learning_rate": 1.0586027945029352e-05, + "loss": 0.5012, + "step": 4005 + }, + { + "epoch": 1.1109262340543538, + "grad_norm": 0.19514985382556915, + "learning_rate": 1.0580571602777425e-05, + "loss": 0.4883, + "step": 4006 + }, + { + "epoch": 1.1112035496394896, + "grad_norm": 0.1857844740152359, + "learning_rate": 1.057511563505683e-05, + "loss": 0.4863, + "step": 4007 + }, + { + "epoch": 1.1114808652246255, + "grad_norm": 0.19990986585617065, + "learning_rate": 1.0569660042932177e-05, + "loss": 0.5079, + "step": 4008 + }, + { + "epoch": 1.1117581808097614, + "grad_norm": 0.1960282176733017, + "learning_rate": 1.0564204827467994e-05, + "loss": 0.5277, + "step": 4009 + }, + { + "epoch": 1.1120354963948973, + "grad_norm": 0.20044219493865967, + "learning_rate": 1.0558749989728729e-05, + "loss": 0.5121, + "step": 4010 + }, + { + "epoch": 1.1123128119800332, + "grad_norm": 0.19683614373207092, + "learning_rate": 1.0553295530778784e-05, + "loss": 0.4932, + "step": 4011 + }, + { + "epoch": 1.112590127565169, + "grad_norm": 0.1851675808429718, + "learning_rate": 1.0547841451682453e-05, + "loss": 0.4994, + "step": 4012 + }, + { + "epoch": 1.112867443150305, + "grad_norm": 0.20108520984649658, + "learning_rate": 1.0542387753503974e-05, + "loss": 0.5352, + "step": 4013 + }, + { + "epoch": 1.1131447587354408, + "grad_norm": 0.18748925626277924, + "learning_rate": 1.0536934437307514e-05, + "loss": 0.4929, + "step": 4014 + }, + { + "epoch": 1.1134220743205767, + "grad_norm": 0.183696448802948, + "learning_rate": 1.0531481504157153e-05, + "loss": 0.5099, + "step": 4015 + }, + { + "epoch": 1.1136993899057126, + "grad_norm": 0.20530031621456146, + "learning_rate": 1.0526028955116912e-05, + "loss": 0.5101, + "step": 4016 + }, + { + "epoch": 1.1139767054908485, + "grad_norm": 0.20076102018356323, + "learning_rate": 1.0520576791250711e-05, + "loss": 0.533, + "step": 4017 + }, + { + "epoch": 1.1142540210759844, + "grad_norm": 0.2018887996673584, + "learning_rate": 1.0515125013622428e-05, + "loss": 0.5015, + "step": 4018 + }, + { + "epoch": 1.1145313366611203, + "grad_norm": 0.1974845677614212, + "learning_rate": 1.0509673623295843e-05, + "loss": 0.4778, + "step": 4019 + }, + { + "epoch": 1.1148086522462561, + "grad_norm": 0.18530985713005066, + "learning_rate": 1.0504222621334664e-05, + "loss": 0.5003, + "step": 4020 + }, + { + "epoch": 1.115085967831392, + "grad_norm": 0.1943405568599701, + "learning_rate": 1.0498772008802531e-05, + "loss": 0.5041, + "step": 4021 + }, + { + "epoch": 1.115363283416528, + "grad_norm": 0.19365082681179047, + "learning_rate": 1.0493321786763003e-05, + "loss": 0.5156, + "step": 4022 + }, + { + "epoch": 1.1156405990016638, + "grad_norm": 0.20058946311473846, + "learning_rate": 1.0487871956279558e-05, + "loss": 0.4925, + "step": 4023 + }, + { + "epoch": 1.1159179145867997, + "grad_norm": 0.1888214498758316, + "learning_rate": 1.0482422518415602e-05, + "loss": 0.5321, + "step": 4024 + }, + { + "epoch": 1.1161952301719356, + "grad_norm": 0.19549217820167542, + "learning_rate": 1.047697347423447e-05, + "loss": 0.5257, + "step": 4025 + }, + { + "epoch": 1.1164725457570714, + "grad_norm": 0.19175288081169128, + "learning_rate": 1.0471524824799413e-05, + "loss": 0.4774, + "step": 4026 + }, + { + "epoch": 1.1167498613422073, + "grad_norm": 0.19548679888248444, + "learning_rate": 1.04660765711736e-05, + "loss": 0.4782, + "step": 4027 + }, + { + "epoch": 1.1170271769273432, + "grad_norm": 0.20252208411693573, + "learning_rate": 1.0460628714420145e-05, + "loss": 0.5024, + "step": 4028 + }, + { + "epoch": 1.117304492512479, + "grad_norm": 0.18733076751232147, + "learning_rate": 1.0455181255602056e-05, + "loss": 0.4894, + "step": 4029 + }, + { + "epoch": 1.117581808097615, + "grad_norm": 0.19177298247814178, + "learning_rate": 1.0449734195782281e-05, + "loss": 0.5067, + "step": 4030 + }, + { + "epoch": 1.1178591236827509, + "grad_norm": 0.18974518775939941, + "learning_rate": 1.0444287536023681e-05, + "loss": 0.4863, + "step": 4031 + }, + { + "epoch": 1.1181364392678868, + "grad_norm": 0.20173045992851257, + "learning_rate": 1.0438841277389055e-05, + "loss": 0.507, + "step": 4032 + }, + { + "epoch": 1.1184137548530226, + "grad_norm": 0.189706489443779, + "learning_rate": 1.0433395420941101e-05, + "loss": 0.495, + "step": 4033 + }, + { + "epoch": 1.1186910704381585, + "grad_norm": 0.20018784701824188, + "learning_rate": 1.0427949967742452e-05, + "loss": 0.5267, + "step": 4034 + }, + { + "epoch": 1.1189683860232944, + "grad_norm": 0.18162201344966888, + "learning_rate": 1.0422504918855664e-05, + "loss": 0.5028, + "step": 4035 + }, + { + "epoch": 1.1192457016084303, + "grad_norm": 0.2000206708908081, + "learning_rate": 1.041706027534321e-05, + "loss": 0.5162, + "step": 4036 + }, + { + "epoch": 1.1195230171935662, + "grad_norm": 0.19805558025836945, + "learning_rate": 1.0411616038267486e-05, + "loss": 0.4861, + "step": 4037 + }, + { + "epoch": 1.119800332778702, + "grad_norm": 0.19431713223457336, + "learning_rate": 1.0406172208690797e-05, + "loss": 0.4991, + "step": 4038 + }, + { + "epoch": 1.120077648363838, + "grad_norm": 0.2039175033569336, + "learning_rate": 1.0400728787675387e-05, + "loss": 0.539, + "step": 4039 + }, + { + "epoch": 1.1203549639489738, + "grad_norm": 0.19760684669017792, + "learning_rate": 1.039528577628341e-05, + "loss": 0.5149, + "step": 4040 + }, + { + "epoch": 1.1206322795341097, + "grad_norm": 0.18955372273921967, + "learning_rate": 1.038984317557694e-05, + "loss": 0.4901, + "step": 4041 + }, + { + "epoch": 1.1209095951192456, + "grad_norm": 0.19011299312114716, + "learning_rate": 1.0384400986617977e-05, + "loss": 0.518, + "step": 4042 + }, + { + "epoch": 1.1211869107043815, + "grad_norm": 0.18622393906116486, + "learning_rate": 1.0378959210468434e-05, + "loss": 0.4912, + "step": 4043 + }, + { + "epoch": 1.1214642262895174, + "grad_norm": 0.19130538403987885, + "learning_rate": 1.0373517848190143e-05, + "loss": 0.4952, + "step": 4044 + }, + { + "epoch": 1.1217415418746532, + "grad_norm": 0.19988033175468445, + "learning_rate": 1.0368076900844856e-05, + "loss": 0.4964, + "step": 4045 + }, + { + "epoch": 1.1220188574597891, + "grad_norm": 0.20225760340690613, + "learning_rate": 1.0362636369494254e-05, + "loss": 0.5338, + "step": 4046 + }, + { + "epoch": 1.122296173044925, + "grad_norm": 0.2051144242286682, + "learning_rate": 1.0357196255199928e-05, + "loss": 0.5251, + "step": 4047 + }, + { + "epoch": 1.122573488630061, + "grad_norm": 0.18915054202079773, + "learning_rate": 1.0351756559023374e-05, + "loss": 0.4812, + "step": 4048 + }, + { + "epoch": 1.1228508042151968, + "grad_norm": 0.20346355438232422, + "learning_rate": 1.0346317282026045e-05, + "loss": 0.5366, + "step": 4049 + }, + { + "epoch": 1.1231281198003327, + "grad_norm": 0.1956186592578888, + "learning_rate": 1.0340878425269269e-05, + "loss": 0.5396, + "step": 4050 + }, + { + "epoch": 1.1234054353854686, + "grad_norm": 0.18970191478729248, + "learning_rate": 1.0335439989814316e-05, + "loss": 0.4687, + "step": 4051 + }, + { + "epoch": 1.1236827509706044, + "grad_norm": 0.19612254202365875, + "learning_rate": 1.033000197672237e-05, + "loss": 0.5025, + "step": 4052 + }, + { + "epoch": 1.1239600665557403, + "grad_norm": 0.19224813580513, + "learning_rate": 1.0324564387054535e-05, + "loss": 0.494, + "step": 4053 + }, + { + "epoch": 1.1242373821408762, + "grad_norm": 0.1851874589920044, + "learning_rate": 1.0319127221871823e-05, + "loss": 0.521, + "step": 4054 + }, + { + "epoch": 1.124514697726012, + "grad_norm": 0.19579973816871643, + "learning_rate": 1.0313690482235168e-05, + "loss": 0.5012, + "step": 4055 + }, + { + "epoch": 1.124792013311148, + "grad_norm": 0.21103787422180176, + "learning_rate": 1.0308254169205428e-05, + "loss": 0.4969, + "step": 4056 + }, + { + "epoch": 1.1250693288962839, + "grad_norm": 0.1894204467535019, + "learning_rate": 1.030281828384337e-05, + "loss": 0.494, + "step": 4057 + }, + { + "epoch": 1.1253466444814197, + "grad_norm": 0.1914178431034088, + "learning_rate": 1.0297382827209679e-05, + "loss": 0.5094, + "step": 4058 + }, + { + "epoch": 1.1256239600665556, + "grad_norm": 0.18707357347011566, + "learning_rate": 1.0291947800364948e-05, + "loss": 0.5023, + "step": 4059 + }, + { + "epoch": 1.1259012756516915, + "grad_norm": 0.19480562210083008, + "learning_rate": 1.0286513204369712e-05, + "loss": 0.5154, + "step": 4060 + }, + { + "epoch": 1.1261785912368274, + "grad_norm": 0.18776677548885345, + "learning_rate": 1.0281079040284392e-05, + "loss": 0.5143, + "step": 4061 + }, + { + "epoch": 1.1264559068219633, + "grad_norm": 0.18544964492321014, + "learning_rate": 1.0275645309169337e-05, + "loss": 0.4906, + "step": 4062 + }, + { + "epoch": 1.1267332224070992, + "grad_norm": 0.18632815778255463, + "learning_rate": 1.0270212012084817e-05, + "loss": 0.5276, + "step": 4063 + }, + { + "epoch": 1.127010537992235, + "grad_norm": 0.206988126039505, + "learning_rate": 1.0264779150091014e-05, + "loss": 0.5146, + "step": 4064 + }, + { + "epoch": 1.127287853577371, + "grad_norm": 0.19988800585269928, + "learning_rate": 1.0259346724248018e-05, + "loss": 0.4973, + "step": 4065 + }, + { + "epoch": 1.1275651691625068, + "grad_norm": 0.18873746693134308, + "learning_rate": 1.0253914735615838e-05, + "loss": 0.5136, + "step": 4066 + }, + { + "epoch": 1.1278424847476427, + "grad_norm": 0.19661124050617218, + "learning_rate": 1.0248483185254403e-05, + "loss": 0.5418, + "step": 4067 + }, + { + "epoch": 1.1281198003327786, + "grad_norm": 0.2008582353591919, + "learning_rate": 1.0243052074223555e-05, + "loss": 0.508, + "step": 4068 + }, + { + "epoch": 1.1283971159179145, + "grad_norm": 0.19019687175750732, + "learning_rate": 1.023762140358304e-05, + "loss": 0.5072, + "step": 4069 + }, + { + "epoch": 1.1286744315030504, + "grad_norm": 0.19475378096103668, + "learning_rate": 1.0232191174392532e-05, + "loss": 0.5282, + "step": 4070 + }, + { + "epoch": 1.1289517470881862, + "grad_norm": 0.18883496522903442, + "learning_rate": 1.0226761387711612e-05, + "loss": 0.5092, + "step": 4071 + }, + { + "epoch": 1.1292290626733221, + "grad_norm": 0.1904006004333496, + "learning_rate": 1.0221332044599768e-05, + "loss": 0.5302, + "step": 4072 + }, + { + "epoch": 1.129506378258458, + "grad_norm": 0.19037893414497375, + "learning_rate": 1.0215903146116417e-05, + "loss": 0.4916, + "step": 4073 + }, + { + "epoch": 1.129783693843594, + "grad_norm": 0.2003484070301056, + "learning_rate": 1.021047469332088e-05, + "loss": 0.5259, + "step": 4074 + }, + { + "epoch": 1.1300610094287298, + "grad_norm": 0.18021680414676666, + "learning_rate": 1.0205046687272392e-05, + "loss": 0.5038, + "step": 4075 + }, + { + "epoch": 1.1303383250138657, + "grad_norm": 0.19366797804832458, + "learning_rate": 1.0199619129030093e-05, + "loss": 0.5091, + "step": 4076 + }, + { + "epoch": 1.1306156405990015, + "grad_norm": 0.19460560381412506, + "learning_rate": 1.0194192019653053e-05, + "loss": 0.5071, + "step": 4077 + }, + { + "epoch": 1.1308929561841374, + "grad_norm": 0.18521569669246674, + "learning_rate": 1.018876536020024e-05, + "loss": 0.5031, + "step": 4078 + }, + { + "epoch": 1.1311702717692733, + "grad_norm": 0.18586260080337524, + "learning_rate": 1.018333915173054e-05, + "loss": 0.4893, + "step": 4079 + }, + { + "epoch": 1.1314475873544092, + "grad_norm": 0.20619140565395355, + "learning_rate": 1.0177913395302748e-05, + "loss": 0.5124, + "step": 4080 + }, + { + "epoch": 1.131724902939545, + "grad_norm": 0.2014811784029007, + "learning_rate": 1.0172488091975583e-05, + "loss": 0.5045, + "step": 4081 + }, + { + "epoch": 1.132002218524681, + "grad_norm": 0.2058635950088501, + "learning_rate": 1.0167063242807654e-05, + "loss": 0.4832, + "step": 4082 + }, + { + "epoch": 1.1322795341098169, + "grad_norm": 0.19578316807746887, + "learning_rate": 1.016163884885749e-05, + "loss": 0.501, + "step": 4083 + }, + { + "epoch": 1.1325568496949527, + "grad_norm": 0.1896321028470993, + "learning_rate": 1.0156214911183546e-05, + "loss": 0.5113, + "step": 4084 + }, + { + "epoch": 1.1328341652800886, + "grad_norm": 0.19131551682949066, + "learning_rate": 1.0150791430844172e-05, + "loss": 0.502, + "step": 4085 + }, + { + "epoch": 1.1331114808652245, + "grad_norm": 0.20685172080993652, + "learning_rate": 1.0145368408897624e-05, + "loss": 0.4907, + "step": 4086 + }, + { + "epoch": 1.1333887964503604, + "grad_norm": 0.21301031112670898, + "learning_rate": 1.0139945846402091e-05, + "loss": 0.5013, + "step": 4087 + }, + { + "epoch": 1.1336661120354963, + "grad_norm": 0.1879834532737732, + "learning_rate": 1.013452374441565e-05, + "loss": 0.5197, + "step": 4088 + }, + { + "epoch": 1.1339434276206322, + "grad_norm": 0.19364705681800842, + "learning_rate": 1.01291021039963e-05, + "loss": 0.5064, + "step": 4089 + }, + { + "epoch": 1.134220743205768, + "grad_norm": 0.18613065779209137, + "learning_rate": 1.012368092620194e-05, + "loss": 0.4874, + "step": 4090 + }, + { + "epoch": 1.134498058790904, + "grad_norm": 0.1905667632818222, + "learning_rate": 1.0118260212090397e-05, + "loss": 0.5038, + "step": 4091 + }, + { + "epoch": 1.1347753743760398, + "grad_norm": 0.19485412538051605, + "learning_rate": 1.0112839962719387e-05, + "loss": 0.4862, + "step": 4092 + }, + { + "epoch": 1.1350526899611757, + "grad_norm": 0.1890053004026413, + "learning_rate": 1.0107420179146542e-05, + "loss": 0.5268, + "step": 4093 + }, + { + "epoch": 1.1353300055463116, + "grad_norm": 0.19711565971374512, + "learning_rate": 1.0102000862429415e-05, + "loss": 0.4994, + "step": 4094 + }, + { + "epoch": 1.1356073211314475, + "grad_norm": 0.18928836286067963, + "learning_rate": 1.0096582013625455e-05, + "loss": 0.5112, + "step": 4095 + }, + { + "epoch": 1.1358846367165834, + "grad_norm": 0.1987476795911789, + "learning_rate": 1.0091163633792023e-05, + "loss": 0.4815, + "step": 4096 + }, + { + "epoch": 1.1361619523017192, + "grad_norm": 0.20385397970676422, + "learning_rate": 1.0085745723986379e-05, + "loss": 0.5387, + "step": 4097 + }, + { + "epoch": 1.1364392678868551, + "grad_norm": 0.1833743155002594, + "learning_rate": 1.0080328285265715e-05, + "loss": 0.5216, + "step": 4098 + }, + { + "epoch": 1.136716583471991, + "grad_norm": 0.19815048575401306, + "learning_rate": 1.007491131868711e-05, + "loss": 0.5257, + "step": 4099 + }, + { + "epoch": 1.1369938990571269, + "grad_norm": 0.19522307813167572, + "learning_rate": 1.0069494825307554e-05, + "loss": 0.5109, + "step": 4100 + }, + { + "epoch": 1.1372712146422628, + "grad_norm": 0.20600605010986328, + "learning_rate": 1.0064078806183956e-05, + "loss": 0.5222, + "step": 4101 + }, + { + "epoch": 1.1375485302273987, + "grad_norm": 0.19125322997570038, + "learning_rate": 1.0058663262373125e-05, + "loss": 0.5145, + "step": 4102 + }, + { + "epoch": 1.1378258458125345, + "grad_norm": 0.19589276611804962, + "learning_rate": 1.005324819493177e-05, + "loss": 0.5181, + "step": 4103 + }, + { + "epoch": 1.1381031613976704, + "grad_norm": 0.182989239692688, + "learning_rate": 1.0047833604916515e-05, + "loss": 0.5209, + "step": 4104 + }, + { + "epoch": 1.1383804769828063, + "grad_norm": 0.18455688655376434, + "learning_rate": 1.0042419493383896e-05, + "loss": 0.4888, + "step": 4105 + }, + { + "epoch": 1.1386577925679422, + "grad_norm": 0.19933606684207916, + "learning_rate": 1.0037005861390346e-05, + "loss": 0.5306, + "step": 4106 + }, + { + "epoch": 1.138935108153078, + "grad_norm": 0.19362375140190125, + "learning_rate": 1.0031592709992204e-05, + "loss": 0.4934, + "step": 4107 + }, + { + "epoch": 1.139212423738214, + "grad_norm": 0.1925627440214157, + "learning_rate": 1.0026180040245728e-05, + "loss": 0.5044, + "step": 4108 + }, + { + "epoch": 1.1394897393233498, + "grad_norm": 0.19093094766139984, + "learning_rate": 1.0020767853207069e-05, + "loss": 0.5083, + "step": 4109 + }, + { + "epoch": 1.1397670549084857, + "grad_norm": 0.18980608880519867, + "learning_rate": 1.0015356149932288e-05, + "loss": 0.5099, + "step": 4110 + }, + { + "epoch": 1.1400443704936216, + "grad_norm": 0.18821600079536438, + "learning_rate": 1.0009944931477346e-05, + "loss": 0.5194, + "step": 4111 + }, + { + "epoch": 1.1403216860787575, + "grad_norm": 0.1851465255022049, + "learning_rate": 1.0004534198898124e-05, + "loss": 0.503, + "step": 4112 + }, + { + "epoch": 1.1405990016638934, + "grad_norm": 0.19353441894054413, + "learning_rate": 9.9991239532504e-06, + "loss": 0.511, + "step": 4113 + }, + { + "epoch": 1.1408763172490295, + "grad_norm": 0.19810713827610016, + "learning_rate": 9.993714195589847e-06, + "loss": 0.5106, + "step": 4114 + }, + { + "epoch": 1.1411536328341654, + "grad_norm": 0.19608542323112488, + "learning_rate": 9.98830492697206e-06, + "loss": 0.5384, + "step": 4115 + }, + { + "epoch": 1.1414309484193013, + "grad_norm": 0.43584343791007996, + "learning_rate": 9.982896148452527e-06, + "loss": 0.5011, + "step": 4116 + }, + { + "epoch": 1.1417082640044371, + "grad_norm": 0.19775094091892242, + "learning_rate": 9.977487861086647e-06, + "loss": 0.5184, + "step": 4117 + }, + { + "epoch": 1.141985579589573, + "grad_norm": 0.2092062532901764, + "learning_rate": 9.97208006592971e-06, + "loss": 0.4917, + "step": 4118 + }, + { + "epoch": 1.142262895174709, + "grad_norm": 0.18967856466770172, + "learning_rate": 9.966672764036936e-06, + "loss": 0.4851, + "step": 4119 + }, + { + "epoch": 1.1425402107598448, + "grad_norm": 0.1816510409116745, + "learning_rate": 9.961265956463424e-06, + "loss": 0.4985, + "step": 4120 + }, + { + "epoch": 1.1428175263449807, + "grad_norm": 0.18744854629039764, + "learning_rate": 9.955859644264183e-06, + "loss": 0.4988, + "step": 4121 + }, + { + "epoch": 1.1430948419301166, + "grad_norm": 0.19459229707717896, + "learning_rate": 9.950453828494132e-06, + "loss": 0.5171, + "step": 4122 + }, + { + "epoch": 1.1433721575152525, + "grad_norm": 0.19057153165340424, + "learning_rate": 9.945048510208094e-06, + "loss": 0.5126, + "step": 4123 + }, + { + "epoch": 1.1436494731003883, + "grad_norm": 0.19819004833698273, + "learning_rate": 9.939643690460779e-06, + "loss": 0.5449, + "step": 4124 + }, + { + "epoch": 1.1439267886855242, + "grad_norm": 0.19458059966564178, + "learning_rate": 9.934239370306813e-06, + "loss": 0.4927, + "step": 4125 + }, + { + "epoch": 1.14420410427066, + "grad_norm": 0.20325776934623718, + "learning_rate": 9.928835550800727e-06, + "loss": 0.5221, + "step": 4126 + }, + { + "epoch": 1.144481419855796, + "grad_norm": 0.19814957678318024, + "learning_rate": 9.923432232996947e-06, + "loss": 0.5417, + "step": 4127 + }, + { + "epoch": 1.1447587354409319, + "grad_norm": 0.1916591227054596, + "learning_rate": 9.9180294179498e-06, + "loss": 0.4876, + "step": 4128 + }, + { + "epoch": 1.1450360510260678, + "grad_norm": 0.19258858263492584, + "learning_rate": 9.912627106713528e-06, + "loss": 0.4838, + "step": 4129 + }, + { + "epoch": 1.1453133666112036, + "grad_norm": 0.18850946426391602, + "learning_rate": 9.907225300342256e-06, + "loss": 0.4996, + "step": 4130 + }, + { + "epoch": 1.1455906821963395, + "grad_norm": 0.21922184526920319, + "learning_rate": 9.901823999890021e-06, + "loss": 0.5001, + "step": 4131 + }, + { + "epoch": 1.1458679977814754, + "grad_norm": 0.1909458041191101, + "learning_rate": 9.896423206410759e-06, + "loss": 0.5119, + "step": 4132 + }, + { + "epoch": 1.1461453133666113, + "grad_norm": 0.18811345100402832, + "learning_rate": 9.891022920958313e-06, + "loss": 0.5086, + "step": 4133 + }, + { + "epoch": 1.1464226289517472, + "grad_norm": 0.20187973976135254, + "learning_rate": 9.88562314458642e-06, + "loss": 0.5163, + "step": 4134 + }, + { + "epoch": 1.146699944536883, + "grad_norm": 0.1905764937400818, + "learning_rate": 9.880223878348713e-06, + "loss": 0.5092, + "step": 4135 + }, + { + "epoch": 1.146977260122019, + "grad_norm": 0.19595111906528473, + "learning_rate": 9.874825123298741e-06, + "loss": 0.5066, + "step": 4136 + }, + { + "epoch": 1.1472545757071548, + "grad_norm": 0.19632695615291595, + "learning_rate": 9.869426880489939e-06, + "loss": 0.5007, + "step": 4137 + }, + { + "epoch": 1.1475318912922907, + "grad_norm": 0.19036737084388733, + "learning_rate": 9.864029150975646e-06, + "loss": 0.4899, + "step": 4138 + }, + { + "epoch": 1.1478092068774266, + "grad_norm": 0.18314070999622345, + "learning_rate": 9.858631935809107e-06, + "loss": 0.4764, + "step": 4139 + }, + { + "epoch": 1.1480865224625625, + "grad_norm": 0.25677627325057983, + "learning_rate": 9.85323523604346e-06, + "loss": 0.5034, + "step": 4140 + }, + { + "epoch": 1.1483638380476984, + "grad_norm": 0.18910054862499237, + "learning_rate": 9.847839052731742e-06, + "loss": 0.5234, + "step": 4141 + }, + { + "epoch": 1.1486411536328343, + "grad_norm": 0.19723178446292877, + "learning_rate": 9.842443386926892e-06, + "loss": 0.5099, + "step": 4142 + }, + { + "epoch": 1.1489184692179701, + "grad_norm": 0.1976155936717987, + "learning_rate": 9.837048239681747e-06, + "loss": 0.498, + "step": 4143 + }, + { + "epoch": 1.149195784803106, + "grad_norm": 0.20631377398967743, + "learning_rate": 9.83165361204905e-06, + "loss": 0.4832, + "step": 4144 + }, + { + "epoch": 1.149473100388242, + "grad_norm": 0.204478919506073, + "learning_rate": 9.826259505081419e-06, + "loss": 0.494, + "step": 4145 + }, + { + "epoch": 1.1497504159733778, + "grad_norm": 0.19692255556583405, + "learning_rate": 9.820865919831406e-06, + "loss": 0.5219, + "step": 4146 + }, + { + "epoch": 1.1500277315585137, + "grad_norm": 0.1929834634065628, + "learning_rate": 9.815472857351433e-06, + "loss": 0.4878, + "step": 4147 + }, + { + "epoch": 1.1503050471436496, + "grad_norm": 0.1798853874206543, + "learning_rate": 9.810080318693832e-06, + "loss": 0.4949, + "step": 4148 + }, + { + "epoch": 1.1505823627287854, + "grad_norm": 0.19789017736911774, + "learning_rate": 9.804688304910824e-06, + "loss": 0.5227, + "step": 4149 + }, + { + "epoch": 1.1508596783139213, + "grad_norm": 0.19549022614955902, + "learning_rate": 9.799296817054542e-06, + "loss": 0.5191, + "step": 4150 + }, + { + "epoch": 1.1511369938990572, + "grad_norm": 0.19911247491836548, + "learning_rate": 9.793905856177008e-06, + "loss": 0.5172, + "step": 4151 + }, + { + "epoch": 1.151414309484193, + "grad_norm": 0.29837673902511597, + "learning_rate": 9.78851542333013e-06, + "loss": 0.5081, + "step": 4152 + }, + { + "epoch": 1.151691625069329, + "grad_norm": 0.1918451339006424, + "learning_rate": 9.783125519565737e-06, + "loss": 0.5116, + "step": 4153 + }, + { + "epoch": 1.1519689406544649, + "grad_norm": 0.19545188546180725, + "learning_rate": 9.777736145935538e-06, + "loss": 0.5042, + "step": 4154 + }, + { + "epoch": 1.1522462562396008, + "grad_norm": 0.19828790426254272, + "learning_rate": 9.772347303491144e-06, + "loss": 0.4944, + "step": 4155 + }, + { + "epoch": 1.1525235718247366, + "grad_norm": 0.19536136090755463, + "learning_rate": 9.766958993284051e-06, + "loss": 0.511, + "step": 4156 + }, + { + "epoch": 1.1528008874098725, + "grad_norm": 0.19933415949344635, + "learning_rate": 9.761571216365678e-06, + "loss": 0.5035, + "step": 4157 + }, + { + "epoch": 1.1530782029950084, + "grad_norm": 0.20315343141555786, + "learning_rate": 9.75618397378731e-06, + "loss": 0.5105, + "step": 4158 + }, + { + "epoch": 1.1533555185801443, + "grad_norm": 0.19544823467731476, + "learning_rate": 9.750797266600142e-06, + "loss": 0.4965, + "step": 4159 + }, + { + "epoch": 1.1536328341652802, + "grad_norm": 0.1914399266242981, + "learning_rate": 9.74541109585527e-06, + "loss": 0.4964, + "step": 4160 + }, + { + "epoch": 1.153910149750416, + "grad_norm": 0.19952736794948578, + "learning_rate": 9.740025462603675e-06, + "loss": 0.4937, + "step": 4161 + }, + { + "epoch": 1.154187465335552, + "grad_norm": 0.19488787651062012, + "learning_rate": 9.734640367896236e-06, + "loss": 0.5137, + "step": 4162 + }, + { + "epoch": 1.1544647809206878, + "grad_norm": 0.18852956593036652, + "learning_rate": 9.729255812783724e-06, + "loss": 0.5133, + "step": 4163 + }, + { + "epoch": 1.1547420965058237, + "grad_norm": 0.23788957297801971, + "learning_rate": 9.723871798316815e-06, + "loss": 0.5007, + "step": 4164 + }, + { + "epoch": 1.1550194120909596, + "grad_norm": 0.19701410830020905, + "learning_rate": 9.718488325546072e-06, + "loss": 0.4874, + "step": 4165 + }, + { + "epoch": 1.1552967276760955, + "grad_norm": 0.1924983710050583, + "learning_rate": 9.713105395521947e-06, + "loss": 0.491, + "step": 4166 + }, + { + "epoch": 1.1555740432612314, + "grad_norm": 0.19456344842910767, + "learning_rate": 9.707723009294802e-06, + "loss": 0.5032, + "step": 4167 + }, + { + "epoch": 1.1558513588463672, + "grad_norm": 0.2932334840297699, + "learning_rate": 9.702341167914875e-06, + "loss": 0.5158, + "step": 4168 + }, + { + "epoch": 1.1561286744315031, + "grad_norm": 0.19522738456726074, + "learning_rate": 9.696959872432311e-06, + "loss": 0.486, + "step": 4169 + }, + { + "epoch": 1.156405990016639, + "grad_norm": 0.19203148782253265, + "learning_rate": 9.691579123897137e-06, + "loss": 0.5187, + "step": 4170 + }, + { + "epoch": 1.156683305601775, + "grad_norm": 0.19738537073135376, + "learning_rate": 9.686198923359286e-06, + "loss": 0.4969, + "step": 4171 + }, + { + "epoch": 1.1569606211869108, + "grad_norm": 0.198639377951622, + "learning_rate": 9.680819271868578e-06, + "loss": 0.4974, + "step": 4172 + }, + { + "epoch": 1.1572379367720467, + "grad_norm": 0.19397403299808502, + "learning_rate": 9.675440170474717e-06, + "loss": 0.513, + "step": 4173 + }, + { + "epoch": 1.1575152523571826, + "grad_norm": 0.19262705743312836, + "learning_rate": 9.670061620227318e-06, + "loss": 0.5025, + "step": 4174 + }, + { + "epoch": 1.1577925679423184, + "grad_norm": 0.20281995832920074, + "learning_rate": 9.664683622175874e-06, + "loss": 0.5014, + "step": 4175 + }, + { + "epoch": 1.1580698835274543, + "grad_norm": 0.18004655838012695, + "learning_rate": 9.659306177369779e-06, + "loss": 0.5118, + "step": 4176 + }, + { + "epoch": 1.1583471991125902, + "grad_norm": 0.21135984361171722, + "learning_rate": 9.653929286858302e-06, + "loss": 0.4946, + "step": 4177 + }, + { + "epoch": 1.158624514697726, + "grad_norm": 0.2009570449590683, + "learning_rate": 9.648552951690635e-06, + "loss": 0.5502, + "step": 4178 + }, + { + "epoch": 1.158901830282862, + "grad_norm": 0.19548632204532623, + "learning_rate": 9.643177172915833e-06, + "loss": 0.5107, + "step": 4179 + }, + { + "epoch": 1.1591791458679979, + "grad_norm": 0.2423223853111267, + "learning_rate": 9.637801951582851e-06, + "loss": 0.4754, + "step": 4180 + }, + { + "epoch": 1.1594564614531337, + "grad_norm": 0.18860936164855957, + "learning_rate": 9.632427288740545e-06, + "loss": 0.4951, + "step": 4181 + }, + { + "epoch": 1.1597337770382696, + "grad_norm": 0.19608083367347717, + "learning_rate": 9.627053185437651e-06, + "loss": 0.4936, + "step": 4182 + }, + { + "epoch": 1.1600110926234055, + "grad_norm": 0.18897056579589844, + "learning_rate": 9.621679642722794e-06, + "loss": 0.507, + "step": 4183 + }, + { + "epoch": 1.1602884082085414, + "grad_norm": 0.18415631353855133, + "learning_rate": 9.616306661644497e-06, + "loss": 0.5157, + "step": 4184 + }, + { + "epoch": 1.1605657237936773, + "grad_norm": 0.300893098115921, + "learning_rate": 9.610934243251177e-06, + "loss": 0.5148, + "step": 4185 + }, + { + "epoch": 1.1608430393788132, + "grad_norm": 0.19000142812728882, + "learning_rate": 9.60556238859113e-06, + "loss": 0.5156, + "step": 4186 + }, + { + "epoch": 1.161120354963949, + "grad_norm": 0.20633526146411896, + "learning_rate": 9.60019109871254e-06, + "loss": 0.4992, + "step": 4187 + }, + { + "epoch": 1.161397670549085, + "grad_norm": 0.20525197684764862, + "learning_rate": 9.594820374663506e-06, + "loss": 0.4937, + "step": 4188 + }, + { + "epoch": 1.1616749861342208, + "grad_norm": 0.19740667939186096, + "learning_rate": 9.589450217491984e-06, + "loss": 0.4853, + "step": 4189 + }, + { + "epoch": 1.1619523017193567, + "grad_norm": 0.20025213062763214, + "learning_rate": 9.58408062824584e-06, + "loss": 0.5031, + "step": 4190 + }, + { + "epoch": 1.1622296173044926, + "grad_norm": 0.20170235633850098, + "learning_rate": 9.578711607972815e-06, + "loss": 0.5283, + "step": 4191 + }, + { + "epoch": 1.1625069328896285, + "grad_norm": 0.19781838357448578, + "learning_rate": 9.573343157720558e-06, + "loss": 0.5155, + "step": 4192 + }, + { + "epoch": 1.1627842484747644, + "grad_norm": 0.19260545074939728, + "learning_rate": 9.567975278536595e-06, + "loss": 0.4877, + "step": 4193 + }, + { + "epoch": 1.1630615640599002, + "grad_norm": 0.1921887993812561, + "learning_rate": 9.562607971468328e-06, + "loss": 0.5139, + "step": 4194 + }, + { + "epoch": 1.1633388796450361, + "grad_norm": 0.19691529870033264, + "learning_rate": 9.55724123756308e-06, + "loss": 0.5187, + "step": 4195 + }, + { + "epoch": 1.163616195230172, + "grad_norm": 0.1975255310535431, + "learning_rate": 9.551875077868028e-06, + "loss": 0.5256, + "step": 4196 + }, + { + "epoch": 1.163893510815308, + "grad_norm": 0.20154882967472076, + "learning_rate": 9.546509493430257e-06, + "loss": 0.5213, + "step": 4197 + }, + { + "epoch": 1.1641708264004438, + "grad_norm": 0.18560223281383514, + "learning_rate": 9.541144485296737e-06, + "loss": 0.501, + "step": 4198 + }, + { + "epoch": 1.1644481419855797, + "grad_norm": 0.18955029547214508, + "learning_rate": 9.535780054514325e-06, + "loss": 0.5273, + "step": 4199 + }, + { + "epoch": 1.1647254575707155, + "grad_norm": 0.1917003095149994, + "learning_rate": 9.530416202129756e-06, + "loss": 0.4887, + "step": 4200 + }, + { + "epoch": 1.1650027731558514, + "grad_norm": 0.20266690850257874, + "learning_rate": 9.525052929189661e-06, + "loss": 0.5112, + "step": 4201 + }, + { + "epoch": 1.1652800887409873, + "grad_norm": 0.18932059407234192, + "learning_rate": 9.519690236740563e-06, + "loss": 0.4984, + "step": 4202 + }, + { + "epoch": 1.1655574043261232, + "grad_norm": 0.19648674130439758, + "learning_rate": 9.51432812582886e-06, + "loss": 0.5004, + "step": 4203 + }, + { + "epoch": 1.165834719911259, + "grad_norm": 0.19848087430000305, + "learning_rate": 9.508966597500843e-06, + "loss": 0.5153, + "step": 4204 + }, + { + "epoch": 1.166112035496395, + "grad_norm": 0.2016671746969223, + "learning_rate": 9.50360565280269e-06, + "loss": 0.5134, + "step": 4205 + }, + { + "epoch": 1.1663893510815309, + "grad_norm": 0.22022029757499695, + "learning_rate": 9.498245292780463e-06, + "loss": 0.5048, + "step": 4206 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.19555556774139404, + "learning_rate": 9.49288551848011e-06, + "loss": 0.511, + "step": 4207 + }, + { + "epoch": 1.1669439822518026, + "grad_norm": 0.21261551976203918, + "learning_rate": 9.487526330947461e-06, + "loss": 0.5123, + "step": 4208 + }, + { + "epoch": 1.1672212978369385, + "grad_norm": 0.20261640846729279, + "learning_rate": 9.482167731228241e-06, + "loss": 0.5132, + "step": 4209 + }, + { + "epoch": 1.1674986134220744, + "grad_norm": 0.20545309782028198, + "learning_rate": 9.476809720368054e-06, + "loss": 0.5108, + "step": 4210 + }, + { + "epoch": 1.1677759290072103, + "grad_norm": 0.18970844149589539, + "learning_rate": 9.471452299412384e-06, + "loss": 0.5205, + "step": 4211 + }, + { + "epoch": 1.1680532445923462, + "grad_norm": 0.19278816878795624, + "learning_rate": 9.466095469406613e-06, + "loss": 0.5095, + "step": 4212 + }, + { + "epoch": 1.168330560177482, + "grad_norm": 0.19161520898342133, + "learning_rate": 9.460739231395999e-06, + "loss": 0.5151, + "step": 4213 + }, + { + "epoch": 1.168607875762618, + "grad_norm": 0.19239401817321777, + "learning_rate": 9.455383586425685e-06, + "loss": 0.4932, + "step": 4214 + }, + { + "epoch": 1.1688851913477538, + "grad_norm": 0.18713194131851196, + "learning_rate": 9.450028535540692e-06, + "loss": 0.4842, + "step": 4215 + }, + { + "epoch": 1.1691625069328897, + "grad_norm": 0.1931910663843155, + "learning_rate": 9.444674079785948e-06, + "loss": 0.4839, + "step": 4216 + }, + { + "epoch": 1.1694398225180256, + "grad_norm": 0.2027042806148529, + "learning_rate": 9.439320220206236e-06, + "loss": 0.4839, + "step": 4217 + }, + { + "epoch": 1.1697171381031615, + "grad_norm": 0.19624637067317963, + "learning_rate": 9.43396695784624e-06, + "loss": 0.5165, + "step": 4218 + }, + { + "epoch": 1.1699944536882974, + "grad_norm": 0.19271767139434814, + "learning_rate": 9.428614293750523e-06, + "loss": 0.5085, + "step": 4219 + }, + { + "epoch": 1.1702717692734332, + "grad_norm": 0.19581159949302673, + "learning_rate": 9.423262228963537e-06, + "loss": 0.5038, + "step": 4220 + }, + { + "epoch": 1.1705490848585691, + "grad_norm": 0.18047882616519928, + "learning_rate": 9.417910764529605e-06, + "loss": 0.5177, + "step": 4221 + }, + { + "epoch": 1.170826400443705, + "grad_norm": 0.19481384754180908, + "learning_rate": 9.412559901492935e-06, + "loss": 0.528, + "step": 4222 + }, + { + "epoch": 1.171103716028841, + "grad_norm": 0.2017330676317215, + "learning_rate": 9.407209640897635e-06, + "loss": 0.4888, + "step": 4223 + }, + { + "epoch": 1.1713810316139768, + "grad_norm": 0.2008255124092102, + "learning_rate": 9.401859983787674e-06, + "loss": 0.5369, + "step": 4224 + }, + { + "epoch": 1.1716583471991127, + "grad_norm": 0.19789865612983704, + "learning_rate": 9.396510931206912e-06, + "loss": 0.5095, + "step": 4225 + }, + { + "epoch": 1.1719356627842485, + "grad_norm": 0.18955931067466736, + "learning_rate": 9.391162484199097e-06, + "loss": 0.4998, + "step": 4226 + }, + { + "epoch": 1.1722129783693844, + "grad_norm": 0.19305095076560974, + "learning_rate": 9.385814643807845e-06, + "loss": 0.5198, + "step": 4227 + }, + { + "epoch": 1.1724902939545203, + "grad_norm": 0.19664254784584045, + "learning_rate": 9.380467411076667e-06, + "loss": 0.5123, + "step": 4228 + }, + { + "epoch": 1.1727676095396562, + "grad_norm": 0.1945103108882904, + "learning_rate": 9.375120787048944e-06, + "loss": 0.5063, + "step": 4229 + }, + { + "epoch": 1.173044925124792, + "grad_norm": 0.19993437826633453, + "learning_rate": 9.36977477276795e-06, + "loss": 0.502, + "step": 4230 + }, + { + "epoch": 1.173322240709928, + "grad_norm": 0.19845424592494965, + "learning_rate": 9.364429369276833e-06, + "loss": 0.4798, + "step": 4231 + }, + { + "epoch": 1.1735995562950639, + "grad_norm": 0.21802715957164764, + "learning_rate": 9.359084577618615e-06, + "loss": 0.5008, + "step": 4232 + }, + { + "epoch": 1.1738768718801997, + "grad_norm": 0.20177268981933594, + "learning_rate": 9.353740398836222e-06, + "loss": 0.5126, + "step": 4233 + }, + { + "epoch": 1.1741541874653356, + "grad_norm": 0.21453998982906342, + "learning_rate": 9.34839683397243e-06, + "loss": 0.5087, + "step": 4234 + }, + { + "epoch": 1.1744315030504715, + "grad_norm": 0.19861598312854767, + "learning_rate": 9.343053884069922e-06, + "loss": 0.5108, + "step": 4235 + }, + { + "epoch": 1.1747088186356074, + "grad_norm": 0.19748757779598236, + "learning_rate": 9.337711550171232e-06, + "loss": 0.5079, + "step": 4236 + }, + { + "epoch": 1.1749861342207433, + "grad_norm": 0.19762682914733887, + "learning_rate": 9.33236983331881e-06, + "loss": 0.4911, + "step": 4237 + }, + { + "epoch": 1.1752634498058792, + "grad_norm": 0.19971820712089539, + "learning_rate": 9.327028734554957e-06, + "loss": 0.5251, + "step": 4238 + }, + { + "epoch": 1.175540765391015, + "grad_norm": 0.20210981369018555, + "learning_rate": 9.321688254921862e-06, + "loss": 0.5057, + "step": 4239 + }, + { + "epoch": 1.175818080976151, + "grad_norm": 0.1958346962928772, + "learning_rate": 9.316348395461598e-06, + "loss": 0.4868, + "step": 4240 + }, + { + "epoch": 1.1760953965612868, + "grad_norm": 0.20058873295783997, + "learning_rate": 9.31100915721611e-06, + "loss": 0.5178, + "step": 4241 + }, + { + "epoch": 1.1763727121464227, + "grad_norm": 0.19912679493427277, + "learning_rate": 9.305670541227232e-06, + "loss": 0.5216, + "step": 4242 + }, + { + "epoch": 1.1766500277315586, + "grad_norm": 0.21662193536758423, + "learning_rate": 9.300332548536655e-06, + "loss": 0.5162, + "step": 4243 + }, + { + "epoch": 1.1769273433166945, + "grad_norm": 0.19769549369812012, + "learning_rate": 9.294995180185976e-06, + "loss": 0.5196, + "step": 4244 + }, + { + "epoch": 1.1772046589018303, + "grad_norm": 0.19544221460819244, + "learning_rate": 9.289658437216652e-06, + "loss": 0.5208, + "step": 4245 + }, + { + "epoch": 1.1774819744869662, + "grad_norm": 0.19446879625320435, + "learning_rate": 9.28432232067002e-06, + "loss": 0.5124, + "step": 4246 + }, + { + "epoch": 1.1777592900721021, + "grad_norm": 0.27411291003227234, + "learning_rate": 9.278986831587305e-06, + "loss": 0.4815, + "step": 4247 + }, + { + "epoch": 1.178036605657238, + "grad_norm": 0.19502323865890503, + "learning_rate": 9.273651971009599e-06, + "loss": 0.4944, + "step": 4248 + }, + { + "epoch": 1.1783139212423739, + "grad_norm": 0.2004517912864685, + "learning_rate": 9.268317739977872e-06, + "loss": 0.491, + "step": 4249 + }, + { + "epoch": 1.1785912368275098, + "grad_norm": 0.21437488496303558, + "learning_rate": 9.262984139532973e-06, + "loss": 0.4984, + "step": 4250 + }, + { + "epoch": 1.1788685524126457, + "grad_norm": 0.19150178134441376, + "learning_rate": 9.257651170715635e-06, + "loss": 0.4888, + "step": 4251 + }, + { + "epoch": 1.1791458679977815, + "grad_norm": 0.21565623581409454, + "learning_rate": 9.25231883456646e-06, + "loss": 0.5025, + "step": 4252 + }, + { + "epoch": 1.1794231835829174, + "grad_norm": 0.20012259483337402, + "learning_rate": 9.246987132125919e-06, + "loss": 0.533, + "step": 4253 + }, + { + "epoch": 1.1797004991680533, + "grad_norm": 0.1878737509250641, + "learning_rate": 9.241656064434382e-06, + "loss": 0.5027, + "step": 4254 + }, + { + "epoch": 1.1799778147531892, + "grad_norm": 0.19911964237689972, + "learning_rate": 9.236325632532074e-06, + "loss": 0.494, + "step": 4255 + }, + { + "epoch": 1.180255130338325, + "grad_norm": 0.208717480301857, + "learning_rate": 9.230995837459103e-06, + "loss": 0.5262, + "step": 4256 + }, + { + "epoch": 1.180532445923461, + "grad_norm": 0.19873370230197906, + "learning_rate": 9.225666680255452e-06, + "loss": 0.5128, + "step": 4257 + }, + { + "epoch": 1.1808097615085968, + "grad_norm": 0.19655872881412506, + "learning_rate": 9.22033816196099e-06, + "loss": 0.4987, + "step": 4258 + }, + { + "epoch": 1.1810870770937327, + "grad_norm": 0.2015899121761322, + "learning_rate": 9.215010283615443e-06, + "loss": 0.5177, + "step": 4259 + }, + { + "epoch": 1.1813643926788686, + "grad_norm": 0.19780929386615753, + "learning_rate": 9.20968304625842e-06, + "loss": 0.4959, + "step": 4260 + }, + { + "epoch": 1.1816417082640045, + "grad_norm": 0.18897369503974915, + "learning_rate": 9.204356450929413e-06, + "loss": 0.4847, + "step": 4261 + }, + { + "epoch": 1.1819190238491404, + "grad_norm": 0.19352437555789948, + "learning_rate": 9.19903049866778e-06, + "loss": 0.4902, + "step": 4262 + }, + { + "epoch": 1.1821963394342763, + "grad_norm": 0.2053721845149994, + "learning_rate": 9.19370519051275e-06, + "loss": 0.5054, + "step": 4263 + }, + { + "epoch": 1.1824736550194122, + "grad_norm": 0.19881947338581085, + "learning_rate": 9.188380527503443e-06, + "loss": 0.4888, + "step": 4264 + }, + { + "epoch": 1.182750970604548, + "grad_norm": 0.19415348768234253, + "learning_rate": 9.18305651067883e-06, + "loss": 0.4889, + "step": 4265 + }, + { + "epoch": 1.183028286189684, + "grad_norm": 0.1971830129623413, + "learning_rate": 9.177733141077775e-06, + "loss": 0.5026, + "step": 4266 + }, + { + "epoch": 1.1833056017748198, + "grad_norm": 0.19726091623306274, + "learning_rate": 9.172410419739e-06, + "loss": 0.5027, + "step": 4267 + }, + { + "epoch": 1.1835829173599557, + "grad_norm": 0.19601614773273468, + "learning_rate": 9.167088347701119e-06, + "loss": 0.495, + "step": 4268 + }, + { + "epoch": 1.1838602329450916, + "grad_norm": 0.18969157338142395, + "learning_rate": 9.16176692600261e-06, + "loss": 0.4859, + "step": 4269 + }, + { + "epoch": 1.1841375485302275, + "grad_norm": 0.18396854400634766, + "learning_rate": 9.156446155681811e-06, + "loss": 0.4981, + "step": 4270 + }, + { + "epoch": 1.1844148641153633, + "grad_norm": 0.19818167388439178, + "learning_rate": 9.151126037776955e-06, + "loss": 0.5199, + "step": 4271 + }, + { + "epoch": 1.1846921797004992, + "grad_norm": 0.20459376275539398, + "learning_rate": 9.145806573326137e-06, + "loss": 0.5215, + "step": 4272 + }, + { + "epoch": 1.184969495285635, + "grad_norm": 0.1927737444639206, + "learning_rate": 9.140487763367328e-06, + "loss": 0.5089, + "step": 4273 + }, + { + "epoch": 1.185246810870771, + "grad_norm": 0.18682962656021118, + "learning_rate": 9.135169608938354e-06, + "loss": 0.4812, + "step": 4274 + }, + { + "epoch": 1.1855241264559069, + "grad_norm": 0.2066759467124939, + "learning_rate": 9.12985211107695e-06, + "loss": 0.5111, + "step": 4275 + }, + { + "epoch": 1.1858014420410428, + "grad_norm": 0.20103536546230316, + "learning_rate": 9.124535270820685e-06, + "loss": 0.4976, + "step": 4276 + }, + { + "epoch": 1.1860787576261786, + "grad_norm": 0.1959027498960495, + "learning_rate": 9.119219089207017e-06, + "loss": 0.5396, + "step": 4277 + }, + { + "epoch": 1.1863560732113145, + "grad_norm": 0.1914219707250595, + "learning_rate": 9.11390356727328e-06, + "loss": 0.4949, + "step": 4278 + }, + { + "epoch": 1.1866333887964504, + "grad_norm": 0.20492805540561676, + "learning_rate": 9.108588706056673e-06, + "loss": 0.5225, + "step": 4279 + }, + { + "epoch": 1.1869107043815863, + "grad_norm": 0.19187003374099731, + "learning_rate": 9.10327450659426e-06, + "loss": 0.4893, + "step": 4280 + }, + { + "epoch": 1.1871880199667222, + "grad_norm": 0.2073322832584381, + "learning_rate": 9.097960969922983e-06, + "loss": 0.5182, + "step": 4281 + }, + { + "epoch": 1.187465335551858, + "grad_norm": 0.1949753612279892, + "learning_rate": 9.092648097079659e-06, + "loss": 0.5037, + "step": 4282 + }, + { + "epoch": 1.187742651136994, + "grad_norm": 0.20522314310073853, + "learning_rate": 9.087335889100967e-06, + "loss": 0.5164, + "step": 4283 + }, + { + "epoch": 1.1880199667221298, + "grad_norm": 0.19784744083881378, + "learning_rate": 9.082024347023457e-06, + "loss": 0.5207, + "step": 4284 + }, + { + "epoch": 1.1882972823072657, + "grad_norm": 0.20166823267936707, + "learning_rate": 9.076713471883557e-06, + "loss": 0.5184, + "step": 4285 + }, + { + "epoch": 1.1885745978924016, + "grad_norm": 0.20087933540344238, + "learning_rate": 9.07140326471756e-06, + "loss": 0.4829, + "step": 4286 + }, + { + "epoch": 1.1888519134775375, + "grad_norm": 0.2303885966539383, + "learning_rate": 9.066093726561622e-06, + "loss": 0.5022, + "step": 4287 + }, + { + "epoch": 1.1891292290626734, + "grad_norm": 0.19996869564056396, + "learning_rate": 9.060784858451774e-06, + "loss": 0.4958, + "step": 4288 + }, + { + "epoch": 1.1894065446478093, + "grad_norm": 0.19278928637504578, + "learning_rate": 9.055476661423925e-06, + "loss": 0.5143, + "step": 4289 + }, + { + "epoch": 1.1896838602329451, + "grad_norm": 0.19539318978786469, + "learning_rate": 9.050169136513842e-06, + "loss": 0.4935, + "step": 4290 + }, + { + "epoch": 1.189961175818081, + "grad_norm": 0.20165032148361206, + "learning_rate": 9.044862284757154e-06, + "loss": 0.484, + "step": 4291 + }, + { + "epoch": 1.190238491403217, + "grad_norm": 0.2109079658985138, + "learning_rate": 9.039556107189384e-06, + "loss": 0.5046, + "step": 4292 + }, + { + "epoch": 1.1905158069883528, + "grad_norm": 0.19363267719745636, + "learning_rate": 9.034250604845898e-06, + "loss": 0.4955, + "step": 4293 + }, + { + "epoch": 1.1907931225734887, + "grad_norm": 0.190843403339386, + "learning_rate": 9.028945778761942e-06, + "loss": 0.4936, + "step": 4294 + }, + { + "epoch": 1.1910704381586246, + "grad_norm": 0.1952875405550003, + "learning_rate": 9.023641629972626e-06, + "loss": 0.4903, + "step": 4295 + }, + { + "epoch": 1.1913477537437605, + "grad_norm": 0.1969594806432724, + "learning_rate": 9.018338159512937e-06, + "loss": 0.4951, + "step": 4296 + }, + { + "epoch": 1.1916250693288963, + "grad_norm": 0.19602110981941223, + "learning_rate": 9.013035368417716e-06, + "loss": 0.5064, + "step": 4297 + }, + { + "epoch": 1.1919023849140322, + "grad_norm": 0.1936318725347519, + "learning_rate": 9.007733257721679e-06, + "loss": 0.5272, + "step": 4298 + }, + { + "epoch": 1.192179700499168, + "grad_norm": 0.19315633177757263, + "learning_rate": 9.002431828459409e-06, + "loss": 0.5085, + "step": 4299 + }, + { + "epoch": 1.192457016084304, + "grad_norm": 0.20358605682849884, + "learning_rate": 8.997131081665357e-06, + "loss": 0.5105, + "step": 4300 + }, + { + "epoch": 1.1927343316694399, + "grad_norm": 0.194684237241745, + "learning_rate": 8.991831018373841e-06, + "loss": 0.5007, + "step": 4301 + }, + { + "epoch": 1.1930116472545758, + "grad_norm": 0.19725412130355835, + "learning_rate": 8.986531639619033e-06, + "loss": 0.4953, + "step": 4302 + }, + { + "epoch": 1.1932889628397116, + "grad_norm": 0.1992274820804596, + "learning_rate": 8.981232946434995e-06, + "loss": 0.4917, + "step": 4303 + }, + { + "epoch": 1.1935662784248475, + "grad_norm": 0.1960635781288147, + "learning_rate": 8.975934939855637e-06, + "loss": 0.4854, + "step": 4304 + }, + { + "epoch": 1.1938435940099834, + "grad_norm": 0.2065868228673935, + "learning_rate": 8.970637620914735e-06, + "loss": 0.497, + "step": 4305 + }, + { + "epoch": 1.1941209095951193, + "grad_norm": 0.19648823142051697, + "learning_rate": 8.965340990645947e-06, + "loss": 0.4887, + "step": 4306 + }, + { + "epoch": 1.1943982251802552, + "grad_norm": 0.20130853354930878, + "learning_rate": 8.960045050082783e-06, + "loss": 0.5177, + "step": 4307 + }, + { + "epoch": 1.194675540765391, + "grad_norm": 0.20241306722164154, + "learning_rate": 8.954749800258615e-06, + "loss": 0.5014, + "step": 4308 + }, + { + "epoch": 1.194952856350527, + "grad_norm": 0.1977449208498001, + "learning_rate": 8.94945524220669e-06, + "loss": 0.5068, + "step": 4309 + }, + { + "epoch": 1.1952301719356628, + "grad_norm": 0.19154322147369385, + "learning_rate": 8.944161376960119e-06, + "loss": 0.516, + "step": 4310 + }, + { + "epoch": 1.1955074875207987, + "grad_norm": 0.19326725602149963, + "learning_rate": 8.938868205551877e-06, + "loss": 0.493, + "step": 4311 + }, + { + "epoch": 1.1957848031059346, + "grad_norm": 0.1888340413570404, + "learning_rate": 8.933575729014788e-06, + "loss": 0.4899, + "step": 4312 + }, + { + "epoch": 1.1960621186910705, + "grad_norm": 0.19526329636573792, + "learning_rate": 8.928283948381575e-06, + "loss": 0.4789, + "step": 4313 + }, + { + "epoch": 1.1963394342762064, + "grad_norm": 0.1940467208623886, + "learning_rate": 8.922992864684791e-06, + "loss": 0.5252, + "step": 4314 + }, + { + "epoch": 1.1966167498613423, + "grad_norm": 0.20158347487449646, + "learning_rate": 8.917702478956872e-06, + "loss": 0.5069, + "step": 4315 + }, + { + "epoch": 1.1968940654464781, + "grad_norm": 0.20539873838424683, + "learning_rate": 8.912412792230104e-06, + "loss": 0.4972, + "step": 4316 + }, + { + "epoch": 1.197171381031614, + "grad_norm": 0.1870296597480774, + "learning_rate": 8.90712380553666e-06, + "loss": 0.4905, + "step": 4317 + }, + { + "epoch": 1.19744869661675, + "grad_norm": 0.1965525895357132, + "learning_rate": 8.90183551990855e-06, + "loss": 0.4966, + "step": 4318 + }, + { + "epoch": 1.1977260122018858, + "grad_norm": 0.20157171785831451, + "learning_rate": 8.896547936377658e-06, + "loss": 0.5305, + "step": 4319 + }, + { + "epoch": 1.1980033277870217, + "grad_norm": 0.2052687555551529, + "learning_rate": 8.89126105597574e-06, + "loss": 0.5178, + "step": 4320 + }, + { + "epoch": 1.1982806433721576, + "grad_norm": 0.19930841028690338, + "learning_rate": 8.885974879734399e-06, + "loss": 0.5181, + "step": 4321 + }, + { + "epoch": 1.1985579589572934, + "grad_norm": 0.19460836052894592, + "learning_rate": 8.880689408685114e-06, + "loss": 0.5114, + "step": 4322 + }, + { + "epoch": 1.1988352745424293, + "grad_norm": 0.19336189329624176, + "learning_rate": 8.87540464385921e-06, + "loss": 0.4967, + "step": 4323 + }, + { + "epoch": 1.1991125901275652, + "grad_norm": 0.19514434039592743, + "learning_rate": 8.8701205862879e-06, + "loss": 0.4995, + "step": 4324 + }, + { + "epoch": 1.199389905712701, + "grad_norm": 0.19089075922966003, + "learning_rate": 8.864837237002232e-06, + "loss": 0.4795, + "step": 4325 + }, + { + "epoch": 1.199667221297837, + "grad_norm": 0.2026674598455429, + "learning_rate": 8.85955459703313e-06, + "loss": 0.5033, + "step": 4326 + }, + { + "epoch": 1.1999445368829729, + "grad_norm": 0.2059757262468338, + "learning_rate": 8.854272667411379e-06, + "loss": 0.4985, + "step": 4327 + }, + { + "epoch": 1.2002218524681088, + "grad_norm": 0.20040294528007507, + "learning_rate": 8.848991449167623e-06, + "loss": 0.493, + "step": 4328 + }, + { + "epoch": 1.2004991680532446, + "grad_norm": 0.19929878413677216, + "learning_rate": 8.843710943332362e-06, + "loss": 0.495, + "step": 4329 + }, + { + "epoch": 1.2007764836383805, + "grad_norm": 0.1958535760641098, + "learning_rate": 8.838431150935975e-06, + "loss": 0.4971, + "step": 4330 + }, + { + "epoch": 1.2010537992235164, + "grad_norm": 0.18605239689350128, + "learning_rate": 8.83315207300868e-06, + "loss": 0.4861, + "step": 4331 + }, + { + "epoch": 1.2013311148086523, + "grad_norm": 0.43775662779808044, + "learning_rate": 8.827873710580564e-06, + "loss": 0.4962, + "step": 4332 + }, + { + "epoch": 1.2016084303937882, + "grad_norm": 0.1940368264913559, + "learning_rate": 8.822596064681577e-06, + "loss": 0.5247, + "step": 4333 + }, + { + "epoch": 1.201885745978924, + "grad_norm": 0.2012210190296173, + "learning_rate": 8.817319136341535e-06, + "loss": 0.5178, + "step": 4334 + }, + { + "epoch": 1.20216306156406, + "grad_norm": 0.18737319111824036, + "learning_rate": 8.812042926590098e-06, + "loss": 0.5166, + "step": 4335 + }, + { + "epoch": 1.2024403771491958, + "grad_norm": 0.19788531959056854, + "learning_rate": 8.806767436456792e-06, + "loss": 0.4959, + "step": 4336 + }, + { + "epoch": 1.2027176927343317, + "grad_norm": 0.1986195296049118, + "learning_rate": 8.801492666971012e-06, + "loss": 0.4669, + "step": 4337 + }, + { + "epoch": 1.2029950083194676, + "grad_norm": 0.21253184974193573, + "learning_rate": 8.796218619162004e-06, + "loss": 0.5126, + "step": 4338 + }, + { + "epoch": 1.2032723239046035, + "grad_norm": 0.19485290348529816, + "learning_rate": 8.790945294058876e-06, + "loss": 0.475, + "step": 4339 + }, + { + "epoch": 1.2035496394897394, + "grad_norm": 0.19246895611286163, + "learning_rate": 8.785672692690584e-06, + "loss": 0.5123, + "step": 4340 + }, + { + "epoch": 1.2038269550748752, + "grad_norm": 0.21028102934360504, + "learning_rate": 8.780400816085963e-06, + "loss": 0.5072, + "step": 4341 + }, + { + "epoch": 1.2041042706600111, + "grad_norm": 0.20249207317829132, + "learning_rate": 8.775129665273691e-06, + "loss": 0.4979, + "step": 4342 + }, + { + "epoch": 1.204381586245147, + "grad_norm": 0.19815127551555634, + "learning_rate": 8.769859241282307e-06, + "loss": 0.5116, + "step": 4343 + }, + { + "epoch": 1.204658901830283, + "grad_norm": 0.20030280947685242, + "learning_rate": 8.764589545140217e-06, + "loss": 0.5121, + "step": 4344 + }, + { + "epoch": 1.2049362174154188, + "grad_norm": 0.2132977396249771, + "learning_rate": 8.759320577875676e-06, + "loss": 0.5192, + "step": 4345 + }, + { + "epoch": 1.2052135330005547, + "grad_norm": 0.19407188892364502, + "learning_rate": 8.754052340516796e-06, + "loss": 0.4912, + "step": 4346 + }, + { + "epoch": 1.2054908485856906, + "grad_norm": 0.2085913121700287, + "learning_rate": 8.748784834091549e-06, + "loss": 0.4861, + "step": 4347 + }, + { + "epoch": 1.2057681641708264, + "grad_norm": 0.20071199536323547, + "learning_rate": 8.74351805962777e-06, + "loss": 0.5451, + "step": 4348 + }, + { + "epoch": 1.2060454797559623, + "grad_norm": 0.20174618065357208, + "learning_rate": 8.738252018153145e-06, + "loss": 0.5094, + "step": 4349 + }, + { + "epoch": 1.2063227953410982, + "grad_norm": 0.2093082219362259, + "learning_rate": 8.73298671069521e-06, + "loss": 0.5161, + "step": 4350 + }, + { + "epoch": 1.206600110926234, + "grad_norm": 0.21680103242397308, + "learning_rate": 8.727722138281381e-06, + "loss": 0.5058, + "step": 4351 + }, + { + "epoch": 1.20687742651137, + "grad_norm": 0.19285671412944794, + "learning_rate": 8.722458301938904e-06, + "loss": 0.4775, + "step": 4352 + }, + { + "epoch": 1.2071547420965059, + "grad_norm": 0.20253880321979523, + "learning_rate": 8.717195202694898e-06, + "loss": 0.501, + "step": 4353 + }, + { + "epoch": 1.2074320576816417, + "grad_norm": 0.19431787729263306, + "learning_rate": 8.711932841576325e-06, + "loss": 0.4927, + "step": 4354 + }, + { + "epoch": 1.2077093732667776, + "grad_norm": 0.20108050107955933, + "learning_rate": 8.706671219610027e-06, + "loss": 0.5135, + "step": 4355 + }, + { + "epoch": 1.2079866888519135, + "grad_norm": 0.20072422921657562, + "learning_rate": 8.70141033782267e-06, + "loss": 0.507, + "step": 4356 + }, + { + "epoch": 1.2082640044370494, + "grad_norm": 0.21369212865829468, + "learning_rate": 8.696150197240798e-06, + "loss": 0.5259, + "step": 4357 + }, + { + "epoch": 1.2085413200221853, + "grad_norm": 0.1956164538860321, + "learning_rate": 8.690890798890806e-06, + "loss": 0.5293, + "step": 4358 + }, + { + "epoch": 1.2088186356073212, + "grad_norm": 0.19677644968032837, + "learning_rate": 8.685632143798938e-06, + "loss": 0.5185, + "step": 4359 + }, + { + "epoch": 1.209095951192457, + "grad_norm": 0.19301322102546692, + "learning_rate": 8.680374232991304e-06, + "loss": 0.5092, + "step": 4360 + }, + { + "epoch": 1.209373266777593, + "grad_norm": 0.19476044178009033, + "learning_rate": 8.67511706749385e-06, + "loss": 0.5197, + "step": 4361 + }, + { + "epoch": 1.2096505823627288, + "grad_norm": 0.20564420521259308, + "learning_rate": 8.669860648332395e-06, + "loss": 0.519, + "step": 4362 + }, + { + "epoch": 1.2099278979478647, + "grad_norm": 0.20334792137145996, + "learning_rate": 8.664604976532605e-06, + "loss": 0.5407, + "step": 4363 + }, + { + "epoch": 1.2102052135330006, + "grad_norm": 0.1910853236913681, + "learning_rate": 8.659350053120003e-06, + "loss": 0.5174, + "step": 4364 + }, + { + "epoch": 1.2104825291181365, + "grad_norm": 0.1837315857410431, + "learning_rate": 8.65409587911996e-06, + "loss": 0.5198, + "step": 4365 + }, + { + "epoch": 1.2107598447032724, + "grad_norm": 0.19404159486293793, + "learning_rate": 8.64884245555771e-06, + "loss": 0.4944, + "step": 4366 + }, + { + "epoch": 1.2110371602884082, + "grad_norm": 0.18698063492774963, + "learning_rate": 8.643589783458328e-06, + "loss": 0.5151, + "step": 4367 + }, + { + "epoch": 1.2113144758735441, + "grad_norm": 0.2053699940443039, + "learning_rate": 8.638337863846752e-06, + "loss": 0.5182, + "step": 4368 + }, + { + "epoch": 1.21159179145868, + "grad_norm": 0.18703065812587738, + "learning_rate": 8.633086697747773e-06, + "loss": 0.4803, + "step": 4369 + }, + { + "epoch": 1.211869107043816, + "grad_norm": 0.19493581354618073, + "learning_rate": 8.627836286186035e-06, + "loss": 0.5313, + "step": 4370 + }, + { + "epoch": 1.2121464226289518, + "grad_norm": 0.1981756091117859, + "learning_rate": 8.622586630186019e-06, + "loss": 0.5012, + "step": 4371 + }, + { + "epoch": 1.2124237382140877, + "grad_norm": 0.18528394401073456, + "learning_rate": 8.61733773077209e-06, + "loss": 0.4845, + "step": 4372 + }, + { + "epoch": 1.2127010537992235, + "grad_norm": 0.20212890207767487, + "learning_rate": 8.612089588968437e-06, + "loss": 0.498, + "step": 4373 + }, + { + "epoch": 1.2129783693843594, + "grad_norm": 0.19095052778720856, + "learning_rate": 8.606842205799113e-06, + "loss": 0.4707, + "step": 4374 + }, + { + "epoch": 1.2132556849694953, + "grad_norm": 0.18967384099960327, + "learning_rate": 8.60159558228802e-06, + "loss": 0.5123, + "step": 4375 + }, + { + "epoch": 1.2135330005546312, + "grad_norm": 0.20997461676597595, + "learning_rate": 8.596349719458916e-06, + "loss": 0.4683, + "step": 4376 + }, + { + "epoch": 1.213810316139767, + "grad_norm": 0.19758398830890656, + "learning_rate": 8.591104618335413e-06, + "loss": 0.5227, + "step": 4377 + }, + { + "epoch": 1.214087631724903, + "grad_norm": 0.19900359213352203, + "learning_rate": 8.585860279940954e-06, + "loss": 0.4995, + "step": 4378 + }, + { + "epoch": 1.2143649473100389, + "grad_norm": 0.19332247972488403, + "learning_rate": 8.580616705298864e-06, + "loss": 0.5178, + "step": 4379 + }, + { + "epoch": 1.2146422628951747, + "grad_norm": 0.18881361186504364, + "learning_rate": 8.575373895432293e-06, + "loss": 0.493, + "step": 4380 + }, + { + "epoch": 1.2149195784803106, + "grad_norm": 0.222880020737648, + "learning_rate": 8.57013185136426e-06, + "loss": 0.4933, + "step": 4381 + }, + { + "epoch": 1.2151968940654465, + "grad_norm": 0.19780275225639343, + "learning_rate": 8.564890574117616e-06, + "loss": 0.4966, + "step": 4382 + }, + { + "epoch": 1.2154742096505824, + "grad_norm": 0.1969325840473175, + "learning_rate": 8.559650064715088e-06, + "loss": 0.5132, + "step": 4383 + }, + { + "epoch": 1.2157515252357183, + "grad_norm": 0.2127065658569336, + "learning_rate": 8.554410324179226e-06, + "loss": 0.496, + "step": 4384 + }, + { + "epoch": 1.2160288408208542, + "grad_norm": 0.20196539163589478, + "learning_rate": 8.549171353532443e-06, + "loss": 0.4894, + "step": 4385 + }, + { + "epoch": 1.21630615640599, + "grad_norm": 0.19963231682777405, + "learning_rate": 8.543933153797007e-06, + "loss": 0.5355, + "step": 4386 + }, + { + "epoch": 1.216583471991126, + "grad_norm": 0.20684166252613068, + "learning_rate": 8.53869572599503e-06, + "loss": 0.497, + "step": 4387 + }, + { + "epoch": 1.2168607875762618, + "grad_norm": 0.20197083055973053, + "learning_rate": 8.533459071148462e-06, + "loss": 0.4998, + "step": 4388 + }, + { + "epoch": 1.2171381031613977, + "grad_norm": 0.19906753301620483, + "learning_rate": 8.528223190279128e-06, + "loss": 0.4932, + "step": 4389 + }, + { + "epoch": 1.2174154187465336, + "grad_norm": 0.1864069402217865, + "learning_rate": 8.522988084408678e-06, + "loss": 0.4894, + "step": 4390 + }, + { + "epoch": 1.2176927343316695, + "grad_norm": 0.1982210874557495, + "learning_rate": 8.517753754558621e-06, + "loss": 0.4868, + "step": 4391 + }, + { + "epoch": 1.2179700499168054, + "grad_norm": 0.19526433944702148, + "learning_rate": 8.512520201750312e-06, + "loss": 0.5154, + "step": 4392 + }, + { + "epoch": 1.2182473655019412, + "grad_norm": 0.19613106548786163, + "learning_rate": 8.507287427004962e-06, + "loss": 0.5095, + "step": 4393 + }, + { + "epoch": 1.2185246810870771, + "grad_norm": 0.18780362606048584, + "learning_rate": 8.502055431343618e-06, + "loss": 0.5064, + "step": 4394 + }, + { + "epoch": 1.218801996672213, + "grad_norm": 0.20160475373268127, + "learning_rate": 8.49682421578718e-06, + "loss": 0.5269, + "step": 4395 + }, + { + "epoch": 1.219079312257349, + "grad_norm": 0.22338464856147766, + "learning_rate": 8.491593781356404e-06, + "loss": 0.5086, + "step": 4396 + }, + { + "epoch": 1.2193566278424848, + "grad_norm": 0.18976663053035736, + "learning_rate": 8.48636412907188e-06, + "loss": 0.5017, + "step": 4397 + }, + { + "epoch": 1.2196339434276207, + "grad_norm": 0.18721407651901245, + "learning_rate": 8.481135259954057e-06, + "loss": 0.4921, + "step": 4398 + }, + { + "epoch": 1.2199112590127565, + "grad_norm": 0.19557468593120575, + "learning_rate": 8.475907175023218e-06, + "loss": 0.495, + "step": 4399 + }, + { + "epoch": 1.2201885745978924, + "grad_norm": 0.1959930807352066, + "learning_rate": 8.470679875299507e-06, + "loss": 0.5033, + "step": 4400 + }, + { + "epoch": 1.2204658901830283, + "grad_norm": 0.19299939274787903, + "learning_rate": 8.465453361802907e-06, + "loss": 0.5275, + "step": 4401 + }, + { + "epoch": 1.2207432057681642, + "grad_norm": 0.19271641969680786, + "learning_rate": 8.460227635553247e-06, + "loss": 0.4899, + "step": 4402 + }, + { + "epoch": 1.2210205213533, + "grad_norm": 0.19292891025543213, + "learning_rate": 8.455002697570211e-06, + "loss": 0.4917, + "step": 4403 + }, + { + "epoch": 1.221297836938436, + "grad_norm": 0.19001443684101105, + "learning_rate": 8.44977854887332e-06, + "loss": 0.4953, + "step": 4404 + }, + { + "epoch": 1.2215751525235718, + "grad_norm": 0.2002885341644287, + "learning_rate": 8.44455519048194e-06, + "loss": 0.5099, + "step": 4405 + }, + { + "epoch": 1.2218524681087077, + "grad_norm": 0.1973315179347992, + "learning_rate": 8.439332623415287e-06, + "loss": 0.5097, + "step": 4406 + }, + { + "epoch": 1.2221297836938436, + "grad_norm": 0.1916041374206543, + "learning_rate": 8.434110848692427e-06, + "loss": 0.5177, + "step": 4407 + }, + { + "epoch": 1.2224070992789795, + "grad_norm": 0.2049477994441986, + "learning_rate": 8.428889867332268e-06, + "loss": 0.4952, + "step": 4408 + }, + { + "epoch": 1.2226844148641154, + "grad_norm": 0.2035469114780426, + "learning_rate": 8.423669680353549e-06, + "loss": 0.5438, + "step": 4409 + }, + { + "epoch": 1.2229617304492513, + "grad_norm": 0.1949811726808548, + "learning_rate": 8.418450288774884e-06, + "loss": 0.5214, + "step": 4410 + }, + { + "epoch": 1.2232390460343872, + "grad_norm": 0.22771196067333221, + "learning_rate": 8.413231693614704e-06, + "loss": 0.5108, + "step": 4411 + }, + { + "epoch": 1.223516361619523, + "grad_norm": 0.19894933700561523, + "learning_rate": 8.408013895891295e-06, + "loss": 0.4941, + "step": 4412 + }, + { + "epoch": 1.223793677204659, + "grad_norm": 0.19783443212509155, + "learning_rate": 8.40279689662279e-06, + "loss": 0.4934, + "step": 4413 + }, + { + "epoch": 1.2240709927897948, + "grad_norm": 0.1914057582616806, + "learning_rate": 8.397580696827166e-06, + "loss": 0.5151, + "step": 4414 + }, + { + "epoch": 1.2243483083749307, + "grad_norm": 0.19515813887119293, + "learning_rate": 8.392365297522243e-06, + "loss": 0.4888, + "step": 4415 + }, + { + "epoch": 1.2246256239600666, + "grad_norm": 0.19242678582668304, + "learning_rate": 8.387150699725673e-06, + "loss": 0.5101, + "step": 4416 + }, + { + "epoch": 1.2249029395452025, + "grad_norm": 0.2145782858133316, + "learning_rate": 8.381936904454973e-06, + "loss": 0.5157, + "step": 4417 + }, + { + "epoch": 1.2251802551303383, + "grad_norm": 0.19011950492858887, + "learning_rate": 8.376723912727488e-06, + "loss": 0.4891, + "step": 4418 + }, + { + "epoch": 1.2254575707154742, + "grad_norm": 0.19757391512393951, + "learning_rate": 8.371511725560416e-06, + "loss": 0.5173, + "step": 4419 + }, + { + "epoch": 1.2257348863006101, + "grad_norm": 0.2027006596326828, + "learning_rate": 8.36630034397078e-06, + "loss": 0.5116, + "step": 4420 + }, + { + "epoch": 1.226012201885746, + "grad_norm": 0.2004849761724472, + "learning_rate": 8.361089768975475e-06, + "loss": 0.5077, + "step": 4421 + }, + { + "epoch": 1.2262895174708819, + "grad_norm": 0.20179691910743713, + "learning_rate": 8.355880001591212e-06, + "loss": 0.4969, + "step": 4422 + }, + { + "epoch": 1.2265668330560178, + "grad_norm": 0.20845246315002441, + "learning_rate": 8.350671042834555e-06, + "loss": 0.4789, + "step": 4423 + }, + { + "epoch": 1.2268441486411537, + "grad_norm": 0.19270509481430054, + "learning_rate": 8.345462893721911e-06, + "loss": 0.4874, + "step": 4424 + }, + { + "epoch": 1.2271214642262895, + "grad_norm": 0.19383633136749268, + "learning_rate": 8.340255555269535e-06, + "loss": 0.5006, + "step": 4425 + }, + { + "epoch": 1.2273987798114254, + "grad_norm": 0.18180836737155914, + "learning_rate": 8.335049028493509e-06, + "loss": 0.5117, + "step": 4426 + }, + { + "epoch": 1.2276760953965613, + "grad_norm": 0.196406289935112, + "learning_rate": 8.32984331440976e-06, + "loss": 0.5224, + "step": 4427 + }, + { + "epoch": 1.2279534109816972, + "grad_norm": 0.19745458662509918, + "learning_rate": 8.324638414034069e-06, + "loss": 0.4721, + "step": 4428 + }, + { + "epoch": 1.228230726566833, + "grad_norm": 0.2035657912492752, + "learning_rate": 8.31943432838205e-06, + "loss": 0.4984, + "step": 4429 + }, + { + "epoch": 1.228508042151969, + "grad_norm": 0.204306960105896, + "learning_rate": 8.314231058469152e-06, + "loss": 0.4978, + "step": 4430 + }, + { + "epoch": 1.2287853577371048, + "grad_norm": 0.20982758700847626, + "learning_rate": 8.309028605310679e-06, + "loss": 0.4996, + "step": 4431 + }, + { + "epoch": 1.2290626733222407, + "grad_norm": 0.19134745001792908, + "learning_rate": 8.30382696992176e-06, + "loss": 0.4947, + "step": 4432 + }, + { + "epoch": 1.2293399889073766, + "grad_norm": 0.19805335998535156, + "learning_rate": 8.298626153317376e-06, + "loss": 0.5233, + "step": 4433 + }, + { + "epoch": 1.2296173044925125, + "grad_norm": 0.19368603825569153, + "learning_rate": 8.293426156512341e-06, + "loss": 0.5168, + "step": 4434 + }, + { + "epoch": 1.2298946200776484, + "grad_norm": 0.19627097249031067, + "learning_rate": 8.288226980521314e-06, + "loss": 0.4882, + "step": 4435 + }, + { + "epoch": 1.2301719356627843, + "grad_norm": 0.19731606543064117, + "learning_rate": 8.283028626358796e-06, + "loss": 0.5079, + "step": 4436 + }, + { + "epoch": 1.2304492512479202, + "grad_norm": 0.22327442467212677, + "learning_rate": 8.277831095039113e-06, + "loss": 0.5173, + "step": 4437 + }, + { + "epoch": 1.230726566833056, + "grad_norm": 0.2100764513015747, + "learning_rate": 8.272634387576453e-06, + "loss": 0.501, + "step": 4438 + }, + { + "epoch": 1.231003882418192, + "grad_norm": 0.19981728494167328, + "learning_rate": 8.267438504984823e-06, + "loss": 0.477, + "step": 4439 + }, + { + "epoch": 1.2312811980033278, + "grad_norm": 0.19193001091480255, + "learning_rate": 8.262243448278084e-06, + "loss": 0.5078, + "step": 4440 + }, + { + "epoch": 1.2315585135884637, + "grad_norm": 0.2083725780248642, + "learning_rate": 8.257049218469917e-06, + "loss": 0.5017, + "step": 4441 + }, + { + "epoch": 1.2318358291735996, + "grad_norm": 0.19924737513065338, + "learning_rate": 8.251855816573873e-06, + "loss": 0.5036, + "step": 4442 + }, + { + "epoch": 1.2321131447587355, + "grad_norm": 0.19425547122955322, + "learning_rate": 8.246663243603305e-06, + "loss": 0.5207, + "step": 4443 + }, + { + "epoch": 1.2323904603438713, + "grad_norm": 0.20682436227798462, + "learning_rate": 8.241471500571428e-06, + "loss": 0.5119, + "step": 4444 + }, + { + "epoch": 1.2326677759290072, + "grad_norm": 0.1909773349761963, + "learning_rate": 8.236280588491292e-06, + "loss": 0.4723, + "step": 4445 + }, + { + "epoch": 1.232945091514143, + "grad_norm": 0.2076360434293747, + "learning_rate": 8.231090508375777e-06, + "loss": 0.4959, + "step": 4446 + }, + { + "epoch": 1.233222407099279, + "grad_norm": 0.19352662563323975, + "learning_rate": 8.225901261237609e-06, + "loss": 0.5002, + "step": 4447 + }, + { + "epoch": 1.2334997226844149, + "grad_norm": 0.21294668316841125, + "learning_rate": 8.220712848089338e-06, + "loss": 0.5099, + "step": 4448 + }, + { + "epoch": 1.2337770382695508, + "grad_norm": 0.20197489857673645, + "learning_rate": 8.215525269943374e-06, + "loss": 0.4944, + "step": 4449 + }, + { + "epoch": 1.2340543538546866, + "grad_norm": 0.20826977491378784, + "learning_rate": 8.210338527811943e-06, + "loss": 0.5321, + "step": 4450 + }, + { + "epoch": 1.2343316694398225, + "grad_norm": 0.20179514586925507, + "learning_rate": 8.205152622707116e-06, + "loss": 0.5053, + "step": 4451 + }, + { + "epoch": 1.2346089850249584, + "grad_norm": 0.21204441785812378, + "learning_rate": 8.199967555640805e-06, + "loss": 0.4856, + "step": 4452 + }, + { + "epoch": 1.2348863006100943, + "grad_norm": 0.20789694786071777, + "learning_rate": 8.194783327624751e-06, + "loss": 0.5174, + "step": 4453 + }, + { + "epoch": 1.2351636161952302, + "grad_norm": 0.207809716463089, + "learning_rate": 8.189599939670531e-06, + "loss": 0.5028, + "step": 4454 + }, + { + "epoch": 1.235440931780366, + "grad_norm": 0.3477715849876404, + "learning_rate": 8.184417392789568e-06, + "loss": 0.4812, + "step": 4455 + }, + { + "epoch": 1.235718247365502, + "grad_norm": 0.20378254354000092, + "learning_rate": 8.179235687993108e-06, + "loss": 0.499, + "step": 4456 + }, + { + "epoch": 1.2359955629506378, + "grad_norm": 0.19878922402858734, + "learning_rate": 8.174054826292249e-06, + "loss": 0.508, + "step": 4457 + }, + { + "epoch": 1.2362728785357737, + "grad_norm": 0.21171467006206512, + "learning_rate": 8.168874808697896e-06, + "loss": 0.5037, + "step": 4458 + }, + { + "epoch": 1.2365501941209096, + "grad_norm": 0.1954033523797989, + "learning_rate": 8.163695636220828e-06, + "loss": 0.5014, + "step": 4459 + }, + { + "epoch": 1.2368275097060455, + "grad_norm": 0.19483618438243866, + "learning_rate": 8.158517309871626e-06, + "loss": 0.4929, + "step": 4460 + }, + { + "epoch": 1.2371048252911814, + "grad_norm": 0.19928301870822906, + "learning_rate": 8.153339830660719e-06, + "loss": 0.5078, + "step": 4461 + }, + { + "epoch": 1.2373821408763173, + "grad_norm": 0.20188100636005402, + "learning_rate": 8.148163199598379e-06, + "loss": 0.4913, + "step": 4462 + }, + { + "epoch": 1.2376594564614531, + "grad_norm": 0.21561436355113983, + "learning_rate": 8.142987417694699e-06, + "loss": 0.4676, + "step": 4463 + }, + { + "epoch": 1.237936772046589, + "grad_norm": 0.21500158309936523, + "learning_rate": 8.137812485959608e-06, + "loss": 0.4817, + "step": 4464 + }, + { + "epoch": 1.238214087631725, + "grad_norm": 0.20233996212482452, + "learning_rate": 8.132638405402874e-06, + "loss": 0.5101, + "step": 4465 + }, + { + "epoch": 1.2384914032168608, + "grad_norm": 0.20174197852611542, + "learning_rate": 8.1274651770341e-06, + "loss": 0.4972, + "step": 4466 + }, + { + "epoch": 1.2387687188019967, + "grad_norm": 0.190217986702919, + "learning_rate": 8.122292801862716e-06, + "loss": 0.498, + "step": 4467 + }, + { + "epoch": 1.2390460343871326, + "grad_norm": 0.20103545486927032, + "learning_rate": 8.11712128089799e-06, + "loss": 0.5011, + "step": 4468 + }, + { + "epoch": 1.2393233499722685, + "grad_norm": 0.20134317874908447, + "learning_rate": 8.111950615149031e-06, + "loss": 0.5089, + "step": 4469 + }, + { + "epoch": 1.2396006655574043, + "grad_norm": 0.20507292449474335, + "learning_rate": 8.106780805624764e-06, + "loss": 0.5092, + "step": 4470 + }, + { + "epoch": 1.2398779811425402, + "grad_norm": 0.19532889127731323, + "learning_rate": 8.101611853333955e-06, + "loss": 0.4975, + "step": 4471 + }, + { + "epoch": 1.240155296727676, + "grad_norm": 0.20147058367729187, + "learning_rate": 8.096443759285206e-06, + "loss": 0.5055, + "step": 4472 + }, + { + "epoch": 1.240432612312812, + "grad_norm": 0.20608854293823242, + "learning_rate": 8.09127652448695e-06, + "loss": 0.5094, + "step": 4473 + }, + { + "epoch": 1.2407099278979479, + "grad_norm": 0.20161250233650208, + "learning_rate": 8.086110149947457e-06, + "loss": 0.5182, + "step": 4474 + }, + { + "epoch": 1.2409872434830838, + "grad_norm": 0.19690173864364624, + "learning_rate": 8.080944636674811e-06, + "loss": 0.5211, + "step": 4475 + }, + { + "epoch": 1.2412645590682196, + "grad_norm": 0.1985633373260498, + "learning_rate": 8.075779985676949e-06, + "loss": 0.4994, + "step": 4476 + }, + { + "epoch": 1.2415418746533555, + "grad_norm": 0.1929730325937271, + "learning_rate": 8.070616197961631e-06, + "loss": 0.4815, + "step": 4477 + }, + { + "epoch": 1.2418191902384914, + "grad_norm": 0.19387488067150116, + "learning_rate": 8.065453274536447e-06, + "loss": 0.5091, + "step": 4478 + }, + { + "epoch": 1.2420965058236273, + "grad_norm": 0.19065174460411072, + "learning_rate": 8.060291216408814e-06, + "loss": 0.4868, + "step": 4479 + }, + { + "epoch": 1.2423738214087632, + "grad_norm": 0.20745143294334412, + "learning_rate": 8.055130024586e-06, + "loss": 0.4938, + "step": 4480 + }, + { + "epoch": 1.242651136993899, + "grad_norm": 0.19908511638641357, + "learning_rate": 8.04996970007508e-06, + "loss": 0.4939, + "step": 4481 + }, + { + "epoch": 1.242928452579035, + "grad_norm": 0.2046736627817154, + "learning_rate": 8.044810243882971e-06, + "loss": 0.5194, + "step": 4482 + }, + { + "epoch": 1.2432057681641708, + "grad_norm": 0.20767585933208466, + "learning_rate": 8.039651657016423e-06, + "loss": 0.5176, + "step": 4483 + }, + { + "epoch": 1.2434830837493067, + "grad_norm": 0.20827417075634003, + "learning_rate": 8.034493940482016e-06, + "loss": 0.4997, + "step": 4484 + }, + { + "epoch": 1.2437603993344426, + "grad_norm": 0.21914151310920715, + "learning_rate": 8.029337095286147e-06, + "loss": 0.4901, + "step": 4485 + }, + { + "epoch": 1.2440377149195785, + "grad_norm": 0.21400579810142517, + "learning_rate": 8.024181122435058e-06, + "loss": 0.4926, + "step": 4486 + }, + { + "epoch": 1.2443150305047144, + "grad_norm": 0.19826753437519073, + "learning_rate": 8.01902602293482e-06, + "loss": 0.5113, + "step": 4487 + }, + { + "epoch": 1.2445923460898503, + "grad_norm": 0.20881658792495728, + "learning_rate": 8.013871797791324e-06, + "loss": 0.4921, + "step": 4488 + }, + { + "epoch": 1.2448696616749861, + "grad_norm": 0.18899375200271606, + "learning_rate": 8.008718448010297e-06, + "loss": 0.5139, + "step": 4489 + }, + { + "epoch": 1.245146977260122, + "grad_norm": 0.1959080994129181, + "learning_rate": 8.003565974597298e-06, + "loss": 0.4864, + "step": 4490 + }, + { + "epoch": 1.245424292845258, + "grad_norm": 0.20414955914020538, + "learning_rate": 7.99841437855771e-06, + "loss": 0.4847, + "step": 4491 + }, + { + "epoch": 1.2457016084303938, + "grad_norm": 0.1957169771194458, + "learning_rate": 7.993263660896738e-06, + "loss": 0.5117, + "step": 4492 + }, + { + "epoch": 1.2459789240155297, + "grad_norm": 0.2063983678817749, + "learning_rate": 7.988113822619431e-06, + "loss": 0.4969, + "step": 4493 + }, + { + "epoch": 1.2462562396006656, + "grad_norm": 0.1924324780702591, + "learning_rate": 7.982964864730658e-06, + "loss": 0.4815, + "step": 4494 + }, + { + "epoch": 1.2465335551858014, + "grad_norm": 0.19338028132915497, + "learning_rate": 7.97781678823512e-06, + "loss": 0.4859, + "step": 4495 + }, + { + "epoch": 1.2468108707709373, + "grad_norm": 0.19696307182312012, + "learning_rate": 7.972669594137333e-06, + "loss": 0.5102, + "step": 4496 + }, + { + "epoch": 1.2470881863560732, + "grad_norm": 0.19288606941699982, + "learning_rate": 7.967523283441664e-06, + "loss": 0.4892, + "step": 4497 + }, + { + "epoch": 1.247365501941209, + "grad_norm": 0.19867482781410217, + "learning_rate": 7.962377857152284e-06, + "loss": 0.4955, + "step": 4498 + }, + { + "epoch": 1.247642817526345, + "grad_norm": 0.20236186683177948, + "learning_rate": 7.957233316273211e-06, + "loss": 0.5143, + "step": 4499 + }, + { + "epoch": 1.2479201331114809, + "grad_norm": 0.2065366804599762, + "learning_rate": 7.952089661808268e-06, + "loss": 0.4996, + "step": 4500 + }, + { + "epoch": 1.2481974486966168, + "grad_norm": 0.1983073204755783, + "learning_rate": 7.946946894761134e-06, + "loss": 0.4892, + "step": 4501 + }, + { + "epoch": 1.2484747642817526, + "grad_norm": 0.19757206737995148, + "learning_rate": 7.94180501613529e-06, + "loss": 0.5111, + "step": 4502 + }, + { + "epoch": 1.2487520798668885, + "grad_norm": 0.20565056800842285, + "learning_rate": 7.936664026934052e-06, + "loss": 0.4943, + "step": 4503 + }, + { + "epoch": 1.2490293954520244, + "grad_norm": 0.19800062477588654, + "learning_rate": 7.931523928160567e-06, + "loss": 0.4838, + "step": 4504 + }, + { + "epoch": 1.2493067110371603, + "grad_norm": 0.2019055336713791, + "learning_rate": 7.926384720817807e-06, + "loss": 0.5413, + "step": 4505 + }, + { + "epoch": 1.2495840266222962, + "grad_norm": 0.19655373692512512, + "learning_rate": 7.921246405908558e-06, + "loss": 0.5043, + "step": 4506 + }, + { + "epoch": 1.249861342207432, + "grad_norm": 0.1993308812379837, + "learning_rate": 7.916108984435448e-06, + "loss": 0.5122, + "step": 4507 + }, + { + "epoch": 1.250138657792568, + "grad_norm": 0.1926482617855072, + "learning_rate": 7.910972457400923e-06, + "loss": 0.5103, + "step": 4508 + }, + { + "epoch": 1.2504159733777038, + "grad_norm": 0.20536184310913086, + "learning_rate": 7.905836825807257e-06, + "loss": 0.5124, + "step": 4509 + }, + { + "epoch": 1.2506932889628397, + "grad_norm": 0.19252263009548187, + "learning_rate": 7.900702090656545e-06, + "loss": 0.5069, + "step": 4510 + }, + { + "epoch": 1.2509706045479756, + "grad_norm": 0.20172010362148285, + "learning_rate": 7.895568252950711e-06, + "loss": 0.5081, + "step": 4511 + }, + { + "epoch": 1.2512479201331115, + "grad_norm": 0.1947304904460907, + "learning_rate": 7.890435313691507e-06, + "loss": 0.5038, + "step": 4512 + }, + { + "epoch": 1.2515252357182474, + "grad_norm": 0.20323634147644043, + "learning_rate": 7.885303273880498e-06, + "loss": 0.5116, + "step": 4513 + }, + { + "epoch": 1.2518025513033832, + "grad_norm": 0.21063730120658875, + "learning_rate": 7.880172134519082e-06, + "loss": 0.5288, + "step": 4514 + }, + { + "epoch": 1.2520798668885191, + "grad_norm": 0.19582238793373108, + "learning_rate": 7.875041896608487e-06, + "loss": 0.507, + "step": 4515 + }, + { + "epoch": 1.252357182473655, + "grad_norm": 0.21959121525287628, + "learning_rate": 7.869912561149755e-06, + "loss": 0.5066, + "step": 4516 + }, + { + "epoch": 1.252634498058791, + "grad_norm": 0.21660704910755157, + "learning_rate": 7.864784129143747e-06, + "loss": 0.5058, + "step": 4517 + }, + { + "epoch": 1.2529118136439268, + "grad_norm": 0.20365749299526215, + "learning_rate": 7.85965660159117e-06, + "loss": 0.4966, + "step": 4518 + }, + { + "epoch": 1.2531891292290627, + "grad_norm": 0.21940533816814423, + "learning_rate": 7.85452997949253e-06, + "loss": 0.4925, + "step": 4519 + }, + { + "epoch": 1.2534664448141986, + "grad_norm": 0.2422044575214386, + "learning_rate": 7.84940426384817e-06, + "loss": 0.4982, + "step": 4520 + }, + { + "epoch": 1.2537437603993344, + "grad_norm": 0.1977468580007553, + "learning_rate": 7.844279455658257e-06, + "loss": 0.5266, + "step": 4521 + }, + { + "epoch": 1.2540210759844703, + "grad_norm": 0.20944328606128693, + "learning_rate": 7.839155555922773e-06, + "loss": 0.4822, + "step": 4522 + }, + { + "epoch": 1.2542983915696062, + "grad_norm": 0.1976504623889923, + "learning_rate": 7.834032565641525e-06, + "loss": 0.4964, + "step": 4523 + }, + { + "epoch": 1.254575707154742, + "grad_norm": 0.20402562618255615, + "learning_rate": 7.828910485814142e-06, + "loss": 0.5108, + "step": 4524 + }, + { + "epoch": 1.254853022739878, + "grad_norm": 0.19931097328662872, + "learning_rate": 7.823789317440086e-06, + "loss": 0.4945, + "step": 4525 + }, + { + "epoch": 1.2551303383250139, + "grad_norm": 0.22738957405090332, + "learning_rate": 7.818669061518628e-06, + "loss": 0.5012, + "step": 4526 + }, + { + "epoch": 1.2554076539101497, + "grad_norm": 0.20075024664402008, + "learning_rate": 7.813549719048862e-06, + "loss": 0.4728, + "step": 4527 + }, + { + "epoch": 1.2556849694952856, + "grad_norm": 0.18941983580589294, + "learning_rate": 7.808431291029717e-06, + "loss": 0.5031, + "step": 4528 + }, + { + "epoch": 1.2559622850804215, + "grad_norm": 0.1993735283613205, + "learning_rate": 7.803313778459925e-06, + "loss": 0.4867, + "step": 4529 + }, + { + "epoch": 1.2562396006655574, + "grad_norm": 0.20413918793201447, + "learning_rate": 7.798197182338051e-06, + "loss": 0.4906, + "step": 4530 + }, + { + "epoch": 1.2565169162506933, + "grad_norm": 0.19167618453502655, + "learning_rate": 7.793081503662477e-06, + "loss": 0.4974, + "step": 4531 + }, + { + "epoch": 1.2567942318358292, + "grad_norm": 0.20251625776290894, + "learning_rate": 7.78796674343141e-06, + "loss": 0.5196, + "step": 4532 + }, + { + "epoch": 1.257071547420965, + "grad_norm": 0.19917502999305725, + "learning_rate": 7.78285290264288e-06, + "loss": 0.5262, + "step": 4533 + }, + { + "epoch": 1.257348863006101, + "grad_norm": 0.23531167209148407, + "learning_rate": 7.777739982294719e-06, + "loss": 0.5052, + "step": 4534 + }, + { + "epoch": 1.2576261785912368, + "grad_norm": 0.20423223078250885, + "learning_rate": 7.772627983384604e-06, + "loss": 0.5154, + "step": 4535 + }, + { + "epoch": 1.2579034941763727, + "grad_norm": 0.19194969534873962, + "learning_rate": 7.767516906910018e-06, + "loss": 0.4841, + "step": 4536 + }, + { + "epoch": 1.2581808097615086, + "grad_norm": 0.20146818459033966, + "learning_rate": 7.762406753868273e-06, + "loss": 0.5213, + "step": 4537 + }, + { + "epoch": 1.2584581253466445, + "grad_norm": 0.2293848693370819, + "learning_rate": 7.757297525256482e-06, + "loss": 0.5098, + "step": 4538 + }, + { + "epoch": 1.2587354409317804, + "grad_norm": 0.201215922832489, + "learning_rate": 7.752189222071607e-06, + "loss": 0.5166, + "step": 4539 + }, + { + "epoch": 1.2590127565169162, + "grad_norm": 0.20330609381198883, + "learning_rate": 7.747081845310403e-06, + "loss": 0.478, + "step": 4540 + }, + { + "epoch": 1.2592900721020521, + "grad_norm": 0.19179701805114746, + "learning_rate": 7.741975395969456e-06, + "loss": 0.5067, + "step": 4541 + }, + { + "epoch": 1.259567387687188, + "grad_norm": 0.19922393560409546, + "learning_rate": 7.736869875045171e-06, + "loss": 0.519, + "step": 4542 + }, + { + "epoch": 1.259844703272324, + "grad_norm": 0.19848206639289856, + "learning_rate": 7.731765283533773e-06, + "loss": 0.4716, + "step": 4543 + }, + { + "epoch": 1.2601220188574598, + "grad_norm": 0.2069179117679596, + "learning_rate": 7.7266616224313e-06, + "loss": 0.5092, + "step": 4544 + }, + { + "epoch": 1.2603993344425957, + "grad_norm": 0.20515765249729156, + "learning_rate": 7.721558892733608e-06, + "loss": 0.5013, + "step": 4545 + }, + { + "epoch": 1.2606766500277315, + "grad_norm": 0.195294588804245, + "learning_rate": 7.716457095436378e-06, + "loss": 0.5055, + "step": 4546 + }, + { + "epoch": 1.2609539656128674, + "grad_norm": 0.1927175670862198, + "learning_rate": 7.711356231535111e-06, + "loss": 0.5105, + "step": 4547 + }, + { + "epoch": 1.2612312811980033, + "grad_norm": 0.19795742630958557, + "learning_rate": 7.706256302025109e-06, + "loss": 0.5169, + "step": 4548 + }, + { + "epoch": 1.2615085967831392, + "grad_norm": 0.2066584676504135, + "learning_rate": 7.701157307901515e-06, + "loss": 0.5266, + "step": 4549 + }, + { + "epoch": 1.261785912368275, + "grad_norm": 0.20272813737392426, + "learning_rate": 7.696059250159277e-06, + "loss": 0.4815, + "step": 4550 + }, + { + "epoch": 1.262063227953411, + "grad_norm": 0.20374014973640442, + "learning_rate": 7.690962129793153e-06, + "loss": 0.4925, + "step": 4551 + }, + { + "epoch": 1.2623405435385469, + "grad_norm": 0.19825245440006256, + "learning_rate": 7.68586594779773e-06, + "loss": 0.5236, + "step": 4552 + }, + { + "epoch": 1.2626178591236827, + "grad_norm": 0.19967573881149292, + "learning_rate": 7.68077070516741e-06, + "loss": 0.4796, + "step": 4553 + }, + { + "epoch": 1.2628951747088186, + "grad_norm": 0.19628724455833435, + "learning_rate": 7.67567640289641e-06, + "loss": 0.5064, + "step": 4554 + }, + { + "epoch": 1.2631724902939545, + "grad_norm": 0.20510771870613098, + "learning_rate": 7.670583041978754e-06, + "loss": 0.5365, + "step": 4555 + }, + { + "epoch": 1.2634498058790904, + "grad_norm": 0.1963818073272705, + "learning_rate": 7.665490623408308e-06, + "loss": 0.4989, + "step": 4556 + }, + { + "epoch": 1.2637271214642263, + "grad_norm": 0.2057606726884842, + "learning_rate": 7.660399148178727e-06, + "loss": 0.4998, + "step": 4557 + }, + { + "epoch": 1.2640044370493622, + "grad_norm": 0.18892644345760345, + "learning_rate": 7.655308617283493e-06, + "loss": 0.492, + "step": 4558 + }, + { + "epoch": 1.264281752634498, + "grad_norm": 0.1982104480266571, + "learning_rate": 7.650219031715906e-06, + "loss": 0.4747, + "step": 4559 + }, + { + "epoch": 1.264559068219634, + "grad_norm": 0.19365094602108002, + "learning_rate": 7.645130392469082e-06, + "loss": 0.491, + "step": 4560 + }, + { + "epoch": 1.2648363838047698, + "grad_norm": 0.1940053105354309, + "learning_rate": 7.640042700535944e-06, + "loss": 0.5134, + "step": 4561 + }, + { + "epoch": 1.2651136993899057, + "grad_norm": 0.19653920829296112, + "learning_rate": 7.634955956909234e-06, + "loss": 0.4746, + "step": 4562 + }, + { + "epoch": 1.2653910149750416, + "grad_norm": 0.19754603505134583, + "learning_rate": 7.629870162581516e-06, + "loss": 0.4839, + "step": 4563 + }, + { + "epoch": 1.2656683305601775, + "grad_norm": 0.1982675939798355, + "learning_rate": 7.62478531854516e-06, + "loss": 0.4898, + "step": 4564 + }, + { + "epoch": 1.2659456461453134, + "grad_norm": 0.20128501951694489, + "learning_rate": 7.61970142579236e-06, + "loss": 0.5171, + "step": 4565 + }, + { + "epoch": 1.2662229617304492, + "grad_norm": 0.20672714710235596, + "learning_rate": 7.6146184853151055e-06, + "loss": 0.51, + "step": 4566 + }, + { + "epoch": 1.2665002773155851, + "grad_norm": 0.1908549964427948, + "learning_rate": 7.609536498105224e-06, + "loss": 0.4792, + "step": 4567 + }, + { + "epoch": 1.266777592900721, + "grad_norm": 0.19964845478534698, + "learning_rate": 7.6044554651543424e-06, + "loss": 0.5005, + "step": 4568 + }, + { + "epoch": 1.267054908485857, + "grad_norm": 0.2022092193365097, + "learning_rate": 7.5993753874539015e-06, + "loss": 0.4815, + "step": 4569 + }, + { + "epoch": 1.2673322240709928, + "grad_norm": 0.20571796596050262, + "learning_rate": 7.594296265995164e-06, + "loss": 0.5162, + "step": 4570 + }, + { + "epoch": 1.2676095396561287, + "grad_norm": 0.2025289386510849, + "learning_rate": 7.589218101769202e-06, + "loss": 0.5145, + "step": 4571 + }, + { + "epoch": 1.2678868552412645, + "grad_norm": 0.18278613686561584, + "learning_rate": 7.584140895766895e-06, + "loss": 0.4959, + "step": 4572 + }, + { + "epoch": 1.2681641708264004, + "grad_norm": 0.20485979318618774, + "learning_rate": 7.579064648978939e-06, + "loss": 0.4931, + "step": 4573 + }, + { + "epoch": 1.2684414864115363, + "grad_norm": 0.20338453352451324, + "learning_rate": 7.5739893623958515e-06, + "loss": 0.501, + "step": 4574 + }, + { + "epoch": 1.2687188019966722, + "grad_norm": 0.1984669268131256, + "learning_rate": 7.5689150370079535e-06, + "loss": 0.5088, + "step": 4575 + }, + { + "epoch": 1.268996117581808, + "grad_norm": 0.19437134265899658, + "learning_rate": 7.563841673805372e-06, + "loss": 0.4721, + "step": 4576 + }, + { + "epoch": 1.269273433166944, + "grad_norm": 0.2029186487197876, + "learning_rate": 7.558769273778066e-06, + "loss": 0.5095, + "step": 4577 + }, + { + "epoch": 1.2695507487520798, + "grad_norm": 0.19555479288101196, + "learning_rate": 7.553697837915791e-06, + "loss": 0.4698, + "step": 4578 + }, + { + "epoch": 1.2698280643372157, + "grad_norm": 0.20628714561462402, + "learning_rate": 7.548627367208111e-06, + "loss": 0.4781, + "step": 4579 + }, + { + "epoch": 1.2701053799223516, + "grad_norm": 0.19298192858695984, + "learning_rate": 7.543557862644421e-06, + "loss": 0.4903, + "step": 4580 + }, + { + "epoch": 1.2703826955074875, + "grad_norm": 0.19302377104759216, + "learning_rate": 7.538489325213913e-06, + "loss": 0.5233, + "step": 4581 + }, + { + "epoch": 1.2706600110926234, + "grad_norm": 0.20301836729049683, + "learning_rate": 7.533421755905587e-06, + "loss": 0.5101, + "step": 4582 + }, + { + "epoch": 1.2709373266777593, + "grad_norm": 0.19993318617343903, + "learning_rate": 7.528355155708261e-06, + "loss": 0.4907, + "step": 4583 + }, + { + "epoch": 1.2712146422628952, + "grad_norm": 0.19807255268096924, + "learning_rate": 7.523289525610569e-06, + "loss": 0.4911, + "step": 4584 + }, + { + "epoch": 1.271491957848031, + "grad_norm": 0.20224188268184662, + "learning_rate": 7.518224866600945e-06, + "loss": 0.4961, + "step": 4585 + }, + { + "epoch": 1.271769273433167, + "grad_norm": 0.194349467754364, + "learning_rate": 7.513161179667636e-06, + "loss": 0.4848, + "step": 4586 + }, + { + "epoch": 1.2720465890183028, + "grad_norm": 0.19369667768478394, + "learning_rate": 7.508098465798707e-06, + "loss": 0.5079, + "step": 4587 + }, + { + "epoch": 1.2723239046034387, + "grad_norm": 0.19323889911174774, + "learning_rate": 7.50303672598203e-06, + "loss": 0.4986, + "step": 4588 + }, + { + "epoch": 1.2726012201885746, + "grad_norm": 0.19939887523651123, + "learning_rate": 7.4979759612052754e-06, + "loss": 0.5052, + "step": 4589 + }, + { + "epoch": 1.2728785357737105, + "grad_norm": 0.19758670032024384, + "learning_rate": 7.4929161724559355e-06, + "loss": 0.5028, + "step": 4590 + }, + { + "epoch": 1.2731558513588463, + "grad_norm": 0.19541890919208527, + "learning_rate": 7.487857360721312e-06, + "loss": 0.471, + "step": 4591 + }, + { + "epoch": 1.2734331669439822, + "grad_norm": 0.2024282068014145, + "learning_rate": 7.482799526988515e-06, + "loss": 0.5235, + "step": 4592 + }, + { + "epoch": 1.2737104825291181, + "grad_norm": 0.19754981994628906, + "learning_rate": 7.4777426722444505e-06, + "loss": 0.4934, + "step": 4593 + }, + { + "epoch": 1.273987798114254, + "grad_norm": 0.20427443087100983, + "learning_rate": 7.472686797475861e-06, + "loss": 0.4731, + "step": 4594 + }, + { + "epoch": 1.2742651136993899, + "grad_norm": 0.18958698213100433, + "learning_rate": 7.46763190366927e-06, + "loss": 0.4863, + "step": 4595 + }, + { + "epoch": 1.2745424292845258, + "grad_norm": 0.19229987263679504, + "learning_rate": 7.462577991811028e-06, + "loss": 0.4881, + "step": 4596 + }, + { + "epoch": 1.2748197448696617, + "grad_norm": 0.2041274905204773, + "learning_rate": 7.4575250628872745e-06, + "loss": 0.4933, + "step": 4597 + }, + { + "epoch": 1.2750970604547975, + "grad_norm": 0.1980563998222351, + "learning_rate": 7.452473117883989e-06, + "loss": 0.5078, + "step": 4598 + }, + { + "epoch": 1.2753743760399334, + "grad_norm": 0.19617661833763123, + "learning_rate": 7.4474221577869265e-06, + "loss": 0.5231, + "step": 4599 + }, + { + "epoch": 1.2756516916250693, + "grad_norm": 0.19393262267112732, + "learning_rate": 7.442372183581664e-06, + "loss": 0.4959, + "step": 4600 + }, + { + "epoch": 1.2759290072102052, + "grad_norm": 0.2189975529909134, + "learning_rate": 7.43732319625359e-06, + "loss": 0.501, + "step": 4601 + }, + { + "epoch": 1.276206322795341, + "grad_norm": 0.21371406316757202, + "learning_rate": 7.432275196787894e-06, + "loss": 0.5012, + "step": 4602 + }, + { + "epoch": 1.276483638380477, + "grad_norm": 0.20440706610679626, + "learning_rate": 7.427228186169575e-06, + "loss": 0.4698, + "step": 4603 + }, + { + "epoch": 1.2767609539656128, + "grad_norm": 0.19952230155467987, + "learning_rate": 7.422182165383434e-06, + "loss": 0.5198, + "step": 4604 + }, + { + "epoch": 1.2770382695507487, + "grad_norm": 0.20963895320892334, + "learning_rate": 7.417137135414088e-06, + "loss": 0.4953, + "step": 4605 + }, + { + "epoch": 1.2773155851358846, + "grad_norm": 0.210064098238945, + "learning_rate": 7.412093097245956e-06, + "loss": 0.5, + "step": 4606 + }, + { + "epoch": 1.2775929007210205, + "grad_norm": 0.20027229189872742, + "learning_rate": 7.4070500518632595e-06, + "loss": 0.499, + "step": 4607 + }, + { + "epoch": 1.2778702163061564, + "grad_norm": 0.19522671401500702, + "learning_rate": 7.4020080002500355e-06, + "loss": 0.4978, + "step": 4608 + }, + { + "epoch": 1.2781475318912923, + "grad_norm": 0.1936059147119522, + "learning_rate": 7.396966943390121e-06, + "loss": 0.4872, + "step": 4609 + }, + { + "epoch": 1.2784248474764282, + "grad_norm": 0.2015063762664795, + "learning_rate": 7.391926882267159e-06, + "loss": 0.5015, + "step": 4610 + }, + { + "epoch": 1.278702163061564, + "grad_norm": 0.20017270743846893, + "learning_rate": 7.386887817864592e-06, + "loss": 0.5063, + "step": 4611 + }, + { + "epoch": 1.2789794786467, + "grad_norm": 0.20706325769424438, + "learning_rate": 7.381849751165684e-06, + "loss": 0.5311, + "step": 4612 + }, + { + "epoch": 1.2792567942318358, + "grad_norm": 0.19370242953300476, + "learning_rate": 7.376812683153496e-06, + "loss": 0.4901, + "step": 4613 + }, + { + "epoch": 1.2795341098169717, + "grad_norm": 0.20481263101100922, + "learning_rate": 7.371776614810883e-06, + "loss": 0.5185, + "step": 4614 + }, + { + "epoch": 1.2798114254021076, + "grad_norm": 0.20902620255947113, + "learning_rate": 7.366741547120527e-06, + "loss": 0.5268, + "step": 4615 + }, + { + "epoch": 1.2800887409872435, + "grad_norm": 0.2147327959537506, + "learning_rate": 7.361707481064898e-06, + "loss": 0.5251, + "step": 4616 + }, + { + "epoch": 1.2803660565723793, + "grad_norm": 0.20069043338298798, + "learning_rate": 7.356674417626275e-06, + "loss": 0.5176, + "step": 4617 + }, + { + "epoch": 1.2806433721575152, + "grad_norm": 0.2038661241531372, + "learning_rate": 7.351642357786741e-06, + "loss": 0.5001, + "step": 4618 + }, + { + "epoch": 1.280920687742651, + "grad_norm": 0.20059525966644287, + "learning_rate": 7.34661130252819e-06, + "loss": 0.5081, + "step": 4619 + }, + { + "epoch": 1.281198003327787, + "grad_norm": 0.2055322527885437, + "learning_rate": 7.341581252832309e-06, + "loss": 0.5234, + "step": 4620 + }, + { + "epoch": 1.2814753189129229, + "grad_norm": 0.1974288672208786, + "learning_rate": 7.336552209680592e-06, + "loss": 0.4774, + "step": 4621 + }, + { + "epoch": 1.2817526344980588, + "grad_norm": 0.19128572940826416, + "learning_rate": 7.3315241740543455e-06, + "loss": 0.4954, + "step": 4622 + }, + { + "epoch": 1.2820299500831946, + "grad_norm": 0.3327876925468445, + "learning_rate": 7.326497146934669e-06, + "loss": 0.5184, + "step": 4623 + }, + { + "epoch": 1.2823072656683305, + "grad_norm": 0.19013381004333496, + "learning_rate": 7.3214711293024694e-06, + "loss": 0.4776, + "step": 4624 + }, + { + "epoch": 1.2825845812534664, + "grad_norm": 0.20403680205345154, + "learning_rate": 7.316446122138451e-06, + "loss": 0.522, + "step": 4625 + }, + { + "epoch": 1.2828618968386023, + "grad_norm": 0.19423431158065796, + "learning_rate": 7.311422126423131e-06, + "loss": 0.4802, + "step": 4626 + }, + { + "epoch": 1.2831392124237382, + "grad_norm": 0.20686981081962585, + "learning_rate": 7.306399143136825e-06, + "loss": 0.4908, + "step": 4627 + }, + { + "epoch": 1.283416528008874, + "grad_norm": 0.19685044884681702, + "learning_rate": 7.301377173259644e-06, + "loss": 0.4766, + "step": 4628 + }, + { + "epoch": 1.28369384359401, + "grad_norm": 0.20665518939495087, + "learning_rate": 7.296356217771515e-06, + "loss": 0.4733, + "step": 4629 + }, + { + "epoch": 1.2839711591791458, + "grad_norm": 0.20772451162338257, + "learning_rate": 7.291336277652158e-06, + "loss": 0.4944, + "step": 4630 + }, + { + "epoch": 1.2842484747642817, + "grad_norm": 0.20459213852882385, + "learning_rate": 7.286317353881094e-06, + "loss": 0.5046, + "step": 4631 + }, + { + "epoch": 1.2845257903494176, + "grad_norm": 0.198153555393219, + "learning_rate": 7.281299447437637e-06, + "loss": 0.5134, + "step": 4632 + }, + { + "epoch": 1.2848031059345535, + "grad_norm": 0.2004951536655426, + "learning_rate": 7.276282559300937e-06, + "loss": 0.4975, + "step": 4633 + }, + { + "epoch": 1.2850804215196894, + "grad_norm": 0.19988703727722168, + "learning_rate": 7.271266690449907e-06, + "loss": 0.5216, + "step": 4634 + }, + { + "epoch": 1.2853577371048253, + "grad_norm": 0.20041632652282715, + "learning_rate": 7.266251841863275e-06, + "loss": 0.4996, + "step": 4635 + }, + { + "epoch": 1.2856350526899611, + "grad_norm": 0.19913561642169952, + "learning_rate": 7.2612380145195735e-06, + "loss": 0.4907, + "step": 4636 + }, + { + "epoch": 1.285912368275097, + "grad_norm": 0.21043738722801208, + "learning_rate": 7.256225209397139e-06, + "loss": 0.5161, + "step": 4637 + }, + { + "epoch": 1.286189683860233, + "grad_norm": 0.2178940773010254, + "learning_rate": 7.2512134274740986e-06, + "loss": 0.5072, + "step": 4638 + }, + { + "epoch": 1.2864669994453688, + "grad_norm": 0.2040819674730301, + "learning_rate": 7.246202669728375e-06, + "loss": 0.522, + "step": 4639 + }, + { + "epoch": 1.2867443150305047, + "grad_norm": 0.20290911197662354, + "learning_rate": 7.241192937137708e-06, + "loss": 0.5249, + "step": 4640 + }, + { + "epoch": 1.2870216306156406, + "grad_norm": 0.2622935473918915, + "learning_rate": 7.2361842306796356e-06, + "loss": 0.5143, + "step": 4641 + }, + { + "epoch": 1.2872989462007765, + "grad_norm": 0.20093023777008057, + "learning_rate": 7.231176551331476e-06, + "loss": 0.5218, + "step": 4642 + }, + { + "epoch": 1.2875762617859123, + "grad_norm": 0.19977106153964996, + "learning_rate": 7.226169900070365e-06, + "loss": 0.5273, + "step": 4643 + }, + { + "epoch": 1.2878535773710482, + "grad_norm": 0.20094482600688934, + "learning_rate": 7.221164277873238e-06, + "loss": 0.5079, + "step": 4644 + }, + { + "epoch": 1.288130892956184, + "grad_norm": 0.2162494659423828, + "learning_rate": 7.216159685716817e-06, + "loss": 0.4924, + "step": 4645 + }, + { + "epoch": 1.28840820854132, + "grad_norm": 0.19907453656196594, + "learning_rate": 7.211156124577639e-06, + "loss": 0.4912, + "step": 4646 + }, + { + "epoch": 1.2886855241264559, + "grad_norm": 0.20266936719417572, + "learning_rate": 7.206153595432022e-06, + "loss": 0.4964, + "step": 4647 + }, + { + "epoch": 1.2889628397115918, + "grad_norm": 0.19707994163036346, + "learning_rate": 7.2011520992561e-06, + "loss": 0.5026, + "step": 4648 + }, + { + "epoch": 1.2892401552967276, + "grad_norm": 0.1953379213809967, + "learning_rate": 7.196151637025788e-06, + "loss": 0.5141, + "step": 4649 + }, + { + "epoch": 1.2895174708818635, + "grad_norm": 0.1917579025030136, + "learning_rate": 7.191152209716822e-06, + "loss": 0.4888, + "step": 4650 + }, + { + "epoch": 1.2897947864669994, + "grad_norm": 0.20420075953006744, + "learning_rate": 7.186153818304708e-06, + "loss": 0.5009, + "step": 4651 + }, + { + "epoch": 1.2900721020521353, + "grad_norm": 0.19985781610012054, + "learning_rate": 7.1811564637647734e-06, + "loss": 0.5, + "step": 4652 + }, + { + "epoch": 1.2903494176372712, + "grad_norm": 0.2002001255750656, + "learning_rate": 7.176160147072138e-06, + "loss": 0.5068, + "step": 4653 + }, + { + "epoch": 1.290626733222407, + "grad_norm": 0.2185979187488556, + "learning_rate": 7.171164869201709e-06, + "loss": 0.5064, + "step": 4654 + }, + { + "epoch": 1.290904048807543, + "grad_norm": 0.20650553703308105, + "learning_rate": 7.166170631128194e-06, + "loss": 0.5152, + "step": 4655 + }, + { + "epoch": 1.2911813643926788, + "grad_norm": 0.20761191844940186, + "learning_rate": 7.161177433826108e-06, + "loss": 0.4963, + "step": 4656 + }, + { + "epoch": 1.2914586799778147, + "grad_norm": 0.20723947882652283, + "learning_rate": 7.156185278269756e-06, + "loss": 0.5188, + "step": 4657 + }, + { + "epoch": 1.2917359955629506, + "grad_norm": 0.21793848276138306, + "learning_rate": 7.151194165433234e-06, + "loss": 0.4931, + "step": 4658 + }, + { + "epoch": 1.2920133111480865, + "grad_norm": 0.19249577820301056, + "learning_rate": 7.146204096290446e-06, + "loss": 0.5235, + "step": 4659 + }, + { + "epoch": 1.2922906267332224, + "grad_norm": 0.20929144322872162, + "learning_rate": 7.1412150718150884e-06, + "loss": 0.5141, + "step": 4660 + }, + { + "epoch": 1.2925679423183583, + "grad_norm": 0.3471876382827759, + "learning_rate": 7.136227092980649e-06, + "loss": 0.5095, + "step": 4661 + }, + { + "epoch": 1.2928452579034941, + "grad_norm": 0.1970401108264923, + "learning_rate": 7.131240160760408e-06, + "loss": 0.4993, + "step": 4662 + }, + { + "epoch": 1.29312257348863, + "grad_norm": 0.2073030173778534, + "learning_rate": 7.126254276127456e-06, + "loss": 0.5111, + "step": 4663 + }, + { + "epoch": 1.293399889073766, + "grad_norm": 0.19547441601753235, + "learning_rate": 7.1212694400546734e-06, + "loss": 0.5045, + "step": 4664 + }, + { + "epoch": 1.2936772046589018, + "grad_norm": 0.1951918601989746, + "learning_rate": 7.116285653514729e-06, + "loss": 0.4969, + "step": 4665 + }, + { + "epoch": 1.2939545202440377, + "grad_norm": 0.1991628259420395, + "learning_rate": 7.111302917480089e-06, + "loss": 0.4958, + "step": 4666 + }, + { + "epoch": 1.2942318358291736, + "grad_norm": 0.19393709301948547, + "learning_rate": 7.10632123292302e-06, + "loss": 0.4918, + "step": 4667 + }, + { + "epoch": 1.2945091514143094, + "grad_norm": 0.19460806250572205, + "learning_rate": 7.101340600815587e-06, + "loss": 0.4921, + "step": 4668 + }, + { + "epoch": 1.2947864669994453, + "grad_norm": 0.1952556073665619, + "learning_rate": 7.096361022129637e-06, + "loss": 0.4937, + "step": 4669 + }, + { + "epoch": 1.2950637825845812, + "grad_norm": 0.20183762907981873, + "learning_rate": 7.0913824978368075e-06, + "loss": 0.4777, + "step": 4670 + }, + { + "epoch": 1.295341098169717, + "grad_norm": 0.2849336564540863, + "learning_rate": 7.086405028908563e-06, + "loss": 0.4967, + "step": 4671 + }, + { + "epoch": 1.295618413754853, + "grad_norm": 0.19828931987285614, + "learning_rate": 7.081428616316127e-06, + "loss": 0.523, + "step": 4672 + }, + { + "epoch": 1.2958957293399889, + "grad_norm": 0.20423446595668793, + "learning_rate": 7.076453261030524e-06, + "loss": 0.4865, + "step": 4673 + }, + { + "epoch": 1.2961730449251248, + "grad_norm": 0.18961788713932037, + "learning_rate": 7.0714789640225865e-06, + "loss": 0.4901, + "step": 4674 + }, + { + "epoch": 1.2964503605102606, + "grad_norm": 0.2092195451259613, + "learning_rate": 7.0665057262629316e-06, + "loss": 0.5204, + "step": 4675 + }, + { + "epoch": 1.2967276760953965, + "grad_norm": 0.2078777551651001, + "learning_rate": 7.061533548721969e-06, + "loss": 0.5013, + "step": 4676 + }, + { + "epoch": 1.2970049916805324, + "grad_norm": 0.2117914855480194, + "learning_rate": 7.0565624323698955e-06, + "loss": 0.5268, + "step": 4677 + }, + { + "epoch": 1.2972823072656683, + "grad_norm": 0.19487378001213074, + "learning_rate": 7.051592378176711e-06, + "loss": 0.5368, + "step": 4678 + }, + { + "epoch": 1.2975596228508042, + "grad_norm": 0.19526413083076477, + "learning_rate": 7.046623387112212e-06, + "loss": 0.502, + "step": 4679 + }, + { + "epoch": 1.29783693843594, + "grad_norm": 0.19563087821006775, + "learning_rate": 7.041655460145971e-06, + "loss": 0.5248, + "step": 4680 + }, + { + "epoch": 1.298114254021076, + "grad_norm": 0.1995537430047989, + "learning_rate": 7.0366885982473635e-06, + "loss": 0.5115, + "step": 4681 + }, + { + "epoch": 1.2983915696062118, + "grad_norm": 0.20692096650600433, + "learning_rate": 7.0317228023855654e-06, + "loss": 0.4994, + "step": 4682 + }, + { + "epoch": 1.2986688851913477, + "grad_norm": 0.20720021426677704, + "learning_rate": 7.026758073529527e-06, + "loss": 0.4766, + "step": 4683 + }, + { + "epoch": 1.2989462007764836, + "grad_norm": 0.20226465165615082, + "learning_rate": 7.021794412647993e-06, + "loss": 0.5035, + "step": 4684 + }, + { + "epoch": 1.2992235163616195, + "grad_norm": 0.21326394379138947, + "learning_rate": 7.016831820709513e-06, + "loss": 0.4912, + "step": 4685 + }, + { + "epoch": 1.2995008319467554, + "grad_norm": 0.23130756616592407, + "learning_rate": 7.0118702986824225e-06, + "loss": 0.5091, + "step": 4686 + }, + { + "epoch": 1.2997781475318912, + "grad_norm": 0.19730432331562042, + "learning_rate": 7.006909847534837e-06, + "loss": 0.5001, + "step": 4687 + }, + { + "epoch": 1.3000554631170271, + "grad_norm": 0.21078179776668549, + "learning_rate": 7.0019504682346835e-06, + "loss": 0.4988, + "step": 4688 + }, + { + "epoch": 1.300332778702163, + "grad_norm": 0.2023860216140747, + "learning_rate": 6.996992161749656e-06, + "loss": 0.4996, + "step": 4689 + }, + { + "epoch": 1.300610094287299, + "grad_norm": 0.19739584624767303, + "learning_rate": 6.992034929047261e-06, + "loss": 0.4843, + "step": 4690 + }, + { + "epoch": 1.3008874098724348, + "grad_norm": 0.2066974639892578, + "learning_rate": 6.987078771094779e-06, + "loss": 0.5207, + "step": 4691 + }, + { + "epoch": 1.3011647254575707, + "grad_norm": 0.20626220107078552, + "learning_rate": 6.982123688859295e-06, + "loss": 0.5055, + "step": 4692 + }, + { + "epoch": 1.3014420410427066, + "grad_norm": 0.19193756580352783, + "learning_rate": 6.977169683307667e-06, + "loss": 0.5372, + "step": 4693 + }, + { + "epoch": 1.3017193566278424, + "grad_norm": 0.19858404994010925, + "learning_rate": 6.972216755406559e-06, + "loss": 0.4943, + "step": 4694 + }, + { + "epoch": 1.3019966722129783, + "grad_norm": 0.19214889407157898, + "learning_rate": 6.967264906122422e-06, + "loss": 0.4917, + "step": 4695 + }, + { + "epoch": 1.3022739877981142, + "grad_norm": 0.24965661764144897, + "learning_rate": 6.962314136421485e-06, + "loss": 0.5006, + "step": 4696 + }, + { + "epoch": 1.30255130338325, + "grad_norm": 0.20609916746616364, + "learning_rate": 6.957364447269785e-06, + "loss": 0.4954, + "step": 4697 + }, + { + "epoch": 1.302828618968386, + "grad_norm": 0.18501780927181244, + "learning_rate": 6.9524158396331225e-06, + "loss": 0.4895, + "step": 4698 + }, + { + "epoch": 1.3031059345535219, + "grad_norm": 0.19009456038475037, + "learning_rate": 6.947468314477115e-06, + "loss": 0.4812, + "step": 4699 + }, + { + "epoch": 1.3033832501386577, + "grad_norm": 0.20104162395000458, + "learning_rate": 6.942521872767148e-06, + "loss": 0.5101, + "step": 4700 + }, + { + "epoch": 1.3036605657237936, + "grad_norm": 0.21302615106105804, + "learning_rate": 6.937576515468405e-06, + "loss": 0.4959, + "step": 4701 + }, + { + "epoch": 1.3039378813089295, + "grad_norm": 0.19924134016036987, + "learning_rate": 6.932632243545864e-06, + "loss": 0.5155, + "step": 4702 + }, + { + "epoch": 1.3042151968940654, + "grad_norm": 0.2015244960784912, + "learning_rate": 6.927689057964274e-06, + "loss": 0.5192, + "step": 4703 + }, + { + "epoch": 1.3044925124792013, + "grad_norm": 0.21109920740127563, + "learning_rate": 6.9227469596881825e-06, + "loss": 0.5109, + "step": 4704 + }, + { + "epoch": 1.3047698280643372, + "grad_norm": 0.19857889413833618, + "learning_rate": 6.9178059496819246e-06, + "loss": 0.4955, + "step": 4705 + }, + { + "epoch": 1.305047143649473, + "grad_norm": 0.20074202120304108, + "learning_rate": 6.912866028909627e-06, + "loss": 0.4898, + "step": 4706 + }, + { + "epoch": 1.305324459234609, + "grad_norm": 0.20996864140033722, + "learning_rate": 6.907927198335197e-06, + "loss": 0.5067, + "step": 4707 + }, + { + "epoch": 1.3056017748197448, + "grad_norm": 0.20864242315292358, + "learning_rate": 6.902989458922319e-06, + "loss": 0.5017, + "step": 4708 + }, + { + "epoch": 1.3058790904048807, + "grad_norm": 0.2010875642299652, + "learning_rate": 6.898052811634498e-06, + "loss": 0.5122, + "step": 4709 + }, + { + "epoch": 1.3061564059900166, + "grad_norm": 0.1969454139471054, + "learning_rate": 6.893117257434994e-06, + "loss": 0.5032, + "step": 4710 + }, + { + "epoch": 1.3064337215751525, + "grad_norm": 0.2069738507270813, + "learning_rate": 6.88818279728686e-06, + "loss": 0.501, + "step": 4711 + }, + { + "epoch": 1.3067110371602884, + "grad_norm": 0.1900450736284256, + "learning_rate": 6.883249432152944e-06, + "loss": 0.5205, + "step": 4712 + }, + { + "epoch": 1.3069883527454242, + "grad_norm": 0.22067134082317352, + "learning_rate": 6.878317162995881e-06, + "loss": 0.4896, + "step": 4713 + }, + { + "epoch": 1.3072656683305601, + "grad_norm": 0.20373979210853577, + "learning_rate": 6.8733859907780865e-06, + "loss": 0.5217, + "step": 4714 + }, + { + "epoch": 1.307542983915696, + "grad_norm": 0.20570501685142517, + "learning_rate": 6.8684559164617525e-06, + "loss": 0.5074, + "step": 4715 + }, + { + "epoch": 1.307820299500832, + "grad_norm": 0.2102414071559906, + "learning_rate": 6.8635269410088725e-06, + "loss": 0.5142, + "step": 4716 + }, + { + "epoch": 1.3080976150859678, + "grad_norm": 0.20017775893211365, + "learning_rate": 6.8585990653812285e-06, + "loss": 0.5078, + "step": 4717 + }, + { + "epoch": 1.3083749306711037, + "grad_norm": 0.22755825519561768, + "learning_rate": 6.8536722905403666e-06, + "loss": 0.5103, + "step": 4718 + }, + { + "epoch": 1.3086522462562395, + "grad_norm": 0.20453383028507233, + "learning_rate": 6.848746617447644e-06, + "loss": 0.4962, + "step": 4719 + }, + { + "epoch": 1.3089295618413754, + "grad_norm": 0.21632753312587738, + "learning_rate": 6.8438220470641785e-06, + "loss": 0.5031, + "step": 4720 + }, + { + "epoch": 1.3092068774265113, + "grad_norm": 0.19868405163288116, + "learning_rate": 6.838898580350895e-06, + "loss": 0.4784, + "step": 4721 + }, + { + "epoch": 1.3094841930116472, + "grad_norm": 0.20900887250900269, + "learning_rate": 6.833976218268478e-06, + "loss": 0.5007, + "step": 4722 + }, + { + "epoch": 1.309761508596783, + "grad_norm": 0.20499762892723083, + "learning_rate": 6.829054961777423e-06, + "loss": 0.5004, + "step": 4723 + }, + { + "epoch": 1.310038824181919, + "grad_norm": 0.19857996702194214, + "learning_rate": 6.8241348118379966e-06, + "loss": 0.5087, + "step": 4724 + }, + { + "epoch": 1.3103161397670549, + "grad_norm": 0.1966305822134018, + "learning_rate": 6.819215769410243e-06, + "loss": 0.4808, + "step": 4725 + }, + { + "epoch": 1.3105934553521907, + "grad_norm": 0.2024124413728714, + "learning_rate": 6.814297835454009e-06, + "loss": 0.4972, + "step": 4726 + }, + { + "epoch": 1.3108707709373266, + "grad_norm": 0.21003521978855133, + "learning_rate": 6.8093810109289e-06, + "loss": 0.4985, + "step": 4727 + }, + { + "epoch": 1.3111480865224625, + "grad_norm": 0.19540292024612427, + "learning_rate": 6.804465296794332e-06, + "loss": 0.4898, + "step": 4728 + }, + { + "epoch": 1.3114254021075984, + "grad_norm": 0.2004072666168213, + "learning_rate": 6.799550694009479e-06, + "loss": 0.4926, + "step": 4729 + }, + { + "epoch": 1.3117027176927343, + "grad_norm": 0.19552730023860931, + "learning_rate": 6.794637203533321e-06, + "loss": 0.4841, + "step": 4730 + }, + { + "epoch": 1.3119800332778702, + "grad_norm": 0.20091231167316437, + "learning_rate": 6.789724826324602e-06, + "loss": 0.4953, + "step": 4731 + }, + { + "epoch": 1.312257348863006, + "grad_norm": 0.20696038007736206, + "learning_rate": 6.78481356334186e-06, + "loss": 0.5117, + "step": 4732 + }, + { + "epoch": 1.312534664448142, + "grad_norm": 0.19968093931674957, + "learning_rate": 6.779903415543418e-06, + "loss": 0.4942, + "step": 4733 + }, + { + "epoch": 1.3128119800332778, + "grad_norm": 0.19521543383598328, + "learning_rate": 6.7749943838873636e-06, + "loss": 0.4901, + "step": 4734 + }, + { + "epoch": 1.3130892956184137, + "grad_norm": 0.24511419236660004, + "learning_rate": 6.770086469331592e-06, + "loss": 0.5023, + "step": 4735 + }, + { + "epoch": 1.3133666112035496, + "grad_norm": 0.29729798436164856, + "learning_rate": 6.765179672833757e-06, + "loss": 0.4767, + "step": 4736 + }, + { + "epoch": 1.3136439267886855, + "grad_norm": 0.19325131177902222, + "learning_rate": 6.760273995351313e-06, + "loss": 0.4878, + "step": 4737 + }, + { + "epoch": 1.3139212423738214, + "grad_norm": 0.1989395171403885, + "learning_rate": 6.75536943784148e-06, + "loss": 0.4873, + "step": 4738 + }, + { + "epoch": 1.3141985579589572, + "grad_norm": 0.20601066946983337, + "learning_rate": 6.750466001261271e-06, + "loss": 0.4805, + "step": 4739 + }, + { + "epoch": 1.3144758735440931, + "grad_norm": 0.19977515935897827, + "learning_rate": 6.74556368656748e-06, + "loss": 0.5138, + "step": 4740 + }, + { + "epoch": 1.314753189129229, + "grad_norm": 0.19737781584262848, + "learning_rate": 6.740662494716675e-06, + "loss": 0.4954, + "step": 4741 + }, + { + "epoch": 1.315030504714365, + "grad_norm": 0.19452109932899475, + "learning_rate": 6.7357624266652044e-06, + "loss": 0.4723, + "step": 4742 + }, + { + "epoch": 1.3153078202995008, + "grad_norm": 0.19652196764945984, + "learning_rate": 6.730863483369203e-06, + "loss": 0.483, + "step": 4743 + }, + { + "epoch": 1.3155851358846367, + "grad_norm": 0.21024899184703827, + "learning_rate": 6.725965665784592e-06, + "loss": 0.5212, + "step": 4744 + }, + { + "epoch": 1.3158624514697725, + "grad_norm": 0.20697370171546936, + "learning_rate": 6.721068974867059e-06, + "loss": 0.4693, + "step": 4745 + }, + { + "epoch": 1.3161397670549084, + "grad_norm": 0.20207835733890533, + "learning_rate": 6.71617341157207e-06, + "loss": 0.5116, + "step": 4746 + }, + { + "epoch": 1.3164170826400443, + "grad_norm": 0.19856862723827362, + "learning_rate": 6.711278976854898e-06, + "loss": 0.5203, + "step": 4747 + }, + { + "epoch": 1.3166943982251802, + "grad_norm": 0.20049835741519928, + "learning_rate": 6.706385671670566e-06, + "loss": 0.485, + "step": 4748 + }, + { + "epoch": 1.316971713810316, + "grad_norm": 0.21266470849514008, + "learning_rate": 6.701493496973885e-06, + "loss": 0.5069, + "step": 4749 + }, + { + "epoch": 1.317249029395452, + "grad_norm": 0.20254302024841309, + "learning_rate": 6.69660245371945e-06, + "loss": 0.5294, + "step": 4750 + }, + { + "epoch": 1.3175263449805878, + "grad_norm": 0.19671931862831116, + "learning_rate": 6.691712542861639e-06, + "loss": 0.4901, + "step": 4751 + }, + { + "epoch": 1.3178036605657237, + "grad_norm": 0.2424955815076828, + "learning_rate": 6.686823765354599e-06, + "loss": 0.518, + "step": 4752 + }, + { + "epoch": 1.3180809761508596, + "grad_norm": 0.2099234014749527, + "learning_rate": 6.681936122152255e-06, + "loss": 0.5131, + "step": 4753 + }, + { + "epoch": 1.3183582917359955, + "grad_norm": 0.1978161633014679, + "learning_rate": 6.67704961420832e-06, + "loss": 0.4835, + "step": 4754 + }, + { + "epoch": 1.3186356073211314, + "grad_norm": 0.197959765791893, + "learning_rate": 6.6721642424762866e-06, + "loss": 0.4862, + "step": 4755 + }, + { + "epoch": 1.3189129229062673, + "grad_norm": 0.19919045269489288, + "learning_rate": 6.667280007909416e-06, + "loss": 0.4858, + "step": 4756 + }, + { + "epoch": 1.3191902384914032, + "grad_norm": 0.2116900086402893, + "learning_rate": 6.662396911460745e-06, + "loss": 0.5011, + "step": 4757 + }, + { + "epoch": 1.319467554076539, + "grad_norm": 0.20922990143299103, + "learning_rate": 6.657514954083099e-06, + "loss": 0.5122, + "step": 4758 + }, + { + "epoch": 1.319744869661675, + "grad_norm": 0.24156232178211212, + "learning_rate": 6.652634136729086e-06, + "loss": 0.4908, + "step": 4759 + }, + { + "epoch": 1.3200221852468108, + "grad_norm": 0.1963520497083664, + "learning_rate": 6.647754460351072e-06, + "loss": 0.5212, + "step": 4760 + }, + { + "epoch": 1.3202995008319467, + "grad_norm": 0.20835274457931519, + "learning_rate": 6.642875925901213e-06, + "loss": 0.5094, + "step": 4761 + }, + { + "epoch": 1.3205768164170826, + "grad_norm": 0.18857994675636292, + "learning_rate": 6.63799853433145e-06, + "loss": 0.4601, + "step": 4762 + }, + { + "epoch": 1.3208541320022185, + "grad_norm": 0.20583271980285645, + "learning_rate": 6.633122286593481e-06, + "loss": 0.5022, + "step": 4763 + }, + { + "epoch": 1.3211314475873543, + "grad_norm": 0.21066723763942719, + "learning_rate": 6.628247183638789e-06, + "loss": 0.4997, + "step": 4764 + }, + { + "epoch": 1.3214087631724902, + "grad_norm": 0.19231611490249634, + "learning_rate": 6.623373226418642e-06, + "loss": 0.4751, + "step": 4765 + }, + { + "epoch": 1.3216860787576261, + "grad_norm": 0.1879042237997055, + "learning_rate": 6.618500415884083e-06, + "loss": 0.4813, + "step": 4766 + }, + { + "epoch": 1.321963394342762, + "grad_norm": 0.20243264734745026, + "learning_rate": 6.613628752985912e-06, + "loss": 0.4951, + "step": 4767 + }, + { + "epoch": 1.3222407099278979, + "grad_norm": 0.20071490108966827, + "learning_rate": 6.608758238674733e-06, + "loss": 0.4892, + "step": 4768 + }, + { + "epoch": 1.3225180255130338, + "grad_norm": 0.213031604886055, + "learning_rate": 6.603888873900905e-06, + "loss": 0.5118, + "step": 4769 + }, + { + "epoch": 1.3227953410981697, + "grad_norm": 0.2158973515033722, + "learning_rate": 6.599020659614572e-06, + "loss": 0.4931, + "step": 4770 + }, + { + "epoch": 1.3230726566833055, + "grad_norm": 0.20497101545333862, + "learning_rate": 6.594153596765655e-06, + "loss": 0.5167, + "step": 4771 + }, + { + "epoch": 1.3233499722684414, + "grad_norm": 0.19923308491706848, + "learning_rate": 6.5892876863038385e-06, + "loss": 0.477, + "step": 4772 + }, + { + "epoch": 1.3236272878535773, + "grad_norm": 0.21297168731689453, + "learning_rate": 6.584422929178602e-06, + "loss": 0.5255, + "step": 4773 + }, + { + "epoch": 1.3239046034387132, + "grad_norm": 0.20591717958450317, + "learning_rate": 6.579559326339177e-06, + "loss": 0.5326, + "step": 4774 + }, + { + "epoch": 1.324181919023849, + "grad_norm": 0.20993812382221222, + "learning_rate": 6.574696878734592e-06, + "loss": 0.4997, + "step": 4775 + }, + { + "epoch": 1.324459234608985, + "grad_norm": 0.20869500935077667, + "learning_rate": 6.569835587313627e-06, + "loss": 0.5205, + "step": 4776 + }, + { + "epoch": 1.3247365501941208, + "grad_norm": 0.19962731003761292, + "learning_rate": 6.5649754530248575e-06, + "loss": 0.4994, + "step": 4777 + }, + { + "epoch": 1.3250138657792567, + "grad_norm": 0.20245380699634552, + "learning_rate": 6.560116476816627e-06, + "loss": 0.5062, + "step": 4778 + }, + { + "epoch": 1.3252911813643926, + "grad_norm": 0.20509305596351624, + "learning_rate": 6.5552586596370465e-06, + "loss": 0.501, + "step": 4779 + }, + { + "epoch": 1.3255684969495285, + "grad_norm": 0.19942280650138855, + "learning_rate": 6.5504020024340005e-06, + "loss": 0.4664, + "step": 4780 + }, + { + "epoch": 1.3258458125346644, + "grad_norm": 0.2001029998064041, + "learning_rate": 6.545546506155154e-06, + "loss": 0.4858, + "step": 4781 + }, + { + "epoch": 1.3261231281198003, + "grad_norm": 0.19130992889404297, + "learning_rate": 6.5406921717479474e-06, + "loss": 0.5081, + "step": 4782 + }, + { + "epoch": 1.3264004437049361, + "grad_norm": 0.20136502385139465, + "learning_rate": 6.53583900015959e-06, + "loss": 0.4889, + "step": 4783 + }, + { + "epoch": 1.326677759290072, + "grad_norm": 0.20278695225715637, + "learning_rate": 6.53098699233705e-06, + "loss": 0.5011, + "step": 4784 + }, + { + "epoch": 1.326955074875208, + "grad_norm": 0.1966562420129776, + "learning_rate": 6.5261361492271054e-06, + "loss": 0.4887, + "step": 4785 + }, + { + "epoch": 1.3272323904603438, + "grad_norm": 0.20651812851428986, + "learning_rate": 6.5212864717762696e-06, + "loss": 0.5006, + "step": 4786 + }, + { + "epoch": 1.3275097060454797, + "grad_norm": 0.210739865899086, + "learning_rate": 6.516437960930843e-06, + "loss": 0.5149, + "step": 4787 + }, + { + "epoch": 1.3277870216306156, + "grad_norm": 0.21981686353683472, + "learning_rate": 6.5115906176369025e-06, + "loss": 0.5234, + "step": 4788 + }, + { + "epoch": 1.3280643372157515, + "grad_norm": 0.19809675216674805, + "learning_rate": 6.506744442840296e-06, + "loss": 0.5058, + "step": 4789 + }, + { + "epoch": 1.3283416528008873, + "grad_norm": 0.19718238711357117, + "learning_rate": 6.501899437486637e-06, + "loss": 0.5016, + "step": 4790 + }, + { + "epoch": 1.3286189683860232, + "grad_norm": 0.19665461778640747, + "learning_rate": 6.4970556025213095e-06, + "loss": 0.4995, + "step": 4791 + }, + { + "epoch": 1.328896283971159, + "grad_norm": 0.1910347193479538, + "learning_rate": 6.492212938889481e-06, + "loss": 0.4835, + "step": 4792 + }, + { + "epoch": 1.329173599556295, + "grad_norm": 0.1989067643880844, + "learning_rate": 6.487371447536084e-06, + "loss": 0.4774, + "step": 4793 + }, + { + "epoch": 1.3294509151414309, + "grad_norm": 0.1986207813024521, + "learning_rate": 6.482531129405819e-06, + "loss": 0.5111, + "step": 4794 + }, + { + "epoch": 1.3297282307265668, + "grad_norm": 0.20336773991584778, + "learning_rate": 6.477691985443157e-06, + "loss": 0.5037, + "step": 4795 + }, + { + "epoch": 1.3300055463117026, + "grad_norm": 0.19637836515903473, + "learning_rate": 6.472854016592346e-06, + "loss": 0.5016, + "step": 4796 + }, + { + "epoch": 1.3302828618968385, + "grad_norm": 0.2004927545785904, + "learning_rate": 6.468017223797407e-06, + "loss": 0.509, + "step": 4797 + }, + { + "epoch": 1.3305601774819744, + "grad_norm": 0.2119477391242981, + "learning_rate": 6.463181608002118e-06, + "loss": 0.5365, + "step": 4798 + }, + { + "epoch": 1.3308374930671103, + "grad_norm": 0.20132261514663696, + "learning_rate": 6.4583471701500395e-06, + "loss": 0.5112, + "step": 4799 + }, + { + "epoch": 1.3311148086522462, + "grad_norm": 0.2103944718837738, + "learning_rate": 6.453513911184503e-06, + "loss": 0.4846, + "step": 4800 + }, + { + "epoch": 1.331392124237382, + "grad_norm": 0.18914499878883362, + "learning_rate": 6.448681832048603e-06, + "loss": 0.4821, + "step": 4801 + }, + { + "epoch": 1.331669439822518, + "grad_norm": 0.19525542855262756, + "learning_rate": 6.443850933685197e-06, + "loss": 0.5018, + "step": 4802 + }, + { + "epoch": 1.3319467554076538, + "grad_norm": 0.19284358620643616, + "learning_rate": 6.4390212170369305e-06, + "loss": 0.478, + "step": 4803 + }, + { + "epoch": 1.3322240709927897, + "grad_norm": 0.20030631124973297, + "learning_rate": 6.43419268304621e-06, + "loss": 0.5084, + "step": 4804 + }, + { + "epoch": 1.3325013865779256, + "grad_norm": 0.19547629356384277, + "learning_rate": 6.429365332655204e-06, + "loss": 0.4884, + "step": 4805 + }, + { + "epoch": 1.3327787021630615, + "grad_norm": 0.20050625503063202, + "learning_rate": 6.4245391668058655e-06, + "loss": 0.526, + "step": 4806 + }, + { + "epoch": 1.3330560177481974, + "grad_norm": 0.24080616235733032, + "learning_rate": 6.419714186439896e-06, + "loss": 0.4912, + "step": 4807 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.2031663954257965, + "learning_rate": 6.414890392498787e-06, + "loss": 0.5048, + "step": 4808 + }, + { + "epoch": 1.3336106489184691, + "grad_norm": 0.20676189661026, + "learning_rate": 6.410067785923779e-06, + "loss": 0.4905, + "step": 4809 + }, + { + "epoch": 1.333887964503605, + "grad_norm": 0.19812701642513275, + "learning_rate": 6.405246367655903e-06, + "loss": 0.5076, + "step": 4810 + }, + { + "epoch": 1.334165280088741, + "grad_norm": 0.20559370517730713, + "learning_rate": 6.4004261386359315e-06, + "loss": 0.4868, + "step": 4811 + }, + { + "epoch": 1.3344425956738768, + "grad_norm": 0.21239563822746277, + "learning_rate": 6.395607099804426e-06, + "loss": 0.4941, + "step": 4812 + }, + { + "epoch": 1.3347199112590127, + "grad_norm": 0.20791207253932953, + "learning_rate": 6.390789252101713e-06, + "loss": 0.5058, + "step": 4813 + }, + { + "epoch": 1.3349972268441486, + "grad_norm": 0.20702631771564484, + "learning_rate": 6.3859725964678735e-06, + "loss": 0.5146, + "step": 4814 + }, + { + "epoch": 1.3352745424292845, + "grad_norm": 0.1996176540851593, + "learning_rate": 6.381157133842772e-06, + "loss": 0.4976, + "step": 4815 + }, + { + "epoch": 1.3355518580144203, + "grad_norm": 0.20184668898582458, + "learning_rate": 6.376342865166024e-06, + "loss": 0.4912, + "step": 4816 + }, + { + "epoch": 1.3358291735995562, + "grad_norm": 0.20000986754894257, + "learning_rate": 6.371529791377031e-06, + "loss": 0.5215, + "step": 4817 + }, + { + "epoch": 1.336106489184692, + "grad_norm": 0.1989196389913559, + "learning_rate": 6.366717913414943e-06, + "loss": 0.4972, + "step": 4818 + }, + { + "epoch": 1.336383804769828, + "grad_norm": 0.19104993343353271, + "learning_rate": 6.361907232218689e-06, + "loss": 0.464, + "step": 4819 + }, + { + "epoch": 1.3366611203549639, + "grad_norm": 0.20721067488193512, + "learning_rate": 6.357097748726965e-06, + "loss": 0.5046, + "step": 4820 + }, + { + "epoch": 1.3369384359400998, + "grad_norm": 0.2302693873643875, + "learning_rate": 6.3522894638782204e-06, + "loss": 0.4778, + "step": 4821 + }, + { + "epoch": 1.3372157515252356, + "grad_norm": 0.20015782117843628, + "learning_rate": 6.347482378610678e-06, + "loss": 0.5031, + "step": 4822 + }, + { + "epoch": 1.3374930671103715, + "grad_norm": 0.19521984457969666, + "learning_rate": 6.342676493862332e-06, + "loss": 0.4701, + "step": 4823 + }, + { + "epoch": 1.3377703826955074, + "grad_norm": 0.20052647590637207, + "learning_rate": 6.337871810570943e-06, + "loss": 0.498, + "step": 4824 + }, + { + "epoch": 1.3380476982806433, + "grad_norm": 0.20175038278102875, + "learning_rate": 6.333068329674021e-06, + "loss": 0.5112, + "step": 4825 + }, + { + "epoch": 1.3383250138657792, + "grad_norm": 0.21088631451129913, + "learning_rate": 6.328266052108856e-06, + "loss": 0.5264, + "step": 4826 + }, + { + "epoch": 1.338602329450915, + "grad_norm": 0.20293080806732178, + "learning_rate": 6.323464978812507e-06, + "loss": 0.47, + "step": 4827 + }, + { + "epoch": 1.338879645036051, + "grad_norm": 0.22624416649341583, + "learning_rate": 6.318665110721786e-06, + "loss": 0.5097, + "step": 4828 + }, + { + "epoch": 1.3391569606211868, + "grad_norm": 0.20791591703891754, + "learning_rate": 6.3138664487732675e-06, + "loss": 0.4856, + "step": 4829 + }, + { + "epoch": 1.3394342762063227, + "grad_norm": 0.20461194217205048, + "learning_rate": 6.309068993903303e-06, + "loss": 0.4912, + "step": 4830 + }, + { + "epoch": 1.3397115917914586, + "grad_norm": 0.2024698108434677, + "learning_rate": 6.304272747048009e-06, + "loss": 0.4838, + "step": 4831 + }, + { + "epoch": 1.3399889073765945, + "grad_norm": 0.20816753804683685, + "learning_rate": 6.2994777091432535e-06, + "loss": 0.5021, + "step": 4832 + }, + { + "epoch": 1.3402662229617304, + "grad_norm": 0.2043784111738205, + "learning_rate": 6.2946838811246734e-06, + "loss": 0.5101, + "step": 4833 + }, + { + "epoch": 1.3405435385468663, + "grad_norm": 0.19572801887989044, + "learning_rate": 6.289891263927675e-06, + "loss": 0.4688, + "step": 4834 + }, + { + "epoch": 1.3408208541320021, + "grad_norm": 0.22099921107292175, + "learning_rate": 6.285099858487428e-06, + "loss": 0.4935, + "step": 4835 + }, + { + "epoch": 1.341098169717138, + "grad_norm": 0.19714292883872986, + "learning_rate": 6.280309665738854e-06, + "loss": 0.496, + "step": 4836 + }, + { + "epoch": 1.341375485302274, + "grad_norm": 0.21856801211833954, + "learning_rate": 6.275520686616654e-06, + "loss": 0.4743, + "step": 4837 + }, + { + "epoch": 1.3416528008874098, + "grad_norm": 0.2152533382177353, + "learning_rate": 6.270732922055286e-06, + "loss": 0.499, + "step": 4838 + }, + { + "epoch": 1.3419301164725457, + "grad_norm": 0.201574444770813, + "learning_rate": 6.2659463729889665e-06, + "loss": 0.4826, + "step": 4839 + }, + { + "epoch": 1.3422074320576816, + "grad_norm": 0.18966776132583618, + "learning_rate": 6.261161040351673e-06, + "loss": 0.4745, + "step": 4840 + }, + { + "epoch": 1.3424847476428174, + "grad_norm": 0.20077745616436005, + "learning_rate": 6.256376925077155e-06, + "loss": 0.4785, + "step": 4841 + }, + { + "epoch": 1.3427620632279533, + "grad_norm": 0.19951693713665009, + "learning_rate": 6.251594028098925e-06, + "loss": 0.4906, + "step": 4842 + }, + { + "epoch": 1.3430393788130892, + "grad_norm": 0.20605219900608063, + "learning_rate": 6.246812350350245e-06, + "loss": 0.5253, + "step": 4843 + }, + { + "epoch": 1.343316694398225, + "grad_norm": 0.1993149369955063, + "learning_rate": 6.242031892764156e-06, + "loss": 0.5034, + "step": 4844 + }, + { + "epoch": 1.343594009983361, + "grad_norm": 0.20574994385242462, + "learning_rate": 6.237252656273439e-06, + "loss": 0.5055, + "step": 4845 + }, + { + "epoch": 1.343871325568497, + "grad_norm": 0.20431512594223022, + "learning_rate": 6.232474641810664e-06, + "loss": 0.4775, + "step": 4846 + }, + { + "epoch": 1.344148641153633, + "grad_norm": 0.1970827877521515, + "learning_rate": 6.2276978503081355e-06, + "loss": 0.4963, + "step": 4847 + }, + { + "epoch": 1.3444259567387689, + "grad_norm": 0.20088131725788116, + "learning_rate": 6.222922282697944e-06, + "loss": 0.5, + "step": 4848 + }, + { + "epoch": 1.3447032723239047, + "grad_norm": 0.2057836651802063, + "learning_rate": 6.218147939911917e-06, + "loss": 0.4928, + "step": 4849 + }, + { + "epoch": 1.3449805879090406, + "grad_norm": 0.20138069987297058, + "learning_rate": 6.213374822881661e-06, + "loss": 0.4929, + "step": 4850 + }, + { + "epoch": 1.3452579034941765, + "grad_norm": 0.20331014692783356, + "learning_rate": 6.208602932538545e-06, + "loss": 0.492, + "step": 4851 + }, + { + "epoch": 1.3455352190793124, + "grad_norm": 0.20663729310035706, + "learning_rate": 6.203832269813678e-06, + "loss": 0.5118, + "step": 4852 + }, + { + "epoch": 1.3458125346644483, + "grad_norm": 0.20441415905952454, + "learning_rate": 6.1990628356379535e-06, + "loss": 0.5004, + "step": 4853 + }, + { + "epoch": 1.3460898502495842, + "grad_norm": 0.2063564509153366, + "learning_rate": 6.194294630942006e-06, + "loss": 0.5033, + "step": 4854 + }, + { + "epoch": 1.34636716583472, + "grad_norm": 0.2034982442855835, + "learning_rate": 6.1895276566562465e-06, + "loss": 0.4578, + "step": 4855 + }, + { + "epoch": 1.346644481419856, + "grad_norm": 0.19431711733341217, + "learning_rate": 6.184761913710829e-06, + "loss": 0.4876, + "step": 4856 + }, + { + "epoch": 1.3469217970049918, + "grad_norm": 0.2182120680809021, + "learning_rate": 6.179997403035682e-06, + "loss": 0.4799, + "step": 4857 + }, + { + "epoch": 1.3471991125901277, + "grad_norm": 0.19659878313541412, + "learning_rate": 6.175234125560492e-06, + "loss": 0.4867, + "step": 4858 + }, + { + "epoch": 1.3474764281752636, + "grad_norm": 0.21084415912628174, + "learning_rate": 6.1704720822146955e-06, + "loss": 0.4893, + "step": 4859 + }, + { + "epoch": 1.3477537437603995, + "grad_norm": 0.20155581831932068, + "learning_rate": 6.165711273927488e-06, + "loss": 0.5106, + "step": 4860 + }, + { + "epoch": 1.3480310593455354, + "grad_norm": 0.20054039359092712, + "learning_rate": 6.160951701627836e-06, + "loss": 0.4754, + "step": 4861 + }, + { + "epoch": 1.3483083749306712, + "grad_norm": 0.2048387974500656, + "learning_rate": 6.15619336624446e-06, + "loss": 0.4928, + "step": 4862 + }, + { + "epoch": 1.3485856905158071, + "grad_norm": 0.19565939903259277, + "learning_rate": 6.151436268705831e-06, + "loss": 0.4936, + "step": 4863 + }, + { + "epoch": 1.348863006100943, + "grad_norm": 0.21946604549884796, + "learning_rate": 6.1466804099401874e-06, + "loss": 0.4924, + "step": 4864 + }, + { + "epoch": 1.349140321686079, + "grad_norm": 0.20226682722568512, + "learning_rate": 6.141925790875529e-06, + "loss": 0.4983, + "step": 4865 + }, + { + "epoch": 1.3494176372712148, + "grad_norm": 0.191081702709198, + "learning_rate": 6.137172412439601e-06, + "loss": 0.5019, + "step": 4866 + }, + { + "epoch": 1.3496949528563507, + "grad_norm": 0.1935385912656784, + "learning_rate": 6.132420275559912e-06, + "loss": 0.4948, + "step": 4867 + }, + { + "epoch": 1.3499722684414865, + "grad_norm": 0.20531508326530457, + "learning_rate": 6.127669381163734e-06, + "loss": 0.5112, + "step": 4868 + }, + { + "epoch": 1.3502495840266224, + "grad_norm": 0.22489938139915466, + "learning_rate": 6.122919730178095e-06, + "loss": 0.5146, + "step": 4869 + }, + { + "epoch": 1.3505268996117583, + "grad_norm": 0.20186354219913483, + "learning_rate": 6.118171323529774e-06, + "loss": 0.4934, + "step": 4870 + }, + { + "epoch": 1.3508042151968942, + "grad_norm": 0.20553670823574066, + "learning_rate": 6.113424162145307e-06, + "loss": 0.5131, + "step": 4871 + }, + { + "epoch": 1.35108153078203, + "grad_norm": 0.20304057002067566, + "learning_rate": 6.108678246950994e-06, + "loss": 0.5157, + "step": 4872 + }, + { + "epoch": 1.351358846367166, + "grad_norm": 0.2151353359222412, + "learning_rate": 6.103933578872896e-06, + "loss": 0.4806, + "step": 4873 + }, + { + "epoch": 1.3516361619523019, + "grad_norm": 0.1945381909608841, + "learning_rate": 6.099190158836816e-06, + "loss": 0.5012, + "step": 4874 + }, + { + "epoch": 1.3519134775374377, + "grad_norm": 0.20316410064697266, + "learning_rate": 6.094447987768315e-06, + "loss": 0.4882, + "step": 4875 + }, + { + "epoch": 1.3521907931225736, + "grad_norm": 0.2029489129781723, + "learning_rate": 6.08970706659273e-06, + "loss": 0.5021, + "step": 4876 + }, + { + "epoch": 1.3524681087077095, + "grad_norm": 0.21302883327007294, + "learning_rate": 6.084967396235136e-06, + "loss": 0.527, + "step": 4877 + }, + { + "epoch": 1.3527454242928454, + "grad_norm": 0.204268679022789, + "learning_rate": 6.08022897762036e-06, + "loss": 0.521, + "step": 4878 + }, + { + "epoch": 1.3530227398779813, + "grad_norm": 0.20695167779922485, + "learning_rate": 6.0754918116730004e-06, + "loss": 0.4753, + "step": 4879 + }, + { + "epoch": 1.3533000554631172, + "grad_norm": 0.2412513643503189, + "learning_rate": 6.070755899317407e-06, + "loss": 0.4799, + "step": 4880 + }, + { + "epoch": 1.353577371048253, + "grad_norm": 0.1893726885318756, + "learning_rate": 6.066021241477676e-06, + "loss": 0.4823, + "step": 4881 + }, + { + "epoch": 1.353854686633389, + "grad_norm": 0.20982672274112701, + "learning_rate": 6.061287839077661e-06, + "loss": 0.5285, + "step": 4882 + }, + { + "epoch": 1.3541320022185248, + "grad_norm": 0.19612151384353638, + "learning_rate": 6.056555693040981e-06, + "loss": 0.5243, + "step": 4883 + }, + { + "epoch": 1.3544093178036607, + "grad_norm": 0.20173022150993347, + "learning_rate": 6.051824804291005e-06, + "loss": 0.5089, + "step": 4884 + }, + { + "epoch": 1.3546866333887966, + "grad_norm": 0.20177814364433289, + "learning_rate": 6.047095173750846e-06, + "loss": 0.508, + "step": 4885 + }, + { + "epoch": 1.3549639489739325, + "grad_norm": 0.21217969059944153, + "learning_rate": 6.042366802343389e-06, + "loss": 0.4961, + "step": 4886 + }, + { + "epoch": 1.3552412645590683, + "grad_norm": 0.21045343577861786, + "learning_rate": 6.0376396909912575e-06, + "loss": 0.471, + "step": 4887 + }, + { + "epoch": 1.3555185801442042, + "grad_norm": 0.20939841866493225, + "learning_rate": 6.032913840616843e-06, + "loss": 0.5066, + "step": 4888 + }, + { + "epoch": 1.3557958957293401, + "grad_norm": 0.20125700533390045, + "learning_rate": 6.028189252142276e-06, + "loss": 0.4869, + "step": 4889 + }, + { + "epoch": 1.356073211314476, + "grad_norm": 0.19696246087551117, + "learning_rate": 6.023465926489453e-06, + "loss": 0.5013, + "step": 4890 + }, + { + "epoch": 1.3563505268996119, + "grad_norm": 0.19802086055278778, + "learning_rate": 6.018743864580025e-06, + "loss": 0.4694, + "step": 4891 + }, + { + "epoch": 1.3566278424847478, + "grad_norm": 0.19850695133209229, + "learning_rate": 6.014023067335382e-06, + "loss": 0.5089, + "step": 4892 + }, + { + "epoch": 1.3569051580698837, + "grad_norm": 0.20037485659122467, + "learning_rate": 6.009303535676686e-06, + "loss": 0.4823, + "step": 4893 + }, + { + "epoch": 1.3571824736550195, + "grad_norm": 0.19831885397434235, + "learning_rate": 6.004585270524833e-06, + "loss": 0.4821, + "step": 4894 + }, + { + "epoch": 1.3574597892401554, + "grad_norm": 0.20965071022510529, + "learning_rate": 5.999868272800492e-06, + "loss": 0.4821, + "step": 4895 + }, + { + "epoch": 1.3577371048252913, + "grad_norm": 0.19735604524612427, + "learning_rate": 5.995152543424064e-06, + "loss": 0.472, + "step": 4896 + }, + { + "epoch": 1.3580144204104272, + "grad_norm": 0.20253437757492065, + "learning_rate": 5.990438083315721e-06, + "loss": 0.5013, + "step": 4897 + }, + { + "epoch": 1.358291735995563, + "grad_norm": 0.19695337116718292, + "learning_rate": 5.985724893395371e-06, + "loss": 0.4982, + "step": 4898 + }, + { + "epoch": 1.358569051580699, + "grad_norm": 0.2138115018606186, + "learning_rate": 5.981012974582688e-06, + "loss": 0.511, + "step": 4899 + }, + { + "epoch": 1.3588463671658348, + "grad_norm": 0.20029625296592712, + "learning_rate": 5.976302327797096e-06, + "loss": 0.5215, + "step": 4900 + }, + { + "epoch": 1.3591236827509707, + "grad_norm": 0.20940783619880676, + "learning_rate": 5.9715929539577595e-06, + "loss": 0.5094, + "step": 4901 + }, + { + "epoch": 1.3594009983361066, + "grad_norm": 0.2135714590549469, + "learning_rate": 5.966884853983597e-06, + "loss": 0.5108, + "step": 4902 + }, + { + "epoch": 1.3596783139212425, + "grad_norm": 0.19404084980487823, + "learning_rate": 5.9621780287932995e-06, + "loss": 0.4732, + "step": 4903 + }, + { + "epoch": 1.3599556295063784, + "grad_norm": 0.20225512981414795, + "learning_rate": 5.957472479305286e-06, + "loss": 0.4923, + "step": 4904 + }, + { + "epoch": 1.3602329450915143, + "grad_norm": 0.20398668944835663, + "learning_rate": 5.952768206437727e-06, + "loss": 0.478, + "step": 4905 + }, + { + "epoch": 1.3605102606766502, + "grad_norm": 0.2068692445755005, + "learning_rate": 5.9480652111085566e-06, + "loss": 0.5464, + "step": 4906 + }, + { + "epoch": 1.360787576261786, + "grad_norm": 0.20321063697338104, + "learning_rate": 5.9433634942354595e-06, + "loss": 0.4954, + "step": 4907 + }, + { + "epoch": 1.361064891846922, + "grad_norm": 0.21151407063007355, + "learning_rate": 5.938663056735859e-06, + "loss": 0.5176, + "step": 4908 + }, + { + "epoch": 1.3613422074320578, + "grad_norm": 0.20232000946998596, + "learning_rate": 5.93396389952693e-06, + "loss": 0.524, + "step": 4909 + }, + { + "epoch": 1.3616195230171937, + "grad_norm": 0.21282248198986053, + "learning_rate": 5.92926602352561e-06, + "loss": 0.5043, + "step": 4910 + }, + { + "epoch": 1.3618968386023296, + "grad_norm": 0.20285063982009888, + "learning_rate": 5.92456942964858e-06, + "loss": 0.4926, + "step": 4911 + }, + { + "epoch": 1.3621741541874655, + "grad_norm": 0.26317086815834045, + "learning_rate": 5.9198741188122675e-06, + "loss": 0.4847, + "step": 4912 + }, + { + "epoch": 1.3624514697726013, + "grad_norm": 0.20316633582115173, + "learning_rate": 5.915180091932843e-06, + "loss": 0.521, + "step": 4913 + }, + { + "epoch": 1.3627287853577372, + "grad_norm": 0.2053111493587494, + "learning_rate": 5.910487349926251e-06, + "loss": 0.4979, + "step": 4914 + }, + { + "epoch": 1.3630061009428731, + "grad_norm": 1.0747162103652954, + "learning_rate": 5.905795893708166e-06, + "loss": 0.4933, + "step": 4915 + }, + { + "epoch": 1.363283416528009, + "grad_norm": 0.19813428819179535, + "learning_rate": 5.901105724194006e-06, + "loss": 0.5115, + "step": 4916 + }, + { + "epoch": 1.3635607321131449, + "grad_norm": 0.2020360231399536, + "learning_rate": 5.896416842298953e-06, + "loss": 0.4945, + "step": 4917 + }, + { + "epoch": 1.3638380476982808, + "grad_norm": 0.20236392319202423, + "learning_rate": 5.891729248937938e-06, + "loss": 0.4844, + "step": 4918 + }, + { + "epoch": 1.3641153632834166, + "grad_norm": 0.20511947572231293, + "learning_rate": 5.8870429450256295e-06, + "loss": 0.4872, + "step": 4919 + }, + { + "epoch": 1.3643926788685525, + "grad_norm": 0.2103184312582016, + "learning_rate": 5.882357931476446e-06, + "loss": 0.519, + "step": 4920 + }, + { + "epoch": 1.3646699944536884, + "grad_norm": 0.20535695552825928, + "learning_rate": 5.877674209204559e-06, + "loss": 0.5072, + "step": 4921 + }, + { + "epoch": 1.3649473100388243, + "grad_norm": 0.20085355639457703, + "learning_rate": 5.872991779123894e-06, + "loss": 0.4931, + "step": 4922 + }, + { + "epoch": 1.3652246256239602, + "grad_norm": 0.19907495379447937, + "learning_rate": 5.8683106421481084e-06, + "loss": 0.5263, + "step": 4923 + }, + { + "epoch": 1.365501941209096, + "grad_norm": 0.2058345228433609, + "learning_rate": 5.863630799190624e-06, + "loss": 0.5022, + "step": 4924 + }, + { + "epoch": 1.365779256794232, + "grad_norm": 0.19591206312179565, + "learning_rate": 5.8589522511645944e-06, + "loss": 0.4979, + "step": 4925 + }, + { + "epoch": 1.3660565723793678, + "grad_norm": 0.212891086935997, + "learning_rate": 5.854274998982935e-06, + "loss": 0.5211, + "step": 4926 + }, + { + "epoch": 1.3663338879645037, + "grad_norm": 0.20310524106025696, + "learning_rate": 5.8495990435582945e-06, + "loss": 0.4923, + "step": 4927 + }, + { + "epoch": 1.3666112035496396, + "grad_norm": 0.2070825695991516, + "learning_rate": 5.844924385803078e-06, + "loss": 0.5088, + "step": 4928 + }, + { + "epoch": 1.3668885191347755, + "grad_norm": 0.1995309591293335, + "learning_rate": 5.8402510266294435e-06, + "loss": 0.4637, + "step": 4929 + }, + { + "epoch": 1.3671658347199114, + "grad_norm": 0.19737666845321655, + "learning_rate": 5.835578966949276e-06, + "loss": 0.5073, + "step": 4930 + }, + { + "epoch": 1.3674431503050473, + "grad_norm": 0.2089546024799347, + "learning_rate": 5.830908207674225e-06, + "loss": 0.4981, + "step": 4931 + }, + { + "epoch": 1.3677204658901831, + "grad_norm": 0.20078176259994507, + "learning_rate": 5.826238749715675e-06, + "loss": 0.4811, + "step": 4932 + }, + { + "epoch": 1.367997781475319, + "grad_norm": 0.20612461864948273, + "learning_rate": 5.821570593984765e-06, + "loss": 0.488, + "step": 4933 + }, + { + "epoch": 1.368275097060455, + "grad_norm": 0.19565999507904053, + "learning_rate": 5.816903741392371e-06, + "loss": 0.4613, + "step": 4934 + }, + { + "epoch": 1.3685524126455908, + "grad_norm": 0.20337355136871338, + "learning_rate": 5.812238192849126e-06, + "loss": 0.5004, + "step": 4935 + }, + { + "epoch": 1.3688297282307267, + "grad_norm": 0.1981084942817688, + "learning_rate": 5.8075739492653936e-06, + "loss": 0.4963, + "step": 4936 + }, + { + "epoch": 1.3691070438158626, + "grad_norm": 0.21550583839416504, + "learning_rate": 5.8029110115512975e-06, + "loss": 0.5181, + "step": 4937 + }, + { + "epoch": 1.3693843594009985, + "grad_norm": 0.2106407731771469, + "learning_rate": 5.7982493806167025e-06, + "loss": 0.5001, + "step": 4938 + }, + { + "epoch": 1.3696616749861343, + "grad_norm": 0.20755064487457275, + "learning_rate": 5.793589057371214e-06, + "loss": 0.4996, + "step": 4939 + }, + { + "epoch": 1.3699389905712702, + "grad_norm": 0.20589278638362885, + "learning_rate": 5.788930042724178e-06, + "loss": 0.4731, + "step": 4940 + }, + { + "epoch": 1.370216306156406, + "grad_norm": 0.19447284936904907, + "learning_rate": 5.7842723375846964e-06, + "loss": 0.4805, + "step": 4941 + }, + { + "epoch": 1.370493621741542, + "grad_norm": 0.18845054507255554, + "learning_rate": 5.779615942861617e-06, + "loss": 0.5183, + "step": 4942 + }, + { + "epoch": 1.3707709373266779, + "grad_norm": 0.2057630866765976, + "learning_rate": 5.774960859463516e-06, + "loss": 0.4762, + "step": 4943 + }, + { + "epoch": 1.3710482529118138, + "grad_norm": 0.20433704555034637, + "learning_rate": 5.770307088298728e-06, + "loss": 0.5085, + "step": 4944 + }, + { + "epoch": 1.3713255684969496, + "grad_norm": 0.20006833970546722, + "learning_rate": 5.76565463027533e-06, + "loss": 0.5217, + "step": 4945 + }, + { + "epoch": 1.3716028840820855, + "grad_norm": 0.1979144960641861, + "learning_rate": 5.761003486301138e-06, + "loss": 0.4903, + "step": 4946 + }, + { + "epoch": 1.3718801996672214, + "grad_norm": 0.20525884628295898, + "learning_rate": 5.756353657283707e-06, + "loss": 0.5108, + "step": 4947 + }, + { + "epoch": 1.3721575152523573, + "grad_norm": 0.20061984658241272, + "learning_rate": 5.7517051441303486e-06, + "loss": 0.5068, + "step": 4948 + }, + { + "epoch": 1.3724348308374932, + "grad_norm": 0.19711720943450928, + "learning_rate": 5.747057947748112e-06, + "loss": 0.5206, + "step": 4949 + }, + { + "epoch": 1.372712146422629, + "grad_norm": 0.20400893688201904, + "learning_rate": 5.742412069043786e-06, + "loss": 0.4879, + "step": 4950 + }, + { + "epoch": 1.372989462007765, + "grad_norm": 0.20518803596496582, + "learning_rate": 5.737767508923896e-06, + "loss": 0.5172, + "step": 4951 + }, + { + "epoch": 1.3732667775929008, + "grad_norm": 0.20601531863212585, + "learning_rate": 5.733124268294734e-06, + "loss": 0.535, + "step": 4952 + }, + { + "epoch": 1.3735440931780367, + "grad_norm": 0.21886709332466125, + "learning_rate": 5.728482348062314e-06, + "loss": 0.4945, + "step": 4953 + }, + { + "epoch": 1.3738214087631726, + "grad_norm": 0.2085854411125183, + "learning_rate": 5.723841749132395e-06, + "loss": 0.5162, + "step": 4954 + }, + { + "epoch": 1.3740987243483085, + "grad_norm": 0.2051091194152832, + "learning_rate": 5.719202472410475e-06, + "loss": 0.4917, + "step": 4955 + }, + { + "epoch": 1.3743760399334444, + "grad_norm": 0.20153647661209106, + "learning_rate": 5.714564518801813e-06, + "loss": 0.4679, + "step": 4956 + }, + { + "epoch": 1.3746533555185803, + "grad_norm": 0.20284593105316162, + "learning_rate": 5.709927889211391e-06, + "loss": 0.4813, + "step": 4957 + }, + { + "epoch": 1.3749306711037161, + "grad_norm": 0.20113231241703033, + "learning_rate": 5.705292584543932e-06, + "loss": 0.4958, + "step": 4958 + }, + { + "epoch": 1.375207986688852, + "grad_norm": 0.21188965439796448, + "learning_rate": 5.700658605703912e-06, + "loss": 0.5014, + "step": 4959 + }, + { + "epoch": 1.375485302273988, + "grad_norm": 0.20827704668045044, + "learning_rate": 5.696025953595549e-06, + "loss": 0.505, + "step": 4960 + }, + { + "epoch": 1.3757626178591238, + "grad_norm": 0.2128673493862152, + "learning_rate": 5.691394629122786e-06, + "loss": 0.5019, + "step": 4961 + }, + { + "epoch": 1.3760399334442597, + "grad_norm": 0.20192976295948029, + "learning_rate": 5.686764633189325e-06, + "loss": 0.4714, + "step": 4962 + }, + { + "epoch": 1.3763172490293956, + "grad_norm": 0.20420150458812714, + "learning_rate": 5.6821359666985925e-06, + "loss": 0.5052, + "step": 4963 + }, + { + "epoch": 1.3765945646145314, + "grad_norm": 0.19757725298404694, + "learning_rate": 5.677508630553774e-06, + "loss": 0.5136, + "step": 4964 + }, + { + "epoch": 1.3768718801996673, + "grad_norm": 0.19914944469928741, + "learning_rate": 5.672882625657776e-06, + "loss": 0.4925, + "step": 4965 + }, + { + "epoch": 1.3771491957848032, + "grad_norm": 0.20814815163612366, + "learning_rate": 5.668257952913259e-06, + "loss": 0.4955, + "step": 4966 + }, + { + "epoch": 1.377426511369939, + "grad_norm": 0.19726327061653137, + "learning_rate": 5.663634613222623e-06, + "loss": 0.4675, + "step": 4967 + }, + { + "epoch": 1.377703826955075, + "grad_norm": 0.20900078117847443, + "learning_rate": 5.659012607487994e-06, + "loss": 0.4912, + "step": 4968 + }, + { + "epoch": 1.3779811425402109, + "grad_norm": 0.21374890208244324, + "learning_rate": 5.65439193661126e-06, + "loss": 0.5166, + "step": 4969 + }, + { + "epoch": 1.3782584581253468, + "grad_norm": 0.21223044395446777, + "learning_rate": 5.649772601494026e-06, + "loss": 0.494, + "step": 4970 + }, + { + "epoch": 1.3785357737104826, + "grad_norm": 0.20285436511039734, + "learning_rate": 5.645154603037654e-06, + "loss": 0.5161, + "step": 4971 + }, + { + "epoch": 1.3788130892956185, + "grad_norm": 0.20432451367378235, + "learning_rate": 5.64053794214323e-06, + "loss": 0.4738, + "step": 4972 + }, + { + "epoch": 1.3790904048807544, + "grad_norm": 0.20761069655418396, + "learning_rate": 5.635922619711598e-06, + "loss": 0.5038, + "step": 4973 + }, + { + "epoch": 1.3793677204658903, + "grad_norm": 0.21306075155735016, + "learning_rate": 5.631308636643317e-06, + "loss": 0.4948, + "step": 4974 + }, + { + "epoch": 1.3796450360510262, + "grad_norm": 0.20453637838363647, + "learning_rate": 5.626695993838704e-06, + "loss": 0.5061, + "step": 4975 + }, + { + "epoch": 1.379922351636162, + "grad_norm": 0.20153799653053284, + "learning_rate": 5.622084692197811e-06, + "loss": 0.4923, + "step": 4976 + }, + { + "epoch": 1.380199667221298, + "grad_norm": 0.2005978673696518, + "learning_rate": 5.617474732620423e-06, + "loss": 0.4906, + "step": 4977 + }, + { + "epoch": 1.3804769828064338, + "grad_norm": 0.20294132828712463, + "learning_rate": 5.612866116006059e-06, + "loss": 0.4977, + "step": 4978 + }, + { + "epoch": 1.3807542983915697, + "grad_norm": 0.19424274563789368, + "learning_rate": 5.608258843253985e-06, + "loss": 0.4766, + "step": 4979 + }, + { + "epoch": 1.3810316139767056, + "grad_norm": 0.20564329624176025, + "learning_rate": 5.60365291526321e-06, + "loss": 0.5037, + "step": 4980 + }, + { + "epoch": 1.3813089295618415, + "grad_norm": 0.19803635776042938, + "learning_rate": 5.599048332932462e-06, + "loss": 0.4728, + "step": 4981 + }, + { + "epoch": 1.3815862451469774, + "grad_norm": 0.20995113253593445, + "learning_rate": 5.594445097160221e-06, + "loss": 0.5107, + "step": 4982 + }, + { + "epoch": 1.3818635607321132, + "grad_norm": 0.20244790613651276, + "learning_rate": 5.589843208844707e-06, + "loss": 0.4971, + "step": 4983 + }, + { + "epoch": 1.3821408763172491, + "grad_norm": 0.2114204615354538, + "learning_rate": 5.5852426688838625e-06, + "loss": 0.5059, + "step": 4984 + }, + { + "epoch": 1.382418191902385, + "grad_norm": 0.2145579308271408, + "learning_rate": 5.580643478175372e-06, + "loss": 0.4949, + "step": 4985 + }, + { + "epoch": 1.382695507487521, + "grad_norm": 0.1978285163640976, + "learning_rate": 5.576045637616663e-06, + "loss": 0.4941, + "step": 4986 + }, + { + "epoch": 1.3829728230726568, + "grad_norm": 0.1951082944869995, + "learning_rate": 5.571449148104903e-06, + "loss": 0.4821, + "step": 4987 + }, + { + "epoch": 1.3832501386577927, + "grad_norm": 0.4343617260456085, + "learning_rate": 5.5668540105369815e-06, + "loss": 0.5103, + "step": 4988 + }, + { + "epoch": 1.3835274542429286, + "grad_norm": 0.19941391050815582, + "learning_rate": 5.562260225809524e-06, + "loss": 0.4847, + "step": 4989 + }, + { + "epoch": 1.3838047698280644, + "grad_norm": 0.2054002434015274, + "learning_rate": 5.557667794818917e-06, + "loss": 0.5059, + "step": 4990 + }, + { + "epoch": 1.3840820854132003, + "grad_norm": 0.19655835628509521, + "learning_rate": 5.5530767184612584e-06, + "loss": 0.5045, + "step": 4991 + }, + { + "epoch": 1.3843594009983362, + "grad_norm": 0.2060847282409668, + "learning_rate": 5.548486997632386e-06, + "loss": 0.5122, + "step": 4992 + }, + { + "epoch": 1.384636716583472, + "grad_norm": 0.20054815709590912, + "learning_rate": 5.543898633227869e-06, + "loss": 0.4945, + "step": 4993 + }, + { + "epoch": 1.384914032168608, + "grad_norm": 0.20712971687316895, + "learning_rate": 5.539311626143034e-06, + "loss": 0.4962, + "step": 4994 + }, + { + "epoch": 1.3851913477537439, + "grad_norm": 0.20592884719371796, + "learning_rate": 5.534725977272923e-06, + "loss": 0.5046, + "step": 4995 + }, + { + "epoch": 1.3854686633388797, + "grad_norm": 0.19674254953861237, + "learning_rate": 5.530141687512311e-06, + "loss": 0.5111, + "step": 4996 + }, + { + "epoch": 1.3857459789240156, + "grad_norm": 0.20650383830070496, + "learning_rate": 5.525558757755716e-06, + "loss": 0.4801, + "step": 4997 + }, + { + "epoch": 1.3860232945091515, + "grad_norm": 0.20828871428966522, + "learning_rate": 5.520977188897398e-06, + "loss": 0.4849, + "step": 4998 + }, + { + "epoch": 1.3863006100942874, + "grad_norm": 0.20563003420829773, + "learning_rate": 5.516396981831337e-06, + "loss": 0.528, + "step": 4999 + }, + { + "epoch": 1.3865779256794233, + "grad_norm": 0.19634945690631866, + "learning_rate": 5.511818137451247e-06, + "loss": 0.482, + "step": 5000 + }, + { + "epoch": 1.3868552412645592, + "grad_norm": 0.19331537187099457, + "learning_rate": 5.507240656650586e-06, + "loss": 0.5174, + "step": 5001 + }, + { + "epoch": 1.387132556849695, + "grad_norm": 0.19204163551330566, + "learning_rate": 5.502664540322547e-06, + "loss": 0.4646, + "step": 5002 + }, + { + "epoch": 1.387409872434831, + "grad_norm": 0.19761690497398376, + "learning_rate": 5.498089789360043e-06, + "loss": 0.4901, + "step": 5003 + }, + { + "epoch": 1.3876871880199668, + "grad_norm": 0.22188632190227509, + "learning_rate": 5.493516404655733e-06, + "loss": 0.5084, + "step": 5004 + }, + { + "epoch": 1.3879645036051027, + "grad_norm": 0.20775288343429565, + "learning_rate": 5.48894438710201e-06, + "loss": 0.4741, + "step": 5005 + }, + { + "epoch": 1.3882418191902386, + "grad_norm": 0.2092396765947342, + "learning_rate": 5.484373737590992e-06, + "loss": 0.5163, + "step": 5006 + }, + { + "epoch": 1.3885191347753745, + "grad_norm": 0.20691785216331482, + "learning_rate": 5.479804457014528e-06, + "loss": 0.4954, + "step": 5007 + }, + { + "epoch": 1.3887964503605104, + "grad_norm": 0.20370697975158691, + "learning_rate": 5.4752365462642115e-06, + "loss": 0.492, + "step": 5008 + }, + { + "epoch": 1.3890737659456462, + "grad_norm": 0.19262473285198212, + "learning_rate": 5.4706700062313686e-06, + "loss": 0.5066, + "step": 5009 + }, + { + "epoch": 1.3893510815307821, + "grad_norm": 0.20191557705402374, + "learning_rate": 5.466104837807038e-06, + "loss": 0.4935, + "step": 5010 + }, + { + "epoch": 1.389628397115918, + "grad_norm": 0.2161003202199936, + "learning_rate": 5.461541041882021e-06, + "loss": 0.5021, + "step": 5011 + }, + { + "epoch": 1.389905712701054, + "grad_norm": 0.2024545818567276, + "learning_rate": 5.456978619346821e-06, + "loss": 0.4968, + "step": 5012 + }, + { + "epoch": 1.3901830282861898, + "grad_norm": 0.19628916680812836, + "learning_rate": 5.452417571091699e-06, + "loss": 0.5084, + "step": 5013 + }, + { + "epoch": 1.3904603438713257, + "grad_norm": 0.21482454240322113, + "learning_rate": 5.447857898006625e-06, + "loss": 0.5142, + "step": 5014 + }, + { + "epoch": 1.3907376594564616, + "grad_norm": 0.19978967308998108, + "learning_rate": 5.4432996009813235e-06, + "loss": 0.448, + "step": 5015 + }, + { + "epoch": 1.3910149750415974, + "grad_norm": 0.21920114755630493, + "learning_rate": 5.43874268090523e-06, + "loss": 0.5065, + "step": 5016 + }, + { + "epoch": 1.3912922906267333, + "grad_norm": 0.1980915367603302, + "learning_rate": 5.434187138667522e-06, + "loss": 0.5073, + "step": 5017 + }, + { + "epoch": 1.3915696062118692, + "grad_norm": 0.19898997247219086, + "learning_rate": 5.429632975157115e-06, + "loss": 0.5052, + "step": 5018 + }, + { + "epoch": 1.391846921797005, + "grad_norm": 0.20365798473358154, + "learning_rate": 5.425080191262634e-06, + "loss": 0.4903, + "step": 5019 + }, + { + "epoch": 1.392124237382141, + "grad_norm": 0.21267344057559967, + "learning_rate": 5.42052878787246e-06, + "loss": 0.5094, + "step": 5020 + }, + { + "epoch": 1.3924015529672769, + "grad_norm": 0.1986275613307953, + "learning_rate": 5.415978765874681e-06, + "loss": 0.5039, + "step": 5021 + }, + { + "epoch": 1.3926788685524127, + "grad_norm": 0.20506128668785095, + "learning_rate": 5.411430126157138e-06, + "loss": 0.5119, + "step": 5022 + }, + { + "epoch": 1.3929561841375486, + "grad_norm": 0.20554526150226593, + "learning_rate": 5.406882869607381e-06, + "loss": 0.4979, + "step": 5023 + }, + { + "epoch": 1.3932334997226845, + "grad_norm": 0.19483192265033722, + "learning_rate": 5.402336997112703e-06, + "loss": 0.4768, + "step": 5024 + }, + { + "epoch": 1.3935108153078204, + "grad_norm": 0.18851692974567413, + "learning_rate": 5.397792509560132e-06, + "loss": 0.4797, + "step": 5025 + }, + { + "epoch": 1.3937881308929563, + "grad_norm": 0.19844412803649902, + "learning_rate": 5.3932494078364125e-06, + "loss": 0.501, + "step": 5026 + }, + { + "epoch": 1.3940654464780922, + "grad_norm": 0.20119161903858185, + "learning_rate": 5.388707692828013e-06, + "loss": 0.4795, + "step": 5027 + }, + { + "epoch": 1.394342762063228, + "grad_norm": 0.1947944462299347, + "learning_rate": 5.384167365421161e-06, + "loss": 0.4779, + "step": 5028 + }, + { + "epoch": 1.394620077648364, + "grad_norm": 0.20363706350326538, + "learning_rate": 5.379628426501789e-06, + "loss": 0.4925, + "step": 5029 + }, + { + "epoch": 1.3948973932334998, + "grad_norm": 0.21294178068637848, + "learning_rate": 5.375090876955559e-06, + "loss": 0.4978, + "step": 5030 + }, + { + "epoch": 1.3951747088186357, + "grad_norm": 0.19777782261371613, + "learning_rate": 5.370554717667861e-06, + "loss": 0.4888, + "step": 5031 + }, + { + "epoch": 1.3954520244037716, + "grad_norm": 0.201828733086586, + "learning_rate": 5.36601994952384e-06, + "loss": 0.5061, + "step": 5032 + }, + { + "epoch": 1.3957293399889075, + "grad_norm": 0.20095033943653107, + "learning_rate": 5.3614865734083365e-06, + "loss": 0.4768, + "step": 5033 + }, + { + "epoch": 1.3960066555740434, + "grad_norm": 0.20809856057167053, + "learning_rate": 5.3569545902059285e-06, + "loss": 0.5032, + "step": 5034 + }, + { + "epoch": 1.3962839711591792, + "grad_norm": 0.20099328458309174, + "learning_rate": 5.352424000800934e-06, + "loss": 0.4971, + "step": 5035 + }, + { + "epoch": 1.3965612867443151, + "grad_norm": 0.20865300297737122, + "learning_rate": 5.34789480607739e-06, + "loss": 0.49, + "step": 5036 + }, + { + "epoch": 1.396838602329451, + "grad_norm": 0.28519347310066223, + "learning_rate": 5.3433670069190616e-06, + "loss": 0.4885, + "step": 5037 + }, + { + "epoch": 1.397115917914587, + "grad_norm": 0.19800283014774323, + "learning_rate": 5.338840604209438e-06, + "loss": 0.5033, + "step": 5038 + }, + { + "epoch": 1.3973932334997228, + "grad_norm": 0.2006731927394867, + "learning_rate": 5.334315598831743e-06, + "loss": 0.5022, + "step": 5039 + }, + { + "epoch": 1.3976705490848587, + "grad_norm": 0.200321763753891, + "learning_rate": 5.329791991668931e-06, + "loss": 0.4846, + "step": 5040 + }, + { + "epoch": 1.3979478646699945, + "grad_norm": 0.21275892853736877, + "learning_rate": 5.3252697836036675e-06, + "loss": 0.503, + "step": 5041 + }, + { + "epoch": 1.3982251802551304, + "grad_norm": 0.19826599955558777, + "learning_rate": 5.320748975518361e-06, + "loss": 0.4912, + "step": 5042 + }, + { + "epoch": 1.3985024958402663, + "grad_norm": 0.20074287056922913, + "learning_rate": 5.316229568295143e-06, + "loss": 0.5036, + "step": 5043 + }, + { + "epoch": 1.3987798114254022, + "grad_norm": 0.19842980802059174, + "learning_rate": 5.311711562815869e-06, + "loss": 0.5017, + "step": 5044 + }, + { + "epoch": 1.399057127010538, + "grad_norm": 0.19995532929897308, + "learning_rate": 5.307194959962112e-06, + "loss": 0.4696, + "step": 5045 + }, + { + "epoch": 1.399334442595674, + "grad_norm": 0.20109152793884277, + "learning_rate": 5.302679760615189e-06, + "loss": 0.4875, + "step": 5046 + }, + { + "epoch": 1.3996117581808099, + "grad_norm": 0.19944603741168976, + "learning_rate": 5.298165965656139e-06, + "loss": 0.485, + "step": 5047 + }, + { + "epoch": 1.3998890737659457, + "grad_norm": 0.19794021546840668, + "learning_rate": 5.293653575965714e-06, + "loss": 0.508, + "step": 5048 + }, + { + "epoch": 1.4001663893510816, + "grad_norm": 0.20664082467556, + "learning_rate": 5.2891425924244095e-06, + "loss": 0.498, + "step": 5049 + }, + { + "epoch": 1.4004437049362175, + "grad_norm": 0.2035246342420578, + "learning_rate": 5.284633015912428e-06, + "loss": 0.5023, + "step": 5050 + }, + { + "epoch": 1.4007210205213534, + "grad_norm": 0.2174990326166153, + "learning_rate": 5.280124847309717e-06, + "loss": 0.483, + "step": 5051 + }, + { + "epoch": 1.4009983361064893, + "grad_norm": 0.1980149745941162, + "learning_rate": 5.275618087495932e-06, + "loss": 0.4739, + "step": 5052 + }, + { + "epoch": 1.4012756516916252, + "grad_norm": 0.19647562503814697, + "learning_rate": 5.271112737350467e-06, + "loss": 0.4788, + "step": 5053 + }, + { + "epoch": 1.401552967276761, + "grad_norm": 0.1995796263217926, + "learning_rate": 5.266608797752429e-06, + "loss": 0.4577, + "step": 5054 + }, + { + "epoch": 1.401830282861897, + "grad_norm": 0.2159973829984665, + "learning_rate": 5.26210626958066e-06, + "loss": 0.5022, + "step": 5055 + }, + { + "epoch": 1.4021075984470328, + "grad_norm": 0.20078851282596588, + "learning_rate": 5.257605153713727e-06, + "loss": 0.4864, + "step": 5056 + }, + { + "epoch": 1.4023849140321687, + "grad_norm": 0.20937801897525787, + "learning_rate": 5.253105451029908e-06, + "loss": 0.4945, + "step": 5057 + }, + { + "epoch": 1.4026622296173046, + "grad_norm": 0.2047879546880722, + "learning_rate": 5.248607162407221e-06, + "loss": 0.5019, + "step": 5058 + }, + { + "epoch": 1.4029395452024405, + "grad_norm": 0.19715817272663116, + "learning_rate": 5.244110288723396e-06, + "loss": 0.492, + "step": 5059 + }, + { + "epoch": 1.4032168607875763, + "grad_norm": 0.20772488415241241, + "learning_rate": 5.2396148308558976e-06, + "loss": 0.5004, + "step": 5060 + }, + { + "epoch": 1.4034941763727122, + "grad_norm": 0.19815978407859802, + "learning_rate": 5.235120789681902e-06, + "loss": 0.452, + "step": 5061 + }, + { + "epoch": 1.4037714919578481, + "grad_norm": 0.2069665491580963, + "learning_rate": 5.23062816607832e-06, + "loss": 0.4955, + "step": 5062 + }, + { + "epoch": 1.404048807542984, + "grad_norm": 0.20308996737003326, + "learning_rate": 5.226136960921786e-06, + "loss": 0.5137, + "step": 5063 + }, + { + "epoch": 1.4043261231281199, + "grad_norm": 0.2034929245710373, + "learning_rate": 5.221647175088648e-06, + "loss": 0.4617, + "step": 5064 + }, + { + "epoch": 1.4046034387132558, + "grad_norm": 0.21139517426490784, + "learning_rate": 5.217158809454979e-06, + "loss": 0.5068, + "step": 5065 + }, + { + "epoch": 1.4048807542983917, + "grad_norm": 0.21125884354114532, + "learning_rate": 5.212671864896581e-06, + "loss": 0.537, + "step": 5066 + }, + { + "epoch": 1.4051580698835275, + "grad_norm": 0.21030350029468536, + "learning_rate": 5.208186342288979e-06, + "loss": 0.5214, + "step": 5067 + }, + { + "epoch": 1.4054353854686634, + "grad_norm": 0.19781124591827393, + "learning_rate": 5.203702242507416e-06, + "loss": 0.4858, + "step": 5068 + }, + { + "epoch": 1.4057127010537993, + "grad_norm": 0.20567484200000763, + "learning_rate": 5.199219566426848e-06, + "loss": 0.4904, + "step": 5069 + }, + { + "epoch": 1.4059900166389352, + "grad_norm": 0.267501562833786, + "learning_rate": 5.194738314921982e-06, + "loss": 0.4828, + "step": 5070 + }, + { + "epoch": 1.406267332224071, + "grad_norm": 0.2026844173669815, + "learning_rate": 5.1902584888672206e-06, + "loss": 0.4974, + "step": 5071 + }, + { + "epoch": 1.406544647809207, + "grad_norm": 0.20914383232593536, + "learning_rate": 5.185780089136691e-06, + "loss": 0.4762, + "step": 5072 + }, + { + "epoch": 1.4068219633943428, + "grad_norm": 0.20183809101581573, + "learning_rate": 5.181303116604253e-06, + "loss": 0.4782, + "step": 5073 + }, + { + "epoch": 1.4070992789794787, + "grad_norm": 0.20599482953548431, + "learning_rate": 5.176827572143486e-06, + "loss": 0.4911, + "step": 5074 + }, + { + "epoch": 1.4073765945646146, + "grad_norm": 0.1961178332567215, + "learning_rate": 5.172353456627683e-06, + "loss": 0.4784, + "step": 5075 + }, + { + "epoch": 1.4076539101497505, + "grad_norm": 0.2111572027206421, + "learning_rate": 5.1678807709298605e-06, + "loss": 0.4856, + "step": 5076 + }, + { + "epoch": 1.4079312257348864, + "grad_norm": 0.19795557856559753, + "learning_rate": 5.163409515922758e-06, + "loss": 0.4855, + "step": 5077 + }, + { + "epoch": 1.4082085413200223, + "grad_norm": 0.2135738879442215, + "learning_rate": 5.158939692478845e-06, + "loss": 0.5018, + "step": 5078 + }, + { + "epoch": 1.4084858569051582, + "grad_norm": 0.2012612372636795, + "learning_rate": 5.154471301470294e-06, + "loss": 0.4999, + "step": 5079 + }, + { + "epoch": 1.408763172490294, + "grad_norm": 0.20429253578186035, + "learning_rate": 5.150004343769001e-06, + "loss": 0.4955, + "step": 5080 + }, + { + "epoch": 1.40904048807543, + "grad_norm": 0.2064649909734726, + "learning_rate": 5.1455388202466025e-06, + "loss": 0.5041, + "step": 5081 + }, + { + "epoch": 1.4093178036605658, + "grad_norm": 0.19653840363025665, + "learning_rate": 5.141074731774433e-06, + "loss": 0.5023, + "step": 5082 + }, + { + "epoch": 1.4095951192457017, + "grad_norm": 0.20743335783481598, + "learning_rate": 5.13661207922355e-06, + "loss": 0.5033, + "step": 5083 + }, + { + "epoch": 1.4098724348308376, + "grad_norm": 0.21355509757995605, + "learning_rate": 5.13215086346474e-06, + "loss": 0.5117, + "step": 5084 + }, + { + "epoch": 1.4101497504159735, + "grad_norm": 0.21359796822071075, + "learning_rate": 5.127691085368508e-06, + "loss": 0.4959, + "step": 5085 + }, + { + "epoch": 1.4104270660011093, + "grad_norm": 0.21122997999191284, + "learning_rate": 5.123232745805067e-06, + "loss": 0.5095, + "step": 5086 + }, + { + "epoch": 1.4107043815862452, + "grad_norm": 0.20164433121681213, + "learning_rate": 5.118775845644365e-06, + "loss": 0.5069, + "step": 5087 + }, + { + "epoch": 1.410981697171381, + "grad_norm": 0.205289825797081, + "learning_rate": 5.114320385756052e-06, + "loss": 0.5191, + "step": 5088 + }, + { + "epoch": 1.411259012756517, + "grad_norm": 0.20195280015468597, + "learning_rate": 5.109866367009518e-06, + "loss": 0.4861, + "step": 5089 + }, + { + "epoch": 1.4115363283416529, + "grad_norm": 0.2103511095046997, + "learning_rate": 5.105413790273848e-06, + "loss": 0.5086, + "step": 5090 + }, + { + "epoch": 1.4118136439267888, + "grad_norm": 0.2056579887866974, + "learning_rate": 5.1009626564178685e-06, + "loss": 0.5043, + "step": 5091 + }, + { + "epoch": 1.4120909595119246, + "grad_norm": 0.22724954783916473, + "learning_rate": 5.096512966310103e-06, + "loss": 0.5088, + "step": 5092 + }, + { + "epoch": 1.4123682750970605, + "grad_norm": 0.20662304759025574, + "learning_rate": 5.0920647208188105e-06, + "loss": 0.4832, + "step": 5093 + }, + { + "epoch": 1.4126455906821964, + "grad_norm": 0.20837345719337463, + "learning_rate": 5.087617920811966e-06, + "loss": 0.5003, + "step": 5094 + }, + { + "epoch": 1.4129229062673323, + "grad_norm": 0.20730285346508026, + "learning_rate": 5.083172567157246e-06, + "loss": 0.4922, + "step": 5095 + }, + { + "epoch": 1.4132002218524682, + "grad_norm": 0.20552845299243927, + "learning_rate": 5.078728660722068e-06, + "loss": 0.5003, + "step": 5096 + }, + { + "epoch": 1.413477537437604, + "grad_norm": 0.21342763304710388, + "learning_rate": 5.074286202373547e-06, + "loss": 0.5156, + "step": 5097 + }, + { + "epoch": 1.41375485302274, + "grad_norm": 0.20104481279850006, + "learning_rate": 5.069845192978534e-06, + "loss": 0.5034, + "step": 5098 + }, + { + "epoch": 1.4140321686078758, + "grad_norm": 0.1914806067943573, + "learning_rate": 5.065405633403576e-06, + "loss": 0.474, + "step": 5099 + }, + { + "epoch": 1.4143094841930117, + "grad_norm": 0.19734273850917816, + "learning_rate": 5.060967524514956e-06, + "loss": 0.4821, + "step": 5100 + }, + { + "epoch": 1.4145867997781476, + "grad_norm": 0.2332550585269928, + "learning_rate": 5.05653086717867e-06, + "loss": 0.4929, + "step": 5101 + }, + { + "epoch": 1.4148641153632835, + "grad_norm": 0.20123951137065887, + "learning_rate": 5.052095662260421e-06, + "loss": 0.4948, + "step": 5102 + }, + { + "epoch": 1.4151414309484194, + "grad_norm": 0.20322628319263458, + "learning_rate": 5.047661910625634e-06, + "loss": 0.5161, + "step": 5103 + }, + { + "epoch": 1.4154187465335553, + "grad_norm": 0.2106594741344452, + "learning_rate": 5.043229613139454e-06, + "loss": 0.4567, + "step": 5104 + }, + { + "epoch": 1.4156960621186911, + "grad_norm": 0.21148160099983215, + "learning_rate": 5.038798770666744e-06, + "loss": 0.5119, + "step": 5105 + }, + { + "epoch": 1.415973377703827, + "grad_norm": 0.20894502103328705, + "learning_rate": 5.034369384072075e-06, + "loss": 0.4947, + "step": 5106 + }, + { + "epoch": 1.416250693288963, + "grad_norm": 0.20449240505695343, + "learning_rate": 5.029941454219728e-06, + "loss": 0.5044, + "step": 5107 + }, + { + "epoch": 1.4165280088740988, + "grad_norm": 0.21614056825637817, + "learning_rate": 5.025514981973728e-06, + "loss": 0.4952, + "step": 5108 + }, + { + "epoch": 1.4168053244592347, + "grad_norm": 0.19919288158416748, + "learning_rate": 5.0210899681977865e-06, + "loss": 0.5098, + "step": 5109 + }, + { + "epoch": 1.4170826400443706, + "grad_norm": 0.20138102769851685, + "learning_rate": 5.01666641375534e-06, + "loss": 0.4817, + "step": 5110 + }, + { + "epoch": 1.4173599556295065, + "grad_norm": 0.20684263110160828, + "learning_rate": 5.0122443195095416e-06, + "loss": 0.4934, + "step": 5111 + }, + { + "epoch": 1.4176372712146423, + "grad_norm": 0.20800793170928955, + "learning_rate": 5.007823686323267e-06, + "loss": 0.4999, + "step": 5112 + }, + { + "epoch": 1.4179145867997782, + "grad_norm": 0.20006629824638367, + "learning_rate": 5.0034045150590905e-06, + "loss": 0.4677, + "step": 5113 + }, + { + "epoch": 1.418191902384914, + "grad_norm": 0.20559728145599365, + "learning_rate": 4.998986806579309e-06, + "loss": 0.473, + "step": 5114 + }, + { + "epoch": 1.41846921797005, + "grad_norm": 0.2054857760667801, + "learning_rate": 4.994570561745936e-06, + "loss": 0.5104, + "step": 5115 + }, + { + "epoch": 1.4187465335551859, + "grad_norm": 0.21354462206363678, + "learning_rate": 4.990155781420704e-06, + "loss": 0.5001, + "step": 5116 + }, + { + "epoch": 1.4190238491403218, + "grad_norm": 0.1943047046661377, + "learning_rate": 4.985742466465047e-06, + "loss": 0.4839, + "step": 5117 + }, + { + "epoch": 1.4193011647254576, + "grad_norm": 0.2034555971622467, + "learning_rate": 4.981330617740118e-06, + "loss": 0.5107, + "step": 5118 + }, + { + "epoch": 1.4195784803105935, + "grad_norm": 0.2111237347126007, + "learning_rate": 4.9769202361067895e-06, + "loss": 0.5202, + "step": 5119 + }, + { + "epoch": 1.4198557958957294, + "grad_norm": 0.19280794262886047, + "learning_rate": 4.972511322425648e-06, + "loss": 0.5044, + "step": 5120 + }, + { + "epoch": 1.4201331114808653, + "grad_norm": 0.198442280292511, + "learning_rate": 4.968103877556979e-06, + "loss": 0.4788, + "step": 5121 + }, + { + "epoch": 1.4204104270660012, + "grad_norm": 0.20548267662525177, + "learning_rate": 4.963697902360798e-06, + "loss": 0.4731, + "step": 5122 + }, + { + "epoch": 1.420687742651137, + "grad_norm": 0.22825707495212555, + "learning_rate": 4.959293397696831e-06, + "loss": 0.4998, + "step": 5123 + }, + { + "epoch": 1.420965058236273, + "grad_norm": 0.20349940657615662, + "learning_rate": 4.954890364424508e-06, + "loss": 0.5058, + "step": 5124 + }, + { + "epoch": 1.4212423738214088, + "grad_norm": 0.20583848655223846, + "learning_rate": 4.950488803402975e-06, + "loss": 0.484, + "step": 5125 + }, + { + "epoch": 1.4215196894065447, + "grad_norm": 0.20237183570861816, + "learning_rate": 4.9460887154910985e-06, + "loss": 0.5071, + "step": 5126 + }, + { + "epoch": 1.4217970049916806, + "grad_norm": 0.20790375769138336, + "learning_rate": 4.941690101547454e-06, + "loss": 0.4937, + "step": 5127 + }, + { + "epoch": 1.4220743205768165, + "grad_norm": 0.22605054080486298, + "learning_rate": 4.9372929624303205e-06, + "loss": 0.4915, + "step": 5128 + }, + { + "epoch": 1.4223516361619524, + "grad_norm": 0.20419445633888245, + "learning_rate": 4.932897298997703e-06, + "loss": 0.4946, + "step": 5129 + }, + { + "epoch": 1.4226289517470883, + "grad_norm": 0.21240122616291046, + "learning_rate": 4.928503112107306e-06, + "loss": 0.5002, + "step": 5130 + }, + { + "epoch": 1.4229062673322241, + "grad_norm": 0.2051856964826584, + "learning_rate": 4.92411040261656e-06, + "loss": 0.4913, + "step": 5131 + }, + { + "epoch": 1.42318358291736, + "grad_norm": 0.20038112998008728, + "learning_rate": 4.919719171382588e-06, + "loss": 0.5047, + "step": 5132 + }, + { + "epoch": 1.423460898502496, + "grad_norm": 0.20497171580791473, + "learning_rate": 4.915329419262242e-06, + "loss": 0.5028, + "step": 5133 + }, + { + "epoch": 1.4237382140876318, + "grad_norm": 0.1987651139497757, + "learning_rate": 4.910941147112083e-06, + "loss": 0.4977, + "step": 5134 + }, + { + "epoch": 1.4240155296727677, + "grad_norm": 0.28679215908050537, + "learning_rate": 4.906554355788369e-06, + "loss": 0.4989, + "step": 5135 + }, + { + "epoch": 1.4242928452579036, + "grad_norm": 0.202947735786438, + "learning_rate": 4.90216904614709e-06, + "loss": 0.4802, + "step": 5136 + }, + { + "epoch": 1.4245701608430394, + "grad_norm": 0.2093580663204193, + "learning_rate": 4.897785219043927e-06, + "loss": 0.4787, + "step": 5137 + }, + { + "epoch": 1.4248474764281753, + "grad_norm": 0.23357978463172913, + "learning_rate": 4.893402875334288e-06, + "loss": 0.4944, + "step": 5138 + }, + { + "epoch": 1.4251247920133112, + "grad_norm": 0.20304684340953827, + "learning_rate": 4.889022015873277e-06, + "loss": 0.4846, + "step": 5139 + }, + { + "epoch": 1.425402107598447, + "grad_norm": 0.20766004920005798, + "learning_rate": 4.884642641515724e-06, + "loss": 0.4871, + "step": 5140 + }, + { + "epoch": 1.425679423183583, + "grad_norm": 0.20664039254188538, + "learning_rate": 4.880264753116153e-06, + "loss": 0.481, + "step": 5141 + }, + { + "epoch": 1.4259567387687189, + "grad_norm": 0.20463590323925018, + "learning_rate": 4.875888351528808e-06, + "loss": 0.5147, + "step": 5142 + }, + { + "epoch": 1.4262340543538548, + "grad_norm": 0.20236745476722717, + "learning_rate": 4.871513437607648e-06, + "loss": 0.4744, + "step": 5143 + }, + { + "epoch": 1.4265113699389906, + "grad_norm": 0.19818803668022156, + "learning_rate": 4.867140012206331e-06, + "loss": 0.5014, + "step": 5144 + }, + { + "epoch": 1.4267886855241265, + "grad_norm": 0.22346088290214539, + "learning_rate": 4.86276807617822e-06, + "loss": 0.6068, + "step": 5145 + }, + { + "epoch": 1.4270660011092624, + "grad_norm": 0.21238963305950165, + "learning_rate": 4.858397630376402e-06, + "loss": 0.4987, + "step": 5146 + }, + { + "epoch": 1.4273433166943983, + "grad_norm": 0.20392395555973053, + "learning_rate": 4.854028675653673e-06, + "loss": 0.4896, + "step": 5147 + }, + { + "epoch": 1.4276206322795342, + "grad_norm": 0.20558500289916992, + "learning_rate": 4.849661212862519e-06, + "loss": 0.485, + "step": 5148 + }, + { + "epoch": 1.42789794786467, + "grad_norm": 0.20195691287517548, + "learning_rate": 4.8452952428551555e-06, + "loss": 0.4898, + "step": 5149 + }, + { + "epoch": 1.428175263449806, + "grad_norm": 0.20537883043289185, + "learning_rate": 4.8409307664835005e-06, + "loss": 0.51, + "step": 5150 + }, + { + "epoch": 1.4284525790349418, + "grad_norm": 0.1957499235868454, + "learning_rate": 4.836567784599177e-06, + "loss": 0.4805, + "step": 5151 + }, + { + "epoch": 1.4287298946200777, + "grad_norm": 0.20477133989334106, + "learning_rate": 4.832206298053514e-06, + "loss": 0.5113, + "step": 5152 + }, + { + "epoch": 1.4290072102052136, + "grad_norm": 0.20341917872428894, + "learning_rate": 4.827846307697556e-06, + "loss": 0.4972, + "step": 5153 + }, + { + "epoch": 1.4292845257903495, + "grad_norm": 0.20808421075344086, + "learning_rate": 4.823487814382058e-06, + "loss": 0.5044, + "step": 5154 + }, + { + "epoch": 1.4295618413754854, + "grad_norm": 0.19239382445812225, + "learning_rate": 4.819130818957472e-06, + "loss": 0.47, + "step": 5155 + }, + { + "epoch": 1.4298391569606212, + "grad_norm": 0.20638294517993927, + "learning_rate": 4.814775322273961e-06, + "loss": 0.5008, + "step": 5156 + }, + { + "epoch": 1.4301164725457571, + "grad_norm": 0.20612825453281403, + "learning_rate": 4.8104213251814e-06, + "loss": 0.4738, + "step": 5157 + }, + { + "epoch": 1.430393788130893, + "grad_norm": 0.20103727281093597, + "learning_rate": 4.806068828529374e-06, + "loss": 0.4854, + "step": 5158 + }, + { + "epoch": 1.430671103716029, + "grad_norm": 0.20140987634658813, + "learning_rate": 4.801717833167162e-06, + "loss": 0.498, + "step": 5159 + }, + { + "epoch": 1.4309484193011648, + "grad_norm": 0.20108608901500702, + "learning_rate": 4.797368339943763e-06, + "loss": 0.4587, + "step": 5160 + }, + { + "epoch": 1.4312257348863007, + "grad_norm": 0.20430995523929596, + "learning_rate": 4.793020349707883e-06, + "loss": 0.502, + "step": 5161 + }, + { + "epoch": 1.4315030504714366, + "grad_norm": 0.1962215155363083, + "learning_rate": 4.7886738633079254e-06, + "loss": 0.4873, + "step": 5162 + }, + { + "epoch": 1.4317803660565724, + "grad_norm": 0.2041066735982895, + "learning_rate": 4.784328881592e-06, + "loss": 0.4807, + "step": 5163 + }, + { + "epoch": 1.4320576816417083, + "grad_norm": 0.2137051373720169, + "learning_rate": 4.779985405407933e-06, + "loss": 0.4924, + "step": 5164 + }, + { + "epoch": 1.4323349972268442, + "grad_norm": 0.20929081737995148, + "learning_rate": 4.775643435603255e-06, + "loss": 0.5052, + "step": 5165 + }, + { + "epoch": 1.43261231281198, + "grad_norm": 0.2020164579153061, + "learning_rate": 4.7713029730251925e-06, + "loss": 0.5179, + "step": 5166 + }, + { + "epoch": 1.432889628397116, + "grad_norm": 0.21634337306022644, + "learning_rate": 4.766964018520691e-06, + "loss": 0.5043, + "step": 5167 + }, + { + "epoch": 1.4331669439822519, + "grad_norm": 0.20789554715156555, + "learning_rate": 4.762626572936389e-06, + "loss": 0.4949, + "step": 5168 + }, + { + "epoch": 1.4334442595673877, + "grad_norm": 0.20186547935009003, + "learning_rate": 4.7582906371186435e-06, + "loss": 0.4865, + "step": 5169 + }, + { + "epoch": 1.4337215751525236, + "grad_norm": 0.20182502269744873, + "learning_rate": 4.753956211913504e-06, + "loss": 0.4684, + "step": 5170 + }, + { + "epoch": 1.4339988907376595, + "grad_norm": 0.20346789062023163, + "learning_rate": 4.749623298166736e-06, + "loss": 0.498, + "step": 5171 + }, + { + "epoch": 1.4342762063227954, + "grad_norm": 0.21296192705631256, + "learning_rate": 4.745291896723808e-06, + "loss": 0.4982, + "step": 5172 + }, + { + "epoch": 1.4345535219079313, + "grad_norm": 0.21046607196331024, + "learning_rate": 4.740962008429885e-06, + "loss": 0.5046, + "step": 5173 + }, + { + "epoch": 1.4348308374930672, + "grad_norm": 0.20590196549892426, + "learning_rate": 4.736633634129849e-06, + "loss": 0.4919, + "step": 5174 + }, + { + "epoch": 1.435108153078203, + "grad_norm": 0.18597187101840973, + "learning_rate": 4.732306774668274e-06, + "loss": 0.492, + "step": 5175 + }, + { + "epoch": 1.435385468663339, + "grad_norm": 0.20251090824604034, + "learning_rate": 4.727981430889452e-06, + "loss": 0.534, + "step": 5176 + }, + { + "epoch": 1.4356627842484748, + "grad_norm": 0.19426211714744568, + "learning_rate": 4.723657603637364e-06, + "loss": 0.4626, + "step": 5177 + }, + { + "epoch": 1.4359400998336107, + "grad_norm": 0.20881786942481995, + "learning_rate": 4.7193352937557125e-06, + "loss": 0.4843, + "step": 5178 + }, + { + "epoch": 1.4362174154187466, + "grad_norm": 0.19851280748844147, + "learning_rate": 4.7150145020878865e-06, + "loss": 0.5019, + "step": 5179 + }, + { + "epoch": 1.4364947310038825, + "grad_norm": 0.2078125774860382, + "learning_rate": 4.71069522947699e-06, + "loss": 0.4671, + "step": 5180 + }, + { + "epoch": 1.4367720465890184, + "grad_norm": 0.20463737845420837, + "learning_rate": 4.706377476765832e-06, + "loss": 0.5038, + "step": 5181 + }, + { + "epoch": 1.4370493621741542, + "grad_norm": 0.20316970348358154, + "learning_rate": 4.702061244796916e-06, + "loss": 0.4726, + "step": 5182 + }, + { + "epoch": 1.4373266777592901, + "grad_norm": 0.1965150386095047, + "learning_rate": 4.69774653441245e-06, + "loss": 0.4753, + "step": 5183 + }, + { + "epoch": 1.437603993344426, + "grad_norm": 0.19861292839050293, + "learning_rate": 4.693433346454352e-06, + "loss": 0.491, + "step": 5184 + }, + { + "epoch": 1.437881308929562, + "grad_norm": 0.2055879682302475, + "learning_rate": 4.689121681764243e-06, + "loss": 0.496, + "step": 5185 + }, + { + "epoch": 1.4381586245146978, + "grad_norm": 0.20754766464233398, + "learning_rate": 4.684811541183436e-06, + "loss": 0.5039, + "step": 5186 + }, + { + "epoch": 1.4384359400998337, + "grad_norm": 0.2045608013868332, + "learning_rate": 4.680502925552956e-06, + "loss": 0.4964, + "step": 5187 + }, + { + "epoch": 1.4387132556849695, + "grad_norm": 0.20918679237365723, + "learning_rate": 4.676195835713533e-06, + "loss": 0.505, + "step": 5188 + }, + { + "epoch": 1.4389905712701054, + "grad_norm": 0.21245136857032776, + "learning_rate": 4.67189027250559e-06, + "loss": 0.4968, + "step": 5189 + }, + { + "epoch": 1.4392678868552413, + "grad_norm": 0.30057552456855774, + "learning_rate": 4.667586236769253e-06, + "loss": 0.4925, + "step": 5190 + }, + { + "epoch": 1.4395452024403772, + "grad_norm": 0.2126583307981491, + "learning_rate": 4.6632837293443575e-06, + "loss": 0.4906, + "step": 5191 + }, + { + "epoch": 1.439822518025513, + "grad_norm": 0.20761823654174805, + "learning_rate": 4.65898275107044e-06, + "loss": 0.507, + "step": 5192 + }, + { + "epoch": 1.440099833610649, + "grad_norm": 0.19843849539756775, + "learning_rate": 4.65468330278673e-06, + "loss": 0.4824, + "step": 5193 + }, + { + "epoch": 1.4403771491957849, + "grad_norm": 0.2050592452287674, + "learning_rate": 4.650385385332163e-06, + "loss": 0.4696, + "step": 5194 + }, + { + "epoch": 1.4406544647809207, + "grad_norm": 0.20630647242069244, + "learning_rate": 4.646088999545378e-06, + "loss": 0.514, + "step": 5195 + }, + { + "epoch": 1.4409317803660566, + "grad_norm": 0.20964011549949646, + "learning_rate": 4.6417941462647206e-06, + "loss": 0.5106, + "step": 5196 + }, + { + "epoch": 1.4412090959511925, + "grad_norm": 0.20389756560325623, + "learning_rate": 4.637500826328223e-06, + "loss": 0.5036, + "step": 5197 + }, + { + "epoch": 1.4414864115363284, + "grad_norm": 0.21767324209213257, + "learning_rate": 4.633209040573619e-06, + "loss": 0.5197, + "step": 5198 + }, + { + "epoch": 1.4417637271214643, + "grad_norm": 0.22654587030410767, + "learning_rate": 4.628918789838367e-06, + "loss": 0.5067, + "step": 5199 + }, + { + "epoch": 1.4420410427066002, + "grad_norm": 0.19964653253555298, + "learning_rate": 4.624630074959599e-06, + "loss": 0.4778, + "step": 5200 + }, + { + "epoch": 1.442318358291736, + "grad_norm": 0.20028294622898102, + "learning_rate": 4.620342896774152e-06, + "loss": 0.4896, + "step": 5201 + }, + { + "epoch": 1.442595673876872, + "grad_norm": 0.20315535366535187, + "learning_rate": 4.616057256118575e-06, + "loss": 0.495, + "step": 5202 + }, + { + "epoch": 1.4428729894620078, + "grad_norm": 0.23156633973121643, + "learning_rate": 4.611773153829111e-06, + "loss": 0.4772, + "step": 5203 + }, + { + "epoch": 1.4431503050471437, + "grad_norm": 0.22991783916950226, + "learning_rate": 4.607490590741702e-06, + "loss": 0.5134, + "step": 5204 + }, + { + "epoch": 1.4434276206322796, + "grad_norm": 0.20309005677700043, + "learning_rate": 4.603209567691979e-06, + "loss": 0.4889, + "step": 5205 + }, + { + "epoch": 1.4437049362174155, + "grad_norm": 0.20435002446174622, + "learning_rate": 4.598930085515293e-06, + "loss": 0.4943, + "step": 5206 + }, + { + "epoch": 1.4439822518025514, + "grad_norm": 0.19982177019119263, + "learning_rate": 4.594652145046688e-06, + "loss": 0.4995, + "step": 5207 + }, + { + "epoch": 1.4442595673876872, + "grad_norm": 0.22092601656913757, + "learning_rate": 4.5903757471208914e-06, + "loss": 0.5108, + "step": 5208 + }, + { + "epoch": 1.4445368829728231, + "grad_norm": 0.2120787650346756, + "learning_rate": 4.586100892572352e-06, + "loss": 0.48, + "step": 5209 + }, + { + "epoch": 1.444814198557959, + "grad_norm": 0.209842249751091, + "learning_rate": 4.5818275822352e-06, + "loss": 0.4888, + "step": 5210 + }, + { + "epoch": 1.445091514143095, + "grad_norm": 0.1947280764579773, + "learning_rate": 4.577555816943279e-06, + "loss": 0.5057, + "step": 5211 + }, + { + "epoch": 1.4453688297282308, + "grad_norm": 0.20627637207508087, + "learning_rate": 4.573285597530114e-06, + "loss": 0.5058, + "step": 5212 + }, + { + "epoch": 1.4456461453133667, + "grad_norm": 0.2139754295349121, + "learning_rate": 4.569016924828945e-06, + "loss": 0.4867, + "step": 5213 + }, + { + "epoch": 1.4459234608985025, + "grad_norm": 0.21095266938209534, + "learning_rate": 4.564749799672705e-06, + "loss": 0.5132, + "step": 5214 + }, + { + "epoch": 1.4462007764836384, + "grad_norm": 0.21572549641132355, + "learning_rate": 4.560484222894014e-06, + "loss": 0.4752, + "step": 5215 + }, + { + "epoch": 1.4464780920687743, + "grad_norm": 0.21467210352420807, + "learning_rate": 4.55622019532521e-06, + "loss": 0.5311, + "step": 5216 + }, + { + "epoch": 1.4467554076539102, + "grad_norm": 0.21415258944034576, + "learning_rate": 4.551957717798308e-06, + "loss": 0.4949, + "step": 5217 + }, + { + "epoch": 1.447032723239046, + "grad_norm": 0.2082109898328781, + "learning_rate": 4.5476967911450345e-06, + "loss": 0.501, + "step": 5218 + }, + { + "epoch": 1.447310038824182, + "grad_norm": 0.22765327990055084, + "learning_rate": 4.543437416196814e-06, + "loss": 0.4966, + "step": 5219 + }, + { + "epoch": 1.4475873544093179, + "grad_norm": 0.20551997423171997, + "learning_rate": 4.539179593784758e-06, + "loss": 0.4974, + "step": 5220 + }, + { + "epoch": 1.4478646699944537, + "grad_norm": 0.20596256852149963, + "learning_rate": 4.534923324739677e-06, + "loss": 0.4995, + "step": 5221 + }, + { + "epoch": 1.4481419855795896, + "grad_norm": 0.1983633041381836, + "learning_rate": 4.530668609892087e-06, + "loss": 0.4959, + "step": 5222 + }, + { + "epoch": 1.4484193011647255, + "grad_norm": 0.19731079041957855, + "learning_rate": 4.526415450072198e-06, + "loss": 0.4974, + "step": 5223 + }, + { + "epoch": 1.4486966167498614, + "grad_norm": 0.21283119916915894, + "learning_rate": 4.522163846109908e-06, + "loss": 0.521, + "step": 5224 + }, + { + "epoch": 1.4489739323349973, + "grad_norm": 0.21046127378940582, + "learning_rate": 4.517913798834818e-06, + "loss": 0.483, + "step": 5225 + }, + { + "epoch": 1.4492512479201332, + "grad_norm": 0.22251440584659576, + "learning_rate": 4.513665309076233e-06, + "loss": 0.4978, + "step": 5226 + }, + { + "epoch": 1.449528563505269, + "grad_norm": 0.20175869762897491, + "learning_rate": 4.50941837766314e-06, + "loss": 0.5356, + "step": 5227 + }, + { + "epoch": 1.449805879090405, + "grad_norm": 0.21276865899562836, + "learning_rate": 4.505173005424224e-06, + "loss": 0.4813, + "step": 5228 + }, + { + "epoch": 1.4500831946755408, + "grad_norm": 0.20545652508735657, + "learning_rate": 4.500929193187872e-06, + "loss": 0.4697, + "step": 5229 + }, + { + "epoch": 1.4503605102606767, + "grad_norm": 0.21193096041679382, + "learning_rate": 4.496686941782172e-06, + "loss": 0.5156, + "step": 5230 + }, + { + "epoch": 1.4506378258458126, + "grad_norm": 0.2051508128643036, + "learning_rate": 4.492446252034893e-06, + "loss": 0.5015, + "step": 5231 + }, + { + "epoch": 1.4509151414309485, + "grad_norm": 0.2122107744216919, + "learning_rate": 4.488207124773501e-06, + "loss": 0.4912, + "step": 5232 + }, + { + "epoch": 1.4511924570160843, + "grad_norm": 0.19480577111244202, + "learning_rate": 4.483969560825169e-06, + "loss": 0.4876, + "step": 5233 + }, + { + "epoch": 1.4514697726012202, + "grad_norm": 0.20153845846652985, + "learning_rate": 4.479733561016759e-06, + "loss": 0.5092, + "step": 5234 + }, + { + "epoch": 1.4517470881863561, + "grad_norm": 0.21945208311080933, + "learning_rate": 4.475499126174826e-06, + "loss": 0.5147, + "step": 5235 + }, + { + "epoch": 1.452024403771492, + "grad_norm": 0.215095654129982, + "learning_rate": 4.471266257125609e-06, + "loss": 0.5222, + "step": 5236 + }, + { + "epoch": 1.4523017193566279, + "grad_norm": 0.20627257227897644, + "learning_rate": 4.467034954695071e-06, + "loss": 0.4668, + "step": 5237 + }, + { + "epoch": 1.4525790349417638, + "grad_norm": 0.2098441869020462, + "learning_rate": 4.462805219708843e-06, + "loss": 0.4668, + "step": 5238 + }, + { + "epoch": 1.4528563505268997, + "grad_norm": 0.22378282248973846, + "learning_rate": 4.4585770529922535e-06, + "loss": 0.4728, + "step": 5239 + }, + { + "epoch": 1.4531336661120355, + "grad_norm": 0.21106509864330292, + "learning_rate": 4.454350455370334e-06, + "loss": 0.5006, + "step": 5240 + }, + { + "epoch": 1.4534109816971714, + "grad_norm": 0.21677586436271667, + "learning_rate": 4.450125427667812e-06, + "loss": 0.4977, + "step": 5241 + }, + { + "epoch": 1.4536882972823073, + "grad_norm": 0.2078382968902588, + "learning_rate": 4.4459019707090956e-06, + "loss": 0.5159, + "step": 5242 + }, + { + "epoch": 1.4539656128674432, + "grad_norm": 0.19706477224826813, + "learning_rate": 4.441680085318289e-06, + "loss": 0.4935, + "step": 5243 + }, + { + "epoch": 1.454242928452579, + "grad_norm": 0.19929809868335724, + "learning_rate": 4.437459772319201e-06, + "loss": 0.4725, + "step": 5244 + }, + { + "epoch": 1.454520244037715, + "grad_norm": 0.19897820055484772, + "learning_rate": 4.4332410325353265e-06, + "loss": 0.4693, + "step": 5245 + }, + { + "epoch": 1.4547975596228508, + "grad_norm": 0.19526724517345428, + "learning_rate": 4.429023866789848e-06, + "loss": 0.4856, + "step": 5246 + }, + { + "epoch": 1.4550748752079867, + "grad_norm": 0.2073434442281723, + "learning_rate": 4.424808275905654e-06, + "loss": 0.4909, + "step": 5247 + }, + { + "epoch": 1.4553521907931226, + "grad_norm": 0.2015712559223175, + "learning_rate": 4.420594260705309e-06, + "loss": 0.4805, + "step": 5248 + }, + { + "epoch": 1.4556295063782585, + "grad_norm": 0.20347121357917786, + "learning_rate": 4.416381822011087e-06, + "loss": 0.5058, + "step": 5249 + }, + { + "epoch": 1.4559068219633944, + "grad_norm": 0.20566286146640778, + "learning_rate": 4.412170960644939e-06, + "loss": 0.4688, + "step": 5250 + }, + { + "epoch": 1.4561841375485303, + "grad_norm": 0.2034814953804016, + "learning_rate": 4.407961677428521e-06, + "loss": 0.5307, + "step": 5251 + }, + { + "epoch": 1.4564614531336662, + "grad_norm": 0.21038807928562164, + "learning_rate": 4.403753973183177e-06, + "loss": 0.5287, + "step": 5252 + }, + { + "epoch": 1.456738768718802, + "grad_norm": 0.2552982270717621, + "learning_rate": 4.399547848729935e-06, + "loss": 0.4838, + "step": 5253 + }, + { + "epoch": 1.457016084303938, + "grad_norm": 0.194267138838768, + "learning_rate": 4.395343304889529e-06, + "loss": 0.4981, + "step": 5254 + }, + { + "epoch": 1.4572933998890738, + "grad_norm": 0.20130032300949097, + "learning_rate": 4.391140342482369e-06, + "loss": 0.4863, + "step": 5255 + }, + { + "epoch": 1.4575707154742097, + "grad_norm": 0.21114356815814972, + "learning_rate": 4.3869389623285725e-06, + "loss": 0.511, + "step": 5256 + }, + { + "epoch": 1.4578480310593456, + "grad_norm": 0.19720643758773804, + "learning_rate": 4.382739165247933e-06, + "loss": 0.4973, + "step": 5257 + }, + { + "epoch": 1.4581253466444815, + "grad_norm": 0.1979367434978485, + "learning_rate": 4.378540952059948e-06, + "loss": 0.5119, + "step": 5258 + }, + { + "epoch": 1.4584026622296173, + "grad_norm": 0.1983027458190918, + "learning_rate": 4.374344323583793e-06, + "loss": 0.5073, + "step": 5259 + }, + { + "epoch": 1.4586799778147532, + "grad_norm": 0.20961186289787292, + "learning_rate": 4.370149280638347e-06, + "loss": 0.5181, + "step": 5260 + }, + { + "epoch": 1.458957293399889, + "grad_norm": 0.1928398460149765, + "learning_rate": 4.3659558240421755e-06, + "loss": 0.504, + "step": 5261 + }, + { + "epoch": 1.459234608985025, + "grad_norm": 0.20917226374149323, + "learning_rate": 4.361763954613526e-06, + "loss": 0.5018, + "step": 5262 + }, + { + "epoch": 1.4595119245701609, + "grad_norm": 0.19321578741073608, + "learning_rate": 4.357573673170352e-06, + "loss": 0.4699, + "step": 5263 + }, + { + "epoch": 1.4597892401552968, + "grad_norm": 0.19718189537525177, + "learning_rate": 4.35338498053028e-06, + "loss": 0.4754, + "step": 5264 + }, + { + "epoch": 1.4600665557404326, + "grad_norm": 0.20244868099689484, + "learning_rate": 4.349197877510643e-06, + "loss": 0.4869, + "step": 5265 + }, + { + "epoch": 1.4603438713255685, + "grad_norm": 0.20539158582687378, + "learning_rate": 4.345012364928447e-06, + "loss": 0.5209, + "step": 5266 + }, + { + "epoch": 1.4606211869107044, + "grad_norm": 0.19396114349365234, + "learning_rate": 4.340828443600401e-06, + "loss": 0.4901, + "step": 5267 + }, + { + "epoch": 1.4608985024958403, + "grad_norm": 0.21171221137046814, + "learning_rate": 4.336646114342903e-06, + "loss": 0.49, + "step": 5268 + }, + { + "epoch": 1.4611758180809762, + "grad_norm": 0.20075739920139313, + "learning_rate": 4.332465377972031e-06, + "loss": 0.5127, + "step": 5269 + }, + { + "epoch": 1.461453133666112, + "grad_norm": 0.2100810706615448, + "learning_rate": 4.328286235303555e-06, + "loss": 0.506, + "step": 5270 + }, + { + "epoch": 1.461730449251248, + "grad_norm": 0.224534273147583, + "learning_rate": 4.324108687152941e-06, + "loss": 0.5033, + "step": 5271 + }, + { + "epoch": 1.4620077648363838, + "grad_norm": 0.20113049447536469, + "learning_rate": 4.3199327343353415e-06, + "loss": 0.4867, + "step": 5272 + }, + { + "epoch": 1.4622850804215197, + "grad_norm": 0.20711010694503784, + "learning_rate": 4.315758377665592e-06, + "loss": 0.4818, + "step": 5273 + }, + { + "epoch": 1.4625623960066556, + "grad_norm": 0.20650602877140045, + "learning_rate": 4.311585617958214e-06, + "loss": 0.4794, + "step": 5274 + }, + { + "epoch": 1.4628397115917915, + "grad_norm": 0.19701525568962097, + "learning_rate": 4.307414456027437e-06, + "loss": 0.5037, + "step": 5275 + }, + { + "epoch": 1.4631170271769274, + "grad_norm": 0.2053639143705368, + "learning_rate": 4.303244892687157e-06, + "loss": 0.4924, + "step": 5276 + }, + { + "epoch": 1.4633943427620633, + "grad_norm": 0.20747579634189606, + "learning_rate": 4.299076928750964e-06, + "loss": 0.4847, + "step": 5277 + }, + { + "epoch": 1.4636716583471991, + "grad_norm": 0.19542749226093292, + "learning_rate": 4.294910565032143e-06, + "loss": 0.4799, + "step": 5278 + }, + { + "epoch": 1.463948973932335, + "grad_norm": 0.20481598377227783, + "learning_rate": 4.290745802343663e-06, + "loss": 0.4998, + "step": 5279 + }, + { + "epoch": 1.464226289517471, + "grad_norm": 0.20202511548995972, + "learning_rate": 4.286582641498177e-06, + "loss": 0.5196, + "step": 5280 + }, + { + "epoch": 1.4645036051026068, + "grad_norm": 0.20724746584892273, + "learning_rate": 4.282421083308024e-06, + "loss": 0.4854, + "step": 5281 + }, + { + "epoch": 1.4647809206877427, + "grad_norm": 0.22080543637275696, + "learning_rate": 4.2782611285852386e-06, + "loss": 0.5355, + "step": 5282 + }, + { + "epoch": 1.4650582362728786, + "grad_norm": 0.20695441961288452, + "learning_rate": 4.274102778141542e-06, + "loss": 0.4853, + "step": 5283 + }, + { + "epoch": 1.4653355518580145, + "grad_norm": 0.20421764254570007, + "learning_rate": 4.26994603278833e-06, + "loss": 0.4865, + "step": 5284 + }, + { + "epoch": 1.4656128674431503, + "grad_norm": 0.19895236194133759, + "learning_rate": 4.265790893336702e-06, + "loss": 0.5041, + "step": 5285 + }, + { + "epoch": 1.4658901830282862, + "grad_norm": 0.20470494031906128, + "learning_rate": 4.261637360597428e-06, + "loss": 0.4982, + "step": 5286 + }, + { + "epoch": 1.466167498613422, + "grad_norm": 0.20590339601039886, + "learning_rate": 4.257485435380981e-06, + "loss": 0.4576, + "step": 5287 + }, + { + "epoch": 1.466444814198558, + "grad_norm": 0.20538951456546783, + "learning_rate": 4.253335118497503e-06, + "loss": 0.513, + "step": 5288 + }, + { + "epoch": 1.4667221297836939, + "grad_norm": 0.19989731907844543, + "learning_rate": 4.249186410756834e-06, + "loss": 0.4728, + "step": 5289 + }, + { + "epoch": 1.4669994453688298, + "grad_norm": 0.19829894602298737, + "learning_rate": 4.245039312968502e-06, + "loss": 0.5188, + "step": 5290 + }, + { + "epoch": 1.4672767609539656, + "grad_norm": 0.2066100686788559, + "learning_rate": 4.240893825941707e-06, + "loss": 0.5106, + "step": 5291 + }, + { + "epoch": 1.4675540765391015, + "grad_norm": 0.2136521190404892, + "learning_rate": 4.236749950485351e-06, + "loss": 0.4995, + "step": 5292 + }, + { + "epoch": 1.4678313921242374, + "grad_norm": 0.2011151909828186, + "learning_rate": 4.232607687408007e-06, + "loss": 0.4716, + "step": 5293 + }, + { + "epoch": 1.4681087077093733, + "grad_norm": 0.20019252598285675, + "learning_rate": 4.228467037517945e-06, + "loss": 0.5025, + "step": 5294 + }, + { + "epoch": 1.4683860232945092, + "grad_norm": 0.21552008390426636, + "learning_rate": 4.224328001623114e-06, + "loss": 0.4994, + "step": 5295 + }, + { + "epoch": 1.468663338879645, + "grad_norm": 0.2044316530227661, + "learning_rate": 4.2201905805311515e-06, + "loss": 0.4901, + "step": 5296 + }, + { + "epoch": 1.468940654464781, + "grad_norm": 0.22182868421077728, + "learning_rate": 4.216054775049372e-06, + "loss": 0.533, + "step": 5297 + }, + { + "epoch": 1.4692179700499168, + "grad_norm": 0.20859482884407043, + "learning_rate": 4.211920585984786e-06, + "loss": 0.5134, + "step": 5298 + }, + { + "epoch": 1.4694952856350527, + "grad_norm": 0.2023824155330658, + "learning_rate": 4.207788014144084e-06, + "loss": 0.5222, + "step": 5299 + }, + { + "epoch": 1.4697726012201886, + "grad_norm": 0.23869916796684265, + "learning_rate": 4.203657060333641e-06, + "loss": 0.5025, + "step": 5300 + }, + { + "epoch": 1.4700499168053245, + "grad_norm": 0.23270224034786224, + "learning_rate": 4.199527725359508e-06, + "loss": 0.5166, + "step": 5301 + }, + { + "epoch": 1.4703272323904604, + "grad_norm": 0.22280660271644592, + "learning_rate": 4.195400010027432e-06, + "loss": 0.4891, + "step": 5302 + }, + { + "epoch": 1.4706045479755963, + "grad_norm": 0.21119210124015808, + "learning_rate": 4.191273915142846e-06, + "loss": 0.5021, + "step": 5303 + }, + { + "epoch": 1.4708818635607321, + "grad_norm": 0.21453891694545746, + "learning_rate": 4.187149441510848e-06, + "loss": 0.52, + "step": 5304 + }, + { + "epoch": 1.471159179145868, + "grad_norm": 0.20360562205314636, + "learning_rate": 4.183026589936241e-06, + "loss": 0.5032, + "step": 5305 + }, + { + "epoch": 1.471436494731004, + "grad_norm": 0.20575298368930817, + "learning_rate": 4.178905361223505e-06, + "loss": 0.4982, + "step": 5306 + }, + { + "epoch": 1.4717138103161398, + "grad_norm": 0.20205079019069672, + "learning_rate": 4.174785756176794e-06, + "loss": 0.5021, + "step": 5307 + }, + { + "epoch": 1.4719911259012757, + "grad_norm": 0.22563353180885315, + "learning_rate": 4.170667775599951e-06, + "loss": 0.4909, + "step": 5308 + }, + { + "epoch": 1.4722684414864116, + "grad_norm": 0.19961145520210266, + "learning_rate": 4.166551420296508e-06, + "loss": 0.5033, + "step": 5309 + }, + { + "epoch": 1.4725457570715474, + "grad_norm": 0.20603464543819427, + "learning_rate": 4.162436691069676e-06, + "loss": 0.4849, + "step": 5310 + }, + { + "epoch": 1.4728230726566833, + "grad_norm": 0.20677094161510468, + "learning_rate": 4.1583235887223455e-06, + "loss": 0.481, + "step": 5311 + }, + { + "epoch": 1.4731003882418192, + "grad_norm": 0.20227313041687012, + "learning_rate": 4.154212114057082e-06, + "loss": 0.5013, + "step": 5312 + }, + { + "epoch": 1.473377703826955, + "grad_norm": 0.2114361971616745, + "learning_rate": 4.150102267876163e-06, + "loss": 0.4746, + "step": 5313 + }, + { + "epoch": 1.473655019412091, + "grad_norm": 0.20628516376018524, + "learning_rate": 4.145994050981516e-06, + "loss": 0.5033, + "step": 5314 + }, + { + "epoch": 1.4739323349972269, + "grad_norm": 0.1993936002254486, + "learning_rate": 4.141887464174763e-06, + "loss": 0.4915, + "step": 5315 + }, + { + "epoch": 1.4742096505823628, + "grad_norm": 0.200067400932312, + "learning_rate": 4.137782508257207e-06, + "loss": 0.4849, + "step": 5316 + }, + { + "epoch": 1.4744869661674986, + "grad_norm": 0.20835205912590027, + "learning_rate": 4.1336791840298425e-06, + "loss": 0.5053, + "step": 5317 + }, + { + "epoch": 1.4747642817526345, + "grad_norm": 0.207503080368042, + "learning_rate": 4.1295774922933295e-06, + "loss": 0.4963, + "step": 5318 + }, + { + "epoch": 1.4750415973377704, + "grad_norm": 0.20725563168525696, + "learning_rate": 4.125477433848014e-06, + "loss": 0.5063, + "step": 5319 + }, + { + "epoch": 1.4753189129229063, + "grad_norm": 0.20372618734836578, + "learning_rate": 4.121379009493931e-06, + "loss": 0.4971, + "step": 5320 + }, + { + "epoch": 1.4755962285080422, + "grad_norm": 0.20692381262779236, + "learning_rate": 4.117282220030794e-06, + "loss": 0.5218, + "step": 5321 + }, + { + "epoch": 1.475873544093178, + "grad_norm": 0.19848695397377014, + "learning_rate": 4.113187066257991e-06, + "loss": 0.5201, + "step": 5322 + }, + { + "epoch": 1.476150859678314, + "grad_norm": 0.20803263783454895, + "learning_rate": 4.109093548974592e-06, + "loss": 0.5094, + "step": 5323 + }, + { + "epoch": 1.4764281752634498, + "grad_norm": 0.20934459567070007, + "learning_rate": 4.105001668979355e-06, + "loss": 0.4812, + "step": 5324 + }, + { + "epoch": 1.4767054908485857, + "grad_norm": 0.22413517534732819, + "learning_rate": 4.100911427070718e-06, + "loss": 0.5026, + "step": 5325 + }, + { + "epoch": 1.4769828064337216, + "grad_norm": 0.19864612817764282, + "learning_rate": 4.096822824046787e-06, + "loss": 0.5218, + "step": 5326 + }, + { + "epoch": 1.4772601220188575, + "grad_norm": 0.20662501454353333, + "learning_rate": 4.09273586070536e-06, + "loss": 0.5018, + "step": 5327 + }, + { + "epoch": 1.4775374376039934, + "grad_norm": 0.20286211371421814, + "learning_rate": 4.088650537843919e-06, + "loss": 0.4921, + "step": 5328 + }, + { + "epoch": 1.4778147531891292, + "grad_norm": 0.21929144859313965, + "learning_rate": 4.084566856259611e-06, + "loss": 0.483, + "step": 5329 + }, + { + "epoch": 1.4780920687742651, + "grad_norm": 0.20432603359222412, + "learning_rate": 4.080484816749268e-06, + "loss": 0.4813, + "step": 5330 + }, + { + "epoch": 1.478369384359401, + "grad_norm": 0.19982703030109406, + "learning_rate": 4.076404420109409e-06, + "loss": 0.4792, + "step": 5331 + }, + { + "epoch": 1.478646699944537, + "grad_norm": 0.20570825040340424, + "learning_rate": 4.072325667136228e-06, + "loss": 0.4854, + "step": 5332 + }, + { + "epoch": 1.4789240155296728, + "grad_norm": 0.2098425328731537, + "learning_rate": 4.068248558625595e-06, + "loss": 0.484, + "step": 5333 + }, + { + "epoch": 1.4792013311148087, + "grad_norm": 0.20449112355709076, + "learning_rate": 4.064173095373067e-06, + "loss": 0.5152, + "step": 5334 + }, + { + "epoch": 1.4794786466999446, + "grad_norm": 0.2116931974887848, + "learning_rate": 4.060099278173867e-06, + "loss": 0.4783, + "step": 5335 + }, + { + "epoch": 1.4797559622850804, + "grad_norm": 0.1945703774690628, + "learning_rate": 4.056027107822911e-06, + "loss": 0.4861, + "step": 5336 + }, + { + "epoch": 1.4800332778702163, + "grad_norm": 0.20104999840259552, + "learning_rate": 4.051956585114783e-06, + "loss": 0.4833, + "step": 5337 + }, + { + "epoch": 1.4803105934553522, + "grad_norm": 0.2020789533853531, + "learning_rate": 4.047887710843756e-06, + "loss": 0.4986, + "step": 5338 + }, + { + "epoch": 1.480587909040488, + "grad_norm": 0.2103864550590515, + "learning_rate": 4.043820485803766e-06, + "loss": 0.4997, + "step": 5339 + }, + { + "epoch": 1.480865224625624, + "grad_norm": 0.21654918789863586, + "learning_rate": 4.039754910788442e-06, + "loss": 0.4647, + "step": 5340 + }, + { + "epoch": 1.4811425402107599, + "grad_norm": 0.2058803290128708, + "learning_rate": 4.0356909865910895e-06, + "loss": 0.5266, + "step": 5341 + }, + { + "epoch": 1.4814198557958957, + "grad_norm": 0.20043110847473145, + "learning_rate": 4.031628714004678e-06, + "loss": 0.514, + "step": 5342 + }, + { + "epoch": 1.4816971713810316, + "grad_norm": 0.2056841254234314, + "learning_rate": 4.0275680938218705e-06, + "loss": 0.5026, + "step": 5343 + }, + { + "epoch": 1.4819744869661675, + "grad_norm": 0.2606765627861023, + "learning_rate": 4.023509126835004e-06, + "loss": 0.4976, + "step": 5344 + }, + { + "epoch": 1.4822518025513034, + "grad_norm": 0.20417386293411255, + "learning_rate": 4.019451813836088e-06, + "loss": 0.5086, + "step": 5345 + }, + { + "epoch": 1.4825291181364393, + "grad_norm": 0.21264471113681793, + "learning_rate": 4.015396155616806e-06, + "loss": 0.4835, + "step": 5346 + }, + { + "epoch": 1.4828064337215752, + "grad_norm": 0.19947127997875214, + "learning_rate": 4.011342152968531e-06, + "loss": 0.4885, + "step": 5347 + }, + { + "epoch": 1.483083749306711, + "grad_norm": 0.20617523789405823, + "learning_rate": 4.007289806682307e-06, + "loss": 0.5123, + "step": 5348 + }, + { + "epoch": 1.483361064891847, + "grad_norm": 0.21049796044826508, + "learning_rate": 4.003239117548853e-06, + "loss": 0.5122, + "step": 5349 + }, + { + "epoch": 1.4836383804769828, + "grad_norm": 0.22191189229488373, + "learning_rate": 3.999190086358556e-06, + "loss": 0.5074, + "step": 5350 + }, + { + "epoch": 1.4839156960621187, + "grad_norm": 0.20709733664989471, + "learning_rate": 3.995142713901506e-06, + "loss": 0.507, + "step": 5351 + }, + { + "epoch": 1.4841930116472546, + "grad_norm": 0.21513330936431885, + "learning_rate": 3.9910970009674445e-06, + "loss": 0.5193, + "step": 5352 + }, + { + "epoch": 1.4844703272323905, + "grad_norm": 0.2153635323047638, + "learning_rate": 3.987052948345797e-06, + "loss": 0.4891, + "step": 5353 + }, + { + "epoch": 1.4847476428175264, + "grad_norm": 0.21485386788845062, + "learning_rate": 3.983010556825658e-06, + "loss": 0.5012, + "step": 5354 + }, + { + "epoch": 1.4850249584026622, + "grad_norm": 0.20851318538188934, + "learning_rate": 3.978969827195821e-06, + "loss": 0.4958, + "step": 5355 + }, + { + "epoch": 1.4853022739877981, + "grad_norm": 0.2146236151456833, + "learning_rate": 3.97493076024473e-06, + "loss": 0.4802, + "step": 5356 + }, + { + "epoch": 1.485579589572934, + "grad_norm": 0.22128230333328247, + "learning_rate": 3.970893356760512e-06, + "loss": 0.52, + "step": 5357 + }, + { + "epoch": 1.48585690515807, + "grad_norm": 0.20890596508979797, + "learning_rate": 3.966857617530974e-06, + "loss": 0.4722, + "step": 5358 + }, + { + "epoch": 1.4861342207432058, + "grad_norm": 0.20855508744716644, + "learning_rate": 3.9628235433436e-06, + "loss": 0.4897, + "step": 5359 + }, + { + "epoch": 1.4864115363283417, + "grad_norm": 0.21735462546348572, + "learning_rate": 3.958791134985541e-06, + "loss": 0.5027, + "step": 5360 + }, + { + "epoch": 1.4866888519134775, + "grad_norm": 0.20674702525138855, + "learning_rate": 3.954760393243623e-06, + "loss": 0.4892, + "step": 5361 + }, + { + "epoch": 1.4869661674986134, + "grad_norm": 0.20703266561031342, + "learning_rate": 3.950731318904355e-06, + "loss": 0.5145, + "step": 5362 + }, + { + "epoch": 1.4872434830837493, + "grad_norm": 0.20337079465389252, + "learning_rate": 3.946703912753917e-06, + "loss": 0.5017, + "step": 5363 + }, + { + "epoch": 1.4875207986688852, + "grad_norm": 0.2117006629705429, + "learning_rate": 3.942678175578159e-06, + "loss": 0.5023, + "step": 5364 + }, + { + "epoch": 1.487798114254021, + "grad_norm": 0.20295751094818115, + "learning_rate": 3.938654108162611e-06, + "loss": 0.4966, + "step": 5365 + }, + { + "epoch": 1.488075429839157, + "grad_norm": 0.20612047612667084, + "learning_rate": 3.934631711292477e-06, + "loss": 0.4974, + "step": 5366 + }, + { + "epoch": 1.4883527454242929, + "grad_norm": 0.1999398022890091, + "learning_rate": 3.930610985752633e-06, + "loss": 0.5003, + "step": 5367 + }, + { + "epoch": 1.4886300610094287, + "grad_norm": 0.20393617451190948, + "learning_rate": 3.926591932327622e-06, + "loss": 0.499, + "step": 5368 + }, + { + "epoch": 1.4889073765945646, + "grad_norm": 0.20448175072669983, + "learning_rate": 3.922574551801675e-06, + "loss": 0.4882, + "step": 5369 + }, + { + "epoch": 1.4891846921797005, + "grad_norm": 0.20072409510612488, + "learning_rate": 3.918558844958691e-06, + "loss": 0.4902, + "step": 5370 + }, + { + "epoch": 1.4894620077648364, + "grad_norm": 0.2017168551683426, + "learning_rate": 3.9145448125822325e-06, + "loss": 0.4926, + "step": 5371 + }, + { + "epoch": 1.4897393233499723, + "grad_norm": 0.20040208101272583, + "learning_rate": 3.910532455455553e-06, + "loss": 0.5075, + "step": 5372 + }, + { + "epoch": 1.4900166389351082, + "grad_norm": 0.19632667303085327, + "learning_rate": 3.9065217743615605e-06, + "loss": 0.4889, + "step": 5373 + }, + { + "epoch": 1.490293954520244, + "grad_norm": 0.21401472389698029, + "learning_rate": 3.902512770082854e-06, + "loss": 0.4837, + "step": 5374 + }, + { + "epoch": 1.49057127010538, + "grad_norm": 0.21348023414611816, + "learning_rate": 3.898505443401689e-06, + "loss": 0.5165, + "step": 5375 + }, + { + "epoch": 1.4908485856905158, + "grad_norm": 0.20388244092464447, + "learning_rate": 3.8944997951000066e-06, + "loss": 0.5116, + "step": 5376 + }, + { + "epoch": 1.4911259012756517, + "grad_norm": 0.20777450501918793, + "learning_rate": 3.89049582595941e-06, + "loss": 0.5097, + "step": 5377 + }, + { + "epoch": 1.4914032168607876, + "grad_norm": 0.20261326432228088, + "learning_rate": 3.886493536761182e-06, + "loss": 0.5027, + "step": 5378 + }, + { + "epoch": 1.4916805324459235, + "grad_norm": 0.20818550884723663, + "learning_rate": 3.882492928286279e-06, + "loss": 0.4847, + "step": 5379 + }, + { + "epoch": 1.4919578480310594, + "grad_norm": 0.1958109438419342, + "learning_rate": 3.878494001315319e-06, + "loss": 0.4782, + "step": 5380 + }, + { + "epoch": 1.4922351636161952, + "grad_norm": 0.20695748925209045, + "learning_rate": 3.874496756628606e-06, + "loss": 0.4889, + "step": 5381 + }, + { + "epoch": 1.4925124792013311, + "grad_norm": 0.20474323630332947, + "learning_rate": 3.8705011950060985e-06, + "loss": 0.4796, + "step": 5382 + }, + { + "epoch": 1.492789794786467, + "grad_norm": 0.20276211202144623, + "learning_rate": 3.8665073172274465e-06, + "loss": 0.4914, + "step": 5383 + }, + { + "epoch": 1.493067110371603, + "grad_norm": 0.20544062554836273, + "learning_rate": 3.862515124071954e-06, + "loss": 0.5119, + "step": 5384 + }, + { + "epoch": 1.4933444259567388, + "grad_norm": 0.19975237548351288, + "learning_rate": 3.858524616318607e-06, + "loss": 0.4804, + "step": 5385 + }, + { + "epoch": 1.4936217415418747, + "grad_norm": 0.20508365333080292, + "learning_rate": 3.8545357947460625e-06, + "loss": 0.4953, + "step": 5386 + }, + { + "epoch": 1.4938990571270105, + "grad_norm": 0.20914685726165771, + "learning_rate": 3.850548660132641e-06, + "loss": 0.4684, + "step": 5387 + }, + { + "epoch": 1.4941763727121464, + "grad_norm": 0.23906511068344116, + "learning_rate": 3.846563213256335e-06, + "loss": 0.5059, + "step": 5388 + }, + { + "epoch": 1.4944536882972823, + "grad_norm": 0.240016907453537, + "learning_rate": 3.842579454894815e-06, + "loss": 0.5089, + "step": 5389 + }, + { + "epoch": 1.4947310038824182, + "grad_norm": 0.2028864324092865, + "learning_rate": 3.838597385825421e-06, + "loss": 0.4835, + "step": 5390 + }, + { + "epoch": 1.495008319467554, + "grad_norm": 0.21252113580703735, + "learning_rate": 3.834617006825157e-06, + "loss": 0.4773, + "step": 5391 + }, + { + "epoch": 1.49528563505269, + "grad_norm": 0.20465324819087982, + "learning_rate": 3.830638318670691e-06, + "loss": 0.4668, + "step": 5392 + }, + { + "epoch": 1.4955629506378258, + "grad_norm": 0.20127691328525543, + "learning_rate": 3.826661322138389e-06, + "loss": 0.5119, + "step": 5393 + }, + { + "epoch": 1.4958402662229617, + "grad_norm": 0.2034253180027008, + "learning_rate": 3.822686018004258e-06, + "loss": 0.5025, + "step": 5394 + }, + { + "epoch": 1.4961175818080976, + "grad_norm": 0.2095792442560196, + "learning_rate": 3.8187124070439815e-06, + "loss": 0.4719, + "step": 5395 + }, + { + "epoch": 1.4963948973932335, + "grad_norm": 0.21357692778110504, + "learning_rate": 3.814740490032921e-06, + "loss": 0.5078, + "step": 5396 + }, + { + "epoch": 1.4966722129783694, + "grad_norm": 0.2135164737701416, + "learning_rate": 3.8107702677461067e-06, + "loss": 0.4999, + "step": 5397 + }, + { + "epoch": 1.4969495285635053, + "grad_norm": 0.21254348754882812, + "learning_rate": 3.8068017409582294e-06, + "loss": 0.4971, + "step": 5398 + }, + { + "epoch": 1.4972268441486412, + "grad_norm": 0.2134813517332077, + "learning_rate": 3.802834910443652e-06, + "loss": 0.5112, + "step": 5399 + }, + { + "epoch": 1.497504159733777, + "grad_norm": 0.20528295636177063, + "learning_rate": 3.798869776976409e-06, + "loss": 0.4773, + "step": 5400 + }, + { + "epoch": 1.497781475318913, + "grad_norm": 0.20741385221481323, + "learning_rate": 3.7949063413302093e-06, + "loss": 0.5007, + "step": 5401 + }, + { + "epoch": 1.4980587909040488, + "grad_norm": 0.2020622044801712, + "learning_rate": 3.7909446042784193e-06, + "loss": 0.4935, + "step": 5402 + }, + { + "epoch": 1.4983361064891847, + "grad_norm": 0.20889051258563995, + "learning_rate": 3.786984566594071e-06, + "loss": 0.471, + "step": 5403 + }, + { + "epoch": 1.4986134220743206, + "grad_norm": 0.19613035023212433, + "learning_rate": 3.7830262290498896e-06, + "loss": 0.5077, + "step": 5404 + }, + { + "epoch": 1.4988907376594565, + "grad_norm": 0.2048519104719162, + "learning_rate": 3.7790695924182413e-06, + "loss": 0.4703, + "step": 5405 + }, + { + "epoch": 1.4991680532445923, + "grad_norm": 0.22040173411369324, + "learning_rate": 3.775114657471168e-06, + "loss": 0.4972, + "step": 5406 + }, + { + "epoch": 1.4994453688297282, + "grad_norm": 0.2038019597530365, + "learning_rate": 3.771161424980388e-06, + "loss": 0.5195, + "step": 5407 + }, + { + "epoch": 1.4997226844148641, + "grad_norm": 0.20490224659442902, + "learning_rate": 3.7672098957172846e-06, + "loss": 0.4884, + "step": 5408 + }, + { + "epoch": 1.5, + "grad_norm": 0.20087428390979767, + "learning_rate": 3.763260070452895e-06, + "loss": 0.4781, + "step": 5409 + }, + { + "epoch": 1.5002773155851359, + "grad_norm": 0.21157024800777435, + "learning_rate": 3.759311949957947e-06, + "loss": 0.5003, + "step": 5410 + }, + { + "epoch": 1.5005546311702718, + "grad_norm": 0.23590406775474548, + "learning_rate": 3.755365535002814e-06, + "loss": 0.4798, + "step": 5411 + }, + { + "epoch": 1.5008319467554077, + "grad_norm": 0.21228978037834167, + "learning_rate": 3.751420826357553e-06, + "loss": 0.5178, + "step": 5412 + }, + { + "epoch": 1.5011092623405435, + "grad_norm": 0.21098542213439941, + "learning_rate": 3.7474778247918743e-06, + "loss": 0.4764, + "step": 5413 + }, + { + "epoch": 1.5013865779256794, + "grad_norm": 0.20355211198329926, + "learning_rate": 3.743536531075169e-06, + "loss": 0.4938, + "step": 5414 + }, + { + "epoch": 1.5016638935108153, + "grad_norm": 0.20924827456474304, + "learning_rate": 3.739596945976481e-06, + "loss": 0.5048, + "step": 5415 + }, + { + "epoch": 1.5019412090959512, + "grad_norm": 0.20819483697414398, + "learning_rate": 3.7356590702645327e-06, + "loss": 0.4889, + "step": 5416 + }, + { + "epoch": 1.502218524681087, + "grad_norm": 0.20322836935520172, + "learning_rate": 3.7317229047077086e-06, + "loss": 0.4886, + "step": 5417 + }, + { + "epoch": 1.502495840266223, + "grad_norm": 0.20074166357517242, + "learning_rate": 3.7277884500740543e-06, + "loss": 0.5019, + "step": 5418 + }, + { + "epoch": 1.5027731558513588, + "grad_norm": 0.21029093861579895, + "learning_rate": 3.723855707131292e-06, + "loss": 0.4973, + "step": 5419 + }, + { + "epoch": 1.5030504714364947, + "grad_norm": 0.20609499514102936, + "learning_rate": 3.7199246766467964e-06, + "loss": 0.5115, + "step": 5420 + }, + { + "epoch": 1.5033277870216306, + "grad_norm": 0.21178114414215088, + "learning_rate": 3.715995359387625e-06, + "loss": 0.5378, + "step": 5421 + }, + { + "epoch": 1.5036051026067665, + "grad_norm": 0.2018701434135437, + "learning_rate": 3.7120677561204823e-06, + "loss": 0.4945, + "step": 5422 + }, + { + "epoch": 1.5038824181919024, + "grad_norm": 0.2033861130475998, + "learning_rate": 3.708141867611753e-06, + "loss": 0.4795, + "step": 5423 + }, + { + "epoch": 1.5041597337770383, + "grad_norm": 0.20554760098457336, + "learning_rate": 3.7042176946274846e-06, + "loss": 0.4864, + "step": 5424 + }, + { + "epoch": 1.5044370493621742, + "grad_norm": 0.20681151747703552, + "learning_rate": 3.7002952379333837e-06, + "loss": 0.4868, + "step": 5425 + }, + { + "epoch": 1.50471436494731, + "grad_norm": 0.1939842700958252, + "learning_rate": 3.6963744982948216e-06, + "loss": 0.4813, + "step": 5426 + }, + { + "epoch": 1.504991680532446, + "grad_norm": 0.20737822353839874, + "learning_rate": 3.6924554764768428e-06, + "loss": 0.5158, + "step": 5427 + }, + { + "epoch": 1.5052689961175818, + "grad_norm": 0.20026959478855133, + "learning_rate": 3.688538173244156e-06, + "loss": 0.493, + "step": 5428 + }, + { + "epoch": 1.5055463117027177, + "grad_norm": 0.20325352251529694, + "learning_rate": 3.6846225893611265e-06, + "loss": 0.5011, + "step": 5429 + }, + { + "epoch": 1.5058236272878536, + "grad_norm": 0.20845289528369904, + "learning_rate": 3.680708725591782e-06, + "loss": 0.4736, + "step": 5430 + }, + { + "epoch": 1.5061009428729895, + "grad_norm": 0.20399589836597443, + "learning_rate": 3.6767965826998345e-06, + "loss": 0.5033, + "step": 5431 + }, + { + "epoch": 1.5063782584581253, + "grad_norm": 0.19747604429721832, + "learning_rate": 3.672886161448641e-06, + "loss": 0.4793, + "step": 5432 + }, + { + "epoch": 1.5066555740432612, + "grad_norm": 0.2051791548728943, + "learning_rate": 3.6689774626012224e-06, + "loss": 0.5145, + "step": 5433 + }, + { + "epoch": 1.506932889628397, + "grad_norm": 0.1973484754562378, + "learning_rate": 3.665070486920276e-06, + "loss": 0.4764, + "step": 5434 + }, + { + "epoch": 1.507210205213533, + "grad_norm": 0.195805624127388, + "learning_rate": 3.6611652351681568e-06, + "loss": 0.4818, + "step": 5435 + }, + { + "epoch": 1.5074875207986689, + "grad_norm": 0.20444104075431824, + "learning_rate": 3.657261708106882e-06, + "loss": 0.4878, + "step": 5436 + }, + { + "epoch": 1.5077648363838048, + "grad_norm": 0.20944221317768097, + "learning_rate": 3.653359906498127e-06, + "loss": 0.5183, + "step": 5437 + }, + { + "epoch": 1.5080421519689406, + "grad_norm": 0.2081797569990158, + "learning_rate": 3.6494598311032415e-06, + "loss": 0.4885, + "step": 5438 + }, + { + "epoch": 1.5083194675540765, + "grad_norm": 0.20127315819263458, + "learning_rate": 3.645561482683238e-06, + "loss": 0.4935, + "step": 5439 + }, + { + "epoch": 1.5085967831392124, + "grad_norm": 0.20761513710021973, + "learning_rate": 3.6416648619987837e-06, + "loss": 0.4706, + "step": 5440 + }, + { + "epoch": 1.5088740987243483, + "grad_norm": 0.2054562270641327, + "learning_rate": 3.6377699698102035e-06, + "loss": 0.5033, + "step": 5441 + }, + { + "epoch": 1.5091514143094842, + "grad_norm": 0.21789734065532684, + "learning_rate": 3.6338768068775104e-06, + "loss": 0.4885, + "step": 5442 + }, + { + "epoch": 1.50942872989462, + "grad_norm": 0.21268032491207123, + "learning_rate": 3.6299853739603555e-06, + "loss": 0.4779, + "step": 5443 + }, + { + "epoch": 1.509706045479756, + "grad_norm": 0.19880391657352448, + "learning_rate": 3.6260956718180568e-06, + "loss": 0.4655, + "step": 5444 + }, + { + "epoch": 1.5099833610648918, + "grad_norm": 0.21325719356536865, + "learning_rate": 3.6222077012096026e-06, + "loss": 0.5004, + "step": 5445 + }, + { + "epoch": 1.5102606766500277, + "grad_norm": 0.20476926863193512, + "learning_rate": 3.6183214628936425e-06, + "loss": 0.4965, + "step": 5446 + }, + { + "epoch": 1.5105379922351636, + "grad_norm": 0.20836953818798065, + "learning_rate": 3.614436957628481e-06, + "loss": 0.4832, + "step": 5447 + }, + { + "epoch": 1.5108153078202995, + "grad_norm": 0.2125098705291748, + "learning_rate": 3.6105541861720826e-06, + "loss": 0.5014, + "step": 5448 + }, + { + "epoch": 1.5110926234054354, + "grad_norm": 0.2126840204000473, + "learning_rate": 3.6066731492820844e-06, + "loss": 0.5311, + "step": 5449 + }, + { + "epoch": 1.5113699389905713, + "grad_norm": 0.2019418478012085, + "learning_rate": 3.6027938477157838e-06, + "loss": 0.5007, + "step": 5450 + }, + { + "epoch": 1.5116472545757071, + "grad_norm": 0.25397753715515137, + "learning_rate": 3.598916282230126e-06, + "loss": 0.5272, + "step": 5451 + }, + { + "epoch": 1.511924570160843, + "grad_norm": 0.20074354112148285, + "learning_rate": 3.595040453581734e-06, + "loss": 0.4877, + "step": 5452 + }, + { + "epoch": 1.512201885745979, + "grad_norm": 0.2004215121269226, + "learning_rate": 3.5911663625268792e-06, + "loss": 0.4733, + "step": 5453 + }, + { + "epoch": 1.5124792013311148, + "grad_norm": 0.21349315345287323, + "learning_rate": 3.587294009821507e-06, + "loss": 0.5166, + "step": 5454 + }, + { + "epoch": 1.5127565169162507, + "grad_norm": 0.20270389318466187, + "learning_rate": 3.5834233962212056e-06, + "loss": 0.5115, + "step": 5455 + }, + { + "epoch": 1.5130338325013866, + "grad_norm": 0.20797984302043915, + "learning_rate": 3.5795545224812405e-06, + "loss": 0.4746, + "step": 5456 + }, + { + "epoch": 1.5133111480865225, + "grad_norm": 0.2141650915145874, + "learning_rate": 3.575687389356534e-06, + "loss": 0.5169, + "step": 5457 + }, + { + "epoch": 1.5135884636716583, + "grad_norm": 0.20348010957241058, + "learning_rate": 3.5718219976016614e-06, + "loss": 0.5052, + "step": 5458 + }, + { + "epoch": 1.5138657792567942, + "grad_norm": 0.2087784856557846, + "learning_rate": 3.5679583479708664e-06, + "loss": 0.5135, + "step": 5459 + }, + { + "epoch": 1.51414309484193, + "grad_norm": 0.2124669849872589, + "learning_rate": 3.564096441218044e-06, + "loss": 0.513, + "step": 5460 + }, + { + "epoch": 1.514420410427066, + "grad_norm": 0.20236825942993164, + "learning_rate": 3.5602362780967624e-06, + "loss": 0.4963, + "step": 5461 + }, + { + "epoch": 1.5146977260122019, + "grad_norm": 0.1994817554950714, + "learning_rate": 3.556377859360234e-06, + "loss": 0.4795, + "step": 5462 + }, + { + "epoch": 1.5149750415973378, + "grad_norm": 0.20891886949539185, + "learning_rate": 3.5525211857613454e-06, + "loss": 0.5117, + "step": 5463 + }, + { + "epoch": 1.5152523571824736, + "grad_norm": 0.19349178671836853, + "learning_rate": 3.5486662580526285e-06, + "loss": 0.4596, + "step": 5464 + }, + { + "epoch": 1.5155296727676095, + "grad_norm": 0.21563927829265594, + "learning_rate": 3.5448130769862864e-06, + "loss": 0.4859, + "step": 5465 + }, + { + "epoch": 1.5158069883527454, + "grad_norm": 0.2011108696460724, + "learning_rate": 3.5409616433141793e-06, + "loss": 0.4963, + "step": 5466 + }, + { + "epoch": 1.5160843039378813, + "grad_norm": 0.19179309904575348, + "learning_rate": 3.537111957787821e-06, + "loss": 0.4804, + "step": 5467 + }, + { + "epoch": 1.5163616195230172, + "grad_norm": 0.194498673081398, + "learning_rate": 3.5332640211583807e-06, + "loss": 0.4814, + "step": 5468 + }, + { + "epoch": 1.516638935108153, + "grad_norm": 0.2188744843006134, + "learning_rate": 3.5294178341767043e-06, + "loss": 0.5059, + "step": 5469 + }, + { + "epoch": 1.516916250693289, + "grad_norm": 0.2129710465669632, + "learning_rate": 3.525573397593282e-06, + "loss": 0.4924, + "step": 5470 + }, + { + "epoch": 1.5171935662784248, + "grad_norm": 0.21179741621017456, + "learning_rate": 3.521730712158257e-06, + "loss": 0.4987, + "step": 5471 + }, + { + "epoch": 1.5174708818635607, + "grad_norm": 0.20567160844802856, + "learning_rate": 3.517889778621446e-06, + "loss": 0.5164, + "step": 5472 + }, + { + "epoch": 1.5177481974486966, + "grad_norm": 0.2281576544046402, + "learning_rate": 3.5140505977323186e-06, + "loss": 0.47, + "step": 5473 + }, + { + "epoch": 1.5180255130338325, + "grad_norm": 0.20740900933742523, + "learning_rate": 3.5102131702399983e-06, + "loss": 0.4853, + "step": 5474 + }, + { + "epoch": 1.5183028286189684, + "grad_norm": 0.20481230318546295, + "learning_rate": 3.506377496893265e-06, + "loss": 0.5248, + "step": 5475 + }, + { + "epoch": 1.5185801442041043, + "grad_norm": 0.20815472304821014, + "learning_rate": 3.502543578440562e-06, + "loss": 0.4855, + "step": 5476 + }, + { + "epoch": 1.5188574597892401, + "grad_norm": 0.20264998078346252, + "learning_rate": 3.498711415629996e-06, + "loss": 0.4765, + "step": 5477 + }, + { + "epoch": 1.519134775374376, + "grad_norm": 0.2170276939868927, + "learning_rate": 3.494881009209315e-06, + "loss": 0.5036, + "step": 5478 + }, + { + "epoch": 1.519412090959512, + "grad_norm": 0.2021361142396927, + "learning_rate": 3.4910523599259294e-06, + "loss": 0.4986, + "step": 5479 + }, + { + "epoch": 1.5196894065446478, + "grad_norm": 0.195895254611969, + "learning_rate": 3.487225468526924e-06, + "loss": 0.4801, + "step": 5480 + }, + { + "epoch": 1.5199667221297837, + "grad_norm": 0.21378639340400696, + "learning_rate": 3.483400335759017e-06, + "loss": 0.5152, + "step": 5481 + }, + { + "epoch": 1.5202440377149196, + "grad_norm": 0.213482066988945, + "learning_rate": 3.4795769623685912e-06, + "loss": 0.5034, + "step": 5482 + }, + { + "epoch": 1.5205213533000554, + "grad_norm": 0.2165437638759613, + "learning_rate": 3.475755349101692e-06, + "loss": 0.4841, + "step": 5483 + }, + { + "epoch": 1.5207986688851913, + "grad_norm": 0.20139548182487488, + "learning_rate": 3.471935496704022e-06, + "loss": 0.4711, + "step": 5484 + }, + { + "epoch": 1.5210759844703272, + "grad_norm": 0.2036607712507248, + "learning_rate": 3.4681174059209288e-06, + "loss": 0.4757, + "step": 5485 + }, + { + "epoch": 1.521353300055463, + "grad_norm": 0.19882218539714813, + "learning_rate": 3.4643010774974226e-06, + "loss": 0.5082, + "step": 5486 + }, + { + "epoch": 1.521630615640599, + "grad_norm": 0.21727705001831055, + "learning_rate": 3.460486512178171e-06, + "loss": 0.474, + "step": 5487 + }, + { + "epoch": 1.5219079312257349, + "grad_norm": 0.21604587137699127, + "learning_rate": 3.456673710707503e-06, + "loss": 0.4752, + "step": 5488 + }, + { + "epoch": 1.5221852468108708, + "grad_norm": 0.20462778210639954, + "learning_rate": 3.4528626738293893e-06, + "loss": 0.5011, + "step": 5489 + }, + { + "epoch": 1.5224625623960066, + "grad_norm": 0.20358699560165405, + "learning_rate": 3.4490534022874706e-06, + "loss": 0.4666, + "step": 5490 + }, + { + "epoch": 1.5227398779811425, + "grad_norm": 0.2057787925004959, + "learning_rate": 3.4452458968250293e-06, + "loss": 0.5309, + "step": 5491 + }, + { + "epoch": 1.5230171935662784, + "grad_norm": 0.1948128193616867, + "learning_rate": 3.441440158185018e-06, + "loss": 0.486, + "step": 5492 + }, + { + "epoch": 1.5232945091514143, + "grad_norm": 0.22251975536346436, + "learning_rate": 3.4376361871100313e-06, + "loss": 0.505, + "step": 5493 + }, + { + "epoch": 1.5235718247365502, + "grad_norm": 0.20903460681438446, + "learning_rate": 3.433833984342326e-06, + "loss": 0.5135, + "step": 5494 + }, + { + "epoch": 1.523849140321686, + "grad_norm": 0.20052765309810638, + "learning_rate": 3.430033550623818e-06, + "loss": 0.5181, + "step": 5495 + }, + { + "epoch": 1.524126455906822, + "grad_norm": 0.20799602568149567, + "learning_rate": 3.4262348866960655e-06, + "loss": 0.5056, + "step": 5496 + }, + { + "epoch": 1.5244037714919578, + "grad_norm": 0.21278195083141327, + "learning_rate": 3.4224379933002953e-06, + "loss": 0.4942, + "step": 5497 + }, + { + "epoch": 1.5246810870770937, + "grad_norm": 0.30083736777305603, + "learning_rate": 3.418642871177373e-06, + "loss": 0.4991, + "step": 5498 + }, + { + "epoch": 1.5249584026622296, + "grad_norm": 0.20429125428199768, + "learning_rate": 3.4148495210678366e-06, + "loss": 0.4891, + "step": 5499 + }, + { + "epoch": 1.5252357182473655, + "grad_norm": 0.2550550401210785, + "learning_rate": 3.4110579437118624e-06, + "loss": 0.4804, + "step": 5500 + }, + { + "epoch": 1.5255130338325014, + "grad_norm": 0.20311662554740906, + "learning_rate": 3.4072681398492942e-06, + "loss": 0.4673, + "step": 5501 + }, + { + "epoch": 1.5257903494176372, + "grad_norm": 0.1889600306749344, + "learning_rate": 3.4034801102196146e-06, + "loss": 0.4716, + "step": 5502 + }, + { + "epoch": 1.5260676650027731, + "grad_norm": 0.21437427401542664, + "learning_rate": 3.399693855561974e-06, + "loss": 0.5053, + "step": 5503 + }, + { + "epoch": 1.526344980587909, + "grad_norm": 0.21157221496105194, + "learning_rate": 3.395909376615172e-06, + "loss": 0.4717, + "step": 5504 + }, + { + "epoch": 1.526622296173045, + "grad_norm": 0.2132854014635086, + "learning_rate": 3.3921266741176614e-06, + "loss": 0.4751, + "step": 5505 + }, + { + "epoch": 1.5268996117581808, + "grad_norm": 0.21224236488342285, + "learning_rate": 3.3883457488075406e-06, + "loss": 0.4927, + "step": 5506 + }, + { + "epoch": 1.5271769273433167, + "grad_norm": 0.20786736905574799, + "learning_rate": 3.384566601422573e-06, + "loss": 0.4666, + "step": 5507 + }, + { + "epoch": 1.5274542429284526, + "grad_norm": 0.21545729041099548, + "learning_rate": 3.3807892327001737e-06, + "loss": 0.4783, + "step": 5508 + }, + { + "epoch": 1.5277315585135884, + "grad_norm": 0.20959782600402832, + "learning_rate": 3.377013643377401e-06, + "loss": 0.5053, + "step": 5509 + }, + { + "epoch": 1.5280088740987243, + "grad_norm": 0.20486906170845032, + "learning_rate": 3.373239834190975e-06, + "loss": 0.4873, + "step": 5510 + }, + { + "epoch": 1.5282861896838602, + "grad_norm": 0.2007468342781067, + "learning_rate": 3.3694678058772704e-06, + "loss": 0.4899, + "step": 5511 + }, + { + "epoch": 1.528563505268996, + "grad_norm": 0.19690293073654175, + "learning_rate": 3.365697559172304e-06, + "loss": 0.4941, + "step": 5512 + }, + { + "epoch": 1.528840820854132, + "grad_norm": 0.2142125815153122, + "learning_rate": 3.3619290948117513e-06, + "loss": 0.4746, + "step": 5513 + }, + { + "epoch": 1.5291181364392679, + "grad_norm": 0.20775122940540314, + "learning_rate": 3.3581624135309395e-06, + "loss": 0.5111, + "step": 5514 + }, + { + "epoch": 1.5293954520244037, + "grad_norm": 0.21404637396335602, + "learning_rate": 3.3543975160648526e-06, + "loss": 0.4836, + "step": 5515 + }, + { + "epoch": 1.5296727676095396, + "grad_norm": 0.20010797679424286, + "learning_rate": 3.3506344031481187e-06, + "loss": 0.5035, + "step": 5516 + }, + { + "epoch": 1.5299500831946755, + "grad_norm": 0.20686432719230652, + "learning_rate": 3.346873075515014e-06, + "loss": 0.4676, + "step": 5517 + }, + { + "epoch": 1.5302273987798114, + "grad_norm": 0.1945461481809616, + "learning_rate": 3.3431135338994864e-06, + "loss": 0.491, + "step": 5518 + }, + { + "epoch": 1.5305047143649473, + "grad_norm": 0.20243096351623535, + "learning_rate": 3.3393557790351167e-06, + "loss": 0.5112, + "step": 5519 + }, + { + "epoch": 1.5307820299500832, + "grad_norm": 0.19688890874385834, + "learning_rate": 3.3355998116551395e-06, + "loss": 0.4847, + "step": 5520 + }, + { + "epoch": 1.531059345535219, + "grad_norm": 0.20034383237361908, + "learning_rate": 3.331845632492439e-06, + "loss": 0.4914, + "step": 5521 + }, + { + "epoch": 1.531336661120355, + "grad_norm": 0.21226158738136292, + "learning_rate": 3.328093242279569e-06, + "loss": 0.5064, + "step": 5522 + }, + { + "epoch": 1.5316139767054908, + "grad_norm": 0.20493634045124054, + "learning_rate": 3.3243426417487107e-06, + "loss": 0.5008, + "step": 5523 + }, + { + "epoch": 1.5318912922906267, + "grad_norm": 0.2078971415758133, + "learning_rate": 3.3205938316317047e-06, + "loss": 0.4862, + "step": 5524 + }, + { + "epoch": 1.5321686078757626, + "grad_norm": 0.19189336895942688, + "learning_rate": 3.316846812660046e-06, + "loss": 0.4914, + "step": 5525 + }, + { + "epoch": 1.5324459234608985, + "grad_norm": 0.18428561091423035, + "learning_rate": 3.313101585564882e-06, + "loss": 0.4801, + "step": 5526 + }, + { + "epoch": 1.5327232390460344, + "grad_norm": 0.19429829716682434, + "learning_rate": 3.3093581510769995e-06, + "loss": 0.4672, + "step": 5527 + }, + { + "epoch": 1.5330005546311702, + "grad_norm": 0.20795489847660065, + "learning_rate": 3.3056165099268398e-06, + "loss": 0.5051, + "step": 5528 + }, + { + "epoch": 1.5332778702163061, + "grad_norm": 0.20628464221954346, + "learning_rate": 3.3018766628445e-06, + "loss": 0.5073, + "step": 5529 + }, + { + "epoch": 1.533555185801442, + "grad_norm": 0.1983129382133484, + "learning_rate": 3.2981386105597256e-06, + "loss": 0.4834, + "step": 5530 + }, + { + "epoch": 1.533832501386578, + "grad_norm": 0.20056569576263428, + "learning_rate": 3.294402353801905e-06, + "loss": 0.4883, + "step": 5531 + }, + { + "epoch": 1.5341098169717138, + "grad_norm": 0.20827257633209229, + "learning_rate": 3.2906678933000813e-06, + "loss": 0.4962, + "step": 5532 + }, + { + "epoch": 1.5343871325568497, + "grad_norm": 0.20605547726154327, + "learning_rate": 3.2869352297829532e-06, + "loss": 0.4792, + "step": 5533 + }, + { + "epoch": 1.5346644481419855, + "grad_norm": 0.20513512194156647, + "learning_rate": 3.283204363978852e-06, + "loss": 0.507, + "step": 5534 + }, + { + "epoch": 1.5349417637271214, + "grad_norm": 0.2062227725982666, + "learning_rate": 3.2794752966157793e-06, + "loss": 0.4994, + "step": 5535 + }, + { + "epoch": 1.5352190793122573, + "grad_norm": 0.2113962173461914, + "learning_rate": 3.2757480284213646e-06, + "loss": 0.4754, + "step": 5536 + }, + { + "epoch": 1.5354963948973932, + "grad_norm": 0.21231204271316528, + "learning_rate": 3.272022560122906e-06, + "loss": 0.488, + "step": 5537 + }, + { + "epoch": 1.535773710482529, + "grad_norm": 0.19757109880447388, + "learning_rate": 3.2682988924473316e-06, + "loss": 0.4951, + "step": 5538 + }, + { + "epoch": 1.536051026067665, + "grad_norm": 0.20043742656707764, + "learning_rate": 3.264577026121238e-06, + "loss": 0.5039, + "step": 5539 + }, + { + "epoch": 1.5363283416528009, + "grad_norm": 0.1949152946472168, + "learning_rate": 3.2608569618708494e-06, + "loss": 0.5021, + "step": 5540 + }, + { + "epoch": 1.5366056572379367, + "grad_norm": 0.20842109620571136, + "learning_rate": 3.257138700422055e-06, + "loss": 0.5008, + "step": 5541 + }, + { + "epoch": 1.5368829728230726, + "grad_norm": 0.21400795876979828, + "learning_rate": 3.253422242500388e-06, + "loss": 0.5048, + "step": 5542 + }, + { + "epoch": 1.5371602884082085, + "grad_norm": 0.21231138706207275, + "learning_rate": 3.249707588831025e-06, + "loss": 0.4919, + "step": 5543 + }, + { + "epoch": 1.5374376039933444, + "grad_norm": 0.2112995684146881, + "learning_rate": 3.24599474013879e-06, + "loss": 0.5135, + "step": 5544 + }, + { + "epoch": 1.5377149195784803, + "grad_norm": 0.20431232452392578, + "learning_rate": 3.2422836971481616e-06, + "loss": 0.5028, + "step": 5545 + }, + { + "epoch": 1.5379922351636162, + "grad_norm": 0.20688453316688538, + "learning_rate": 3.238574460583266e-06, + "loss": 0.4814, + "step": 5546 + }, + { + "epoch": 1.538269550748752, + "grad_norm": 0.21401961147785187, + "learning_rate": 3.234867031167865e-06, + "loss": 0.5026, + "step": 5547 + }, + { + "epoch": 1.538546866333888, + "grad_norm": 0.20731821656227112, + "learning_rate": 3.231161409625383e-06, + "loss": 0.5069, + "step": 5548 + }, + { + "epoch": 1.5388241819190238, + "grad_norm": 0.20831851661205292, + "learning_rate": 3.227457596678886e-06, + "loss": 0.4883, + "step": 5549 + }, + { + "epoch": 1.5391014975041597, + "grad_norm": 0.2025631219148636, + "learning_rate": 3.223755593051084e-06, + "loss": 0.4687, + "step": 5550 + }, + { + "epoch": 1.5393788130892956, + "grad_norm": 0.21386443078517914, + "learning_rate": 3.2200553994643307e-06, + "loss": 0.4807, + "step": 5551 + }, + { + "epoch": 1.5396561286744315, + "grad_norm": 0.22764462232589722, + "learning_rate": 3.2163570166406366e-06, + "loss": 0.4884, + "step": 5552 + }, + { + "epoch": 1.5399334442595674, + "grad_norm": 0.20263278484344482, + "learning_rate": 3.2126604453016574e-06, + "loss": 0.5059, + "step": 5553 + }, + { + "epoch": 1.5402107598447032, + "grad_norm": 0.22203831374645233, + "learning_rate": 3.20896568616869e-06, + "loss": 0.5071, + "step": 5554 + }, + { + "epoch": 1.5404880754298391, + "grad_norm": 0.21261858940124512, + "learning_rate": 3.205272739962674e-06, + "loss": 0.4967, + "step": 5555 + }, + { + "epoch": 1.540765391014975, + "grad_norm": 0.2063218355178833, + "learning_rate": 3.2015816074042052e-06, + "loss": 0.4983, + "step": 5556 + }, + { + "epoch": 1.541042706600111, + "grad_norm": 0.21330100297927856, + "learning_rate": 3.197892289213526e-06, + "loss": 0.4947, + "step": 5557 + }, + { + "epoch": 1.5413200221852468, + "grad_norm": 0.2076280117034912, + "learning_rate": 3.1942047861105163e-06, + "loss": 0.5006, + "step": 5558 + }, + { + "epoch": 1.5415973377703827, + "grad_norm": 0.2099052220582962, + "learning_rate": 3.190519098814697e-06, + "loss": 0.4869, + "step": 5559 + }, + { + "epoch": 1.5418746533555185, + "grad_norm": 0.20259883999824524, + "learning_rate": 3.1868352280452595e-06, + "loss": 0.4829, + "step": 5560 + }, + { + "epoch": 1.5421519689406544, + "grad_norm": 0.20924805104732513, + "learning_rate": 3.1831531745210168e-06, + "loss": 0.4715, + "step": 5561 + }, + { + "epoch": 1.5424292845257903, + "grad_norm": 0.2099631428718567, + "learning_rate": 3.1794729389604304e-06, + "loss": 0.491, + "step": 5562 + }, + { + "epoch": 1.5427066001109262, + "grad_norm": 0.2203865945339203, + "learning_rate": 3.1757945220816167e-06, + "loss": 0.5099, + "step": 5563 + }, + { + "epoch": 1.542983915696062, + "grad_norm": 0.20883382856845856, + "learning_rate": 3.1721179246023356e-06, + "loss": 0.4756, + "step": 5564 + }, + { + "epoch": 1.543261231281198, + "grad_norm": 0.24059654772281647, + "learning_rate": 3.1684431472399856e-06, + "loss": 0.4966, + "step": 5565 + }, + { + "epoch": 1.5435385468663338, + "grad_norm": 0.20498476922512054, + "learning_rate": 3.164770190711608e-06, + "loss": 0.5164, + "step": 5566 + }, + { + "epoch": 1.5438158624514697, + "grad_norm": 0.22245949506759644, + "learning_rate": 3.1610990557338987e-06, + "loss": 0.502, + "step": 5567 + }, + { + "epoch": 1.5440931780366056, + "grad_norm": 0.2156648188829422, + "learning_rate": 3.1574297430231977e-06, + "loss": 0.5193, + "step": 5568 + }, + { + "epoch": 1.5443704936217415, + "grad_norm": 0.22104239463806152, + "learning_rate": 3.153762253295475e-06, + "loss": 0.5048, + "step": 5569 + }, + { + "epoch": 1.5446478092068774, + "grad_norm": 0.20960746705532074, + "learning_rate": 3.1500965872663628e-06, + "loss": 0.5105, + "step": 5570 + }, + { + "epoch": 1.5449251247920133, + "grad_norm": 0.20040079951286316, + "learning_rate": 3.1464327456511288e-06, + "loss": 0.5048, + "step": 5571 + }, + { + "epoch": 1.5452024403771492, + "grad_norm": 0.20150801539421082, + "learning_rate": 3.142770729164686e-06, + "loss": 0.4976, + "step": 5572 + }, + { + "epoch": 1.545479755962285, + "grad_norm": 0.20172013342380524, + "learning_rate": 3.1391105385215847e-06, + "loss": 0.4958, + "step": 5573 + }, + { + "epoch": 1.545757071547421, + "grad_norm": 0.2224036306142807, + "learning_rate": 3.1354521744360295e-06, + "loss": 0.5085, + "step": 5574 + }, + { + "epoch": 1.5460343871325568, + "grad_norm": 0.21245881915092468, + "learning_rate": 3.131795637621868e-06, + "loss": 0.4828, + "step": 5575 + }, + { + "epoch": 1.5463117027176927, + "grad_norm": 0.21667277812957764, + "learning_rate": 3.1281409287925793e-06, + "loss": 0.492, + "step": 5576 + }, + { + "epoch": 1.5465890183028286, + "grad_norm": 0.2037910372018814, + "learning_rate": 3.1244880486613017e-06, + "loss": 0.5103, + "step": 5577 + }, + { + "epoch": 1.5468663338879645, + "grad_norm": 0.3222982585430145, + "learning_rate": 3.1208369979408013e-06, + "loss": 0.4977, + "step": 5578 + }, + { + "epoch": 1.5471436494731003, + "grad_norm": 0.2243606150150299, + "learning_rate": 3.117187777343504e-06, + "loss": 0.4867, + "step": 5579 + }, + { + "epoch": 1.5474209650582362, + "grad_norm": 0.2110065221786499, + "learning_rate": 3.1135403875814593e-06, + "loss": 0.5011, + "step": 5580 + }, + { + "epoch": 1.5476982806433721, + "grad_norm": 0.1963575929403305, + "learning_rate": 3.109894829366379e-06, + "loss": 0.4776, + "step": 5581 + }, + { + "epoch": 1.547975596228508, + "grad_norm": 0.20565971732139587, + "learning_rate": 3.1062511034095993e-06, + "loss": 0.4844, + "step": 5582 + }, + { + "epoch": 1.5482529118136439, + "grad_norm": 0.20696739852428436, + "learning_rate": 3.1026092104221124e-06, + "loss": 0.4766, + "step": 5583 + }, + { + "epoch": 1.5485302273987798, + "grad_norm": 0.27509939670562744, + "learning_rate": 3.098969151114552e-06, + "loss": 0.4783, + "step": 5584 + }, + { + "epoch": 1.5488075429839157, + "grad_norm": 0.20480765402317047, + "learning_rate": 3.0953309261971804e-06, + "loss": 0.4953, + "step": 5585 + }, + { + "epoch": 1.5490848585690515, + "grad_norm": 0.38247838616371155, + "learning_rate": 3.0916945363799214e-06, + "loss": 0.4516, + "step": 5586 + }, + { + "epoch": 1.5493621741541874, + "grad_norm": 0.21146711707115173, + "learning_rate": 3.088059982372324e-06, + "loss": 0.4855, + "step": 5587 + }, + { + "epoch": 1.5496394897393233, + "grad_norm": 0.2040729522705078, + "learning_rate": 3.0844272648835908e-06, + "loss": 0.5106, + "step": 5588 + }, + { + "epoch": 1.5499168053244592, + "grad_norm": 0.2061615288257599, + "learning_rate": 3.080796384622557e-06, + "loss": 0.4986, + "step": 5589 + }, + { + "epoch": 1.550194120909595, + "grad_norm": 0.19639424979686737, + "learning_rate": 3.0771673422977044e-06, + "loss": 0.5096, + "step": 5590 + }, + { + "epoch": 1.550471436494731, + "grad_norm": 0.22984932363033295, + "learning_rate": 3.0735401386171618e-06, + "loss": 0.4803, + "step": 5591 + }, + { + "epoch": 1.5507487520798668, + "grad_norm": 0.23276609182357788, + "learning_rate": 3.0699147742886862e-06, + "loss": 0.4857, + "step": 5592 + }, + { + "epoch": 1.5510260676650027, + "grad_norm": 0.22025956213474274, + "learning_rate": 3.0662912500196804e-06, + "loss": 0.5067, + "step": 5593 + }, + { + "epoch": 1.5513033832501386, + "grad_norm": 0.19536203145980835, + "learning_rate": 3.0626695665171927e-06, + "loss": 0.4849, + "step": 5594 + }, + { + "epoch": 1.5515806988352745, + "grad_norm": 0.2104620635509491, + "learning_rate": 3.059049724487914e-06, + "loss": 0.5033, + "step": 5595 + }, + { + "epoch": 1.5518580144204104, + "grad_norm": 0.2038435935974121, + "learning_rate": 3.0554317246381664e-06, + "loss": 0.5164, + "step": 5596 + }, + { + "epoch": 1.5521353300055463, + "grad_norm": 0.23535074293613434, + "learning_rate": 3.051815567673913e-06, + "loss": 0.4933, + "step": 5597 + }, + { + "epoch": 1.5524126455906821, + "grad_norm": 0.20647317171096802, + "learning_rate": 3.0482012543007725e-06, + "loss": 0.5152, + "step": 5598 + }, + { + "epoch": 1.552689961175818, + "grad_norm": 0.19642433524131775, + "learning_rate": 3.0445887852239897e-06, + "loss": 0.4896, + "step": 5599 + }, + { + "epoch": 1.552967276760954, + "grad_norm": 0.20343849062919617, + "learning_rate": 3.040978161148446e-06, + "loss": 0.4754, + "step": 5600 + }, + { + "epoch": 1.5532445923460898, + "grad_norm": 0.20736227929592133, + "learning_rate": 3.0373693827786766e-06, + "loss": 0.4728, + "step": 5601 + }, + { + "epoch": 1.5535219079312257, + "grad_norm": 0.2885718047618866, + "learning_rate": 3.03376245081885e-06, + "loss": 0.4844, + "step": 5602 + }, + { + "epoch": 1.5537992235163616, + "grad_norm": 0.21258218586444855, + "learning_rate": 3.0301573659727746e-06, + "loss": 0.4675, + "step": 5603 + }, + { + "epoch": 1.5540765391014975, + "grad_norm": 0.2054533213376999, + "learning_rate": 3.026554128943891e-06, + "loss": 0.4944, + "step": 5604 + }, + { + "epoch": 1.5543538546866333, + "grad_norm": 0.21457917988300323, + "learning_rate": 3.022952740435292e-06, + "loss": 0.4545, + "step": 5605 + }, + { + "epoch": 1.5546311702717692, + "grad_norm": 0.2189054936170578, + "learning_rate": 3.019353201149705e-06, + "loss": 0.4932, + "step": 5606 + }, + { + "epoch": 1.554908485856905, + "grad_norm": 0.19639572501182556, + "learning_rate": 3.0157555117894907e-06, + "loss": 0.4758, + "step": 5607 + }, + { + "epoch": 1.555185801442041, + "grad_norm": 0.21322979032993317, + "learning_rate": 3.012159673056661e-06, + "loss": 0.4928, + "step": 5608 + }, + { + "epoch": 1.5554631170271769, + "grad_norm": 0.2102145254611969, + "learning_rate": 3.008565685652849e-06, + "loss": 0.515, + "step": 5609 + }, + { + "epoch": 1.5557404326123128, + "grad_norm": 0.22351384162902832, + "learning_rate": 3.004973550279348e-06, + "loss": 0.5084, + "step": 5610 + }, + { + "epoch": 1.5560177481974486, + "grad_norm": 0.2032230645418167, + "learning_rate": 3.001383267637069e-06, + "loss": 0.4957, + "step": 5611 + }, + { + "epoch": 1.5562950637825845, + "grad_norm": 0.2006012350320816, + "learning_rate": 2.997794838426575e-06, + "loss": 0.481, + "step": 5612 + }, + { + "epoch": 1.5565723793677204, + "grad_norm": 0.20619052648544312, + "learning_rate": 2.9942082633480696e-06, + "loss": 0.4755, + "step": 5613 + }, + { + "epoch": 1.5568496949528563, + "grad_norm": 0.21772894263267517, + "learning_rate": 2.990623543101377e-06, + "loss": 0.4795, + "step": 5614 + }, + { + "epoch": 1.5571270105379922, + "grad_norm": 0.19506295025348663, + "learning_rate": 2.9870406783859827e-06, + "loss": 0.4655, + "step": 5615 + }, + { + "epoch": 1.557404326123128, + "grad_norm": 0.20197400450706482, + "learning_rate": 2.98345966990099e-06, + "loss": 0.5146, + "step": 5616 + }, + { + "epoch": 1.557681641708264, + "grad_norm": 0.20348283648490906, + "learning_rate": 2.979880518345153e-06, + "loss": 0.4377, + "step": 5617 + }, + { + "epoch": 1.5579589572933998, + "grad_norm": 0.20409370958805084, + "learning_rate": 2.976303224416856e-06, + "loss": 0.5216, + "step": 5618 + }, + { + "epoch": 1.5582362728785357, + "grad_norm": 0.21259050071239471, + "learning_rate": 2.972727788814128e-06, + "loss": 0.4988, + "step": 5619 + }, + { + "epoch": 1.5585135884636716, + "grad_norm": 0.19778795540332794, + "learning_rate": 2.9691542122346262e-06, + "loss": 0.5165, + "step": 5620 + }, + { + "epoch": 1.5587909040488075, + "grad_norm": 0.27131521701812744, + "learning_rate": 2.9655824953756517e-06, + "loss": 0.4913, + "step": 5621 + }, + { + "epoch": 1.5590682196339434, + "grad_norm": 0.2034967541694641, + "learning_rate": 2.962012638934146e-06, + "loss": 0.4725, + "step": 5622 + }, + { + "epoch": 1.5593455352190793, + "grad_norm": 0.19683615863323212, + "learning_rate": 2.9584446436066752e-06, + "loss": 0.4949, + "step": 5623 + }, + { + "epoch": 1.5596228508042151, + "grad_norm": 0.21673937141895294, + "learning_rate": 2.9548785100894565e-06, + "loss": 0.5125, + "step": 5624 + }, + { + "epoch": 1.559900166389351, + "grad_norm": 0.22336265444755554, + "learning_rate": 2.9513142390783303e-06, + "loss": 0.4803, + "step": 5625 + }, + { + "epoch": 1.560177481974487, + "grad_norm": 0.21071960031986237, + "learning_rate": 2.947751831268787e-06, + "loss": 0.5069, + "step": 5626 + }, + { + "epoch": 1.5604547975596228, + "grad_norm": 0.2002846896648407, + "learning_rate": 2.9441912873559406e-06, + "loss": 0.4857, + "step": 5627 + }, + { + "epoch": 1.5607321131447587, + "grad_norm": 0.20336318016052246, + "learning_rate": 2.940632608034549e-06, + "loss": 0.5015, + "step": 5628 + }, + { + "epoch": 1.5610094287298946, + "grad_norm": 0.199977844953537, + "learning_rate": 2.9370757939990114e-06, + "loss": 0.5078, + "step": 5629 + }, + { + "epoch": 1.5612867443150305, + "grad_norm": 0.20519019663333893, + "learning_rate": 2.933520845943351e-06, + "loss": 0.5052, + "step": 5630 + }, + { + "epoch": 1.5615640599001663, + "grad_norm": 0.21687155961990356, + "learning_rate": 2.9299677645612282e-06, + "loss": 0.4759, + "step": 5631 + }, + { + "epoch": 1.5618413754853022, + "grad_norm": 0.2080409973859787, + "learning_rate": 2.9264165505459476e-06, + "loss": 0.4918, + "step": 5632 + }, + { + "epoch": 1.562118691070438, + "grad_norm": 0.2009446918964386, + "learning_rate": 2.92286720459045e-06, + "loss": 0.4763, + "step": 5633 + }, + { + "epoch": 1.562396006655574, + "grad_norm": 0.22965089976787567, + "learning_rate": 2.9193197273873014e-06, + "loss": 0.483, + "step": 5634 + }, + { + "epoch": 1.5626733222407099, + "grad_norm": 0.20917722582817078, + "learning_rate": 2.9157741196287017e-06, + "loss": 0.496, + "step": 5635 + }, + { + "epoch": 1.5629506378258458, + "grad_norm": 0.23011514544487, + "learning_rate": 2.9122303820065087e-06, + "loss": 0.4844, + "step": 5636 + }, + { + "epoch": 1.5632279534109816, + "grad_norm": 0.2208072394132614, + "learning_rate": 2.908688515212191e-06, + "loss": 0.4847, + "step": 5637 + }, + { + "epoch": 1.5635052689961175, + "grad_norm": 0.2128543108701706, + "learning_rate": 2.905148519936857e-06, + "loss": 0.5051, + "step": 5638 + }, + { + "epoch": 1.5637825845812534, + "grad_norm": 0.2076890915632248, + "learning_rate": 2.9016103968712584e-06, + "loss": 0.5162, + "step": 5639 + }, + { + "epoch": 1.5640599001663893, + "grad_norm": 0.20999306440353394, + "learning_rate": 2.8980741467057786e-06, + "loss": 0.4999, + "step": 5640 + }, + { + "epoch": 1.5643372157515252, + "grad_norm": 0.22848869860172272, + "learning_rate": 2.8945397701304304e-06, + "loss": 0.4806, + "step": 5641 + }, + { + "epoch": 1.564614531336661, + "grad_norm": 0.23232677578926086, + "learning_rate": 2.8910072678348625e-06, + "loss": 0.4944, + "step": 5642 + }, + { + "epoch": 1.564891846921797, + "grad_norm": 0.2025647908449173, + "learning_rate": 2.887476640508363e-06, + "loss": 0.4707, + "step": 5643 + }, + { + "epoch": 1.5651691625069328, + "grad_norm": 0.20394767820835114, + "learning_rate": 2.8839478888398523e-06, + "loss": 0.5031, + "step": 5644 + }, + { + "epoch": 1.5654464780920687, + "grad_norm": 0.2066267430782318, + "learning_rate": 2.880421013517881e-06, + "loss": 0.4955, + "step": 5645 + }, + { + "epoch": 1.5657237936772046, + "grad_norm": 0.20270580053329468, + "learning_rate": 2.876896015230632e-06, + "loss": 0.474, + "step": 5646 + }, + { + "epoch": 1.5660011092623405, + "grad_norm": 0.2070900797843933, + "learning_rate": 2.8733728946659293e-06, + "loss": 0.4942, + "step": 5647 + }, + { + "epoch": 1.5662784248474764, + "grad_norm": 0.20855194330215454, + "learning_rate": 2.8698516525112324e-06, + "loss": 0.4801, + "step": 5648 + }, + { + "epoch": 1.5665557404326123, + "grad_norm": 0.2164842188358307, + "learning_rate": 2.86633228945362e-06, + "loss": 0.4946, + "step": 5649 + }, + { + "epoch": 1.5668330560177481, + "grad_norm": 0.21440735459327698, + "learning_rate": 2.862814806179817e-06, + "loss": 0.5068, + "step": 5650 + }, + { + "epoch": 1.567110371602884, + "grad_norm": 0.2091311663389206, + "learning_rate": 2.8592992033761814e-06, + "loss": 0.4693, + "step": 5651 + }, + { + "epoch": 1.56738768718802, + "grad_norm": 0.20943525433540344, + "learning_rate": 2.855785481728697e-06, + "loss": 0.5165, + "step": 5652 + }, + { + "epoch": 1.5676650027731558, + "grad_norm": 0.20559410750865936, + "learning_rate": 2.85227364192298e-06, + "loss": 0.5084, + "step": 5653 + }, + { + "epoch": 1.5679423183582917, + "grad_norm": 0.20715293288230896, + "learning_rate": 2.8487636846442873e-06, + "loss": 0.5199, + "step": 5654 + }, + { + "epoch": 1.5682196339434276, + "grad_norm": 0.21139737963676453, + "learning_rate": 2.8452556105775073e-06, + "loss": 0.5422, + "step": 5655 + }, + { + "epoch": 1.5684969495285634, + "grad_norm": 0.20338012278079987, + "learning_rate": 2.8417494204071526e-06, + "loss": 0.5006, + "step": 5656 + }, + { + "epoch": 1.5687742651136993, + "grad_norm": 0.21125024557113647, + "learning_rate": 2.8382451148173798e-06, + "loss": 0.5144, + "step": 5657 + }, + { + "epoch": 1.5690515806988352, + "grad_norm": 0.2064410001039505, + "learning_rate": 2.8347426944919637e-06, + "loss": 0.4919, + "step": 5658 + }, + { + "epoch": 1.569328896283971, + "grad_norm": 0.19719727337360382, + "learning_rate": 2.8312421601143267e-06, + "loss": 0.4966, + "step": 5659 + }, + { + "epoch": 1.569606211869107, + "grad_norm": 0.2056899517774582, + "learning_rate": 2.8277435123675144e-06, + "loss": 0.5109, + "step": 5660 + }, + { + "epoch": 1.5698835274542429, + "grad_norm": 0.20407631993293762, + "learning_rate": 2.8242467519342022e-06, + "loss": 0.476, + "step": 5661 + }, + { + "epoch": 1.5701608430393788, + "grad_norm": 0.2034579962491989, + "learning_rate": 2.8207518794967054e-06, + "loss": 0.5173, + "step": 5662 + }, + { + "epoch": 1.5704381586245146, + "grad_norm": 0.21342724561691284, + "learning_rate": 2.81725889573696e-06, + "loss": 0.4815, + "step": 5663 + }, + { + "epoch": 1.5707154742096505, + "grad_norm": 0.2078382819890976, + "learning_rate": 2.813767801336549e-06, + "loss": 0.5132, + "step": 5664 + }, + { + "epoch": 1.5709927897947864, + "grad_norm": 0.20430020987987518, + "learning_rate": 2.810278596976666e-06, + "loss": 0.5055, + "step": 5665 + }, + { + "epoch": 1.5712701053799223, + "grad_norm": 0.2025614082813263, + "learning_rate": 2.806791283338155e-06, + "loss": 0.4966, + "step": 5666 + }, + { + "epoch": 1.5715474209650582, + "grad_norm": 0.20510238409042358, + "learning_rate": 2.8033058611014838e-06, + "loss": 0.489, + "step": 5667 + }, + { + "epoch": 1.571824736550194, + "grad_norm": 0.2076101154088974, + "learning_rate": 2.7998223309467484e-06, + "loss": 0.5016, + "step": 5668 + }, + { + "epoch": 1.57210205213533, + "grad_norm": 0.19899241626262665, + "learning_rate": 2.7963406935536728e-06, + "loss": 0.5041, + "step": 5669 + }, + { + "epoch": 1.5723793677204658, + "grad_norm": 0.22239477932453156, + "learning_rate": 2.7928609496016235e-06, + "loss": 0.5068, + "step": 5670 + }, + { + "epoch": 1.5726566833056017, + "grad_norm": 0.20445238053798676, + "learning_rate": 2.789383099769591e-06, + "loss": 0.4886, + "step": 5671 + }, + { + "epoch": 1.5729339988907376, + "grad_norm": 0.20267120003700256, + "learning_rate": 2.785907144736194e-06, + "loss": 0.4763, + "step": 5672 + }, + { + "epoch": 1.5732113144758735, + "grad_norm": 0.20123063027858734, + "learning_rate": 2.7824330851796755e-06, + "loss": 0.4932, + "step": 5673 + }, + { + "epoch": 1.5734886300610094, + "grad_norm": 0.20822611451148987, + "learning_rate": 2.7789609217779316e-06, + "loss": 0.47, + "step": 5674 + }, + { + "epoch": 1.5737659456461452, + "grad_norm": 0.21520821750164032, + "learning_rate": 2.7754906552084667e-06, + "loss": 0.4951, + "step": 5675 + }, + { + "epoch": 1.5740432612312811, + "grad_norm": 0.20591013133525848, + "learning_rate": 2.7720222861484167e-06, + "loss": 0.4938, + "step": 5676 + }, + { + "epoch": 1.574320576816417, + "grad_norm": 0.22870129346847534, + "learning_rate": 2.768555815274557e-06, + "loss": 0.4814, + "step": 5677 + }, + { + "epoch": 1.574597892401553, + "grad_norm": 0.1999206393957138, + "learning_rate": 2.76509124326329e-06, + "loss": 0.5062, + "step": 5678 + }, + { + "epoch": 1.5748752079866888, + "grad_norm": 0.21404382586479187, + "learning_rate": 2.7616285707906447e-06, + "loss": 0.5205, + "step": 5679 + }, + { + "epoch": 1.5751525235718247, + "grad_norm": 0.22264225780963898, + "learning_rate": 2.7581677985322742e-06, + "loss": 0.4951, + "step": 5680 + }, + { + "epoch": 1.5754298391569606, + "grad_norm": 0.21465402841567993, + "learning_rate": 2.75470892716347e-06, + "loss": 0.4647, + "step": 5681 + }, + { + "epoch": 1.5757071547420964, + "grad_norm": 0.21319536864757538, + "learning_rate": 2.751251957359155e-06, + "loss": 0.5177, + "step": 5682 + }, + { + "epoch": 1.5759844703272323, + "grad_norm": 0.21107381582260132, + "learning_rate": 2.7477968897938717e-06, + "loss": 0.4781, + "step": 5683 + }, + { + "epoch": 1.5762617859123682, + "grad_norm": 0.22035937011241913, + "learning_rate": 2.744343725141792e-06, + "loss": 0.4566, + "step": 5684 + }, + { + "epoch": 1.576539101497504, + "grad_norm": 0.21590951085090637, + "learning_rate": 2.7408924640767218e-06, + "loss": 0.5025, + "step": 5685 + }, + { + "epoch": 1.57681641708264, + "grad_norm": 0.20270653069019318, + "learning_rate": 2.7374431072720975e-06, + "loss": 0.529, + "step": 5686 + }, + { + "epoch": 1.5770937326677759, + "grad_norm": 0.2021293044090271, + "learning_rate": 2.733995655400974e-06, + "loss": 0.4969, + "step": 5687 + }, + { + "epoch": 1.5773710482529117, + "grad_norm": 0.19878076016902924, + "learning_rate": 2.7305501091360444e-06, + "loss": 0.476, + "step": 5688 + }, + { + "epoch": 1.5776483638380476, + "grad_norm": 0.2047017365694046, + "learning_rate": 2.7271064691496277e-06, + "loss": 0.4912, + "step": 5689 + }, + { + "epoch": 1.5779256794231835, + "grad_norm": 0.20984232425689697, + "learning_rate": 2.723664736113668e-06, + "loss": 0.4956, + "step": 5690 + }, + { + "epoch": 1.5782029950083194, + "grad_norm": 0.2142646759748459, + "learning_rate": 2.720224910699733e-06, + "loss": 0.5026, + "step": 5691 + }, + { + "epoch": 1.5784803105934553, + "grad_norm": 0.2094864547252655, + "learning_rate": 2.7167869935790276e-06, + "loss": 0.4846, + "step": 5692 + }, + { + "epoch": 1.5787576261785912, + "grad_norm": 0.21176081895828247, + "learning_rate": 2.713350985422386e-06, + "loss": 0.5049, + "step": 5693 + }, + { + "epoch": 1.579034941763727, + "grad_norm": 0.1998070776462555, + "learning_rate": 2.7099168869002543e-06, + "loss": 0.4521, + "step": 5694 + }, + { + "epoch": 1.579312257348863, + "grad_norm": 0.20673444867134094, + "learning_rate": 2.7064846986827264e-06, + "loss": 0.511, + "step": 5695 + }, + { + "epoch": 1.5795895729339988, + "grad_norm": 0.21131543815135956, + "learning_rate": 2.7030544214395035e-06, + "loss": 0.4869, + "step": 5696 + }, + { + "epoch": 1.5798668885191347, + "grad_norm": 0.21434102952480316, + "learning_rate": 2.6996260558399324e-06, + "loss": 0.4945, + "step": 5697 + }, + { + "epoch": 1.5801442041042706, + "grad_norm": 0.2118789553642273, + "learning_rate": 2.696199602552971e-06, + "loss": 0.4696, + "step": 5698 + }, + { + "epoch": 1.5804215196894065, + "grad_norm": 0.21816542744636536, + "learning_rate": 2.6927750622472157e-06, + "loss": 0.4896, + "step": 5699 + }, + { + "epoch": 1.5806988352745424, + "grad_norm": 0.2012248933315277, + "learning_rate": 2.6893524355908804e-06, + "loss": 0.4781, + "step": 5700 + }, + { + "epoch": 1.5809761508596782, + "grad_norm": 0.1999814510345459, + "learning_rate": 2.685931723251814e-06, + "loss": 0.507, + "step": 5701 + }, + { + "epoch": 1.5812534664448141, + "grad_norm": 0.21244245767593384, + "learning_rate": 2.682512925897489e-06, + "loss": 0.4788, + "step": 5702 + }, + { + "epoch": 1.58153078202995, + "grad_norm": 0.19930778443813324, + "learning_rate": 2.6790960441949996e-06, + "loss": 0.5082, + "step": 5703 + }, + { + "epoch": 1.581808097615086, + "grad_norm": 0.19802865386009216, + "learning_rate": 2.675681078811075e-06, + "loss": 0.4891, + "step": 5704 + }, + { + "epoch": 1.5820854132002218, + "grad_norm": 0.2145322859287262, + "learning_rate": 2.6722680304120575e-06, + "loss": 0.5113, + "step": 5705 + }, + { + "epoch": 1.5823627287853577, + "grad_norm": 0.20376405119895935, + "learning_rate": 2.6688568996639337e-06, + "loss": 0.4751, + "step": 5706 + }, + { + "epoch": 1.5826400443704935, + "grad_norm": 0.21193109452724457, + "learning_rate": 2.6654476872322948e-06, + "loss": 0.4846, + "step": 5707 + }, + { + "epoch": 1.5829173599556294, + "grad_norm": 0.21111689507961273, + "learning_rate": 2.662040393782375e-06, + "loss": 0.4803, + "step": 5708 + }, + { + "epoch": 1.5831946755407653, + "grad_norm": 0.204023540019989, + "learning_rate": 2.658635019979029e-06, + "loss": 0.5126, + "step": 5709 + }, + { + "epoch": 1.5834719911259012, + "grad_norm": 0.20971745252609253, + "learning_rate": 2.655231566486732e-06, + "loss": 0.5089, + "step": 5710 + }, + { + "epoch": 1.583749306711037, + "grad_norm": 0.19988863170146942, + "learning_rate": 2.6518300339695865e-06, + "loss": 0.5121, + "step": 5711 + }, + { + "epoch": 1.584026622296173, + "grad_norm": 0.21533909440040588, + "learning_rate": 2.6484304230913236e-06, + "loss": 0.4912, + "step": 5712 + }, + { + "epoch": 1.5843039378813089, + "grad_norm": 0.20329493284225464, + "learning_rate": 2.6450327345153e-06, + "loss": 0.4747, + "step": 5713 + }, + { + "epoch": 1.5845812534664447, + "grad_norm": 0.2205643355846405, + "learning_rate": 2.6416369689044903e-06, + "loss": 0.5216, + "step": 5714 + }, + { + "epoch": 1.5848585690515806, + "grad_norm": 0.20560097694396973, + "learning_rate": 2.638243126921498e-06, + "loss": 0.4795, + "step": 5715 + }, + { + "epoch": 1.5851358846367165, + "grad_norm": 0.19872471690177917, + "learning_rate": 2.6348512092285603e-06, + "loss": 0.516, + "step": 5716 + }, + { + "epoch": 1.5854132002218524, + "grad_norm": 0.2076081484556198, + "learning_rate": 2.6314612164875213e-06, + "loss": 0.4939, + "step": 5717 + }, + { + "epoch": 1.5856905158069883, + "grad_norm": 0.19979238510131836, + "learning_rate": 2.6280731493598596e-06, + "loss": 0.4665, + "step": 5718 + }, + { + "epoch": 1.5859678313921242, + "grad_norm": 0.23124083876609802, + "learning_rate": 2.6246870085066764e-06, + "loss": 0.5132, + "step": 5719 + }, + { + "epoch": 1.58624514697726, + "grad_norm": 0.22173020243644714, + "learning_rate": 2.6213027945887035e-06, + "loss": 0.5085, + "step": 5720 + }, + { + "epoch": 1.586522462562396, + "grad_norm": 0.19916951656341553, + "learning_rate": 2.6179205082662862e-06, + "loss": 0.4924, + "step": 5721 + }, + { + "epoch": 1.5867997781475318, + "grad_norm": 0.21253123879432678, + "learning_rate": 2.614540150199396e-06, + "loss": 0.5012, + "step": 5722 + }, + { + "epoch": 1.5870770937326677, + "grad_norm": 0.21266770362854004, + "learning_rate": 2.611161721047632e-06, + "loss": 0.51, + "step": 5723 + }, + { + "epoch": 1.5873544093178036, + "grad_norm": 0.20849168300628662, + "learning_rate": 2.60778522147022e-06, + "loss": 0.5161, + "step": 5724 + }, + { + "epoch": 1.5876317249029395, + "grad_norm": 0.20374304056167603, + "learning_rate": 2.6044106521259963e-06, + "loss": 0.4899, + "step": 5725 + }, + { + "epoch": 1.5879090404880754, + "grad_norm": 0.2087586522102356, + "learning_rate": 2.6010380136734347e-06, + "loss": 0.5035, + "step": 5726 + }, + { + "epoch": 1.5881863560732112, + "grad_norm": 0.20148667693138123, + "learning_rate": 2.5976673067706262e-06, + "loss": 0.504, + "step": 5727 + }, + { + "epoch": 1.5884636716583471, + "grad_norm": 0.2141498327255249, + "learning_rate": 2.5942985320752843e-06, + "loss": 0.5007, + "step": 5728 + }, + { + "epoch": 1.588740987243483, + "grad_norm": 0.20762969553470612, + "learning_rate": 2.5909316902447426e-06, + "loss": 0.4928, + "step": 5729 + }, + { + "epoch": 1.589018302828619, + "grad_norm": 0.20631739497184753, + "learning_rate": 2.5875667819359626e-06, + "loss": 0.4968, + "step": 5730 + }, + { + "epoch": 1.5892956184137548, + "grad_norm": 0.2064218521118164, + "learning_rate": 2.5842038078055327e-06, + "loss": 0.4878, + "step": 5731 + }, + { + "epoch": 1.5895729339988907, + "grad_norm": 0.20431697368621826, + "learning_rate": 2.5808427685096505e-06, + "loss": 0.4973, + "step": 5732 + }, + { + "epoch": 1.5898502495840265, + "grad_norm": 0.2072911262512207, + "learning_rate": 2.5774836647041515e-06, + "loss": 0.5057, + "step": 5733 + }, + { + "epoch": 1.5901275651691624, + "grad_norm": 0.22611966729164124, + "learning_rate": 2.5741264970444768e-06, + "loss": 0.4918, + "step": 5734 + }, + { + "epoch": 1.5904048807542983, + "grad_norm": 0.20940294861793518, + "learning_rate": 2.570771266185708e-06, + "loss": 0.4741, + "step": 5735 + }, + { + "epoch": 1.5906821963394342, + "grad_norm": 0.21070222556591034, + "learning_rate": 2.5674179727825307e-06, + "loss": 0.4841, + "step": 5736 + }, + { + "epoch": 1.59095951192457, + "grad_norm": 0.21693362295627594, + "learning_rate": 2.564066617489269e-06, + "loss": 0.4725, + "step": 5737 + }, + { + "epoch": 1.591236827509706, + "grad_norm": 0.20319637656211853, + "learning_rate": 2.5607172009598556e-06, + "loss": 0.5041, + "step": 5738 + }, + { + "epoch": 1.5915141430948418, + "grad_norm": 0.2054455578327179, + "learning_rate": 2.5573697238478523e-06, + "loss": 0.4975, + "step": 5739 + }, + { + "epoch": 1.5917914586799777, + "grad_norm": 0.20702321827411652, + "learning_rate": 2.5540241868064434e-06, + "loss": 0.4761, + "step": 5740 + }, + { + "epoch": 1.5920687742651136, + "grad_norm": 0.23018468916416168, + "learning_rate": 2.5506805904884272e-06, + "loss": 0.5258, + "step": 5741 + }, + { + "epoch": 1.5923460898502495, + "grad_norm": 0.2070692479610443, + "learning_rate": 2.5473389355462325e-06, + "loss": 0.5011, + "step": 5742 + }, + { + "epoch": 1.5926234054353854, + "grad_norm": 0.21105840802192688, + "learning_rate": 2.543999222631899e-06, + "loss": 0.491, + "step": 5743 + }, + { + "epoch": 1.5929007210205213, + "grad_norm": 0.21551688015460968, + "learning_rate": 2.5406614523971e-06, + "loss": 0.4878, + "step": 5744 + }, + { + "epoch": 1.5931780366056572, + "grad_norm": 0.21947219967842102, + "learning_rate": 2.537325625493116e-06, + "loss": 0.4982, + "step": 5745 + }, + { + "epoch": 1.593455352190793, + "grad_norm": 0.21670997142791748, + "learning_rate": 2.5339917425708584e-06, + "loss": 0.5216, + "step": 5746 + }, + { + "epoch": 1.593732667775929, + "grad_norm": 0.22349973022937775, + "learning_rate": 2.5306598042808592e-06, + "loss": 0.5216, + "step": 5747 + }, + { + "epoch": 1.5940099833610648, + "grad_norm": 0.20010913908481598, + "learning_rate": 2.5273298112732657e-06, + "loss": 0.467, + "step": 5748 + }, + { + "epoch": 1.5942872989462007, + "grad_norm": 0.2064429521560669, + "learning_rate": 2.5240017641978435e-06, + "loss": 0.4931, + "step": 5749 + }, + { + "epoch": 1.5945646145313366, + "grad_norm": 0.19768312573432922, + "learning_rate": 2.520675663703985e-06, + "loss": 0.4852, + "step": 5750 + }, + { + "epoch": 1.5948419301164725, + "grad_norm": 0.21115142107009888, + "learning_rate": 2.517351510440706e-06, + "loss": 0.5125, + "step": 5751 + }, + { + "epoch": 1.5951192457016083, + "grad_norm": 0.19108986854553223, + "learning_rate": 2.5140293050566295e-06, + "loss": 0.4878, + "step": 5752 + }, + { + "epoch": 1.5953965612867442, + "grad_norm": 0.20708754658699036, + "learning_rate": 2.510709048200009e-06, + "loss": 0.4871, + "step": 5753 + }, + { + "epoch": 1.5956738768718801, + "grad_norm": 0.20507743954658508, + "learning_rate": 2.507390740518717e-06, + "loss": 0.5031, + "step": 5754 + }, + { + "epoch": 1.595951192457016, + "grad_norm": 0.2134825587272644, + "learning_rate": 2.5040743826602405e-06, + "loss": 0.4753, + "step": 5755 + }, + { + "epoch": 1.5962285080421519, + "grad_norm": 0.20486515760421753, + "learning_rate": 2.5007599752716867e-06, + "loss": 0.5022, + "step": 5756 + }, + { + "epoch": 1.5965058236272878, + "grad_norm": 0.20787964761257172, + "learning_rate": 2.4974475189997862e-06, + "loss": 0.498, + "step": 5757 + }, + { + "epoch": 1.5967831392124237, + "grad_norm": 0.20221605896949768, + "learning_rate": 2.494137014490891e-06, + "loss": 0.518, + "step": 5758 + }, + { + "epoch": 1.5970604547975595, + "grad_norm": 0.2075101137161255, + "learning_rate": 2.4908284623909638e-06, + "loss": 0.5026, + "step": 5759 + }, + { + "epoch": 1.5973377703826954, + "grad_norm": 0.21558356285095215, + "learning_rate": 2.487521863345589e-06, + "loss": 0.488, + "step": 5760 + }, + { + "epoch": 1.5976150859678313, + "grad_norm": 0.20336772501468658, + "learning_rate": 2.4842172179999736e-06, + "loss": 0.4776, + "step": 5761 + }, + { + "epoch": 1.5978924015529672, + "grad_norm": 0.2211672067642212, + "learning_rate": 2.480914526998945e-06, + "loss": 0.4935, + "step": 5762 + }, + { + "epoch": 1.598169717138103, + "grad_norm": 0.2278861552476883, + "learning_rate": 2.4776137909869434e-06, + "loss": 0.4833, + "step": 5763 + }, + { + "epoch": 1.598447032723239, + "grad_norm": 0.20449037849903107, + "learning_rate": 2.4743150106080233e-06, + "loss": 0.4874, + "step": 5764 + }, + { + "epoch": 1.5987243483083748, + "grad_norm": 0.20173722505569458, + "learning_rate": 2.471018186505876e-06, + "loss": 0.4963, + "step": 5765 + }, + { + "epoch": 1.5990016638935107, + "grad_norm": 0.2080666571855545, + "learning_rate": 2.4677233193237945e-06, + "loss": 0.4766, + "step": 5766 + }, + { + "epoch": 1.5992789794786466, + "grad_norm": 0.2013276368379593, + "learning_rate": 2.4644304097046892e-06, + "loss": 0.4742, + "step": 5767 + }, + { + "epoch": 1.5995562950637825, + "grad_norm": 0.21188431978225708, + "learning_rate": 2.461139458291098e-06, + "loss": 0.4795, + "step": 5768 + }, + { + "epoch": 1.5998336106489184, + "grad_norm": 0.20740027725696564, + "learning_rate": 2.457850465725177e-06, + "loss": 0.4796, + "step": 5769 + }, + { + "epoch": 1.6001109262340543, + "grad_norm": 0.20362474024295807, + "learning_rate": 2.454563432648692e-06, + "loss": 0.5114, + "step": 5770 + }, + { + "epoch": 1.6003882418191901, + "grad_norm": 0.3291257619857788, + "learning_rate": 2.451278359703027e-06, + "loss": 0.4933, + "step": 5771 + }, + { + "epoch": 1.600665557404326, + "grad_norm": 0.20942147076129913, + "learning_rate": 2.447995247529189e-06, + "loss": 0.4955, + "step": 5772 + }, + { + "epoch": 1.600942872989462, + "grad_norm": 0.20903481543064117, + "learning_rate": 2.4447140967678057e-06, + "loss": 0.4991, + "step": 5773 + }, + { + "epoch": 1.6012201885745978, + "grad_norm": 0.20149755477905273, + "learning_rate": 2.4414349080591064e-06, + "loss": 0.4864, + "step": 5774 + }, + { + "epoch": 1.6014975041597337, + "grad_norm": 0.20722270011901855, + "learning_rate": 2.4381576820429588e-06, + "loss": 0.4994, + "step": 5775 + }, + { + "epoch": 1.6017748197448696, + "grad_norm": 0.21165424585342407, + "learning_rate": 2.434882419358826e-06, + "loss": 0.4968, + "step": 5776 + }, + { + "epoch": 1.6020521353300055, + "grad_norm": 0.2054947316646576, + "learning_rate": 2.4316091206458073e-06, + "loss": 0.48, + "step": 5777 + }, + { + "epoch": 1.6023294509151413, + "grad_norm": 0.2067498415708542, + "learning_rate": 2.428337786542603e-06, + "loss": 0.4804, + "step": 5778 + }, + { + "epoch": 1.6026067665002772, + "grad_norm": 0.19922077655792236, + "learning_rate": 2.42506841768754e-06, + "loss": 0.4946, + "step": 5779 + }, + { + "epoch": 1.602884082085413, + "grad_norm": 0.202239990234375, + "learning_rate": 2.4218010147185625e-06, + "loss": 0.4929, + "step": 5780 + }, + { + "epoch": 1.603161397670549, + "grad_norm": 0.2116711288690567, + "learning_rate": 2.4185355782732205e-06, + "loss": 0.4702, + "step": 5781 + }, + { + "epoch": 1.6034387132556849, + "grad_norm": 0.1949220895767212, + "learning_rate": 2.4152721089886933e-06, + "loss": 0.4611, + "step": 5782 + }, + { + "epoch": 1.6037160288408208, + "grad_norm": 0.20025965571403503, + "learning_rate": 2.412010607501765e-06, + "loss": 0.5017, + "step": 5783 + }, + { + "epoch": 1.6039933444259566, + "grad_norm": 0.22020164132118225, + "learning_rate": 2.4087510744488465e-06, + "loss": 0.4937, + "step": 5784 + }, + { + "epoch": 1.6042706600110925, + "grad_norm": 0.21315798163414001, + "learning_rate": 2.4054935104659533e-06, + "loss": 0.4847, + "step": 5785 + }, + { + "epoch": 1.6045479755962284, + "grad_norm": 0.22190795838832855, + "learning_rate": 2.4022379161887265e-06, + "loss": 0.4754, + "step": 5786 + }, + { + "epoch": 1.6048252911813643, + "grad_norm": 0.21109223365783691, + "learning_rate": 2.3989842922524154e-06, + "loss": 0.5015, + "step": 5787 + }, + { + "epoch": 1.6051026067665002, + "grad_norm": 0.21155421435832977, + "learning_rate": 2.3957326392918906e-06, + "loss": 0.4992, + "step": 5788 + }, + { + "epoch": 1.605379922351636, + "grad_norm": 0.2093130350112915, + "learning_rate": 2.392482957941637e-06, + "loss": 0.5007, + "step": 5789 + }, + { + "epoch": 1.605657237936772, + "grad_norm": 0.20884621143341064, + "learning_rate": 2.389235248835754e-06, + "loss": 0.5155, + "step": 5790 + }, + { + "epoch": 1.6059345535219078, + "grad_norm": 0.20897847414016724, + "learning_rate": 2.385989512607946e-06, + "loss": 0.5158, + "step": 5791 + }, + { + "epoch": 1.6062118691070437, + "grad_norm": 0.2077287882566452, + "learning_rate": 2.382745749891556e-06, + "loss": 0.4918, + "step": 5792 + }, + { + "epoch": 1.6064891846921796, + "grad_norm": 0.2046043872833252, + "learning_rate": 2.379503961319522e-06, + "loss": 0.4631, + "step": 5793 + }, + { + "epoch": 1.6067665002773155, + "grad_norm": 0.22556503117084503, + "learning_rate": 2.3762641475244e-06, + "loss": 0.4901, + "step": 5794 + }, + { + "epoch": 1.6070438158624514, + "grad_norm": 0.20669564604759216, + "learning_rate": 2.3730263091383654e-06, + "loss": 0.5145, + "step": 5795 + }, + { + "epoch": 1.6073211314475873, + "grad_norm": 0.20489123463630676, + "learning_rate": 2.3697904467932104e-06, + "loss": 0.49, + "step": 5796 + }, + { + "epoch": 1.6075984470327231, + "grad_norm": 0.2056889533996582, + "learning_rate": 2.366556561120334e-06, + "loss": 0.5178, + "step": 5797 + }, + { + "epoch": 1.607875762617859, + "grad_norm": 0.22281832993030548, + "learning_rate": 2.3633246527507507e-06, + "loss": 0.4962, + "step": 5798 + }, + { + "epoch": 1.608153078202995, + "grad_norm": 0.20517009496688843, + "learning_rate": 2.3600947223150926e-06, + "loss": 0.517, + "step": 5799 + }, + { + "epoch": 1.6084303937881308, + "grad_norm": 0.2156504988670349, + "learning_rate": 2.3568667704436096e-06, + "loss": 0.4901, + "step": 5800 + }, + { + "epoch": 1.6087077093732667, + "grad_norm": 0.20395740866661072, + "learning_rate": 2.3536407977661573e-06, + "loss": 0.4848, + "step": 5801 + }, + { + "epoch": 1.6089850249584026, + "grad_norm": 0.20527851581573486, + "learning_rate": 2.3504168049122006e-06, + "loss": 0.5139, + "step": 5802 + }, + { + "epoch": 1.6092623405435384, + "grad_norm": 0.20943453907966614, + "learning_rate": 2.34719479251084e-06, + "loss": 0.5047, + "step": 5803 + }, + { + "epoch": 1.6095396561286743, + "grad_norm": 0.19371294975280762, + "learning_rate": 2.34397476119077e-06, + "loss": 0.4913, + "step": 5804 + }, + { + "epoch": 1.6098169717138102, + "grad_norm": 0.20842455327510834, + "learning_rate": 2.3407567115802983e-06, + "loss": 0.5115, + "step": 5805 + }, + { + "epoch": 1.610094287298946, + "grad_norm": 0.21230155229568481, + "learning_rate": 2.337540644307358e-06, + "loss": 0.4878, + "step": 5806 + }, + { + "epoch": 1.610371602884082, + "grad_norm": 0.20435665547847748, + "learning_rate": 2.334326559999489e-06, + "loss": 0.5108, + "step": 5807 + }, + { + "epoch": 1.6106489184692179, + "grad_norm": 0.20020738244056702, + "learning_rate": 2.3311144592838425e-06, + "loss": 0.4754, + "step": 5808 + }, + { + "epoch": 1.6109262340543538, + "grad_norm": 0.20572920143604279, + "learning_rate": 2.327904342787182e-06, + "loss": 0.5035, + "step": 5809 + }, + { + "epoch": 1.6112035496394896, + "grad_norm": 0.2309185266494751, + "learning_rate": 2.324696211135889e-06, + "loss": 0.5065, + "step": 5810 + }, + { + "epoch": 1.6114808652246255, + "grad_norm": 0.22252123057842255, + "learning_rate": 2.3214900649559572e-06, + "loss": 0.5174, + "step": 5811 + }, + { + "epoch": 1.6117581808097614, + "grad_norm": 0.1993841528892517, + "learning_rate": 2.3182859048729856e-06, + "loss": 0.4873, + "step": 5812 + }, + { + "epoch": 1.6120354963948973, + "grad_norm": 0.2186397761106491, + "learning_rate": 2.3150837315121966e-06, + "loss": 0.5177, + "step": 5813 + }, + { + "epoch": 1.6123128119800332, + "grad_norm": 0.21579131484031677, + "learning_rate": 2.3118835454984126e-06, + "loss": 0.4963, + "step": 5814 + }, + { + "epoch": 1.612590127565169, + "grad_norm": 0.23498544096946716, + "learning_rate": 2.3086853474560814e-06, + "loss": 0.4963, + "step": 5815 + }, + { + "epoch": 1.612867443150305, + "grad_norm": 0.20521143078804016, + "learning_rate": 2.305489138009248e-06, + "loss": 0.4848, + "step": 5816 + }, + { + "epoch": 1.6131447587354408, + "grad_norm": 0.1934400200843811, + "learning_rate": 2.302294917781583e-06, + "loss": 0.4906, + "step": 5817 + }, + { + "epoch": 1.6134220743205767, + "grad_norm": 0.20979085564613342, + "learning_rate": 2.2991026873963676e-06, + "loss": 0.4736, + "step": 5818 + }, + { + "epoch": 1.6136993899057126, + "grad_norm": 0.2139722853899002, + "learning_rate": 2.295912447476481e-06, + "loss": 0.4949, + "step": 5819 + }, + { + "epoch": 1.6139767054908485, + "grad_norm": 0.21068985760211945, + "learning_rate": 2.2927241986444303e-06, + "loss": 0.4978, + "step": 5820 + }, + { + "epoch": 1.6142540210759844, + "grad_norm": 0.20163820683956146, + "learning_rate": 2.289537941522324e-06, + "loss": 0.5178, + "step": 5821 + }, + { + "epoch": 1.6145313366611203, + "grad_norm": 0.20904479920864105, + "learning_rate": 2.286353676731888e-06, + "loss": 0.5101, + "step": 5822 + }, + { + "epoch": 1.6148086522462561, + "grad_norm": 0.2054595649242401, + "learning_rate": 2.283171404894452e-06, + "loss": 0.4829, + "step": 5823 + }, + { + "epoch": 1.615085967831392, + "grad_norm": 0.21114782989025116, + "learning_rate": 2.279991126630969e-06, + "loss": 0.4995, + "step": 5824 + }, + { + "epoch": 1.615363283416528, + "grad_norm": 0.20004698634147644, + "learning_rate": 2.2768128425619858e-06, + "loss": 0.4823, + "step": 5825 + }, + { + "epoch": 1.6156405990016638, + "grad_norm": 0.21248266100883484, + "learning_rate": 2.273636553307677e-06, + "loss": 0.5148, + "step": 5826 + }, + { + "epoch": 1.6159179145867997, + "grad_norm": 0.21467168629169464, + "learning_rate": 2.2704622594878225e-06, + "loss": 0.4923, + "step": 5827 + }, + { + "epoch": 1.6161952301719356, + "grad_norm": 0.21232888102531433, + "learning_rate": 2.2672899617218065e-06, + "loss": 0.5014, + "step": 5828 + }, + { + "epoch": 1.6164725457570714, + "grad_norm": 0.2272992879152298, + "learning_rate": 2.2641196606286274e-06, + "loss": 0.5026, + "step": 5829 + }, + { + "epoch": 1.6167498613422073, + "grad_norm": 0.2556884288787842, + "learning_rate": 2.2609513568268958e-06, + "loss": 0.5019, + "step": 5830 + }, + { + "epoch": 1.6170271769273432, + "grad_norm": 0.2090773582458496, + "learning_rate": 2.257785050934838e-06, + "loss": 0.5294, + "step": 5831 + }, + { + "epoch": 1.617304492512479, + "grad_norm": 0.2042931616306305, + "learning_rate": 2.2546207435702738e-06, + "loss": 0.507, + "step": 5832 + }, + { + "epoch": 1.617581808097615, + "grad_norm": 0.21114638447761536, + "learning_rate": 2.2514584353506505e-06, + "loss": 0.4889, + "step": 5833 + }, + { + "epoch": 1.6178591236827509, + "grad_norm": 0.21107198297977448, + "learning_rate": 2.2482981268930183e-06, + "loss": 0.4965, + "step": 5834 + }, + { + "epoch": 1.6181364392678868, + "grad_norm": 0.22077830135822296, + "learning_rate": 2.2451398188140365e-06, + "loss": 0.4955, + "step": 5835 + }, + { + "epoch": 1.6184137548530226, + "grad_norm": 0.20404209196567535, + "learning_rate": 2.2419835117299682e-06, + "loss": 0.4862, + "step": 5836 + }, + { + "epoch": 1.6186910704381585, + "grad_norm": 0.2278210073709488, + "learning_rate": 2.2388292062567e-06, + "loss": 0.4899, + "step": 5837 + }, + { + "epoch": 1.6189683860232944, + "grad_norm": 0.20038627088069916, + "learning_rate": 2.2356769030097203e-06, + "loss": 0.4979, + "step": 5838 + }, + { + "epoch": 1.6192457016084303, + "grad_norm": 0.2179410755634308, + "learning_rate": 2.232526602604125e-06, + "loss": 0.4971, + "step": 5839 + }, + { + "epoch": 1.6195230171935662, + "grad_norm": 0.2113763988018036, + "learning_rate": 2.2293783056546156e-06, + "loss": 0.5079, + "step": 5840 + }, + { + "epoch": 1.619800332778702, + "grad_norm": 0.2023729383945465, + "learning_rate": 2.2262320127755184e-06, + "loss": 0.5081, + "step": 5841 + }, + { + "epoch": 1.620077648363838, + "grad_norm": 0.2042120099067688, + "learning_rate": 2.2230877245807553e-06, + "loss": 0.4965, + "step": 5842 + }, + { + "epoch": 1.6203549639489738, + "grad_norm": 0.21033427119255066, + "learning_rate": 2.2199454416838585e-06, + "loss": 0.4882, + "step": 5843 + }, + { + "epoch": 1.6206322795341097, + "grad_norm": 0.2538556456565857, + "learning_rate": 2.2168051646979647e-06, + "loss": 0.4803, + "step": 5844 + }, + { + "epoch": 1.6209095951192456, + "grad_norm": 0.20632927119731903, + "learning_rate": 2.2136668942358373e-06, + "loss": 0.4971, + "step": 5845 + }, + { + "epoch": 1.6211869107043815, + "grad_norm": 0.20020337402820587, + "learning_rate": 2.210530630909831e-06, + "loss": 0.4935, + "step": 5846 + }, + { + "epoch": 1.6214642262895174, + "grad_norm": 0.21856513619422913, + "learning_rate": 2.2073963753319076e-06, + "loss": 0.4761, + "step": 5847 + }, + { + "epoch": 1.6217415418746532, + "grad_norm": 0.2104158103466034, + "learning_rate": 2.2042641281136508e-06, + "loss": 0.4971, + "step": 5848 + }, + { + "epoch": 1.6220188574597891, + "grad_norm": 0.20630767941474915, + "learning_rate": 2.2011338898662458e-06, + "loss": 0.4878, + "step": 5849 + }, + { + "epoch": 1.622296173044925, + "grad_norm": 0.20518629252910614, + "learning_rate": 2.198005661200478e-06, + "loss": 0.5087, + "step": 5850 + }, + { + "epoch": 1.622573488630061, + "grad_norm": 0.25231507420539856, + "learning_rate": 2.1948794427267565e-06, + "loss": 0.514, + "step": 5851 + }, + { + "epoch": 1.6228508042151968, + "grad_norm": 0.21174240112304688, + "learning_rate": 2.1917552350550803e-06, + "loss": 0.4977, + "step": 5852 + }, + { + "epoch": 1.6231281198003327, + "grad_norm": 0.2215232104063034, + "learning_rate": 2.1886330387950737e-06, + "loss": 0.4799, + "step": 5853 + }, + { + "epoch": 1.6234054353854686, + "grad_norm": 0.2032269835472107, + "learning_rate": 2.1855128545559517e-06, + "loss": 0.4942, + "step": 5854 + }, + { + "epoch": 1.6236827509706044, + "grad_norm": 0.2067827433347702, + "learning_rate": 2.1823946829465496e-06, + "loss": 0.5164, + "step": 5855 + }, + { + "epoch": 1.6239600665557403, + "grad_norm": 0.19948525726795197, + "learning_rate": 2.1792785245753082e-06, + "loss": 0.4838, + "step": 5856 + }, + { + "epoch": 1.6242373821408762, + "grad_norm": 0.21956472098827362, + "learning_rate": 2.176164380050265e-06, + "loss": 0.4999, + "step": 5857 + }, + { + "epoch": 1.624514697726012, + "grad_norm": 0.20982100069522858, + "learning_rate": 2.1730522499790793e-06, + "loss": 0.4661, + "step": 5858 + }, + { + "epoch": 1.624792013311148, + "grad_norm": 0.20440787076950073, + "learning_rate": 2.169942134969004e-06, + "loss": 0.4907, + "step": 5859 + }, + { + "epoch": 1.6250693288962839, + "grad_norm": 0.21445617079734802, + "learning_rate": 2.1668340356269107e-06, + "loss": 0.5016, + "step": 5860 + }, + { + "epoch": 1.6253466444814197, + "grad_norm": 0.25135523080825806, + "learning_rate": 2.163727952559266e-06, + "loss": 0.4958, + "step": 5861 + }, + { + "epoch": 1.6256239600665556, + "grad_norm": 0.20553235709667206, + "learning_rate": 2.1606238863721568e-06, + "loss": 0.4758, + "step": 5862 + }, + { + "epoch": 1.6259012756516915, + "grad_norm": 0.1968127191066742, + "learning_rate": 2.157521837671259e-06, + "loss": 0.4919, + "step": 5863 + }, + { + "epoch": 1.6261785912368274, + "grad_norm": 0.28879934549331665, + "learning_rate": 2.1544218070618695e-06, + "loss": 0.4916, + "step": 5864 + }, + { + "epoch": 1.6264559068219633, + "grad_norm": 0.2082158625125885, + "learning_rate": 2.1513237951488907e-06, + "loss": 0.4969, + "step": 5865 + }, + { + "epoch": 1.6267332224070992, + "grad_norm": 0.20625808835029602, + "learning_rate": 2.1482278025368214e-06, + "loss": 0.4714, + "step": 5866 + }, + { + "epoch": 1.627010537992235, + "grad_norm": 0.2138909250497818, + "learning_rate": 2.1451338298297706e-06, + "loss": 0.4911, + "step": 5867 + }, + { + "epoch": 1.627287853577371, + "grad_norm": 0.20718157291412354, + "learning_rate": 2.1420418776314565e-06, + "loss": 0.4918, + "step": 5868 + }, + { + "epoch": 1.6275651691625068, + "grad_norm": 0.2110764980316162, + "learning_rate": 2.1389519465452035e-06, + "loss": 0.4946, + "step": 5869 + }, + { + "epoch": 1.6278424847476427, + "grad_norm": 0.21094083786010742, + "learning_rate": 2.135864037173933e-06, + "loss": 0.4799, + "step": 5870 + }, + { + "epoch": 1.6281198003327786, + "grad_norm": 0.20451469719409943, + "learning_rate": 2.132778150120182e-06, + "loss": 0.4805, + "step": 5871 + }, + { + "epoch": 1.6283971159179145, + "grad_norm": 0.20834794640541077, + "learning_rate": 2.129694285986092e-06, + "loss": 0.5009, + "step": 5872 + }, + { + "epoch": 1.6286744315030504, + "grad_norm": 0.20821170508861542, + "learning_rate": 2.126612445373402e-06, + "loss": 0.5267, + "step": 5873 + }, + { + "epoch": 1.6289517470881862, + "grad_norm": 0.2115286886692047, + "learning_rate": 2.1235326288834595e-06, + "loss": 0.5084, + "step": 5874 + }, + { + "epoch": 1.6292290626733221, + "grad_norm": 0.2055329829454422, + "learning_rate": 2.1204548371172194e-06, + "loss": 0.4826, + "step": 5875 + }, + { + "epoch": 1.629506378258458, + "grad_norm": 0.20218388736248016, + "learning_rate": 2.117379070675245e-06, + "loss": 0.5188, + "step": 5876 + }, + { + "epoch": 1.629783693843594, + "grad_norm": 0.22304154932498932, + "learning_rate": 2.1143053301576954e-06, + "loss": 0.5008, + "step": 5877 + }, + { + "epoch": 1.6300610094287298, + "grad_norm": 0.1911652535200119, + "learning_rate": 2.1112336161643347e-06, + "loss": 0.4933, + "step": 5878 + }, + { + "epoch": 1.6303383250138657, + "grad_norm": 0.20941410958766937, + "learning_rate": 2.108163929294546e-06, + "loss": 0.5065, + "step": 5879 + }, + { + "epoch": 1.6306156405990015, + "grad_norm": 0.20953522622585297, + "learning_rate": 2.1050962701473014e-06, + "loss": 0.4801, + "step": 5880 + }, + { + "epoch": 1.6308929561841374, + "grad_norm": 0.2114289551973343, + "learning_rate": 2.102030639321183e-06, + "loss": 0.4981, + "step": 5881 + }, + { + "epoch": 1.6311702717692733, + "grad_norm": 0.21842321753501892, + "learning_rate": 2.0989670374143693e-06, + "loss": 0.5037, + "step": 5882 + }, + { + "epoch": 1.6314475873544092, + "grad_norm": 0.2003999501466751, + "learning_rate": 2.0959054650246626e-06, + "loss": 0.4975, + "step": 5883 + }, + { + "epoch": 1.631724902939545, + "grad_norm": 0.21246321499347687, + "learning_rate": 2.0928459227494505e-06, + "loss": 0.4951, + "step": 5884 + }, + { + "epoch": 1.632002218524681, + "grad_norm": 0.1984589397907257, + "learning_rate": 2.0897884111857292e-06, + "loss": 0.4977, + "step": 5885 + }, + { + "epoch": 1.6322795341098169, + "grad_norm": 0.20242911577224731, + "learning_rate": 2.086732930930102e-06, + "loss": 0.4695, + "step": 5886 + }, + { + "epoch": 1.6325568496949527, + "grad_norm": 0.2037610560655594, + "learning_rate": 2.0836794825787763e-06, + "loss": 0.4774, + "step": 5887 + }, + { + "epoch": 1.6328341652800886, + "grad_norm": 0.23012907803058624, + "learning_rate": 2.0806280667275594e-06, + "loss": 0.4965, + "step": 5888 + }, + { + "epoch": 1.6331114808652245, + "grad_norm": 0.21143199503421783, + "learning_rate": 2.0775786839718607e-06, + "loss": 0.4985, + "step": 5889 + }, + { + "epoch": 1.6333887964503604, + "grad_norm": 0.20616495609283447, + "learning_rate": 2.074531334906696e-06, + "loss": 0.5273, + "step": 5890 + }, + { + "epoch": 1.6336661120354963, + "grad_norm": 0.2136596441268921, + "learning_rate": 2.0714860201266895e-06, + "loss": 0.4846, + "step": 5891 + }, + { + "epoch": 1.6339434276206322, + "grad_norm": 0.22074300050735474, + "learning_rate": 2.068442740226055e-06, + "loss": 0.5065, + "step": 5892 + }, + { + "epoch": 1.634220743205768, + "grad_norm": 0.20592841506004333, + "learning_rate": 2.065401495798622e-06, + "loss": 0.4774, + "step": 5893 + }, + { + "epoch": 1.634498058790904, + "grad_norm": 0.20553089678287506, + "learning_rate": 2.062362287437818e-06, + "loss": 0.4521, + "step": 5894 + }, + { + "epoch": 1.6347753743760398, + "grad_norm": 0.21127671003341675, + "learning_rate": 2.0593251157366727e-06, + "loss": 0.5082, + "step": 5895 + }, + { + "epoch": 1.6350526899611757, + "grad_norm": 0.20608097314834595, + "learning_rate": 2.056289981287815e-06, + "loss": 0.4999, + "step": 5896 + }, + { + "epoch": 1.6353300055463116, + "grad_norm": 0.20263780653476715, + "learning_rate": 2.0532568846834825e-06, + "loss": 0.469, + "step": 5897 + }, + { + "epoch": 1.6356073211314475, + "grad_norm": 0.2072770744562149, + "learning_rate": 2.050225826515516e-06, + "loss": 0.476, + "step": 5898 + }, + { + "epoch": 1.6358846367165834, + "grad_norm": 0.20764364302158356, + "learning_rate": 2.0471968073753483e-06, + "loss": 0.4831, + "step": 5899 + }, + { + "epoch": 1.6361619523017192, + "grad_norm": 0.20213009417057037, + "learning_rate": 2.0441698278540296e-06, + "loss": 0.4584, + "step": 5900 + }, + { + "epoch": 1.6364392678868551, + "grad_norm": 0.19651676714420319, + "learning_rate": 2.041144888542196e-06, + "loss": 0.4947, + "step": 5901 + }, + { + "epoch": 1.636716583471991, + "grad_norm": 0.20257742702960968, + "learning_rate": 2.0381219900300993e-06, + "loss": 0.5024, + "step": 5902 + }, + { + "epoch": 1.6369938990571269, + "grad_norm": 0.20740802586078644, + "learning_rate": 2.0351011329075823e-06, + "loss": 0.5088, + "step": 5903 + }, + { + "epoch": 1.6372712146422628, + "grad_norm": 0.21403908729553223, + "learning_rate": 2.0320823177641e-06, + "loss": 0.4876, + "step": 5904 + }, + { + "epoch": 1.6375485302273987, + "grad_norm": 0.2182864248752594, + "learning_rate": 2.0290655451886965e-06, + "loss": 0.4772, + "step": 5905 + }, + { + "epoch": 1.6378258458125345, + "grad_norm": 0.21262075006961823, + "learning_rate": 2.0260508157700266e-06, + "loss": 0.4797, + "step": 5906 + }, + { + "epoch": 1.6381031613976704, + "grad_norm": 0.21089348196983337, + "learning_rate": 2.023038130096347e-06, + "loss": 0.4934, + "step": 5907 + }, + { + "epoch": 1.6383804769828063, + "grad_norm": 0.2085971236228943, + "learning_rate": 2.020027488755509e-06, + "loss": 0.4815, + "step": 5908 + }, + { + "epoch": 1.6386577925679422, + "grad_norm": 0.2214841991662979, + "learning_rate": 2.017018892334971e-06, + "loss": 0.5057, + "step": 5909 + }, + { + "epoch": 1.638935108153078, + "grad_norm": 0.2010078728199005, + "learning_rate": 2.0140123414217867e-06, + "loss": 0.4997, + "step": 5910 + }, + { + "epoch": 1.639212423738214, + "grad_norm": 0.21052546799182892, + "learning_rate": 2.0110078366026173e-06, + "loss": 0.4891, + "step": 5911 + }, + { + "epoch": 1.6394897393233498, + "grad_norm": 0.2032831907272339, + "learning_rate": 2.008005378463716e-06, + "loss": 0.5115, + "step": 5912 + }, + { + "epoch": 1.6397670549084857, + "grad_norm": 0.20649638772010803, + "learning_rate": 2.0050049675909467e-06, + "loss": 0.4735, + "step": 5913 + }, + { + "epoch": 1.6400443704936216, + "grad_norm": 0.20801763236522675, + "learning_rate": 2.0020066045697714e-06, + "loss": 0.4761, + "step": 5914 + }, + { + "epoch": 1.6403216860787575, + "grad_norm": 0.19369389116764069, + "learning_rate": 1.999010289985247e-06, + "loss": 0.5107, + "step": 5915 + }, + { + "epoch": 1.6405990016638934, + "grad_norm": 0.20587508380413055, + "learning_rate": 1.9960160244220263e-06, + "loss": 0.4925, + "step": 5916 + }, + { + "epoch": 1.6408763172490293, + "grad_norm": 0.20227836072444916, + "learning_rate": 1.993023808464382e-06, + "loss": 0.5093, + "step": 5917 + }, + { + "epoch": 1.6411536328341652, + "grad_norm": 0.18990160524845123, + "learning_rate": 1.990033642696172e-06, + "loss": 0.4542, + "step": 5918 + }, + { + "epoch": 1.641430948419301, + "grad_norm": 0.20094534754753113, + "learning_rate": 1.9870455277008536e-06, + "loss": 0.4991, + "step": 5919 + }, + { + "epoch": 1.641708264004437, + "grad_norm": 0.22286152839660645, + "learning_rate": 1.9840594640614816e-06, + "loss": 0.5301, + "step": 5920 + }, + { + "epoch": 1.6419855795895728, + "grad_norm": 0.19467279314994812, + "learning_rate": 1.9810754523607296e-06, + "loss": 0.4759, + "step": 5921 + }, + { + "epoch": 1.6422628951747087, + "grad_norm": 0.21738329529762268, + "learning_rate": 1.9780934931808506e-06, + "loss": 0.4728, + "step": 5922 + }, + { + "epoch": 1.6425402107598446, + "grad_norm": 0.21225155889987946, + "learning_rate": 1.9751135871036995e-06, + "loss": 0.4957, + "step": 5923 + }, + { + "epoch": 1.6428175263449805, + "grad_norm": 0.20787325501441956, + "learning_rate": 1.9721357347107406e-06, + "loss": 0.4905, + "step": 5924 + }, + { + "epoch": 1.6430948419301163, + "grad_norm": 0.21133233606815338, + "learning_rate": 1.9691599365830328e-06, + "loss": 0.4878, + "step": 5925 + }, + { + "epoch": 1.6433721575152522, + "grad_norm": 0.19063884019851685, + "learning_rate": 1.966186193301231e-06, + "loss": 0.4926, + "step": 5926 + }, + { + "epoch": 1.6436494731003881, + "grad_norm": 0.2176242172718048, + "learning_rate": 1.9632145054455873e-06, + "loss": 0.4622, + "step": 5927 + }, + { + "epoch": 1.643926788685524, + "grad_norm": 0.2013779729604721, + "learning_rate": 1.960244873595961e-06, + "loss": 0.4814, + "step": 5928 + }, + { + "epoch": 1.6442041042706599, + "grad_norm": 0.21530881524085999, + "learning_rate": 1.957277298331808e-06, + "loss": 0.5058, + "step": 5929 + }, + { + "epoch": 1.6444814198557958, + "grad_norm": 0.20872762799263, + "learning_rate": 1.9543117802321764e-06, + "loss": 0.4875, + "step": 5930 + }, + { + "epoch": 1.6447587354409317, + "grad_norm": 0.21378107368946075, + "learning_rate": 1.9513483198757176e-06, + "loss": 0.4703, + "step": 5931 + }, + { + "epoch": 1.6450360510260675, + "grad_norm": 0.22160130739212036, + "learning_rate": 1.9483869178406875e-06, + "loss": 0.4991, + "step": 5932 + }, + { + "epoch": 1.6453133666112034, + "grad_norm": 0.21556591987609863, + "learning_rate": 1.945427574704928e-06, + "loss": 0.5146, + "step": 5933 + }, + { + "epoch": 1.6455906821963393, + "grad_norm": 0.2028489112854004, + "learning_rate": 1.9424702910458837e-06, + "loss": 0.5054, + "step": 5934 + }, + { + "epoch": 1.6458679977814752, + "grad_norm": 0.2313559353351593, + "learning_rate": 1.939515067440603e-06, + "loss": 0.5166, + "step": 5935 + }, + { + "epoch": 1.646145313366611, + "grad_norm": 0.20201341807842255, + "learning_rate": 1.9365619044657306e-06, + "loss": 0.478, + "step": 5936 + }, + { + "epoch": 1.646422628951747, + "grad_norm": 0.20596764981746674, + "learning_rate": 1.9336108026975e-06, + "loss": 0.4794, + "step": 5937 + }, + { + "epoch": 1.6466999445368828, + "grad_norm": 0.20622918009757996, + "learning_rate": 1.9306617627117567e-06, + "loss": 0.4991, + "step": 5938 + }, + { + "epoch": 1.6469772601220187, + "grad_norm": 0.20188166201114655, + "learning_rate": 1.927714785083928e-06, + "loss": 0.5167, + "step": 5939 + }, + { + "epoch": 1.6472545757071546, + "grad_norm": 0.2264070063829422, + "learning_rate": 1.9247698703890566e-06, + "loss": 0.5025, + "step": 5940 + }, + { + "epoch": 1.6475318912922905, + "grad_norm": 0.20895101130008698, + "learning_rate": 1.921827019201766e-06, + "loss": 0.5183, + "step": 5941 + }, + { + "epoch": 1.6478092068774264, + "grad_norm": 0.2241641879081726, + "learning_rate": 1.91888623209629e-06, + "loss": 0.4804, + "step": 5942 + }, + { + "epoch": 1.6480865224625623, + "grad_norm": 0.1995384395122528, + "learning_rate": 1.9159475096464484e-06, + "loss": 0.4742, + "step": 5943 + }, + { + "epoch": 1.6483638380476981, + "grad_norm": 0.213314950466156, + "learning_rate": 1.9130108524256672e-06, + "loss": 0.4683, + "step": 5944 + }, + { + "epoch": 1.648641153632834, + "grad_norm": 0.220163494348526, + "learning_rate": 1.9100762610069684e-06, + "loss": 0.5043, + "step": 5945 + }, + { + "epoch": 1.64891846921797, + "grad_norm": 0.2084074467420578, + "learning_rate": 1.907143735962963e-06, + "loss": 0.4717, + "step": 5946 + }, + { + "epoch": 1.6491957848031058, + "grad_norm": 0.21064861118793488, + "learning_rate": 1.9042132778658698e-06, + "loss": 0.4935, + "step": 5947 + }, + { + "epoch": 1.6494731003882417, + "grad_norm": 0.20477135479450226, + "learning_rate": 1.9012848872874938e-06, + "loss": 0.4861, + "step": 5948 + }, + { + "epoch": 1.6497504159733776, + "grad_norm": 0.21649906039237976, + "learning_rate": 1.8983585647992463e-06, + "loss": 0.4822, + "step": 5949 + }, + { + "epoch": 1.6500277315585135, + "grad_norm": 0.21001774072647095, + "learning_rate": 1.8954343109721245e-06, + "loss": 0.4892, + "step": 5950 + }, + { + "epoch": 1.6503050471436493, + "grad_norm": 0.19450335204601288, + "learning_rate": 1.8925121263767317e-06, + "loss": 0.4973, + "step": 5951 + }, + { + "epoch": 1.6505823627287852, + "grad_norm": 0.21450437605381012, + "learning_rate": 1.8895920115832675e-06, + "loss": 0.4925, + "step": 5952 + }, + { + "epoch": 1.650859678313921, + "grad_norm": 0.21557024121284485, + "learning_rate": 1.8866739671615175e-06, + "loss": 0.4697, + "step": 5953 + }, + { + "epoch": 1.651136993899057, + "grad_norm": 0.21621811389923096, + "learning_rate": 1.8837579936808695e-06, + "loss": 0.4718, + "step": 5954 + }, + { + "epoch": 1.6514143094841929, + "grad_norm": 0.20242778956890106, + "learning_rate": 1.8808440917103085e-06, + "loss": 0.4827, + "step": 5955 + }, + { + "epoch": 1.6516916250693288, + "grad_norm": 0.21023069322109222, + "learning_rate": 1.8779322618184173e-06, + "loss": 0.514, + "step": 5956 + }, + { + "epoch": 1.6519689406544646, + "grad_norm": 0.20898282527923584, + "learning_rate": 1.8750225045733678e-06, + "loss": 0.5226, + "step": 5957 + }, + { + "epoch": 1.6522462562396005, + "grad_norm": 0.21227650344371796, + "learning_rate": 1.872114820542925e-06, + "loss": 0.5026, + "step": 5958 + }, + { + "epoch": 1.6525235718247364, + "grad_norm": 0.21009580790996552, + "learning_rate": 1.8692092102944674e-06, + "loss": 0.4846, + "step": 5959 + }, + { + "epoch": 1.6528008874098723, + "grad_norm": 0.2119254618883133, + "learning_rate": 1.8663056743949512e-06, + "loss": 0.4779, + "step": 5960 + }, + { + "epoch": 1.6530782029950082, + "grad_norm": 0.21257485449314117, + "learning_rate": 1.8634042134109285e-06, + "loss": 0.5017, + "step": 5961 + }, + { + "epoch": 1.653355518580144, + "grad_norm": 0.216257706284523, + "learning_rate": 1.860504827908556e-06, + "loss": 0.5239, + "step": 5962 + }, + { + "epoch": 1.65363283416528, + "grad_norm": 0.20020192861557007, + "learning_rate": 1.8576075184535815e-06, + "loss": 0.5074, + "step": 5963 + }, + { + "epoch": 1.6539101497504158, + "grad_norm": 0.21256963908672333, + "learning_rate": 1.8547122856113458e-06, + "loss": 0.5388, + "step": 5964 + }, + { + "epoch": 1.6541874653355517, + "grad_norm": 0.21079207956790924, + "learning_rate": 1.8518191299467815e-06, + "loss": 0.5012, + "step": 5965 + }, + { + "epoch": 1.6544647809206876, + "grad_norm": 0.21106182038784027, + "learning_rate": 1.8489280520244235e-06, + "loss": 0.4973, + "step": 5966 + }, + { + "epoch": 1.6547420965058235, + "grad_norm": 0.2153145670890808, + "learning_rate": 1.8460390524083992e-06, + "loss": 0.5018, + "step": 5967 + }, + { + "epoch": 1.6550194120909594, + "grad_norm": 0.2027408331632614, + "learning_rate": 1.843152131662429e-06, + "loss": 0.4777, + "step": 5968 + }, + { + "epoch": 1.6552967276760953, + "grad_norm": 0.2015599012374878, + "learning_rate": 1.8402672903498188e-06, + "loss": 0.4992, + "step": 5969 + }, + { + "epoch": 1.6555740432612311, + "grad_norm": 0.24570131301879883, + "learning_rate": 1.8373845290334896e-06, + "loss": 0.5061, + "step": 5970 + }, + { + "epoch": 1.655851358846367, + "grad_norm": 0.21255463361740112, + "learning_rate": 1.834503848275941e-06, + "loss": 0.4945, + "step": 5971 + }, + { + "epoch": 1.656128674431503, + "grad_norm": 0.21138375997543335, + "learning_rate": 1.8316252486392654e-06, + "loss": 0.4811, + "step": 5972 + }, + { + "epoch": 1.6564059900166388, + "grad_norm": 0.2128453552722931, + "learning_rate": 1.8287487306851564e-06, + "loss": 0.4978, + "step": 5973 + }, + { + "epoch": 1.6566833056017747, + "grad_norm": 0.20760896801948547, + "learning_rate": 1.8258742949749024e-06, + "loss": 0.4547, + "step": 5974 + }, + { + "epoch": 1.6569606211869106, + "grad_norm": 0.2109159529209137, + "learning_rate": 1.8230019420693758e-06, + "loss": 0.4891, + "step": 5975 + }, + { + "epoch": 1.6572379367720464, + "grad_norm": 0.21184983849525452, + "learning_rate": 1.820131672529056e-06, + "loss": 0.4846, + "step": 5976 + }, + { + "epoch": 1.6575152523571823, + "grad_norm": 0.20627717673778534, + "learning_rate": 1.8172634869140018e-06, + "loss": 0.4988, + "step": 5977 + }, + { + "epoch": 1.6577925679423182, + "grad_norm": 0.1983606368303299, + "learning_rate": 1.8143973857838768e-06, + "loss": 0.4799, + "step": 5978 + }, + { + "epoch": 1.658069883527454, + "grad_norm": 0.2059786021709442, + "learning_rate": 1.8115333696979293e-06, + "loss": 0.4843, + "step": 5979 + }, + { + "epoch": 1.65834719911259, + "grad_norm": 0.20751167833805084, + "learning_rate": 1.8086714392150096e-06, + "loss": 0.4832, + "step": 5980 + }, + { + "epoch": 1.6586245146977259, + "grad_norm": 0.20422019064426422, + "learning_rate": 1.8058115948935514e-06, + "loss": 0.488, + "step": 5981 + }, + { + "epoch": 1.6589018302828618, + "grad_norm": 0.2069856822490692, + "learning_rate": 1.8029538372915878e-06, + "loss": 0.5008, + "step": 5982 + }, + { + "epoch": 1.6591791458679976, + "grad_norm": 0.21730239689350128, + "learning_rate": 1.800098166966746e-06, + "loss": 0.4896, + "step": 5983 + }, + { + "epoch": 1.6594564614531335, + "grad_norm": 0.20635366439819336, + "learning_rate": 1.7972445844762376e-06, + "loss": 0.4924, + "step": 5984 + }, + { + "epoch": 1.6597337770382694, + "grad_norm": 0.21054959297180176, + "learning_rate": 1.794393090376878e-06, + "loss": 0.4737, + "step": 5985 + }, + { + "epoch": 1.6600110926234053, + "grad_norm": 0.22848398983478546, + "learning_rate": 1.7915436852250625e-06, + "loss": 0.5273, + "step": 5986 + }, + { + "epoch": 1.6602884082085412, + "grad_norm": 0.20154784619808197, + "learning_rate": 1.7886963695767921e-06, + "loss": 0.487, + "step": 5987 + }, + { + "epoch": 1.660565723793677, + "grad_norm": 0.213270902633667, + "learning_rate": 1.7858511439876491e-06, + "loss": 0.4962, + "step": 5988 + }, + { + "epoch": 1.660843039378813, + "grad_norm": 0.21345144510269165, + "learning_rate": 1.7830080090128127e-06, + "loss": 0.5156, + "step": 5989 + }, + { + "epoch": 1.6611203549639488, + "grad_norm": 0.1976012885570526, + "learning_rate": 1.7801669652070585e-06, + "loss": 0.4916, + "step": 5990 + }, + { + "epoch": 1.6613976705490847, + "grad_norm": 0.21380779147148132, + "learning_rate": 1.7773280131247461e-06, + "loss": 0.5073, + "step": 5991 + }, + { + "epoch": 1.6616749861342206, + "grad_norm": 0.20816531777381897, + "learning_rate": 1.7744911533198266e-06, + "loss": 0.4932, + "step": 5992 + }, + { + "epoch": 1.6619523017193565, + "grad_norm": 0.21241043508052826, + "learning_rate": 1.771656386345852e-06, + "loss": 0.4989, + "step": 5993 + }, + { + "epoch": 1.6622296173044924, + "grad_norm": 0.2513413727283478, + "learning_rate": 1.76882371275596e-06, + "loss": 0.494, + "step": 5994 + }, + { + "epoch": 1.6625069328896283, + "grad_norm": 0.20039229094982147, + "learning_rate": 1.7659931331028777e-06, + "loss": 0.5074, + "step": 5995 + }, + { + "epoch": 1.6627842484747641, + "grad_norm": 0.1997574418783188, + "learning_rate": 1.7631646479389224e-06, + "loss": 0.5249, + "step": 5996 + }, + { + "epoch": 1.6630615640599, + "grad_norm": 0.20580746233463287, + "learning_rate": 1.7603382578160174e-06, + "loss": 0.5056, + "step": 5997 + }, + { + "epoch": 1.663338879645036, + "grad_norm": 0.21222086250782013, + "learning_rate": 1.7575139632856604e-06, + "loss": 0.4943, + "step": 5998 + }, + { + "epoch": 1.6636161952301718, + "grad_norm": 0.21751612424850464, + "learning_rate": 1.7546917648989428e-06, + "loss": 0.4876, + "step": 5999 + }, + { + "epoch": 1.6638935108153077, + "grad_norm": 0.20124460756778717, + "learning_rate": 1.7518716632065544e-06, + "loss": 0.4757, + "step": 6000 + }, + { + "epoch": 1.6641708264004436, + "grad_norm": 0.20955534279346466, + "learning_rate": 1.7490536587587716e-06, + "loss": 0.5009, + "step": 6001 + }, + { + "epoch": 1.6644481419855794, + "grad_norm": 0.21060660481452942, + "learning_rate": 1.7462377521054633e-06, + "loss": 0.5176, + "step": 6002 + }, + { + "epoch": 1.6647254575707153, + "grad_norm": 0.2049666792154312, + "learning_rate": 1.7434239437960797e-06, + "loss": 0.5014, + "step": 6003 + }, + { + "epoch": 1.6650027731558512, + "grad_norm": 0.2187904566526413, + "learning_rate": 1.7406122343796766e-06, + "loss": 0.4969, + "step": 6004 + }, + { + "epoch": 1.665280088740987, + "grad_norm": 0.20527362823486328, + "learning_rate": 1.737802624404894e-06, + "loss": 0.469, + "step": 6005 + }, + { + "epoch": 1.665557404326123, + "grad_norm": 0.21435390412807465, + "learning_rate": 1.7349951144199572e-06, + "loss": 0.5112, + "step": 6006 + }, + { + "epoch": 1.6658347199112589, + "grad_norm": 0.2521720826625824, + "learning_rate": 1.732189704972685e-06, + "loss": 0.5026, + "step": 6007 + }, + { + "epoch": 1.6661120354963947, + "grad_norm": 0.21735185384750366, + "learning_rate": 1.7293863966104898e-06, + "loss": 0.5284, + "step": 6008 + }, + { + "epoch": 1.6663893510815306, + "grad_norm": 0.20416045188903809, + "learning_rate": 1.7265851898803725e-06, + "loss": 0.4943, + "step": 6009 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.19750310480594635, + "learning_rate": 1.7237860853289183e-06, + "loss": 0.4829, + "step": 6010 + }, + { + "epoch": 1.6669439822518024, + "grad_norm": 0.21233929693698883, + "learning_rate": 1.7209890835023086e-06, + "loss": 0.4907, + "step": 6011 + }, + { + "epoch": 1.6672212978369383, + "grad_norm": 0.20838162302970886, + "learning_rate": 1.7181941849463173e-06, + "loss": 0.4875, + "step": 6012 + }, + { + "epoch": 1.6674986134220742, + "grad_norm": 0.21194593608379364, + "learning_rate": 1.7154013902062977e-06, + "loss": 0.4902, + "step": 6013 + }, + { + "epoch": 1.66777592900721, + "grad_norm": 0.208678737282753, + "learning_rate": 1.7126106998271968e-06, + "loss": 0.4778, + "step": 6014 + }, + { + "epoch": 1.668053244592346, + "grad_norm": 0.2077416628599167, + "learning_rate": 1.7098221143535557e-06, + "loss": 0.4905, + "step": 6015 + }, + { + "epoch": 1.6683305601774818, + "grad_norm": 0.21134360134601593, + "learning_rate": 1.7070356343295026e-06, + "loss": 0.4539, + "step": 6016 + }, + { + "epoch": 1.6686078757626177, + "grad_norm": 0.21801994740962982, + "learning_rate": 1.7042512602987489e-06, + "loss": 0.4891, + "step": 6017 + }, + { + "epoch": 1.6688851913477536, + "grad_norm": 0.21269913017749786, + "learning_rate": 1.7014689928046056e-06, + "loss": 0.4759, + "step": 6018 + }, + { + "epoch": 1.6691625069328895, + "grad_norm": 0.21309658885002136, + "learning_rate": 1.6986888323899594e-06, + "loss": 0.477, + "step": 6019 + }, + { + "epoch": 1.6694398225180254, + "grad_norm": 0.21096886694431305, + "learning_rate": 1.6959107795973011e-06, + "loss": 0.5187, + "step": 6020 + }, + { + "epoch": 1.6697171381031612, + "grad_norm": 0.20901210606098175, + "learning_rate": 1.693134834968696e-06, + "loss": 0.4737, + "step": 6021 + }, + { + "epoch": 1.6699944536882971, + "grad_norm": 0.21308885514736176, + "learning_rate": 1.6903609990458063e-06, + "loss": 0.4976, + "step": 6022 + }, + { + "epoch": 1.670271769273433, + "grad_norm": 0.21009613573551178, + "learning_rate": 1.6875892723698855e-06, + "loss": 0.5045, + "step": 6023 + }, + { + "epoch": 1.670549084858569, + "grad_norm": 0.2145100235939026, + "learning_rate": 1.6848196554817633e-06, + "loss": 0.4772, + "step": 6024 + }, + { + "epoch": 1.6708264004437048, + "grad_norm": 0.21058829128742218, + "learning_rate": 1.6820521489218728e-06, + "loss": 0.5044, + "step": 6025 + }, + { + "epoch": 1.6711037160288407, + "grad_norm": 0.21063855290412903, + "learning_rate": 1.6792867532302207e-06, + "loss": 0.5194, + "step": 6026 + }, + { + "epoch": 1.6713810316139766, + "grad_norm": 0.21397702395915985, + "learning_rate": 1.6765234689464157e-06, + "loss": 0.4903, + "step": 6027 + }, + { + "epoch": 1.6716583471991124, + "grad_norm": 0.20812270045280457, + "learning_rate": 1.6737622966096405e-06, + "loss": 0.4869, + "step": 6028 + }, + { + "epoch": 1.6719356627842483, + "grad_norm": 0.19779033958911896, + "learning_rate": 1.671003236758681e-06, + "loss": 0.4884, + "step": 6029 + }, + { + "epoch": 1.6722129783693842, + "grad_norm": 0.2069951891899109, + "learning_rate": 1.6682462899318962e-06, + "loss": 0.5154, + "step": 6030 + }, + { + "epoch": 1.67249029395452, + "grad_norm": 0.20514649152755737, + "learning_rate": 1.665491456667241e-06, + "loss": 0.509, + "step": 6031 + }, + { + "epoch": 1.672767609539656, + "grad_norm": 0.20265156030654907, + "learning_rate": 1.6627387375022605e-06, + "loss": 0.5008, + "step": 6032 + }, + { + "epoch": 1.6730449251247919, + "grad_norm": 0.20257985591888428, + "learning_rate": 1.6599881329740817e-06, + "loss": 0.4914, + "step": 6033 + }, + { + "epoch": 1.6733222407099277, + "grad_norm": 0.19979064166545868, + "learning_rate": 1.657239643619414e-06, + "loss": 0.477, + "step": 6034 + }, + { + "epoch": 1.6735995562950636, + "grad_norm": 0.20505757629871368, + "learning_rate": 1.6544932699745663e-06, + "loss": 0.4945, + "step": 6035 + }, + { + "epoch": 1.6738768718801995, + "grad_norm": 0.21343204379081726, + "learning_rate": 1.6517490125754307e-06, + "loss": 0.4913, + "step": 6036 + }, + { + "epoch": 1.6741541874653354, + "grad_norm": 0.20039796829223633, + "learning_rate": 1.6490068719574787e-06, + "loss": 0.4874, + "step": 6037 + }, + { + "epoch": 1.6744315030504713, + "grad_norm": 0.3097711503505707, + "learning_rate": 1.646266848655778e-06, + "loss": 0.464, + "step": 6038 + }, + { + "epoch": 1.6747088186356072, + "grad_norm": 0.21286475658416748, + "learning_rate": 1.6435289432049818e-06, + "loss": 0.4889, + "step": 6039 + }, + { + "epoch": 1.674986134220743, + "grad_norm": 0.2054137885570526, + "learning_rate": 1.6407931561393253e-06, + "loss": 0.5036, + "step": 6040 + }, + { + "epoch": 1.675263449805879, + "grad_norm": 0.20179954171180725, + "learning_rate": 1.638059487992631e-06, + "loss": 0.5136, + "step": 6041 + }, + { + "epoch": 1.6755407653910148, + "grad_norm": 0.21015551686286926, + "learning_rate": 1.6353279392983117e-06, + "loss": 0.4785, + "step": 6042 + }, + { + "epoch": 1.6758180809761507, + "grad_norm": 0.2488449513912201, + "learning_rate": 1.63259851058937e-06, + "loss": 0.4889, + "step": 6043 + }, + { + "epoch": 1.6760953965612866, + "grad_norm": 0.20739984512329102, + "learning_rate": 1.6298712023983837e-06, + "loss": 0.4849, + "step": 6044 + }, + { + "epoch": 1.6763727121464225, + "grad_norm": 0.21058540046215057, + "learning_rate": 1.627146015257522e-06, + "loss": 0.522, + "step": 6045 + }, + { + "epoch": 1.6766500277315584, + "grad_norm": 0.1949351280927658, + "learning_rate": 1.6244229496985426e-06, + "loss": 0.4794, + "step": 6046 + }, + { + "epoch": 1.6769273433166942, + "grad_norm": 0.21308818459510803, + "learning_rate": 1.6217020062527927e-06, + "loss": 0.4913, + "step": 6047 + }, + { + "epoch": 1.6772046589018301, + "grad_norm": 0.20299099385738373, + "learning_rate": 1.6189831854511937e-06, + "loss": 0.4901, + "step": 6048 + }, + { + "epoch": 1.677481974486966, + "grad_norm": 0.21231001615524292, + "learning_rate": 1.616266487824261e-06, + "loss": 0.4885, + "step": 6049 + }, + { + "epoch": 1.677759290072102, + "grad_norm": 0.20558898150920868, + "learning_rate": 1.6135519139021005e-06, + "loss": 0.5032, + "step": 6050 + }, + { + "epoch": 1.6780366056572378, + "grad_norm": 0.2060549259185791, + "learning_rate": 1.6108394642143907e-06, + "loss": 0.5098, + "step": 6051 + }, + { + "epoch": 1.6783139212423737, + "grad_norm": 0.20870618522167206, + "learning_rate": 1.6081291392904027e-06, + "loss": 0.4791, + "step": 6052 + }, + { + "epoch": 1.6785912368275095, + "grad_norm": 0.20835687220096588, + "learning_rate": 1.6054209396589929e-06, + "loss": 0.4865, + "step": 6053 + }, + { + "epoch": 1.6788685524126454, + "grad_norm": 0.2111913561820984, + "learning_rate": 1.6027148658486077e-06, + "loss": 0.4962, + "step": 6054 + }, + { + "epoch": 1.6791458679977813, + "grad_norm": 0.21386343240737915, + "learning_rate": 1.600010918387268e-06, + "loss": 0.4999, + "step": 6055 + }, + { + "epoch": 1.6794231835829172, + "grad_norm": 0.20479026436805725, + "learning_rate": 1.5973090978025906e-06, + "loss": 0.4723, + "step": 6056 + }, + { + "epoch": 1.679700499168053, + "grad_norm": 0.20431126654148102, + "learning_rate": 1.5946094046217664e-06, + "loss": 0.4792, + "step": 6057 + }, + { + "epoch": 1.679977814753189, + "grad_norm": 0.21851181983947754, + "learning_rate": 1.5919118393715834e-06, + "loss": 0.4986, + "step": 6058 + }, + { + "epoch": 1.6802551303383249, + "grad_norm": 0.3242267668247223, + "learning_rate": 1.5892164025784015e-06, + "loss": 0.5032, + "step": 6059 + }, + { + "epoch": 1.6805324459234607, + "grad_norm": 0.2177019715309143, + "learning_rate": 1.5865230947681762e-06, + "loss": 0.4851, + "step": 6060 + }, + { + "epoch": 1.6808097615085966, + "grad_norm": 0.20672672986984253, + "learning_rate": 1.5838319164664438e-06, + "loss": 0.4711, + "step": 6061 + }, + { + "epoch": 1.6810870770937325, + "grad_norm": 0.1982792168855667, + "learning_rate": 1.581142868198321e-06, + "loss": 0.466, + "step": 6062 + }, + { + "epoch": 1.6813643926788684, + "grad_norm": 0.20962117612361908, + "learning_rate": 1.5784559504885166e-06, + "loss": 0.5069, + "step": 6063 + }, + { + "epoch": 1.6816417082640043, + "grad_norm": 0.2054154872894287, + "learning_rate": 1.5757711638613143e-06, + "loss": 0.5134, + "step": 6064 + }, + { + "epoch": 1.6819190238491402, + "grad_norm": 0.19651010632514954, + "learning_rate": 1.5730885088405922e-06, + "loss": 0.4967, + "step": 6065 + }, + { + "epoch": 1.682196339434276, + "grad_norm": 0.20710954070091248, + "learning_rate": 1.570407985949804e-06, + "loss": 0.4772, + "step": 6066 + }, + { + "epoch": 1.682473655019412, + "grad_norm": 0.2049458771944046, + "learning_rate": 1.5677295957119934e-06, + "loss": 0.4758, + "step": 6067 + }, + { + "epoch": 1.6827509706045478, + "grad_norm": 0.21262648701667786, + "learning_rate": 1.5650533386497801e-06, + "loss": 0.5058, + "step": 6068 + }, + { + "epoch": 1.6830282861896837, + "grad_norm": 0.20999044179916382, + "learning_rate": 1.5623792152853783e-06, + "loss": 0.5048, + "step": 6069 + }, + { + "epoch": 1.6833056017748196, + "grad_norm": 0.21131351590156555, + "learning_rate": 1.559707226140579e-06, + "loss": 0.4729, + "step": 6070 + }, + { + "epoch": 1.6835829173599555, + "grad_norm": 0.22507424652576447, + "learning_rate": 1.5570373717367594e-06, + "loss": 0.4815, + "step": 6071 + }, + { + "epoch": 1.6838602329450914, + "grad_norm": 0.2068501114845276, + "learning_rate": 1.5543696525948726e-06, + "loss": 0.5036, + "step": 6072 + }, + { + "epoch": 1.6841375485302272, + "grad_norm": 0.20177114009857178, + "learning_rate": 1.551704069235467e-06, + "loss": 0.4747, + "step": 6073 + }, + { + "epoch": 1.6844148641153631, + "grad_norm": 0.20704385638237, + "learning_rate": 1.5490406221786686e-06, + "loss": 0.4812, + "step": 6074 + }, + { + "epoch": 1.684692179700499, + "grad_norm": 0.21657253801822662, + "learning_rate": 1.5463793119441835e-06, + "loss": 0.4854, + "step": 6075 + }, + { + "epoch": 1.6849694952856349, + "grad_norm": 0.2125793993473053, + "learning_rate": 1.543720139051305e-06, + "loss": 0.4749, + "step": 6076 + }, + { + "epoch": 1.6852468108707708, + "grad_norm": 0.20819686353206635, + "learning_rate": 1.541063104018911e-06, + "loss": 0.5036, + "step": 6077 + }, + { + "epoch": 1.6855241264559067, + "grad_norm": 0.21309512853622437, + "learning_rate": 1.5384082073654564e-06, + "loss": 0.5012, + "step": 6078 + }, + { + "epoch": 1.6858014420410425, + "grad_norm": 0.20586422085762024, + "learning_rate": 1.5357554496089805e-06, + "loss": 0.4803, + "step": 6079 + }, + { + "epoch": 1.6860787576261784, + "grad_norm": 0.2012133002281189, + "learning_rate": 1.5331048312671085e-06, + "loss": 0.4771, + "step": 6080 + }, + { + "epoch": 1.6863560732113143, + "grad_norm": 0.2082866132259369, + "learning_rate": 1.5304563528570488e-06, + "loss": 0.4793, + "step": 6081 + }, + { + "epoch": 1.6866333887964502, + "grad_norm": 0.21011273562908173, + "learning_rate": 1.5278100148955865e-06, + "loss": 0.4703, + "step": 6082 + }, + { + "epoch": 1.686910704381586, + "grad_norm": 0.20084689557552338, + "learning_rate": 1.5251658178990908e-06, + "loss": 0.4969, + "step": 6083 + }, + { + "epoch": 1.687188019966722, + "grad_norm": 0.22017468512058258, + "learning_rate": 1.5225237623835167e-06, + "loss": 0.4982, + "step": 6084 + }, + { + "epoch": 1.6874653355518578, + "grad_norm": 0.21381786465644836, + "learning_rate": 1.5198838488644036e-06, + "loss": 0.4626, + "step": 6085 + }, + { + "epoch": 1.687742651136994, + "grad_norm": 0.2041536420583725, + "learning_rate": 1.5172460778568626e-06, + "loss": 0.4923, + "step": 6086 + }, + { + "epoch": 1.6880199667221298, + "grad_norm": 0.2171899676322937, + "learning_rate": 1.5146104498755891e-06, + "loss": 0.5064, + "step": 6087 + }, + { + "epoch": 1.6882972823072657, + "grad_norm": 0.217244952917099, + "learning_rate": 1.5119769654348748e-06, + "loss": 0.5021, + "step": 6088 + }, + { + "epoch": 1.6885745978924016, + "grad_norm": 0.20813634991645813, + "learning_rate": 1.5093456250485764e-06, + "loss": 0.4623, + "step": 6089 + }, + { + "epoch": 1.6888519134775375, + "grad_norm": 0.2243288904428482, + "learning_rate": 1.5067164292301358e-06, + "loss": 0.5251, + "step": 6090 + }, + { + "epoch": 1.6891292290626734, + "grad_norm": 0.21137182414531708, + "learning_rate": 1.504089378492582e-06, + "loss": 0.5257, + "step": 6091 + }, + { + "epoch": 1.6894065446478093, + "grad_norm": 0.20429569482803345, + "learning_rate": 1.501464473348524e-06, + "loss": 0.4699, + "step": 6092 + }, + { + "epoch": 1.6896838602329451, + "grad_norm": 0.20688970386981964, + "learning_rate": 1.498841714310148e-06, + "loss": 0.5066, + "step": 6093 + }, + { + "epoch": 1.689961175818081, + "grad_norm": 0.20542113482952118, + "learning_rate": 1.496221101889221e-06, + "loss": 0.4753, + "step": 6094 + }, + { + "epoch": 1.690238491403217, + "grad_norm": 0.20209279656410217, + "learning_rate": 1.4936026365970968e-06, + "loss": 0.5048, + "step": 6095 + }, + { + "epoch": 1.6905158069883528, + "grad_norm": 0.2007710188627243, + "learning_rate": 1.4909863189447093e-06, + "loss": 0.4646, + "step": 6096 + }, + { + "epoch": 1.6907931225734887, + "grad_norm": 0.19921134412288666, + "learning_rate": 1.488372149442567e-06, + "loss": 0.5055, + "step": 6097 + }, + { + "epoch": 1.6910704381586246, + "grad_norm": 0.2226364016532898, + "learning_rate": 1.485760128600769e-06, + "loss": 0.4918, + "step": 6098 + }, + { + "epoch": 1.6913477537437605, + "grad_norm": 0.21037879586219788, + "learning_rate": 1.4831502569289834e-06, + "loss": 0.4959, + "step": 6099 + }, + { + "epoch": 1.6916250693288963, + "grad_norm": 0.20453481376171112, + "learning_rate": 1.4805425349364716e-06, + "loss": 0.5035, + "step": 6100 + }, + { + "epoch": 1.6919023849140322, + "grad_norm": 0.2033025324344635, + "learning_rate": 1.4779369631320637e-06, + "loss": 0.4661, + "step": 6101 + }, + { + "epoch": 1.692179700499168, + "grad_norm": 0.20237600803375244, + "learning_rate": 1.475333542024178e-06, + "loss": 0.4769, + "step": 6102 + }, + { + "epoch": 1.692457016084304, + "grad_norm": 0.20004519820213318, + "learning_rate": 1.4727322721208136e-06, + "loss": 0.4957, + "step": 6103 + }, + { + "epoch": 1.6927343316694399, + "grad_norm": 0.2055417001247406, + "learning_rate": 1.4701331539295426e-06, + "loss": 0.4975, + "step": 6104 + }, + { + "epoch": 1.6930116472545758, + "grad_norm": 0.21751341223716736, + "learning_rate": 1.4675361879575271e-06, + "loss": 0.4675, + "step": 6105 + }, + { + "epoch": 1.6932889628397116, + "grad_norm": 0.21179702877998352, + "learning_rate": 1.4649413747114982e-06, + "loss": 0.5041, + "step": 6106 + }, + { + "epoch": 1.6935662784248475, + "grad_norm": 0.20675987005233765, + "learning_rate": 1.4623487146977754e-06, + "loss": 0.5094, + "step": 6107 + }, + { + "epoch": 1.6938435940099834, + "grad_norm": 0.20587822794914246, + "learning_rate": 1.4597582084222571e-06, + "loss": 0.5026, + "step": 6108 + }, + { + "epoch": 1.6941209095951193, + "grad_norm": 0.19946245849132538, + "learning_rate": 1.4571698563904196e-06, + "loss": 0.4706, + "step": 6109 + }, + { + "epoch": 1.6943982251802552, + "grad_norm": 0.2001856118440628, + "learning_rate": 1.4545836591073129e-06, + "loss": 0.5065, + "step": 6110 + }, + { + "epoch": 1.694675540765391, + "grad_norm": 0.21339738368988037, + "learning_rate": 1.4519996170775791e-06, + "loss": 0.5259, + "step": 6111 + }, + { + "epoch": 1.694952856350527, + "grad_norm": 0.2282184213399887, + "learning_rate": 1.4494177308054315e-06, + "loss": 0.4954, + "step": 6112 + }, + { + "epoch": 1.6952301719356628, + "grad_norm": 0.2113092988729477, + "learning_rate": 1.4468380007946633e-06, + "loss": 0.4853, + "step": 6113 + }, + { + "epoch": 1.6955074875207987, + "grad_norm": 0.20249804854393005, + "learning_rate": 1.4442604275486493e-06, + "loss": 0.4781, + "step": 6114 + }, + { + "epoch": 1.6957848031059346, + "grad_norm": 0.21009276807308197, + "learning_rate": 1.4416850115703442e-06, + "loss": 0.5062, + "step": 6115 + }, + { + "epoch": 1.6960621186910705, + "grad_norm": 0.21459093689918518, + "learning_rate": 1.439111753362278e-06, + "loss": 0.4612, + "step": 6116 + }, + { + "epoch": 1.6963394342762064, + "grad_norm": 0.20765627920627594, + "learning_rate": 1.4365406534265587e-06, + "loss": 0.4704, + "step": 6117 + }, + { + "epoch": 1.6966167498613423, + "grad_norm": 0.20445479452610016, + "learning_rate": 1.4339717122648797e-06, + "loss": 0.4637, + "step": 6118 + }, + { + "epoch": 1.6968940654464781, + "grad_norm": 0.19960498809814453, + "learning_rate": 1.431404930378509e-06, + "loss": 0.4579, + "step": 6119 + }, + { + "epoch": 1.697171381031614, + "grad_norm": 0.22552312910556793, + "learning_rate": 1.428840308268295e-06, + "loss": 0.4842, + "step": 6120 + }, + { + "epoch": 1.69744869661675, + "grad_norm": 0.2081850916147232, + "learning_rate": 1.4262778464346593e-06, + "loss": 0.4912, + "step": 6121 + }, + { + "epoch": 1.6977260122018858, + "grad_norm": 0.22280938923358917, + "learning_rate": 1.4237175453776077e-06, + "loss": 0.492, + "step": 6122 + }, + { + "epoch": 1.6980033277870217, + "grad_norm": 0.20667922496795654, + "learning_rate": 1.4211594055967252e-06, + "loss": 0.5025, + "step": 6123 + }, + { + "epoch": 1.6982806433721576, + "grad_norm": 0.20905707776546478, + "learning_rate": 1.4186034275911726e-06, + "loss": 0.4897, + "step": 6124 + }, + { + "epoch": 1.6985579589572934, + "grad_norm": 0.22128935158252716, + "learning_rate": 1.4160496118596823e-06, + "loss": 0.5004, + "step": 6125 + }, + { + "epoch": 1.6988352745424293, + "grad_norm": 0.225121408700943, + "learning_rate": 1.413497958900581e-06, + "loss": 0.4908, + "step": 6126 + }, + { + "epoch": 1.6991125901275652, + "grad_norm": 0.21258188784122467, + "learning_rate": 1.4109484692117592e-06, + "loss": 0.5058, + "step": 6127 + }, + { + "epoch": 1.699389905712701, + "grad_norm": 0.21206620335578918, + "learning_rate": 1.408401143290687e-06, + "loss": 0.5058, + "step": 6128 + }, + { + "epoch": 1.699667221297837, + "grad_norm": 0.21642054617404938, + "learning_rate": 1.4058559816344186e-06, + "loss": 0.47, + "step": 6129 + }, + { + "epoch": 1.6999445368829729, + "grad_norm": 0.20300422608852386, + "learning_rate": 1.403312984739584e-06, + "loss": 0.492, + "step": 6130 + }, + { + "epoch": 1.7002218524681088, + "grad_norm": 0.20064608752727509, + "learning_rate": 1.400772153102388e-06, + "loss": 0.4729, + "step": 6131 + }, + { + "epoch": 1.7004991680532446, + "grad_norm": 0.22698557376861572, + "learning_rate": 1.3982334872186101e-06, + "loss": 0.4651, + "step": 6132 + }, + { + "epoch": 1.7007764836383805, + "grad_norm": 0.21828146278858185, + "learning_rate": 1.3956969875836155e-06, + "loss": 0.4998, + "step": 6133 + }, + { + "epoch": 1.7010537992235164, + "grad_norm": 0.20743338763713837, + "learning_rate": 1.3931626546923426e-06, + "loss": 0.5038, + "step": 6134 + }, + { + "epoch": 1.7013311148086523, + "grad_norm": 0.19973209500312805, + "learning_rate": 1.3906304890393047e-06, + "loss": 0.475, + "step": 6135 + }, + { + "epoch": 1.7016084303937882, + "grad_norm": 0.20409800112247467, + "learning_rate": 1.3881004911185976e-06, + "loss": 0.4745, + "step": 6136 + }, + { + "epoch": 1.701885745978924, + "grad_norm": 0.21515069901943207, + "learning_rate": 1.3855726614238868e-06, + "loss": 0.5089, + "step": 6137 + }, + { + "epoch": 1.70216306156406, + "grad_norm": 0.20320101082324982, + "learning_rate": 1.383047000448423e-06, + "loss": 0.4882, + "step": 6138 + }, + { + "epoch": 1.7024403771491958, + "grad_norm": 0.20085285604000092, + "learning_rate": 1.3805235086850249e-06, + "loss": 0.4702, + "step": 6139 + }, + { + "epoch": 1.7027176927343317, + "grad_norm": 0.2055083066225052, + "learning_rate": 1.3780021866260955e-06, + "loss": 0.4807, + "step": 6140 + }, + { + "epoch": 1.7029950083194676, + "grad_norm": 0.21570569276809692, + "learning_rate": 1.3754830347636138e-06, + "loss": 0.4849, + "step": 6141 + }, + { + "epoch": 1.7032723239046035, + "grad_norm": 0.20811456441879272, + "learning_rate": 1.3729660535891282e-06, + "loss": 0.4693, + "step": 6142 + }, + { + "epoch": 1.7035496394897394, + "grad_norm": 0.2096460610628128, + "learning_rate": 1.3704512435937734e-06, + "loss": 0.5394, + "step": 6143 + }, + { + "epoch": 1.7038269550748752, + "grad_norm": 0.20185941457748413, + "learning_rate": 1.3679386052682499e-06, + "loss": 0.5063, + "step": 6144 + }, + { + "epoch": 1.7041042706600111, + "grad_norm": 0.26697927713394165, + "learning_rate": 1.365428139102845e-06, + "loss": 0.5086, + "step": 6145 + }, + { + "epoch": 1.704381586245147, + "grad_norm": 0.21113193035125732, + "learning_rate": 1.362919845587414e-06, + "loss": 0.5187, + "step": 6146 + }, + { + "epoch": 1.704658901830283, + "grad_norm": 0.21000999212265015, + "learning_rate": 1.360413725211393e-06, + "loss": 0.4919, + "step": 6147 + }, + { + "epoch": 1.7049362174154188, + "grad_norm": 0.21595942974090576, + "learning_rate": 1.3579097784637908e-06, + "loss": 0.4821, + "step": 6148 + }, + { + "epoch": 1.7052135330005547, + "grad_norm": 0.20245753228664398, + "learning_rate": 1.3554080058331947e-06, + "loss": 0.4946, + "step": 6149 + }, + { + "epoch": 1.7054908485856906, + "grad_norm": 0.19732367992401123, + "learning_rate": 1.3529084078077695e-06, + "loss": 0.4822, + "step": 6150 + }, + { + "epoch": 1.7057681641708264, + "grad_norm": 0.2121272087097168, + "learning_rate": 1.3504109848752485e-06, + "loss": 0.5231, + "step": 6151 + }, + { + "epoch": 1.7060454797559623, + "grad_norm": 0.21215039491653442, + "learning_rate": 1.3479157375229493e-06, + "loss": 0.4893, + "step": 6152 + }, + { + "epoch": 1.7063227953410982, + "grad_norm": 0.20783814787864685, + "learning_rate": 1.3454226662377555e-06, + "loss": 0.4933, + "step": 6153 + }, + { + "epoch": 1.706600110926234, + "grad_norm": 0.20154152810573578, + "learning_rate": 1.3429317715061367e-06, + "loss": 0.4944, + "step": 6154 + }, + { + "epoch": 1.70687742651137, + "grad_norm": 0.20301756262779236, + "learning_rate": 1.340443053814129e-06, + "loss": 0.4592, + "step": 6155 + }, + { + "epoch": 1.7071547420965059, + "grad_norm": 0.20453648269176483, + "learning_rate": 1.3379565136473482e-06, + "loss": 0.4897, + "step": 6156 + }, + { + "epoch": 1.7074320576816417, + "grad_norm": 0.21424759924411774, + "learning_rate": 1.3354721514909865e-06, + "loss": 0.4816, + "step": 6157 + }, + { + "epoch": 1.7077093732667776, + "grad_norm": 0.20235998928546906, + "learning_rate": 1.3329899678298063e-06, + "loss": 0.4907, + "step": 6158 + }, + { + "epoch": 1.7079866888519135, + "grad_norm": 0.20267461240291595, + "learning_rate": 1.3305099631481453e-06, + "loss": 0.4633, + "step": 6159 + }, + { + "epoch": 1.7082640044370494, + "grad_norm": 0.21311825513839722, + "learning_rate": 1.3280321379299215e-06, + "loss": 0.4865, + "step": 6160 + }, + { + "epoch": 1.7085413200221853, + "grad_norm": 0.2097369134426117, + "learning_rate": 1.325556492658625e-06, + "loss": 0.5052, + "step": 6161 + }, + { + "epoch": 1.7088186356073212, + "grad_norm": 0.2084108293056488, + "learning_rate": 1.3230830278173178e-06, + "loss": 0.496, + "step": 6162 + }, + { + "epoch": 1.709095951192457, + "grad_norm": 0.20339664816856384, + "learning_rate": 1.3206117438886333e-06, + "loss": 0.5049, + "step": 6163 + }, + { + "epoch": 1.709373266777593, + "grad_norm": 0.2198762446641922, + "learning_rate": 1.3181426413547955e-06, + "loss": 0.4874, + "step": 6164 + }, + { + "epoch": 1.7096505823627288, + "grad_norm": 0.21083226799964905, + "learning_rate": 1.3156757206975873e-06, + "loss": 0.4878, + "step": 6165 + }, + { + "epoch": 1.7099278979478647, + "grad_norm": 0.23222039639949799, + "learning_rate": 1.313210982398365e-06, + "loss": 0.4773, + "step": 6166 + }, + { + "epoch": 1.7102052135330006, + "grad_norm": 0.20829123258590698, + "learning_rate": 1.3107484269380688e-06, + "loss": 0.4823, + "step": 6167 + }, + { + "epoch": 1.7104825291181365, + "grad_norm": 0.20589673519134521, + "learning_rate": 1.3082880547972104e-06, + "loss": 0.4466, + "step": 6168 + }, + { + "epoch": 1.7107598447032724, + "grad_norm": 0.21213486790657043, + "learning_rate": 1.3058298664558725e-06, + "loss": 0.4807, + "step": 6169 + }, + { + "epoch": 1.7110371602884082, + "grad_norm": 0.2168436497449875, + "learning_rate": 1.3033738623937072e-06, + "loss": 0.4891, + "step": 6170 + }, + { + "epoch": 1.7113144758735441, + "grad_norm": 0.1989026516675949, + "learning_rate": 1.300920043089951e-06, + "loss": 0.5083, + "step": 6171 + }, + { + "epoch": 1.71159179145868, + "grad_norm": 0.19995690882205963, + "learning_rate": 1.2984684090234122e-06, + "loss": 0.4747, + "step": 6172 + }, + { + "epoch": 1.711869107043816, + "grad_norm": 0.20262081921100616, + "learning_rate": 1.2960189606724613e-06, + "loss": 0.4834, + "step": 6173 + }, + { + "epoch": 1.7121464226289518, + "grad_norm": 0.22770211100578308, + "learning_rate": 1.2935716985150587e-06, + "loss": 0.5197, + "step": 6174 + }, + { + "epoch": 1.7124237382140877, + "grad_norm": 0.21298766136169434, + "learning_rate": 1.2911266230287239e-06, + "loss": 0.5022, + "step": 6175 + }, + { + "epoch": 1.7127010537992235, + "grad_norm": 0.21138055622577667, + "learning_rate": 1.2886837346905615e-06, + "loss": 0.472, + "step": 6176 + }, + { + "epoch": 1.7129783693843594, + "grad_norm": 0.20885500311851501, + "learning_rate": 1.2862430339772372e-06, + "loss": 0.4925, + "step": 6177 + }, + { + "epoch": 1.7132556849694953, + "grad_norm": 0.2066037952899933, + "learning_rate": 1.2838045213650008e-06, + "loss": 0.5073, + "step": 6178 + }, + { + "epoch": 1.7135330005546312, + "grad_norm": 0.2201787531375885, + "learning_rate": 1.2813681973296714e-06, + "loss": 0.4771, + "step": 6179 + }, + { + "epoch": 1.713810316139767, + "grad_norm": 0.20949430763721466, + "learning_rate": 1.2789340623466358e-06, + "loss": 0.4767, + "step": 6180 + }, + { + "epoch": 1.714087631724903, + "grad_norm": 0.20830868184566498, + "learning_rate": 1.276502116890864e-06, + "loss": 0.4684, + "step": 6181 + }, + { + "epoch": 1.7143649473100389, + "grad_norm": 0.197309672832489, + "learning_rate": 1.2740723614368886e-06, + "loss": 0.4686, + "step": 6182 + }, + { + "epoch": 1.7146422628951747, + "grad_norm": 0.22104878723621368, + "learning_rate": 1.2716447964588222e-06, + "loss": 0.4738, + "step": 6183 + }, + { + "epoch": 1.7149195784803106, + "grad_norm": 0.20894010365009308, + "learning_rate": 1.2692194224303442e-06, + "loss": 0.4781, + "step": 6184 + }, + { + "epoch": 1.7151968940654465, + "grad_norm": 0.2051091194152832, + "learning_rate": 1.266796239824712e-06, + "loss": 0.4777, + "step": 6185 + }, + { + "epoch": 1.7154742096505824, + "grad_norm": 0.26800915598869324, + "learning_rate": 1.2643752491147505e-06, + "loss": 0.5087, + "step": 6186 + }, + { + "epoch": 1.7157515252357183, + "grad_norm": 0.20493799448013306, + "learning_rate": 1.2619564507728595e-06, + "loss": 0.4678, + "step": 6187 + }, + { + "epoch": 1.7160288408208542, + "grad_norm": 0.22277140617370605, + "learning_rate": 1.2595398452710128e-06, + "loss": 0.5145, + "step": 6188 + }, + { + "epoch": 1.71630615640599, + "grad_norm": 0.2096967250108719, + "learning_rate": 1.2571254330807538e-06, + "loss": 0.4936, + "step": 6189 + }, + { + "epoch": 1.716583471991126, + "grad_norm": 0.2071332186460495, + "learning_rate": 1.254713214673195e-06, + "loss": 0.4964, + "step": 6190 + }, + { + "epoch": 1.7168607875762618, + "grad_norm": 0.20031912624835968, + "learning_rate": 1.252303190519026e-06, + "loss": 0.5048, + "step": 6191 + }, + { + "epoch": 1.7171381031613977, + "grad_norm": 0.2249787598848343, + "learning_rate": 1.2498953610885087e-06, + "loss": 0.5219, + "step": 6192 + }, + { + "epoch": 1.7174154187465336, + "grad_norm": 0.20989800989627838, + "learning_rate": 1.2474897268514696e-06, + "loss": 0.4814, + "step": 6193 + }, + { + "epoch": 1.7176927343316695, + "grad_norm": 0.21632020175457, + "learning_rate": 1.2450862882773154e-06, + "loss": 0.4855, + "step": 6194 + }, + { + "epoch": 1.7179700499168054, + "grad_norm": 0.21728786826133728, + "learning_rate": 1.2426850458350208e-06, + "loss": 0.504, + "step": 6195 + }, + { + "epoch": 1.7182473655019412, + "grad_norm": 0.20542924106121063, + "learning_rate": 1.240285999993132e-06, + "loss": 0.5027, + "step": 6196 + }, + { + "epoch": 1.7185246810870771, + "grad_norm": 0.2109268456697464, + "learning_rate": 1.237889151219762e-06, + "loss": 0.5254, + "step": 6197 + }, + { + "epoch": 1.718801996672213, + "grad_norm": 0.21600918471813202, + "learning_rate": 1.2354944999826022e-06, + "loss": 0.5359, + "step": 6198 + }, + { + "epoch": 1.719079312257349, + "grad_norm": 0.21099236607551575, + "learning_rate": 1.2331020467489157e-06, + "loss": 0.5269, + "step": 6199 + }, + { + "epoch": 1.7193566278424848, + "grad_norm": 0.21038185060024261, + "learning_rate": 1.230711791985531e-06, + "loss": 0.5292, + "step": 6200 + }, + { + "epoch": 1.7196339434276207, + "grad_norm": 0.21021394431591034, + "learning_rate": 1.2283237361588442e-06, + "loss": 0.5175, + "step": 6201 + }, + { + "epoch": 1.7199112590127565, + "grad_norm": 0.20856258273124695, + "learning_rate": 1.2259378797348397e-06, + "loss": 0.5112, + "step": 6202 + }, + { + "epoch": 1.7201885745978924, + "grad_norm": 0.22244808077812195, + "learning_rate": 1.2235542231790548e-06, + "loss": 0.5144, + "step": 6203 + }, + { + "epoch": 1.7204658901830283, + "grad_norm": 0.20829738676548004, + "learning_rate": 1.2211727669566034e-06, + "loss": 0.5132, + "step": 6204 + }, + { + "epoch": 1.7207432057681642, + "grad_norm": 0.20939025282859802, + "learning_rate": 1.2187935115321708e-06, + "loss": 0.4784, + "step": 6205 + }, + { + "epoch": 1.7210205213533, + "grad_norm": 0.20607073605060577, + "learning_rate": 1.2164164573700162e-06, + "loss": 0.4863, + "step": 6206 + }, + { + "epoch": 1.721297836938436, + "grad_norm": 0.20456166565418243, + "learning_rate": 1.2140416049339644e-06, + "loss": 0.5155, + "step": 6207 + }, + { + "epoch": 1.7215751525235718, + "grad_norm": 0.21939820051193237, + "learning_rate": 1.2116689546874088e-06, + "loss": 0.4917, + "step": 6208 + }, + { + "epoch": 1.7218524681087077, + "grad_norm": 0.2015402466058731, + "learning_rate": 1.209298507093319e-06, + "loss": 0.4664, + "step": 6209 + }, + { + "epoch": 1.7221297836938436, + "grad_norm": 0.20761947333812714, + "learning_rate": 1.2069302626142352e-06, + "loss": 0.4844, + "step": 6210 + }, + { + "epoch": 1.7224070992789795, + "grad_norm": 0.21084575355052948, + "learning_rate": 1.2045642217122594e-06, + "loss": 0.5133, + "step": 6211 + }, + { + "epoch": 1.7226844148641154, + "grad_norm": 0.20690487325191498, + "learning_rate": 1.2022003848490699e-06, + "loss": 0.5062, + "step": 6212 + }, + { + "epoch": 1.7229617304492513, + "grad_norm": 0.2094261348247528, + "learning_rate": 1.1998387524859141e-06, + "loss": 0.5167, + "step": 6213 + }, + { + "epoch": 1.7232390460343872, + "grad_norm": 0.20428799092769623, + "learning_rate": 1.1974793250836128e-06, + "loss": 0.4953, + "step": 6214 + }, + { + "epoch": 1.723516361619523, + "grad_norm": 0.200530007481575, + "learning_rate": 1.1951221031025473e-06, + "loss": 0.5019, + "step": 6215 + }, + { + "epoch": 1.723793677204659, + "grad_norm": 0.21505190432071686, + "learning_rate": 1.1927670870026762e-06, + "loss": 0.5069, + "step": 6216 + }, + { + "epoch": 1.7240709927897948, + "grad_norm": 0.20491503179073334, + "learning_rate": 1.190414277243529e-06, + "loss": 0.4872, + "step": 6217 + }, + { + "epoch": 1.7243483083749307, + "grad_norm": 0.20874449610710144, + "learning_rate": 1.188063674284197e-06, + "loss": 0.4886, + "step": 6218 + }, + { + "epoch": 1.7246256239600666, + "grad_norm": 0.21290519833564758, + "learning_rate": 1.1857152785833451e-06, + "loss": 0.478, + "step": 6219 + }, + { + "epoch": 1.7249029395452025, + "grad_norm": 0.20770753920078278, + "learning_rate": 1.1833690905992081e-06, + "loss": 0.4946, + "step": 6220 + }, + { + "epoch": 1.7251802551303383, + "grad_norm": 0.21284344792366028, + "learning_rate": 1.1810251107895923e-06, + "loss": 0.4757, + "step": 6221 + }, + { + "epoch": 1.7254575707154742, + "grad_norm": 0.20383362472057343, + "learning_rate": 1.1786833396118664e-06, + "loss": 0.4774, + "step": 6222 + }, + { + "epoch": 1.7257348863006101, + "grad_norm": 0.2078067809343338, + "learning_rate": 1.1763437775229744e-06, + "loss": 0.4934, + "step": 6223 + }, + { + "epoch": 1.726012201885746, + "grad_norm": 0.2041560709476471, + "learning_rate": 1.174006424979425e-06, + "loss": 0.5165, + "step": 6224 + }, + { + "epoch": 1.7262895174708819, + "grad_norm": 0.21720266342163086, + "learning_rate": 1.1716712824373006e-06, + "loss": 0.4729, + "step": 6225 + }, + { + "epoch": 1.7265668330560178, + "grad_norm": 0.19432541728019714, + "learning_rate": 1.1693383503522435e-06, + "loss": 0.4889, + "step": 6226 + }, + { + "epoch": 1.7268441486411537, + "grad_norm": 0.20884351432323456, + "learning_rate": 1.1670076291794785e-06, + "loss": 0.4874, + "step": 6227 + }, + { + "epoch": 1.7271214642262895, + "grad_norm": 0.20704929530620575, + "learning_rate": 1.1646791193737848e-06, + "loss": 0.4934, + "step": 6228 + }, + { + "epoch": 1.7273987798114254, + "grad_norm": 0.2176746129989624, + "learning_rate": 1.1623528213895174e-06, + "loss": 0.4709, + "step": 6229 + }, + { + "epoch": 1.7276760953965613, + "grad_norm": 0.20258481800556183, + "learning_rate": 1.160028735680603e-06, + "loss": 0.4843, + "step": 6230 + }, + { + "epoch": 1.7279534109816972, + "grad_norm": 0.20934736728668213, + "learning_rate": 1.1577068627005264e-06, + "loss": 0.4852, + "step": 6231 + }, + { + "epoch": 1.728230726566833, + "grad_norm": 0.20559830963611603, + "learning_rate": 1.1553872029023498e-06, + "loss": 0.4676, + "step": 6232 + }, + { + "epoch": 1.728508042151969, + "grad_norm": 0.20282293856143951, + "learning_rate": 1.1530697567387019e-06, + "loss": 0.474, + "step": 6233 + }, + { + "epoch": 1.7287853577371048, + "grad_norm": 0.20379206538200378, + "learning_rate": 1.1507545246617763e-06, + "loss": 0.4831, + "step": 6234 + }, + { + "epoch": 1.7290626733222407, + "grad_norm": 0.20645445585250854, + "learning_rate": 1.1484415071233322e-06, + "loss": 0.4729, + "step": 6235 + }, + { + "epoch": 1.7293399889073766, + "grad_norm": 0.20266611874103546, + "learning_rate": 1.1461307045747035e-06, + "loss": 0.4816, + "step": 6236 + }, + { + "epoch": 1.7296173044925125, + "grad_norm": 0.20587818324565887, + "learning_rate": 1.1438221174667931e-06, + "loss": 0.5129, + "step": 6237 + }, + { + "epoch": 1.7298946200776484, + "grad_norm": 0.21184997260570526, + "learning_rate": 1.1415157462500631e-06, + "loss": 0.5172, + "step": 6238 + }, + { + "epoch": 1.7301719356627843, + "grad_norm": 0.21965019404888153, + "learning_rate": 1.1392115913745436e-06, + "loss": 0.5091, + "step": 6239 + }, + { + "epoch": 1.7304492512479202, + "grad_norm": 0.2110959142446518, + "learning_rate": 1.1369096532898458e-06, + "loss": 0.4752, + "step": 6240 + }, + { + "epoch": 1.730726566833056, + "grad_norm": 0.20495730638504028, + "learning_rate": 1.134609932445134e-06, + "loss": 0.4793, + "step": 6241 + }, + { + "epoch": 1.731003882418192, + "grad_norm": 0.20782862603664398, + "learning_rate": 1.132312429289145e-06, + "loss": 0.5282, + "step": 6242 + }, + { + "epoch": 1.7312811980033278, + "grad_norm": 0.20263516902923584, + "learning_rate": 1.1300171442701776e-06, + "loss": 0.4969, + "step": 6243 + }, + { + "epoch": 1.7315585135884637, + "grad_norm": 0.2159367799758911, + "learning_rate": 1.127724077836112e-06, + "loss": 0.489, + "step": 6244 + }, + { + "epoch": 1.7318358291735996, + "grad_norm": 0.2180434763431549, + "learning_rate": 1.1254332304343806e-06, + "loss": 0.4872, + "step": 6245 + }, + { + "epoch": 1.7321131447587355, + "grad_norm": 0.20098507404327393, + "learning_rate": 1.1231446025119885e-06, + "loss": 0.5097, + "step": 6246 + }, + { + "epoch": 1.7323904603438713, + "grad_norm": 0.20740365982055664, + "learning_rate": 1.1208581945155075e-06, + "loss": 0.4777, + "step": 6247 + }, + { + "epoch": 1.7326677759290072, + "grad_norm": 0.2009047120809555, + "learning_rate": 1.1185740068910807e-06, + "loss": 0.4916, + "step": 6248 + }, + { + "epoch": 1.732945091514143, + "grad_norm": 0.2049127072095871, + "learning_rate": 1.1162920400844102e-06, + "loss": 0.493, + "step": 6249 + }, + { + "epoch": 1.733222407099279, + "grad_norm": 0.21437804400920868, + "learning_rate": 1.1140122945407644e-06, + "loss": 0.4864, + "step": 6250 + }, + { + "epoch": 1.7334997226844149, + "grad_norm": 0.20494471490383148, + "learning_rate": 1.1117347707049876e-06, + "loss": 0.4845, + "step": 6251 + }, + { + "epoch": 1.7337770382695508, + "grad_norm": 0.2116573303937912, + "learning_rate": 1.1094594690214858e-06, + "loss": 0.4745, + "step": 6252 + }, + { + "epoch": 1.7340543538546866, + "grad_norm": 0.2070380300283432, + "learning_rate": 1.1071863899342255e-06, + "loss": 0.4956, + "step": 6253 + }, + { + "epoch": 1.7343316694398225, + "grad_norm": 0.2108084261417389, + "learning_rate": 1.1049155338867466e-06, + "loss": 0.4917, + "step": 6254 + }, + { + "epoch": 1.7346089850249584, + "grad_norm": 0.20470231771469116, + "learning_rate": 1.1026469013221574e-06, + "loss": 0.4788, + "step": 6255 + }, + { + "epoch": 1.7348863006100943, + "grad_norm": 0.20482300221920013, + "learning_rate": 1.1003804926831243e-06, + "loss": 0.4933, + "step": 6256 + }, + { + "epoch": 1.7351636161952302, + "grad_norm": 0.23574180901050568, + "learning_rate": 1.0981163084118825e-06, + "loss": 0.4953, + "step": 6257 + }, + { + "epoch": 1.735440931780366, + "grad_norm": 0.19923263788223267, + "learning_rate": 1.095854348950237e-06, + "loss": 0.4594, + "step": 6258 + }, + { + "epoch": 1.735718247365502, + "grad_norm": 0.23000024259090424, + "learning_rate": 1.0935946147395556e-06, + "loss": 0.4972, + "step": 6259 + }, + { + "epoch": 1.7359955629506378, + "grad_norm": 0.2056475728750229, + "learning_rate": 1.0913371062207702e-06, + "loss": 0.4875, + "step": 6260 + }, + { + "epoch": 1.7362728785357737, + "grad_norm": 0.21127453446388245, + "learning_rate": 1.0890818238343856e-06, + "loss": 0.4894, + "step": 6261 + }, + { + "epoch": 1.7365501941209096, + "grad_norm": 0.21046780049800873, + "learning_rate": 1.0868287680204606e-06, + "loss": 0.499, + "step": 6262 + }, + { + "epoch": 1.7368275097060455, + "grad_norm": 0.2049138844013214, + "learning_rate": 1.0845779392186311e-06, + "loss": 0.5112, + "step": 6263 + }, + { + "epoch": 1.7371048252911814, + "grad_norm": 0.20450134575366974, + "learning_rate": 1.0823293378680904e-06, + "loss": 0.4916, + "step": 6264 + }, + { + "epoch": 1.7373821408763173, + "grad_norm": 0.20806051790714264, + "learning_rate": 1.0800829644076027e-06, + "loss": 0.4837, + "step": 6265 + }, + { + "epoch": 1.7376594564614531, + "grad_norm": 0.20882248878479004, + "learning_rate": 1.0778388192754909e-06, + "loss": 0.4949, + "step": 6266 + }, + { + "epoch": 1.737936772046589, + "grad_norm": 0.2119029462337494, + "learning_rate": 1.0755969029096508e-06, + "loss": 0.4759, + "step": 6267 + }, + { + "epoch": 1.738214087631725, + "grad_norm": 0.20091450214385986, + "learning_rate": 1.0733572157475405e-06, + "loss": 0.4907, + "step": 6268 + }, + { + "epoch": 1.7384914032168608, + "grad_norm": 0.295280784368515, + "learning_rate": 1.0711197582261786e-06, + "loss": 0.5013, + "step": 6269 + }, + { + "epoch": 1.7387687188019967, + "grad_norm": 0.21129965782165527, + "learning_rate": 1.0688845307821577e-06, + "loss": 0.5146, + "step": 6270 + }, + { + "epoch": 1.7390460343871326, + "grad_norm": 0.20647571980953217, + "learning_rate": 1.0666515338516232e-06, + "loss": 0.4848, + "step": 6271 + }, + { + "epoch": 1.7393233499722685, + "grad_norm": 0.20426985621452332, + "learning_rate": 1.064420767870297e-06, + "loss": 0.4568, + "step": 6272 + }, + { + "epoch": 1.7396006655574043, + "grad_norm": 0.20671120285987854, + "learning_rate": 1.0621922332734568e-06, + "loss": 0.477, + "step": 6273 + }, + { + "epoch": 1.7398779811425402, + "grad_norm": 0.20746487379074097, + "learning_rate": 1.0599659304959514e-06, + "loss": 0.5096, + "step": 6274 + }, + { + "epoch": 1.740155296727676, + "grad_norm": 0.21173517405986786, + "learning_rate": 1.057741859972193e-06, + "loss": 0.4759, + "step": 6275 + }, + { + "epoch": 1.740432612312812, + "grad_norm": 0.20655114948749542, + "learning_rate": 1.0555200221361556e-06, + "loss": 0.4914, + "step": 6276 + }, + { + "epoch": 1.7407099278979479, + "grad_norm": 0.2177557796239853, + "learning_rate": 1.053300417421374e-06, + "loss": 0.4762, + "step": 6277 + }, + { + "epoch": 1.7409872434830838, + "grad_norm": 0.1989573985338211, + "learning_rate": 1.0510830462609561e-06, + "loss": 0.4646, + "step": 6278 + }, + { + "epoch": 1.7412645590682196, + "grad_norm": 0.20482690632343292, + "learning_rate": 1.0488679090875711e-06, + "loss": 0.4705, + "step": 6279 + }, + { + "epoch": 1.7415418746533555, + "grad_norm": 0.19922995567321777, + "learning_rate": 1.0466550063334497e-06, + "loss": 0.4826, + "step": 6280 + }, + { + "epoch": 1.7418191902384914, + "grad_norm": 0.2140817493200302, + "learning_rate": 1.0444443384303823e-06, + "loss": 0.4991, + "step": 6281 + }, + { + "epoch": 1.7420965058236273, + "grad_norm": 0.2041434794664383, + "learning_rate": 1.0422359058097382e-06, + "loss": 0.4831, + "step": 6282 + }, + { + "epoch": 1.7423738214087632, + "grad_norm": 0.2063605785369873, + "learning_rate": 1.0400297089024373e-06, + "loss": 0.5162, + "step": 6283 + }, + { + "epoch": 1.742651136993899, + "grad_norm": 0.21836838126182556, + "learning_rate": 1.0378257481389639e-06, + "loss": 0.4971, + "step": 6284 + }, + { + "epoch": 1.742928452579035, + "grad_norm": 0.2118816375732422, + "learning_rate": 1.0356240239493705e-06, + "loss": 0.4815, + "step": 6285 + }, + { + "epoch": 1.7432057681641708, + "grad_norm": 0.20386700332164764, + "learning_rate": 1.0334245367632764e-06, + "loss": 0.4998, + "step": 6286 + }, + { + "epoch": 1.7434830837493067, + "grad_norm": 0.2046600878238678, + "learning_rate": 1.031227287009856e-06, + "loss": 0.4958, + "step": 6287 + }, + { + "epoch": 1.7437603993344426, + "grad_norm": 0.20780619978904724, + "learning_rate": 1.0290322751178486e-06, + "loss": 0.5133, + "step": 6288 + }, + { + "epoch": 1.7440377149195785, + "grad_norm": 0.22479791939258575, + "learning_rate": 1.0268395015155613e-06, + "loss": 0.4936, + "step": 6289 + }, + { + "epoch": 1.7443150305047144, + "grad_norm": 0.2186380922794342, + "learning_rate": 1.0246489666308672e-06, + "loss": 0.5165, + "step": 6290 + }, + { + "epoch": 1.7445923460898503, + "grad_norm": 0.2129991352558136, + "learning_rate": 1.0224606708911913e-06, + "loss": 0.5162, + "step": 6291 + }, + { + "epoch": 1.7448696616749861, + "grad_norm": 0.2101704329252243, + "learning_rate": 1.0202746147235265e-06, + "loss": 0.5055, + "step": 6292 + }, + { + "epoch": 1.745146977260122, + "grad_norm": 0.21138980984687805, + "learning_rate": 1.0180907985544387e-06, + "loss": 0.4946, + "step": 6293 + }, + { + "epoch": 1.745424292845258, + "grad_norm": 0.20843324065208435, + "learning_rate": 1.0159092228100437e-06, + "loss": 0.4729, + "step": 6294 + }, + { + "epoch": 1.7457016084303938, + "grad_norm": 0.20566268265247345, + "learning_rate": 1.0137298879160221e-06, + "loss": 0.4515, + "step": 6295 + }, + { + "epoch": 1.7459789240155297, + "grad_norm": 0.2100181132555008, + "learning_rate": 1.0115527942976224e-06, + "loss": 0.4772, + "step": 6296 + }, + { + "epoch": 1.7462562396006656, + "grad_norm": 0.2049798220396042, + "learning_rate": 1.009377942379655e-06, + "loss": 0.4771, + "step": 6297 + }, + { + "epoch": 1.7465335551858014, + "grad_norm": 0.2053791582584381, + "learning_rate": 1.0072053325864875e-06, + "loss": 0.4837, + "step": 6298 + }, + { + "epoch": 1.7468108707709373, + "grad_norm": 0.20660558342933655, + "learning_rate": 1.005034965342057e-06, + "loss": 0.5195, + "step": 6299 + }, + { + "epoch": 1.7470881863560732, + "grad_norm": 0.21594339609146118, + "learning_rate": 1.0028668410698564e-06, + "loss": 0.4937, + "step": 6300 + }, + { + "epoch": 1.747365501941209, + "grad_norm": 0.21394188702106476, + "learning_rate": 1.000700960192949e-06, + "loss": 0.5009, + "step": 6301 + }, + { + "epoch": 1.747642817526345, + "grad_norm": 0.2106638103723526, + "learning_rate": 9.985373231339496e-07, + "loss": 0.4723, + "step": 6302 + }, + { + "epoch": 1.7479201331114809, + "grad_norm": 0.20718468725681305, + "learning_rate": 9.963759303150453e-07, + "loss": 0.4694, + "step": 6303 + }, + { + "epoch": 1.7481974486966168, + "grad_norm": 0.2988225817680359, + "learning_rate": 9.942167821579778e-07, + "loss": 0.5089, + "step": 6304 + }, + { + "epoch": 1.7484747642817526, + "grad_norm": 0.21094036102294922, + "learning_rate": 9.920598790840562e-07, + "loss": 0.4992, + "step": 6305 + }, + { + "epoch": 1.7487520798668885, + "grad_norm": 0.20664121210575104, + "learning_rate": 9.899052215141522e-07, + "loss": 0.4853, + "step": 6306 + }, + { + "epoch": 1.7490293954520244, + "grad_norm": 0.21847885847091675, + "learning_rate": 9.877528098686917e-07, + "loss": 0.5011, + "step": 6307 + }, + { + "epoch": 1.7493067110371603, + "grad_norm": 0.22634607553482056, + "learning_rate": 9.856026445676709e-07, + "loss": 0.508, + "step": 6308 + }, + { + "epoch": 1.7495840266222962, + "grad_norm": 0.21793290972709656, + "learning_rate": 9.8345472603064e-07, + "loss": 0.5396, + "step": 6309 + }, + { + "epoch": 1.749861342207432, + "grad_norm": 0.2086942344903946, + "learning_rate": 9.813090546767184e-07, + "loss": 0.5171, + "step": 6310 + }, + { + "epoch": 1.750138657792568, + "grad_norm": 0.21406447887420654, + "learning_rate": 9.79165630924582e-07, + "loss": 0.5021, + "step": 6311 + }, + { + "epoch": 1.7504159733777038, + "grad_norm": 0.20880061388015747, + "learning_rate": 9.770244551924683e-07, + "loss": 0.5135, + "step": 6312 + }, + { + "epoch": 1.7506932889628397, + "grad_norm": 0.2138943374156952, + "learning_rate": 9.748855278981811e-07, + "loss": 0.5149, + "step": 6313 + }, + { + "epoch": 1.7509706045479756, + "grad_norm": 0.20010744035243988, + "learning_rate": 9.727488494590805e-07, + "loss": 0.5055, + "step": 6314 + }, + { + "epoch": 1.7512479201331115, + "grad_norm": 0.2039041370153427, + "learning_rate": 9.706144202920848e-07, + "loss": 0.4796, + "step": 6315 + }, + { + "epoch": 1.7515252357182474, + "grad_norm": 0.1957438588142395, + "learning_rate": 9.684822408136813e-07, + "loss": 0.4513, + "step": 6316 + }, + { + "epoch": 1.7518025513033832, + "grad_norm": 0.20559154450893402, + "learning_rate": 9.663523114399179e-07, + "loss": 0.5066, + "step": 6317 + }, + { + "epoch": 1.7520798668885191, + "grad_norm": 0.20115143060684204, + "learning_rate": 9.642246325863954e-07, + "loss": 0.53, + "step": 6318 + }, + { + "epoch": 1.752357182473655, + "grad_norm": 0.22017818689346313, + "learning_rate": 9.620992046682778e-07, + "loss": 0.4881, + "step": 6319 + }, + { + "epoch": 1.752634498058791, + "grad_norm": 0.20881177484989166, + "learning_rate": 9.599760281002997e-07, + "loss": 0.4777, + "step": 6320 + }, + { + "epoch": 1.7529118136439268, + "grad_norm": 0.24889224767684937, + "learning_rate": 9.578551032967466e-07, + "loss": 0.4785, + "step": 6321 + }, + { + "epoch": 1.7531891292290627, + "grad_norm": 0.21844631433486938, + "learning_rate": 9.557364306714638e-07, + "loss": 0.4774, + "step": 6322 + }, + { + "epoch": 1.7534664448141986, + "grad_norm": 0.2035273164510727, + "learning_rate": 9.536200106378637e-07, + "loss": 0.4977, + "step": 6323 + }, + { + "epoch": 1.7537437603993344, + "grad_norm": 0.19780419766902924, + "learning_rate": 9.515058436089158e-07, + "loss": 0.4964, + "step": 6324 + }, + { + "epoch": 1.7540210759844703, + "grad_norm": 0.19891899824142456, + "learning_rate": 9.493939299971499e-07, + "loss": 0.4999, + "step": 6325 + }, + { + "epoch": 1.7542983915696062, + "grad_norm": 0.20503740012645721, + "learning_rate": 9.472842702146545e-07, + "loss": 0.4901, + "step": 6326 + }, + { + "epoch": 1.754575707154742, + "grad_norm": 0.22530865669250488, + "learning_rate": 9.451768646730808e-07, + "loss": 0.4997, + "step": 6327 + }, + { + "epoch": 1.754853022739878, + "grad_norm": 0.21066060662269592, + "learning_rate": 9.430717137836414e-07, + "loss": 0.4874, + "step": 6328 + }, + { + "epoch": 1.7551303383250139, + "grad_norm": 0.2237623631954193, + "learning_rate": 9.409688179571066e-07, + "loss": 0.4899, + "step": 6329 + }, + { + "epoch": 1.7554076539101497, + "grad_norm": 0.2304733246564865, + "learning_rate": 9.388681776038022e-07, + "loss": 0.4858, + "step": 6330 + }, + { + "epoch": 1.7556849694952856, + "grad_norm": 0.21019956469535828, + "learning_rate": 9.367697931336266e-07, + "loss": 0.5145, + "step": 6331 + }, + { + "epoch": 1.7559622850804215, + "grad_norm": 0.2206750512123108, + "learning_rate": 9.346736649560262e-07, + "loss": 0.5251, + "step": 6332 + }, + { + "epoch": 1.7562396006655574, + "grad_norm": 0.21418681740760803, + "learning_rate": 9.325797934800082e-07, + "loss": 0.4837, + "step": 6333 + }, + { + "epoch": 1.7565169162506933, + "grad_norm": 0.20882757008075714, + "learning_rate": 9.304881791141474e-07, + "loss": 0.5064, + "step": 6334 + }, + { + "epoch": 1.7567942318358292, + "grad_norm": 0.22608445584774017, + "learning_rate": 9.283988222665726e-07, + "loss": 0.5077, + "step": 6335 + }, + { + "epoch": 1.757071547420965, + "grad_norm": 0.20677009224891663, + "learning_rate": 9.263117233449706e-07, + "loss": 0.5135, + "step": 6336 + }, + { + "epoch": 1.757348863006101, + "grad_norm": 0.2091902196407318, + "learning_rate": 9.24226882756589e-07, + "loss": 0.5079, + "step": 6337 + }, + { + "epoch": 1.7576261785912368, + "grad_norm": 0.2035513073205948, + "learning_rate": 9.22144300908237e-07, + "loss": 0.4837, + "step": 6338 + }, + { + "epoch": 1.7579034941763727, + "grad_norm": 0.21183468401432037, + "learning_rate": 9.200639782062842e-07, + "loss": 0.4968, + "step": 6339 + }, + { + "epoch": 1.7581808097615086, + "grad_norm": 0.20480120182037354, + "learning_rate": 9.179859150566503e-07, + "loss": 0.5, + "step": 6340 + }, + { + "epoch": 1.7584581253466445, + "grad_norm": 0.20064327120780945, + "learning_rate": 9.159101118648276e-07, + "loss": 0.4982, + "step": 6341 + }, + { + "epoch": 1.7587354409317804, + "grad_norm": 0.22214539349079132, + "learning_rate": 9.13836569035853e-07, + "loss": 0.5245, + "step": 6342 + }, + { + "epoch": 1.7590127565169162, + "grad_norm": 0.23857805132865906, + "learning_rate": 9.117652869743365e-07, + "loss": 0.5035, + "step": 6343 + }, + { + "epoch": 1.7592900721020521, + "grad_norm": 0.21097144484519958, + "learning_rate": 9.096962660844352e-07, + "loss": 0.4751, + "step": 6344 + }, + { + "epoch": 1.759567387687188, + "grad_norm": 0.20220641791820526, + "learning_rate": 9.076295067698707e-07, + "loss": 0.4813, + "step": 6345 + }, + { + "epoch": 1.759844703272324, + "grad_norm": 0.20024532079696655, + "learning_rate": 9.055650094339257e-07, + "loss": 0.4912, + "step": 6346 + }, + { + "epoch": 1.7601220188574598, + "grad_norm": 0.2002626657485962, + "learning_rate": 9.035027744794353e-07, + "loss": 0.4897, + "step": 6347 + }, + { + "epoch": 1.7603993344425957, + "grad_norm": 0.20744280517101288, + "learning_rate": 9.014428023087981e-07, + "loss": 0.5057, + "step": 6348 + }, + { + "epoch": 1.7606766500277315, + "grad_norm": 0.21660728752613068, + "learning_rate": 8.993850933239664e-07, + "loss": 0.5014, + "step": 6349 + }, + { + "epoch": 1.7609539656128674, + "grad_norm": 0.22033831477165222, + "learning_rate": 8.973296479264564e-07, + "loss": 0.4623, + "step": 6350 + }, + { + "epoch": 1.7612312811980033, + "grad_norm": 0.1961023062467575, + "learning_rate": 8.952764665173378e-07, + "loss": 0.458, + "step": 6351 + }, + { + "epoch": 1.7615085967831392, + "grad_norm": 0.20602113008499146, + "learning_rate": 8.932255494972452e-07, + "loss": 0.487, + "step": 6352 + }, + { + "epoch": 1.761785912368275, + "grad_norm": 0.21287092566490173, + "learning_rate": 8.911768972663603e-07, + "loss": 0.5232, + "step": 6353 + }, + { + "epoch": 1.762063227953411, + "grad_norm": 0.20058588683605194, + "learning_rate": 8.891305102244326e-07, + "loss": 0.4714, + "step": 6354 + }, + { + "epoch": 1.7623405435385469, + "grad_norm": 0.22055946290493011, + "learning_rate": 8.870863887707678e-07, + "loss": 0.5222, + "step": 6355 + }, + { + "epoch": 1.7626178591236827, + "grad_norm": 0.2032008171081543, + "learning_rate": 8.850445333042287e-07, + "loss": 0.5174, + "step": 6356 + }, + { + "epoch": 1.7628951747088186, + "grad_norm": 0.22134113311767578, + "learning_rate": 8.830049442232285e-07, + "loss": 0.5117, + "step": 6357 + }, + { + "epoch": 1.7631724902939545, + "grad_norm": 0.2134564369916916, + "learning_rate": 8.809676219257546e-07, + "loss": 0.5069, + "step": 6358 + }, + { + "epoch": 1.7634498058790904, + "grad_norm": 0.21325929462909698, + "learning_rate": 8.789325668093374e-07, + "loss": 0.5023, + "step": 6359 + }, + { + "epoch": 1.7637271214642263, + "grad_norm": 0.20757299661636353, + "learning_rate": 8.768997792710689e-07, + "loss": 0.5152, + "step": 6360 + }, + { + "epoch": 1.7640044370493622, + "grad_norm": 0.2015492469072342, + "learning_rate": 8.74869259707603e-07, + "loss": 0.4596, + "step": 6361 + }, + { + "epoch": 1.764281752634498, + "grad_norm": 0.23414163291454315, + "learning_rate": 8.728410085151476e-07, + "loss": 0.4936, + "step": 6362 + }, + { + "epoch": 1.764559068219634, + "grad_norm": 0.21143180131912231, + "learning_rate": 8.708150260894666e-07, + "loss": 0.5004, + "step": 6363 + }, + { + "epoch": 1.7648363838047698, + "grad_norm": 0.2277495414018631, + "learning_rate": 8.687913128258826e-07, + "loss": 0.4884, + "step": 6364 + }, + { + "epoch": 1.7651136993899057, + "grad_norm": 0.21437212824821472, + "learning_rate": 8.667698691192758e-07, + "loss": 0.504, + "step": 6365 + }, + { + "epoch": 1.7653910149750416, + "grad_norm": 0.20440204441547394, + "learning_rate": 8.64750695364086e-07, + "loss": 0.4866, + "step": 6366 + }, + { + "epoch": 1.7656683305601775, + "grad_norm": 0.2076803594827652, + "learning_rate": 8.627337919543066e-07, + "loss": 0.5107, + "step": 6367 + }, + { + "epoch": 1.7659456461453134, + "grad_norm": 0.21426992118358612, + "learning_rate": 8.607191592834824e-07, + "loss": 0.4913, + "step": 6368 + }, + { + "epoch": 1.7662229617304492, + "grad_norm": 0.24520544707775116, + "learning_rate": 8.587067977447321e-07, + "loss": 0.5185, + "step": 6369 + }, + { + "epoch": 1.7665002773155851, + "grad_norm": 0.20242126286029816, + "learning_rate": 8.566967077307156e-07, + "loss": 0.4838, + "step": 6370 + }, + { + "epoch": 1.766777592900721, + "grad_norm": 0.2145501673221588, + "learning_rate": 8.546888896336536e-07, + "loss": 0.4861, + "step": 6371 + }, + { + "epoch": 1.767054908485857, + "grad_norm": 0.21382984519004822, + "learning_rate": 8.526833438453258e-07, + "loss": 0.4972, + "step": 6372 + }, + { + "epoch": 1.7673322240709928, + "grad_norm": 0.20749729871749878, + "learning_rate": 8.506800707570691e-07, + "loss": 0.4799, + "step": 6373 + }, + { + "epoch": 1.7676095396561287, + "grad_norm": 0.20935972034931183, + "learning_rate": 8.486790707597725e-07, + "loss": 0.4861, + "step": 6374 + }, + { + "epoch": 1.7678868552412645, + "grad_norm": 0.21835561096668243, + "learning_rate": 8.466803442438844e-07, + "loss": 0.5149, + "step": 6375 + }, + { + "epoch": 1.7681641708264004, + "grad_norm": 0.2116965502500534, + "learning_rate": 8.446838915994099e-07, + "loss": 0.5302, + "step": 6376 + }, + { + "epoch": 1.7684414864115363, + "grad_norm": 0.20836491882801056, + "learning_rate": 8.42689713215912e-07, + "loss": 0.5131, + "step": 6377 + }, + { + "epoch": 1.7687188019966722, + "grad_norm": 0.19809886813163757, + "learning_rate": 8.406978094825033e-07, + "loss": 0.5147, + "step": 6378 + }, + { + "epoch": 1.768996117581808, + "grad_norm": 0.20016132295131683, + "learning_rate": 8.387081807878602e-07, + "loss": 0.4531, + "step": 6379 + }, + { + "epoch": 1.769273433166944, + "grad_norm": 0.20489495992660522, + "learning_rate": 8.367208275202112e-07, + "loss": 0.4955, + "step": 6380 + }, + { + "epoch": 1.7695507487520798, + "grad_norm": 0.22036077082157135, + "learning_rate": 8.34735750067342e-07, + "loss": 0.4954, + "step": 6381 + }, + { + "epoch": 1.7698280643372157, + "grad_norm": 0.19575130939483643, + "learning_rate": 8.327529488165925e-07, + "loss": 0.4695, + "step": 6382 + }, + { + "epoch": 1.7701053799223516, + "grad_norm": 0.20686204731464386, + "learning_rate": 8.30772424154859e-07, + "loss": 0.4713, + "step": 6383 + }, + { + "epoch": 1.7703826955074875, + "grad_norm": 0.2088318020105362, + "learning_rate": 8.287941764685989e-07, + "loss": 0.5319, + "step": 6384 + }, + { + "epoch": 1.7706600110926234, + "grad_norm": 0.20468567311763763, + "learning_rate": 8.26818206143816e-07, + "loss": 0.5164, + "step": 6385 + }, + { + "epoch": 1.7709373266777593, + "grad_norm": 0.22691620886325836, + "learning_rate": 8.248445135660782e-07, + "loss": 0.5026, + "step": 6386 + }, + { + "epoch": 1.7712146422628952, + "grad_norm": 0.2066589593887329, + "learning_rate": 8.22873099120501e-07, + "loss": 0.5169, + "step": 6387 + }, + { + "epoch": 1.771491957848031, + "grad_norm": 0.2105691283941269, + "learning_rate": 8.209039631917631e-07, + "loss": 0.5144, + "step": 6388 + }, + { + "epoch": 1.771769273433167, + "grad_norm": 0.23009151220321655, + "learning_rate": 8.18937106164093e-07, + "loss": 0.5291, + "step": 6389 + }, + { + "epoch": 1.7720465890183028, + "grad_norm": 0.20932504534721375, + "learning_rate": 8.169725284212781e-07, + "loss": 0.5105, + "step": 6390 + }, + { + "epoch": 1.7723239046034387, + "grad_norm": 0.2107200175523758, + "learning_rate": 8.150102303466578e-07, + "loss": 0.4886, + "step": 6391 + }, + { + "epoch": 1.7726012201885746, + "grad_norm": 0.20422813296318054, + "learning_rate": 8.130502123231285e-07, + "loss": 0.5055, + "step": 6392 + }, + { + "epoch": 1.7728785357737105, + "grad_norm": 0.20422576367855072, + "learning_rate": 8.110924747331467e-07, + "loss": 0.5034, + "step": 6393 + }, + { + "epoch": 1.7731558513588463, + "grad_norm": 0.21044956147670746, + "learning_rate": 8.091370179587124e-07, + "loss": 0.5029, + "step": 6394 + }, + { + "epoch": 1.7734331669439822, + "grad_norm": 0.20264363288879395, + "learning_rate": 8.071838423813885e-07, + "loss": 0.5073, + "step": 6395 + }, + { + "epoch": 1.7737104825291181, + "grad_norm": 0.2157750427722931, + "learning_rate": 8.052329483822924e-07, + "loss": 0.4861, + "step": 6396 + }, + { + "epoch": 1.773987798114254, + "grad_norm": 0.20734168589115143, + "learning_rate": 8.032843363420972e-07, + "loss": 0.5013, + "step": 6397 + }, + { + "epoch": 1.7742651136993899, + "grad_norm": 0.23018255829811096, + "learning_rate": 8.013380066410253e-07, + "loss": 0.5124, + "step": 6398 + }, + { + "epoch": 1.7745424292845258, + "grad_norm": 0.19958136975765228, + "learning_rate": 7.993939596588576e-07, + "loss": 0.4944, + "step": 6399 + }, + { + "epoch": 1.7748197448696617, + "grad_norm": 0.2005843073129654, + "learning_rate": 7.974521957749309e-07, + "loss": 0.4886, + "step": 6400 + }, + { + "epoch": 1.7750970604547975, + "grad_norm": 0.20511426031589508, + "learning_rate": 7.955127153681352e-07, + "loss": 0.4816, + "step": 6401 + }, + { + "epoch": 1.7753743760399334, + "grad_norm": 0.19508033990859985, + "learning_rate": 7.935755188169122e-07, + "loss": 0.4817, + "step": 6402 + }, + { + "epoch": 1.7756516916250693, + "grad_norm": 0.2073792815208435, + "learning_rate": 7.916406064992596e-07, + "loss": 0.4902, + "step": 6403 + }, + { + "epoch": 1.7759290072102052, + "grad_norm": 0.2069348245859146, + "learning_rate": 7.89707978792735e-07, + "loss": 0.5026, + "step": 6404 + }, + { + "epoch": 1.776206322795341, + "grad_norm": 0.21495787799358368, + "learning_rate": 7.87777636074441e-07, + "loss": 0.4647, + "step": 6405 + }, + { + "epoch": 1.776483638380477, + "grad_norm": 0.20726212859153748, + "learning_rate": 7.858495787210373e-07, + "loss": 0.4705, + "step": 6406 + }, + { + "epoch": 1.7767609539656128, + "grad_norm": 0.2127843052148819, + "learning_rate": 7.83923807108744e-07, + "loss": 0.5182, + "step": 6407 + }, + { + "epoch": 1.7770382695507487, + "grad_norm": 0.20020431280136108, + "learning_rate": 7.820003216133284e-07, + "loss": 0.4982, + "step": 6408 + }, + { + "epoch": 1.7773155851358846, + "grad_norm": 0.21346312761306763, + "learning_rate": 7.800791226101112e-07, + "loss": 0.5011, + "step": 6409 + }, + { + "epoch": 1.7775929007210205, + "grad_norm": 0.20786842703819275, + "learning_rate": 7.781602104739674e-07, + "loss": 0.4825, + "step": 6410 + }, + { + "epoch": 1.7778702163061564, + "grad_norm": 0.21008695662021637, + "learning_rate": 7.762435855793352e-07, + "loss": 0.5017, + "step": 6411 + }, + { + "epoch": 1.7781475318912923, + "grad_norm": 0.22787503898143768, + "learning_rate": 7.743292483001944e-07, + "loss": 0.5009, + "step": 6412 + }, + { + "epoch": 1.7784248474764282, + "grad_norm": 0.21107859909534454, + "learning_rate": 7.724171990100809e-07, + "loss": 0.5056, + "step": 6413 + }, + { + "epoch": 1.778702163061564, + "grad_norm": 0.20022447407245636, + "learning_rate": 7.705074380820881e-07, + "loss": 0.4997, + "step": 6414 + }, + { + "epoch": 1.7789794786467, + "grad_norm": 0.20294740796089172, + "learning_rate": 7.685999658888637e-07, + "loss": 0.4944, + "step": 6415 + }, + { + "epoch": 1.7792567942318358, + "grad_norm": 0.21547819674015045, + "learning_rate": 7.66694782802603e-07, + "loss": 0.5007, + "step": 6416 + }, + { + "epoch": 1.7795341098169717, + "grad_norm": 0.20150557160377502, + "learning_rate": 7.647918891950559e-07, + "loss": 0.5125, + "step": 6417 + }, + { + "epoch": 1.7798114254021076, + "grad_norm": 0.22688019275665283, + "learning_rate": 7.628912854375308e-07, + "loss": 0.4976, + "step": 6418 + }, + { + "epoch": 1.7800887409872435, + "grad_norm": 0.20131336152553558, + "learning_rate": 7.609929719008854e-07, + "loss": 0.4804, + "step": 6419 + }, + { + "epoch": 1.7803660565723793, + "grad_norm": 0.20517846941947937, + "learning_rate": 7.590969489555289e-07, + "loss": 0.5062, + "step": 6420 + }, + { + "epoch": 1.7806433721575152, + "grad_norm": 0.21819156408309937, + "learning_rate": 7.572032169714277e-07, + "loss": 0.4709, + "step": 6421 + }, + { + "epoch": 1.780920687742651, + "grad_norm": 0.21262036263942719, + "learning_rate": 7.553117763180989e-07, + "loss": 0.4938, + "step": 6422 + }, + { + "epoch": 1.781198003327787, + "grad_norm": 0.2089860737323761, + "learning_rate": 7.534226273646111e-07, + "loss": 0.542, + "step": 6423 + }, + { + "epoch": 1.7814753189129229, + "grad_norm": 0.20706579089164734, + "learning_rate": 7.515357704795902e-07, + "loss": 0.5062, + "step": 6424 + }, + { + "epoch": 1.7817526344980588, + "grad_norm": 0.21182586252689362, + "learning_rate": 7.496512060312086e-07, + "loss": 0.4642, + "step": 6425 + }, + { + "epoch": 1.7820299500831946, + "grad_norm": 0.21045905351638794, + "learning_rate": 7.477689343871983e-07, + "loss": 0.5155, + "step": 6426 + }, + { + "epoch": 1.7823072656683305, + "grad_norm": 0.21174542605876923, + "learning_rate": 7.458889559148363e-07, + "loss": 0.5125, + "step": 6427 + }, + { + "epoch": 1.7825845812534664, + "grad_norm": 0.21431586146354675, + "learning_rate": 7.440112709809599e-07, + "loss": 0.5094, + "step": 6428 + }, + { + "epoch": 1.7828618968386023, + "grad_norm": 0.21488359570503235, + "learning_rate": 7.421358799519524e-07, + "loss": 0.5196, + "step": 6429 + }, + { + "epoch": 1.7831392124237382, + "grad_norm": 0.21055498719215393, + "learning_rate": 7.402627831937528e-07, + "loss": 0.5338, + "step": 6430 + }, + { + "epoch": 1.783416528008874, + "grad_norm": 0.19541572034358978, + "learning_rate": 7.383919810718537e-07, + "loss": 0.4689, + "step": 6431 + }, + { + "epoch": 1.78369384359401, + "grad_norm": 0.22601917386054993, + "learning_rate": 7.365234739512977e-07, + "loss": 0.4881, + "step": 6432 + }, + { + "epoch": 1.7839711591791458, + "grad_norm": 0.20422977209091187, + "learning_rate": 7.34657262196678e-07, + "loss": 0.489, + "step": 6433 + }, + { + "epoch": 1.7842484747642817, + "grad_norm": 0.21061384677886963, + "learning_rate": 7.327933461721434e-07, + "loss": 0.523, + "step": 6434 + }, + { + "epoch": 1.7845257903494176, + "grad_norm": 0.23255345225334167, + "learning_rate": 7.309317262413948e-07, + "loss": 0.4948, + "step": 6435 + }, + { + "epoch": 1.7848031059345535, + "grad_norm": 0.20327569544315338, + "learning_rate": 7.290724027676791e-07, + "loss": 0.4884, + "step": 6436 + }, + { + "epoch": 1.7850804215196894, + "grad_norm": 0.20791566371917725, + "learning_rate": 7.272153761138031e-07, + "loss": 0.487, + "step": 6437 + }, + { + "epoch": 1.7853577371048253, + "grad_norm": 0.19935283064842224, + "learning_rate": 7.253606466421231e-07, + "loss": 0.4907, + "step": 6438 + }, + { + "epoch": 1.7856350526899611, + "grad_norm": 0.2055501639842987, + "learning_rate": 7.235082147145453e-07, + "loss": 0.4908, + "step": 6439 + }, + { + "epoch": 1.785912368275097, + "grad_norm": 0.20820872485637665, + "learning_rate": 7.216580806925235e-07, + "loss": 0.4897, + "step": 6440 + }, + { + "epoch": 1.786189683860233, + "grad_norm": 0.19953136146068573, + "learning_rate": 7.19810244937072e-07, + "loss": 0.4806, + "step": 6441 + }, + { + "epoch": 1.7864669994453688, + "grad_norm": 0.20614445209503174, + "learning_rate": 7.179647078087548e-07, + "loss": 0.4992, + "step": 6442 + }, + { + "epoch": 1.7867443150305047, + "grad_norm": 0.20100446045398712, + "learning_rate": 7.161214696676813e-07, + "loss": 0.4811, + "step": 6443 + }, + { + "epoch": 1.7870216306156406, + "grad_norm": 0.21023845672607422, + "learning_rate": 7.142805308735151e-07, + "loss": 0.4809, + "step": 6444 + }, + { + "epoch": 1.7872989462007765, + "grad_norm": 0.21367646753787994, + "learning_rate": 7.124418917854745e-07, + "loss": 0.4902, + "step": 6445 + }, + { + "epoch": 1.7875762617859123, + "grad_norm": 0.20011699199676514, + "learning_rate": 7.106055527623279e-07, + "loss": 0.4909, + "step": 6446 + }, + { + "epoch": 1.7878535773710482, + "grad_norm": 0.2133239507675171, + "learning_rate": 7.087715141623916e-07, + "loss": 0.4953, + "step": 6447 + }, + { + "epoch": 1.788130892956184, + "grad_norm": 0.20663748681545258, + "learning_rate": 7.06939776343532e-07, + "loss": 0.4885, + "step": 6448 + }, + { + "epoch": 1.78840820854132, + "grad_norm": 0.2234223186969757, + "learning_rate": 7.051103396631772e-07, + "loss": 0.468, + "step": 6449 + }, + { + "epoch": 1.7886855241264559, + "grad_norm": 0.2111656367778778, + "learning_rate": 7.032832044782959e-07, + "loss": 0.4909, + "step": 6450 + }, + { + "epoch": 1.7889628397115918, + "grad_norm": 0.2099202424287796, + "learning_rate": 7.014583711454053e-07, + "loss": 0.4802, + "step": 6451 + }, + { + "epoch": 1.7892401552967276, + "grad_norm": 0.19917812943458557, + "learning_rate": 6.996358400205849e-07, + "loss": 0.4913, + "step": 6452 + }, + { + "epoch": 1.7895174708818635, + "grad_norm": 0.20266632735729218, + "learning_rate": 6.978156114594583e-07, + "loss": 0.5082, + "step": 6453 + }, + { + "epoch": 1.7897947864669994, + "grad_norm": 0.21638800203800201, + "learning_rate": 6.959976858171985e-07, + "loss": 0.5163, + "step": 6454 + }, + { + "epoch": 1.7900721020521353, + "grad_norm": 0.20662005245685577, + "learning_rate": 6.941820634485299e-07, + "loss": 0.501, + "step": 6455 + }, + { + "epoch": 1.7903494176372712, + "grad_norm": 0.2111278474330902, + "learning_rate": 6.92368744707729e-07, + "loss": 0.5009, + "step": 6456 + }, + { + "epoch": 1.790626733222407, + "grad_norm": 0.21369148790836334, + "learning_rate": 6.905577299486266e-07, + "loss": 0.4807, + "step": 6457 + }, + { + "epoch": 1.790904048807543, + "grad_norm": 0.2006935179233551, + "learning_rate": 6.887490195245941e-07, + "loss": 0.5229, + "step": 6458 + }, + { + "epoch": 1.7911813643926788, + "grad_norm": 0.20024555921554565, + "learning_rate": 6.869426137885604e-07, + "loss": 0.5033, + "step": 6459 + }, + { + "epoch": 1.7914586799778147, + "grad_norm": 0.21740752458572388, + "learning_rate": 6.85138513093006e-07, + "loss": 0.4924, + "step": 6460 + }, + { + "epoch": 1.7917359955629506, + "grad_norm": 0.21370796859264374, + "learning_rate": 6.833367177899564e-07, + "loss": 0.4914, + "step": 6461 + }, + { + "epoch": 1.7920133111480865, + "grad_norm": 0.203489288687706, + "learning_rate": 6.815372282309884e-07, + "loss": 0.4905, + "step": 6462 + }, + { + "epoch": 1.7922906267332224, + "grad_norm": 0.20067156851291656, + "learning_rate": 6.797400447672311e-07, + "loss": 0.4508, + "step": 6463 + }, + { + "epoch": 1.7925679423183583, + "grad_norm": 0.2093934714794159, + "learning_rate": 6.779451677493636e-07, + "loss": 0.5265, + "step": 6464 + }, + { + "epoch": 1.7928452579034941, + "grad_norm": 0.20720593631267548, + "learning_rate": 6.761525975276129e-07, + "loss": 0.4972, + "step": 6465 + }, + { + "epoch": 1.79312257348863, + "grad_norm": 0.21246635913848877, + "learning_rate": 6.7436233445176e-07, + "loss": 0.5076, + "step": 6466 + }, + { + "epoch": 1.793399889073766, + "grad_norm": 0.2040356993675232, + "learning_rate": 6.72574378871127e-07, + "loss": 0.4878, + "step": 6467 + }, + { + "epoch": 1.7936772046589018, + "grad_norm": 0.20411434769630432, + "learning_rate": 6.707887311345959e-07, + "loss": 0.4903, + "step": 6468 + }, + { + "epoch": 1.7939545202440377, + "grad_norm": 0.21358831226825714, + "learning_rate": 6.690053915905908e-07, + "loss": 0.5239, + "step": 6469 + }, + { + "epoch": 1.7942318358291736, + "grad_norm": 0.20531177520751953, + "learning_rate": 6.672243605870918e-07, + "loss": 0.4836, + "step": 6470 + }, + { + "epoch": 1.7945091514143094, + "grad_norm": 0.203311026096344, + "learning_rate": 6.654456384716221e-07, + "loss": 0.4853, + "step": 6471 + }, + { + "epoch": 1.7947864669994453, + "grad_norm": 0.20684535801410675, + "learning_rate": 6.636692255912583e-07, + "loss": 0.4996, + "step": 6472 + }, + { + "epoch": 1.7950637825845812, + "grad_norm": 0.21285724639892578, + "learning_rate": 6.618951222926287e-07, + "loss": 0.5002, + "step": 6473 + }, + { + "epoch": 1.795341098169717, + "grad_norm": 0.201505646109581, + "learning_rate": 6.601233289219036e-07, + "loss": 0.5027, + "step": 6474 + }, + { + "epoch": 1.795618413754853, + "grad_norm": 0.24583852291107178, + "learning_rate": 6.583538458248106e-07, + "loss": 0.4928, + "step": 6475 + }, + { + "epoch": 1.7958957293399889, + "grad_norm": 0.22444890439510345, + "learning_rate": 6.565866733466181e-07, + "loss": 0.4982, + "step": 6476 + }, + { + "epoch": 1.7961730449251248, + "grad_norm": 0.21000193059444427, + "learning_rate": 6.548218118321542e-07, + "loss": 0.4952, + "step": 6477 + }, + { + "epoch": 1.7964503605102606, + "grad_norm": 0.20615287125110626, + "learning_rate": 6.530592616257839e-07, + "loss": 0.4686, + "step": 6478 + }, + { + "epoch": 1.7967276760953965, + "grad_norm": 0.19822682440280914, + "learning_rate": 6.512990230714306e-07, + "loss": 0.5017, + "step": 6479 + }, + { + "epoch": 1.7970049916805324, + "grad_norm": 0.20360395312309265, + "learning_rate": 6.495410965125653e-07, + "loss": 0.4922, + "step": 6480 + }, + { + "epoch": 1.7972823072656683, + "grad_norm": 0.19985002279281616, + "learning_rate": 6.477854822922042e-07, + "loss": 0.4931, + "step": 6481 + }, + { + "epoch": 1.7975596228508042, + "grad_norm": 0.19747690856456757, + "learning_rate": 6.460321807529118e-07, + "loss": 0.4757, + "step": 6482 + }, + { + "epoch": 1.79783693843594, + "grad_norm": 0.2075447291135788, + "learning_rate": 6.44281192236805e-07, + "loss": 0.5065, + "step": 6483 + }, + { + "epoch": 1.798114254021076, + "grad_norm": 0.2010498195886612, + "learning_rate": 6.42532517085552e-07, + "loss": 0.4978, + "step": 6484 + }, + { + "epoch": 1.7983915696062118, + "grad_norm": 0.2031354308128357, + "learning_rate": 6.407861556403633e-07, + "loss": 0.5098, + "step": 6485 + }, + { + "epoch": 1.7986688851913477, + "grad_norm": 0.21396131813526154, + "learning_rate": 6.390421082419939e-07, + "loss": 0.5098, + "step": 6486 + }, + { + "epoch": 1.7989462007764836, + "grad_norm": 0.20978130400180817, + "learning_rate": 6.373003752307649e-07, + "loss": 0.4845, + "step": 6487 + }, + { + "epoch": 1.7992235163616195, + "grad_norm": 0.20987875759601593, + "learning_rate": 6.355609569465279e-07, + "loss": 0.5111, + "step": 6488 + }, + { + "epoch": 1.7995008319467554, + "grad_norm": 0.20756718516349792, + "learning_rate": 6.338238537286892e-07, + "loss": 0.5202, + "step": 6489 + }, + { + "epoch": 1.7997781475318912, + "grad_norm": 0.2095729559659958, + "learning_rate": 6.320890659162041e-07, + "loss": 0.4798, + "step": 6490 + }, + { + "epoch": 1.8000554631170271, + "grad_norm": 0.21260963380336761, + "learning_rate": 6.303565938475794e-07, + "loss": 0.4852, + "step": 6491 + }, + { + "epoch": 1.800332778702163, + "grad_norm": 0.20586569607257843, + "learning_rate": 6.286264378608631e-07, + "loss": 0.4981, + "step": 6492 + }, + { + "epoch": 1.800610094287299, + "grad_norm": 0.20685313642024994, + "learning_rate": 6.268985982936529e-07, + "loss": 0.4848, + "step": 6493 + }, + { + "epoch": 1.8008874098724348, + "grad_norm": 0.21318146586418152, + "learning_rate": 6.251730754830987e-07, + "loss": 0.4873, + "step": 6494 + }, + { + "epoch": 1.8011647254575707, + "grad_norm": 0.21374082565307617, + "learning_rate": 6.234498697658964e-07, + "loss": 0.4889, + "step": 6495 + }, + { + "epoch": 1.8014420410427066, + "grad_norm": 0.20280690491199493, + "learning_rate": 6.217289814782867e-07, + "loss": 0.5096, + "step": 6496 + }, + { + "epoch": 1.8017193566278424, + "grad_norm": 0.2248666137456894, + "learning_rate": 6.200104109560622e-07, + "loss": 0.5, + "step": 6497 + }, + { + "epoch": 1.8019966722129783, + "grad_norm": 0.25136902928352356, + "learning_rate": 6.182941585345603e-07, + "loss": 0.5054, + "step": 6498 + }, + { + "epoch": 1.8022739877981142, + "grad_norm": 0.2106061577796936, + "learning_rate": 6.165802245486684e-07, + "loss": 0.503, + "step": 6499 + }, + { + "epoch": 1.80255130338325, + "grad_norm": 0.19975021481513977, + "learning_rate": 6.148686093328193e-07, + "loss": 0.4912, + "step": 6500 + }, + { + "epoch": 1.802828618968386, + "grad_norm": 0.2124665230512619, + "learning_rate": 6.131593132209942e-07, + "loss": 0.5096, + "step": 6501 + }, + { + "epoch": 1.8031059345535219, + "grad_norm": 0.20581713318824768, + "learning_rate": 6.11452336546725e-07, + "loss": 0.4901, + "step": 6502 + }, + { + "epoch": 1.8033832501386577, + "grad_norm": 0.20466797053813934, + "learning_rate": 6.097476796430856e-07, + "loss": 0.4827, + "step": 6503 + }, + { + "epoch": 1.8036605657237936, + "grad_norm": 0.20932024717330933, + "learning_rate": 6.080453428427003e-07, + "loss": 0.4881, + "step": 6504 + }, + { + "epoch": 1.8039378813089295, + "grad_norm": 0.20006607472896576, + "learning_rate": 6.063453264777391e-07, + "loss": 0.4892, + "step": 6505 + }, + { + "epoch": 1.8042151968940654, + "grad_norm": 0.21569404006004333, + "learning_rate": 6.046476308799217e-07, + "loss": 0.4704, + "step": 6506 + }, + { + "epoch": 1.8044925124792013, + "grad_norm": 0.22104816138744354, + "learning_rate": 6.029522563805123e-07, + "loss": 0.4961, + "step": 6507 + }, + { + "epoch": 1.8047698280643372, + "grad_norm": 0.21142521500587463, + "learning_rate": 6.012592033103253e-07, + "loss": 0.5161, + "step": 6508 + }, + { + "epoch": 1.805047143649473, + "grad_norm": 0.20408941805362701, + "learning_rate": 5.995684719997174e-07, + "loss": 0.512, + "step": 6509 + }, + { + "epoch": 1.805324459234609, + "grad_norm": 0.20980273187160492, + "learning_rate": 5.978800627785968e-07, + "loss": 0.5006, + "step": 6510 + }, + { + "epoch": 1.8056017748197448, + "grad_norm": 0.21785345673561096, + "learning_rate": 5.961939759764181e-07, + "loss": 0.4957, + "step": 6511 + }, + { + "epoch": 1.8058790904048807, + "grad_norm": 0.20818081498146057, + "learning_rate": 5.945102119221793e-07, + "loss": 0.5021, + "step": 6512 + }, + { + "epoch": 1.8061564059900166, + "grad_norm": 0.21114230155944824, + "learning_rate": 5.928287709444285e-07, + "loss": 0.4915, + "step": 6513 + }, + { + "epoch": 1.8064337215751525, + "grad_norm": 0.20450453460216522, + "learning_rate": 5.911496533712577e-07, + "loss": 0.5232, + "step": 6514 + }, + { + "epoch": 1.8067110371602884, + "grad_norm": 0.20697475969791412, + "learning_rate": 5.894728595303101e-07, + "loss": 0.4895, + "step": 6515 + }, + { + "epoch": 1.8069883527454242, + "grad_norm": 0.20897066593170166, + "learning_rate": 5.877983897487699e-07, + "loss": 0.4692, + "step": 6516 + }, + { + "epoch": 1.8072656683305601, + "grad_norm": 0.2387371063232422, + "learning_rate": 5.861262443533716e-07, + "loss": 0.4805, + "step": 6517 + }, + { + "epoch": 1.807542983915696, + "grad_norm": 0.2114652842283249, + "learning_rate": 5.844564236703972e-07, + "loss": 0.5114, + "step": 6518 + }, + { + "epoch": 1.807820299500832, + "grad_norm": 0.19332511723041534, + "learning_rate": 5.827889280256696e-07, + "loss": 0.4668, + "step": 6519 + }, + { + "epoch": 1.8080976150859678, + "grad_norm": 0.22820791602134705, + "learning_rate": 5.811237577445616e-07, + "loss": 0.4955, + "step": 6520 + }, + { + "epoch": 1.8083749306711037, + "grad_norm": 0.2099064439535141, + "learning_rate": 5.79460913151994e-07, + "loss": 0.529, + "step": 6521 + }, + { + "epoch": 1.8086522462562395, + "grad_norm": 0.22128312289714813, + "learning_rate": 5.778003945724322e-07, + "loss": 0.4778, + "step": 6522 + }, + { + "epoch": 1.8089295618413754, + "grad_norm": 0.20738999545574188, + "learning_rate": 5.761422023298851e-07, + "loss": 0.4965, + "step": 6523 + }, + { + "epoch": 1.8092068774265113, + "grad_norm": 0.23788759112358093, + "learning_rate": 5.744863367479092e-07, + "loss": 0.5113, + "step": 6524 + }, + { + "epoch": 1.8094841930116472, + "grad_norm": 0.21730081737041473, + "learning_rate": 5.728327981496112e-07, + "loss": 0.4981, + "step": 6525 + }, + { + "epoch": 1.809761508596783, + "grad_norm": 0.21804404258728027, + "learning_rate": 5.711815868576401e-07, + "loss": 0.4849, + "step": 6526 + }, + { + "epoch": 1.810038824181919, + "grad_norm": 0.21189890801906586, + "learning_rate": 5.695327031941866e-07, + "loss": 0.4909, + "step": 6527 + }, + { + "epoch": 1.8103161397670549, + "grad_norm": 0.2018709033727646, + "learning_rate": 5.678861474809949e-07, + "loss": 0.4799, + "step": 6528 + }, + { + "epoch": 1.8105934553521907, + "grad_norm": 0.22132818400859833, + "learning_rate": 5.662419200393537e-07, + "loss": 0.4723, + "step": 6529 + }, + { + "epoch": 1.8108707709373266, + "grad_norm": 0.2106301486492157, + "learning_rate": 5.646000211900925e-07, + "loss": 0.4924, + "step": 6530 + }, + { + "epoch": 1.8111480865224625, + "grad_norm": 0.20907257497310638, + "learning_rate": 5.629604512535871e-07, + "loss": 0.497, + "step": 6531 + }, + { + "epoch": 1.8114254021075984, + "grad_norm": 0.20625951886177063, + "learning_rate": 5.613232105497649e-07, + "loss": 0.504, + "step": 6532 + }, + { + "epoch": 1.8117027176927343, + "grad_norm": 0.2099820077419281, + "learning_rate": 5.596882993980937e-07, + "loss": 0.4957, + "step": 6533 + }, + { + "epoch": 1.8119800332778702, + "grad_norm": 0.21241778135299683, + "learning_rate": 5.580557181175893e-07, + "loss": 0.4908, + "step": 6534 + }, + { + "epoch": 1.812257348863006, + "grad_norm": 0.20039701461791992, + "learning_rate": 5.564254670268068e-07, + "loss": 0.4982, + "step": 6535 + }, + { + "epoch": 1.8125346644481422, + "grad_norm": 0.22225303947925568, + "learning_rate": 5.547975464438568e-07, + "loss": 0.5102, + "step": 6536 + }, + { + "epoch": 1.812811980033278, + "grad_norm": 0.21028605103492737, + "learning_rate": 5.53171956686388e-07, + "loss": 0.5236, + "step": 6537 + }, + { + "epoch": 1.813089295618414, + "grad_norm": 0.2042209506034851, + "learning_rate": 5.51548698071594e-07, + "loss": 0.4895, + "step": 6538 + }, + { + "epoch": 1.8133666112035498, + "grad_norm": 0.21611033380031586, + "learning_rate": 5.499277709162171e-07, + "loss": 0.4968, + "step": 6539 + }, + { + "epoch": 1.8136439267886857, + "grad_norm": 0.20384977757930756, + "learning_rate": 5.483091755365461e-07, + "loss": 0.492, + "step": 6540 + }, + { + "epoch": 1.8139212423738216, + "grad_norm": 0.1995922178030014, + "learning_rate": 5.466929122484075e-07, + "loss": 0.4917, + "step": 6541 + }, + { + "epoch": 1.8141985579589575, + "grad_norm": 0.20135626196861267, + "learning_rate": 5.450789813671781e-07, + "loss": 0.5002, + "step": 6542 + }, + { + "epoch": 1.8144758735440933, + "grad_norm": 0.21459290385246277, + "learning_rate": 5.434673832077783e-07, + "loss": 0.4684, + "step": 6543 + }, + { + "epoch": 1.8147531891292292, + "grad_norm": 0.2142772525548935, + "learning_rate": 5.41858118084676e-07, + "loss": 0.4991, + "step": 6544 + }, + { + "epoch": 1.8150305047143651, + "grad_norm": 0.20316624641418457, + "learning_rate": 5.402511863118798e-07, + "loss": 0.4978, + "step": 6545 + }, + { + "epoch": 1.815307820299501, + "grad_norm": 0.23183675110340118, + "learning_rate": 5.386465882029443e-07, + "loss": 0.4948, + "step": 6546 + }, + { + "epoch": 1.8155851358846369, + "grad_norm": 0.2099410742521286, + "learning_rate": 5.370443240709691e-07, + "loss": 0.5037, + "step": 6547 + }, + { + "epoch": 1.8158624514697728, + "grad_norm": 0.2161746770143509, + "learning_rate": 5.354443942285986e-07, + "loss": 0.5388, + "step": 6548 + }, + { + "epoch": 1.8161397670549086, + "grad_norm": 0.2113848328590393, + "learning_rate": 5.338467989880233e-07, + "loss": 0.5012, + "step": 6549 + }, + { + "epoch": 1.8164170826400445, + "grad_norm": 0.20695488154888153, + "learning_rate": 5.322515386609731e-07, + "loss": 0.5092, + "step": 6550 + }, + { + "epoch": 1.8166943982251804, + "grad_norm": 0.22753605246543884, + "learning_rate": 5.30658613558728e-07, + "loss": 0.4784, + "step": 6551 + }, + { + "epoch": 1.8169717138103163, + "grad_norm": 0.20156748592853546, + "learning_rate": 5.290680239921089e-07, + "loss": 0.4905, + "step": 6552 + }, + { + "epoch": 1.8172490293954522, + "grad_norm": 0.20030784606933594, + "learning_rate": 5.274797702714829e-07, + "loss": 0.4789, + "step": 6553 + }, + { + "epoch": 1.817526344980588, + "grad_norm": 0.2246222347021103, + "learning_rate": 5.258938527067575e-07, + "loss": 0.5038, + "step": 6554 + }, + { + "epoch": 1.817803660565724, + "grad_norm": 0.21242398023605347, + "learning_rate": 5.243102716073908e-07, + "loss": 0.4957, + "step": 6555 + }, + { + "epoch": 1.8180809761508598, + "grad_norm": 0.20436997711658478, + "learning_rate": 5.227290272823801e-07, + "loss": 0.4759, + "step": 6556 + }, + { + "epoch": 1.8183582917359957, + "grad_norm": 0.20299552381038666, + "learning_rate": 5.211501200402688e-07, + "loss": 0.5054, + "step": 6557 + }, + { + "epoch": 1.8186356073211316, + "grad_norm": 0.2033611238002777, + "learning_rate": 5.195735501891424e-07, + "loss": 0.4932, + "step": 6558 + }, + { + "epoch": 1.8189129229062675, + "grad_norm": 0.2847311794757843, + "learning_rate": 5.1799931803663e-07, + "loss": 0.4986, + "step": 6559 + }, + { + "epoch": 1.8191902384914034, + "grad_norm": 0.2158021777868271, + "learning_rate": 5.164274238899091e-07, + "loss": 0.4769, + "step": 6560 + }, + { + "epoch": 1.8194675540765393, + "grad_norm": 0.21182480454444885, + "learning_rate": 5.148578680556987e-07, + "loss": 0.4827, + "step": 6561 + }, + { + "epoch": 1.8197448696616751, + "grad_norm": 0.20851145684719086, + "learning_rate": 5.132906508402535e-07, + "loss": 0.512, + "step": 6562 + }, + { + "epoch": 1.820022185246811, + "grad_norm": 0.20874373614788055, + "learning_rate": 5.117257725493874e-07, + "loss": 0.4721, + "step": 6563 + }, + { + "epoch": 1.820299500831947, + "grad_norm": 0.203582301735878, + "learning_rate": 5.101632334884476e-07, + "loss": 0.467, + "step": 6564 + }, + { + "epoch": 1.8205768164170828, + "grad_norm": 0.21535137295722961, + "learning_rate": 5.086030339623237e-07, + "loss": 0.477, + "step": 6565 + }, + { + "epoch": 1.8208541320022187, + "grad_norm": 0.2023378312587738, + "learning_rate": 5.070451742754528e-07, + "loss": 0.4867, + "step": 6566 + }, + { + "epoch": 1.8211314475873546, + "grad_norm": 0.20345695316791534, + "learning_rate": 5.054896547318181e-07, + "loss": 0.4803, + "step": 6567 + }, + { + "epoch": 1.8214087631724905, + "grad_norm": 0.2040456086397171, + "learning_rate": 5.039364756349405e-07, + "loss": 0.5076, + "step": 6568 + }, + { + "epoch": 1.8216860787576263, + "grad_norm": 0.20643432438373566, + "learning_rate": 5.023856372878846e-07, + "loss": 0.51, + "step": 6569 + }, + { + "epoch": 1.8219633943427622, + "grad_norm": 0.1972525417804718, + "learning_rate": 5.008371399932613e-07, + "loss": 0.4995, + "step": 6570 + }, + { + "epoch": 1.822240709927898, + "grad_norm": 0.27110210061073303, + "learning_rate": 4.992909840532259e-07, + "loss": 0.505, + "step": 6571 + }, + { + "epoch": 1.822518025513034, + "grad_norm": 0.20463772118091583, + "learning_rate": 4.977471697694719e-07, + "loss": 0.5256, + "step": 6572 + }, + { + "epoch": 1.8227953410981699, + "grad_norm": 0.20936624705791473, + "learning_rate": 4.962056974432374e-07, + "loss": 0.4791, + "step": 6573 + }, + { + "epoch": 1.8230726566833058, + "grad_norm": 0.21320702135562897, + "learning_rate": 4.946665673753056e-07, + "loss": 0.5057, + "step": 6574 + }, + { + "epoch": 1.8233499722684416, + "grad_norm": 0.20866161584854126, + "learning_rate": 4.931297798660043e-07, + "loss": 0.4935, + "step": 6575 + }, + { + "epoch": 1.8236272878535775, + "grad_norm": 0.21233926713466644, + "learning_rate": 4.915953352151961e-07, + "loss": 0.5105, + "step": 6576 + }, + { + "epoch": 1.8239046034387134, + "grad_norm": 0.20736192166805267, + "learning_rate": 4.900632337222947e-07, + "loss": 0.5026, + "step": 6577 + }, + { + "epoch": 1.8241819190238493, + "grad_norm": 0.21084974706172943, + "learning_rate": 4.885334756862564e-07, + "loss": 0.4946, + "step": 6578 + }, + { + "epoch": 1.8244592346089852, + "grad_norm": 0.20611052215099335, + "learning_rate": 4.870060614055733e-07, + "loss": 0.5124, + "step": 6579 + }, + { + "epoch": 1.824736550194121, + "grad_norm": 0.19793611764907837, + "learning_rate": 4.85480991178286e-07, + "loss": 0.4918, + "step": 6580 + }, + { + "epoch": 1.825013865779257, + "grad_norm": 0.20998556911945343, + "learning_rate": 4.839582653019745e-07, + "loss": 0.4959, + "step": 6581 + }, + { + "epoch": 1.8252911813643928, + "grad_norm": 0.2145131677389145, + "learning_rate": 4.824378840737664e-07, + "loss": 0.4858, + "step": 6582 + }, + { + "epoch": 1.8255684969495287, + "grad_norm": 0.20194010436534882, + "learning_rate": 4.809198477903259e-07, + "loss": 0.4685, + "step": 6583 + }, + { + "epoch": 1.8258458125346646, + "grad_norm": 0.2031271904706955, + "learning_rate": 4.794041567478632e-07, + "loss": 0.4833, + "step": 6584 + }, + { + "epoch": 1.8261231281198005, + "grad_norm": 0.2092730849981308, + "learning_rate": 4.778908112421279e-07, + "loss": 0.4969, + "step": 6585 + }, + { + "epoch": 1.8264004437049364, + "grad_norm": 0.21413682401180267, + "learning_rate": 4.7637981156841563e-07, + "loss": 0.4964, + "step": 6586 + }, + { + "epoch": 1.8266777592900723, + "grad_norm": 0.2138873189687729, + "learning_rate": 4.7487115802156147e-07, + "loss": 0.484, + "step": 6587 + }, + { + "epoch": 1.8269550748752081, + "grad_norm": 0.23256392776966095, + "learning_rate": 4.733648508959465e-07, + "loss": 0.4868, + "step": 6588 + }, + { + "epoch": 1.827232390460344, + "grad_norm": 0.20839428901672363, + "learning_rate": 4.718608904854857e-07, + "loss": 0.4998, + "step": 6589 + }, + { + "epoch": 1.82750970604548, + "grad_norm": 0.21245619654655457, + "learning_rate": 4.703592770836457e-07, + "loss": 0.4976, + "step": 6590 + }, + { + "epoch": 1.8277870216306158, + "grad_norm": 0.21361759305000305, + "learning_rate": 4.6886001098343094e-07, + "loss": 0.5143, + "step": 6591 + }, + { + "epoch": 1.8280643372157517, + "grad_norm": 0.20406369864940643, + "learning_rate": 4.673630924773853e-07, + "loss": 0.4561, + "step": 6592 + }, + { + "epoch": 1.8283416528008876, + "grad_norm": 0.20464986562728882, + "learning_rate": 4.6586852185760144e-07, + "loss": 0.4884, + "step": 6593 + }, + { + "epoch": 1.8286189683860234, + "grad_norm": 0.21941900253295898, + "learning_rate": 4.643762994157058e-07, + "loss": 0.5124, + "step": 6594 + }, + { + "epoch": 1.8288962839711593, + "grad_norm": 0.21227768063545227, + "learning_rate": 4.628864254428725e-07, + "loss": 0.4913, + "step": 6595 + }, + { + "epoch": 1.8291735995562952, + "grad_norm": 0.20707818865776062, + "learning_rate": 4.613989002298133e-07, + "loss": 0.4637, + "step": 6596 + }, + { + "epoch": 1.829450915141431, + "grad_norm": 0.21716617047786713, + "learning_rate": 4.599137240667864e-07, + "loss": 0.4596, + "step": 6597 + }, + { + "epoch": 1.829728230726567, + "grad_norm": 0.21836552023887634, + "learning_rate": 4.5843089724358913e-07, + "loss": 0.4907, + "step": 6598 + }, + { + "epoch": 1.8300055463117029, + "grad_norm": 0.20570078492164612, + "learning_rate": 4.5695042004955943e-07, + "loss": 0.4854, + "step": 6599 + }, + { + "epoch": 1.8302828618968388, + "grad_norm": 0.21633854508399963, + "learning_rate": 4.554722927735747e-07, + "loss": 0.517, + "step": 6600 + }, + { + "epoch": 1.8305601774819746, + "grad_norm": 0.20889052748680115, + "learning_rate": 4.53996515704061e-07, + "loss": 0.5119, + "step": 6601 + }, + { + "epoch": 1.8308374930671105, + "grad_norm": 0.21190080046653748, + "learning_rate": 4.5252308912897973e-07, + "loss": 0.492, + "step": 6602 + }, + { + "epoch": 1.8311148086522464, + "grad_norm": 0.19486699998378754, + "learning_rate": 4.5105201333583565e-07, + "loss": 0.4603, + "step": 6603 + }, + { + "epoch": 1.8313921242373823, + "grad_norm": 0.20805886387825012, + "learning_rate": 4.495832886116741e-07, + "loss": 0.5107, + "step": 6604 + }, + { + "epoch": 1.8316694398225182, + "grad_norm": 0.2246580719947815, + "learning_rate": 4.481169152430839e-07, + "loss": 0.5245, + "step": 6605 + }, + { + "epoch": 1.831946755407654, + "grad_norm": 0.2717232406139374, + "learning_rate": 4.466528935161918e-07, + "loss": 0.5234, + "step": 6606 + }, + { + "epoch": 1.83222407099279, + "grad_norm": 0.2078229784965515, + "learning_rate": 4.451912237166664e-07, + "loss": 0.4893, + "step": 6607 + }, + { + "epoch": 1.8325013865779258, + "grad_norm": 0.20676632225513458, + "learning_rate": 4.4373190612971986e-07, + "loss": 0.5072, + "step": 6608 + }, + { + "epoch": 1.8327787021630617, + "grad_norm": 0.21076683700084686, + "learning_rate": 4.4227494104010503e-07, + "loss": 0.5058, + "step": 6609 + }, + { + "epoch": 1.8330560177481976, + "grad_norm": 0.20716801285743713, + "learning_rate": 4.408203287321111e-07, + "loss": 0.4752, + "step": 6610 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.20303776860237122, + "learning_rate": 4.3936806948957354e-07, + "loss": 0.4991, + "step": 6611 + }, + { + "epoch": 1.8336106489184694, + "grad_norm": 0.2012963891029358, + "learning_rate": 4.379181635958643e-07, + "loss": 0.486, + "step": 6612 + }, + { + "epoch": 1.8338879645036053, + "grad_norm": 0.20308934152126312, + "learning_rate": 4.3647061133390286e-07, + "loss": 0.506, + "step": 6613 + }, + { + "epoch": 1.8341652800887411, + "grad_norm": 0.20220200717449188, + "learning_rate": 4.3502541298613977e-07, + "loss": 0.4809, + "step": 6614 + }, + { + "epoch": 1.834442595673877, + "grad_norm": 0.20746108889579773, + "learning_rate": 4.335825688345743e-07, + "loss": 0.4813, + "step": 6615 + }, + { + "epoch": 1.834719911259013, + "grad_norm": 0.22002127766609192, + "learning_rate": 4.321420791607453e-07, + "loss": 0.5233, + "step": 6616 + }, + { + "epoch": 1.8349972268441488, + "grad_norm": 0.20879393815994263, + "learning_rate": 4.307039442457278e-07, + "loss": 0.4956, + "step": 6617 + }, + { + "epoch": 1.8352745424292847, + "grad_norm": 0.21325801312923431, + "learning_rate": 4.2926816437014047e-07, + "loss": 0.4913, + "step": 6618 + }, + { + "epoch": 1.8355518580144206, + "grad_norm": 0.19979670643806458, + "learning_rate": 4.278347398141411e-07, + "loss": 0.5124, + "step": 6619 + }, + { + "epoch": 1.8358291735995564, + "grad_norm": 0.21641233563423157, + "learning_rate": 4.264036708574323e-07, + "loss": 0.505, + "step": 6620 + }, + { + "epoch": 1.8361064891846923, + "grad_norm": 0.2334747463464737, + "learning_rate": 4.249749577792492e-07, + "loss": 0.5114, + "step": 6621 + }, + { + "epoch": 1.8363838047698282, + "grad_norm": 0.19688747823238373, + "learning_rate": 4.2354860085837414e-07, + "loss": 0.5072, + "step": 6622 + }, + { + "epoch": 1.836661120354964, + "grad_norm": 0.21270081400871277, + "learning_rate": 4.2212460037312636e-07, + "loss": 0.5024, + "step": 6623 + }, + { + "epoch": 1.8369384359401, + "grad_norm": 0.21791474521160126, + "learning_rate": 4.2070295660136817e-07, + "loss": 0.5123, + "step": 6624 + }, + { + "epoch": 1.8372157515252359, + "grad_norm": 0.1985998898744583, + "learning_rate": 4.192836698204958e-07, + "loss": 0.4741, + "step": 6625 + }, + { + "epoch": 1.8374930671103717, + "grad_norm": 0.2119298279285431, + "learning_rate": 4.1786674030745295e-07, + "loss": 0.4941, + "step": 6626 + }, + { + "epoch": 1.8377703826955076, + "grad_norm": 0.20587483048439026, + "learning_rate": 4.164521683387185e-07, + "loss": 0.5002, + "step": 6627 + }, + { + "epoch": 1.8380476982806435, + "grad_norm": 0.19921058416366577, + "learning_rate": 4.1503995419031325e-07, + "loss": 0.4908, + "step": 6628 + }, + { + "epoch": 1.8383250138657794, + "grad_norm": 0.2109578251838684, + "learning_rate": 4.1363009813780026e-07, + "loss": 0.4987, + "step": 6629 + }, + { + "epoch": 1.8386023294509153, + "grad_norm": 0.21179182827472687, + "learning_rate": 4.122226004562746e-07, + "loss": 0.5024, + "step": 6630 + }, + { + "epoch": 1.8388796450360512, + "grad_norm": 0.20858225226402283, + "learning_rate": 4.108174614203819e-07, + "loss": 0.517, + "step": 6631 + }, + { + "epoch": 1.839156960621187, + "grad_norm": 0.2038223147392273, + "learning_rate": 4.094146813042973e-07, + "loss": 0.448, + "step": 6632 + }, + { + "epoch": 1.839434276206323, + "grad_norm": 0.21669189631938934, + "learning_rate": 4.0801426038174357e-07, + "loss": 0.4757, + "step": 6633 + }, + { + "epoch": 1.8397115917914588, + "grad_norm": 0.204112246632576, + "learning_rate": 4.06616198925977e-07, + "loss": 0.4729, + "step": 6634 + }, + { + "epoch": 1.8399889073765947, + "grad_norm": 0.22220736742019653, + "learning_rate": 4.052204972097989e-07, + "loss": 0.5062, + "step": 6635 + }, + { + "epoch": 1.8402662229617306, + "grad_norm": 0.2096226066350937, + "learning_rate": 4.0382715550554837e-07, + "loss": 0.4841, + "step": 6636 + }, + { + "epoch": 1.8405435385468665, + "grad_norm": 0.20454815030097961, + "learning_rate": 4.024361740851024e-07, + "loss": 0.4883, + "step": 6637 + }, + { + "epoch": 1.8408208541320024, + "grad_norm": 0.20967929065227509, + "learning_rate": 4.010475532198757e-07, + "loss": 0.5155, + "step": 6638 + }, + { + "epoch": 1.8410981697171382, + "grad_norm": 0.22011229395866394, + "learning_rate": 3.996612931808266e-07, + "loss": 0.5232, + "step": 6639 + }, + { + "epoch": 1.8413754853022741, + "grad_norm": 0.22266606986522675, + "learning_rate": 3.9827739423845265e-07, + "loss": 0.5071, + "step": 6640 + }, + { + "epoch": 1.84165280088741, + "grad_norm": 0.20759738981723785, + "learning_rate": 3.9689585666278784e-07, + "loss": 0.4825, + "step": 6641 + }, + { + "epoch": 1.841930116472546, + "grad_norm": 0.2150697559118271, + "learning_rate": 3.9551668072340675e-07, + "loss": 0.511, + "step": 6642 + }, + { + "epoch": 1.8422074320576818, + "grad_norm": 0.21445181965827942, + "learning_rate": 3.9413986668942473e-07, + "loss": 0.4903, + "step": 6643 + }, + { + "epoch": 1.8424847476428177, + "grad_norm": 0.2114182561635971, + "learning_rate": 3.9276541482949347e-07, + "loss": 0.51, + "step": 6644 + }, + { + "epoch": 1.8427620632279536, + "grad_norm": 0.2046855241060257, + "learning_rate": 3.913933254118041e-07, + "loss": 0.5055, + "step": 6645 + }, + { + "epoch": 1.8430393788130894, + "grad_norm": 0.20602397620677948, + "learning_rate": 3.9002359870408817e-07, + "loss": 0.5024, + "step": 6646 + }, + { + "epoch": 1.8433166943982253, + "grad_norm": 0.21433816850185394, + "learning_rate": 3.886562349736167e-07, + "loss": 0.4955, + "step": 6647 + }, + { + "epoch": 1.8435940099833612, + "grad_norm": 0.21116018295288086, + "learning_rate": 3.872912344871985e-07, + "loss": 0.4696, + "step": 6648 + }, + { + "epoch": 1.843871325568497, + "grad_norm": 0.20629142224788666, + "learning_rate": 3.8592859751117873e-07, + "loss": 0.4636, + "step": 6649 + }, + { + "epoch": 1.844148641153633, + "grad_norm": 0.2071436494588852, + "learning_rate": 3.845683243114462e-07, + "loss": 0.5121, + "step": 6650 + }, + { + "epoch": 1.8444259567387689, + "grad_norm": 0.20660892128944397, + "learning_rate": 3.8321041515342744e-07, + "loss": 0.5133, + "step": 6651 + }, + { + "epoch": 1.8447032723239047, + "grad_norm": 0.19560420513153076, + "learning_rate": 3.818548703020841e-07, + "loss": 0.5027, + "step": 6652 + }, + { + "epoch": 1.8449805879090406, + "grad_norm": 0.20409871637821198, + "learning_rate": 3.805016900219172e-07, + "loss": 0.507, + "step": 6653 + }, + { + "epoch": 1.8452579034941765, + "grad_norm": 0.21331243216991425, + "learning_rate": 3.791508745769737e-07, + "loss": 0.5273, + "step": 6654 + }, + { + "epoch": 1.8455352190793124, + "grad_norm": 0.2074180394411087, + "learning_rate": 3.7780242423083036e-07, + "loss": 0.4921, + "step": 6655 + }, + { + "epoch": 1.8458125346644483, + "grad_norm": 0.22154366970062256, + "learning_rate": 3.7645633924660446e-07, + "loss": 0.5068, + "step": 6656 + }, + { + "epoch": 1.8460898502495842, + "grad_norm": 0.20785929262638092, + "learning_rate": 3.7511261988695393e-07, + "loss": 0.4947, + "step": 6657 + }, + { + "epoch": 1.84636716583472, + "grad_norm": 0.22468653321266174, + "learning_rate": 3.737712664140747e-07, + "loss": 0.5096, + "step": 6658 + }, + { + "epoch": 1.846644481419856, + "grad_norm": 0.2090657502412796, + "learning_rate": 3.724322790897003e-07, + "loss": 0.5105, + "step": 6659 + }, + { + "epoch": 1.8469217970049918, + "grad_norm": 0.20280075073242188, + "learning_rate": 3.71095658175101e-07, + "loss": 0.4872, + "step": 6660 + }, + { + "epoch": 1.8471991125901277, + "grad_norm": 0.2233634740114212, + "learning_rate": 3.697614039310876e-07, + "loss": 0.506, + "step": 6661 + }, + { + "epoch": 1.8474764281752636, + "grad_norm": 0.2086162120103836, + "learning_rate": 3.684295166180102e-07, + "loss": 0.5089, + "step": 6662 + }, + { + "epoch": 1.8477537437603995, + "grad_norm": 0.20018382370471954, + "learning_rate": 3.6709999649575386e-07, + "loss": 0.5051, + "step": 6663 + }, + { + "epoch": 1.8480310593455354, + "grad_norm": 0.21848781406879425, + "learning_rate": 3.6577284382374316e-07, + "loss": 0.497, + "step": 6664 + }, + { + "epoch": 1.8483083749306712, + "grad_norm": 0.24338483810424805, + "learning_rate": 3.644480588609403e-07, + "loss": 0.5335, + "step": 6665 + }, + { + "epoch": 1.8485856905158071, + "grad_norm": 0.21215735375881195, + "learning_rate": 3.6312564186584826e-07, + "loss": 0.5265, + "step": 6666 + }, + { + "epoch": 1.848863006100943, + "grad_norm": 0.22368744015693665, + "learning_rate": 3.6180559309650086e-07, + "loss": 0.5033, + "step": 6667 + }, + { + "epoch": 1.849140321686079, + "grad_norm": 0.20563702285289764, + "learning_rate": 3.6048791281047963e-07, + "loss": 0.4938, + "step": 6668 + }, + { + "epoch": 1.8494176372712148, + "grad_norm": 0.21737602353096008, + "learning_rate": 3.5917260126489687e-07, + "loss": 0.5033, + "step": 6669 + }, + { + "epoch": 1.8496949528563507, + "grad_norm": 0.32653746008872986, + "learning_rate": 3.578596587164043e-07, + "loss": 0.517, + "step": 6670 + }, + { + "epoch": 1.8499722684414865, + "grad_norm": 0.45970603823661804, + "learning_rate": 3.565490854211928e-07, + "loss": 0.5046, + "step": 6671 + }, + { + "epoch": 1.8502495840266224, + "grad_norm": 0.20332348346710205, + "learning_rate": 3.552408816349884e-07, + "loss": 0.4842, + "step": 6672 + }, + { + "epoch": 1.8505268996117583, + "grad_norm": 0.20179514586925507, + "learning_rate": 3.5393504761305903e-07, + "loss": 0.4844, + "step": 6673 + }, + { + "epoch": 1.8508042151968942, + "grad_norm": 0.20190241932868958, + "learning_rate": 3.5263158361020373e-07, + "loss": 0.4805, + "step": 6674 + }, + { + "epoch": 1.85108153078203, + "grad_norm": 0.2159728854894638, + "learning_rate": 3.513304898807676e-07, + "loss": 0.5239, + "step": 6675 + }, + { + "epoch": 1.851358846367166, + "grad_norm": 0.20452584326267242, + "learning_rate": 3.5003176667862265e-07, + "loss": 0.4694, + "step": 6676 + }, + { + "epoch": 1.8516361619523019, + "grad_norm": 0.19299055635929108, + "learning_rate": 3.487354142571883e-07, + "loss": 0.5076, + "step": 6677 + }, + { + "epoch": 1.8519134775374377, + "grad_norm": 0.20924656093120575, + "learning_rate": 3.474414328694178e-07, + "loss": 0.502, + "step": 6678 + }, + { + "epoch": 1.8521907931225736, + "grad_norm": 1.066011905670166, + "learning_rate": 3.4614982276779953e-07, + "loss": 0.4964, + "step": 6679 + }, + { + "epoch": 1.8524681087077095, + "grad_norm": 0.21032501757144928, + "learning_rate": 3.448605842043581e-07, + "loss": 0.4718, + "step": 6680 + }, + { + "epoch": 1.8527454242928454, + "grad_norm": 0.216018944978714, + "learning_rate": 3.435737174306633e-07, + "loss": 0.5178, + "step": 6681 + }, + { + "epoch": 1.8530227398779813, + "grad_norm": 0.2162523716688156, + "learning_rate": 3.4228922269781556e-07, + "loss": 0.4962, + "step": 6682 + }, + { + "epoch": 1.8533000554631172, + "grad_norm": 0.20311126112937927, + "learning_rate": 3.410071002564519e-07, + "loss": 0.5093, + "step": 6683 + }, + { + "epoch": 1.853577371048253, + "grad_norm": 0.21295960247516632, + "learning_rate": 3.397273503567486e-07, + "loss": 0.4753, + "step": 6684 + }, + { + "epoch": 1.853854686633389, + "grad_norm": 0.21265694499015808, + "learning_rate": 3.3844997324842113e-07, + "loss": 0.4974, + "step": 6685 + }, + { + "epoch": 1.8541320022185248, + "grad_norm": 0.21514546871185303, + "learning_rate": 3.3717496918071746e-07, + "loss": 0.4879, + "step": 6686 + }, + { + "epoch": 1.8544093178036607, + "grad_norm": 0.23401249945163727, + "learning_rate": 3.3590233840242455e-07, + "loss": 0.5049, + "step": 6687 + }, + { + "epoch": 1.8546866333887966, + "grad_norm": 0.20478412508964539, + "learning_rate": 3.346320811618675e-07, + "loss": 0.5053, + "step": 6688 + }, + { + "epoch": 1.8549639489739325, + "grad_norm": 0.21184539794921875, + "learning_rate": 3.333641977069077e-07, + "loss": 0.528, + "step": 6689 + }, + { + "epoch": 1.8552412645590683, + "grad_norm": 0.1997302621603012, + "learning_rate": 3.320986882849417e-07, + "loss": 0.4981, + "step": 6690 + }, + { + "epoch": 1.8555185801442042, + "grad_norm": 0.21800538897514343, + "learning_rate": 3.308355531429011e-07, + "loss": 0.5069, + "step": 6691 + }, + { + "epoch": 1.8557958957293401, + "grad_norm": 0.2156432867050171, + "learning_rate": 3.295747925272638e-07, + "loss": 0.5039, + "step": 6692 + }, + { + "epoch": 1.856073211314476, + "grad_norm": 0.2117234319448471, + "learning_rate": 3.28316406684033e-07, + "loss": 0.4791, + "step": 6693 + }, + { + "epoch": 1.8563505268996119, + "grad_norm": 0.48360639810562134, + "learning_rate": 3.2706039585875257e-07, + "loss": 0.4949, + "step": 6694 + }, + { + "epoch": 1.8566278424847478, + "grad_norm": 0.20971214771270752, + "learning_rate": 3.2580676029650566e-07, + "loss": 0.5, + "step": 6695 + }, + { + "epoch": 1.8569051580698837, + "grad_norm": 0.20673950016498566, + "learning_rate": 3.24555500241909e-07, + "loss": 0.5199, + "step": 6696 + }, + { + "epoch": 1.8571824736550195, + "grad_norm": 0.20716223120689392, + "learning_rate": 3.233066159391174e-07, + "loss": 0.5073, + "step": 6697 + }, + { + "epoch": 1.8574597892401554, + "grad_norm": 0.2109098881483078, + "learning_rate": 3.2206010763181916e-07, + "loss": 0.5055, + "step": 6698 + }, + { + "epoch": 1.8577371048252913, + "grad_norm": 0.20620295405387878, + "learning_rate": 3.20815975563242e-07, + "loss": 0.5266, + "step": 6699 + }, + { + "epoch": 1.8580144204104272, + "grad_norm": 0.22776873409748077, + "learning_rate": 3.1957421997615004e-07, + "loss": 0.5076, + "step": 6700 + }, + { + "epoch": 1.858291735995563, + "grad_norm": 0.2032913714647293, + "learning_rate": 3.1833484111284115e-07, + "loss": 0.5065, + "step": 6701 + }, + { + "epoch": 1.858569051580699, + "grad_norm": 0.2074134647846222, + "learning_rate": 3.1709783921515383e-07, + "loss": 0.5079, + "step": 6702 + }, + { + "epoch": 1.8588463671658348, + "grad_norm": 0.20687542855739594, + "learning_rate": 3.158632145244561e-07, + "loss": 0.5004, + "step": 6703 + }, + { + "epoch": 1.8591236827509707, + "grad_norm": 0.21574708819389343, + "learning_rate": 3.1463096728165944e-07, + "loss": 0.5162, + "step": 6704 + }, + { + "epoch": 1.8594009983361066, + "grad_norm": 0.2129552662372589, + "learning_rate": 3.134010977272048e-07, + "loss": 0.505, + "step": 6705 + }, + { + "epoch": 1.8596783139212425, + "grad_norm": 0.2198726236820221, + "learning_rate": 3.121736061010738e-07, + "loss": 0.5004, + "step": 6706 + }, + { + "epoch": 1.8599556295063784, + "grad_norm": 0.21903812885284424, + "learning_rate": 3.109484926427847e-07, + "loss": 0.4909, + "step": 6707 + }, + { + "epoch": 1.8602329450915143, + "grad_norm": 0.2174917459487915, + "learning_rate": 3.0972575759138503e-07, + "loss": 0.4957, + "step": 6708 + }, + { + "epoch": 1.8605102606766502, + "grad_norm": 0.2153944969177246, + "learning_rate": 3.085054011854674e-07, + "loss": 0.4964, + "step": 6709 + }, + { + "epoch": 1.860787576261786, + "grad_norm": 0.22760117053985596, + "learning_rate": 3.0728742366315233e-07, + "loss": 0.4961, + "step": 6710 + }, + { + "epoch": 1.861064891846922, + "grad_norm": 0.20543061196804047, + "learning_rate": 3.0607182526210115e-07, + "loss": 0.4923, + "step": 6711 + }, + { + "epoch": 1.8613422074320578, + "grad_norm": 0.2019956409931183, + "learning_rate": 3.0485860621950887e-07, + "loss": 0.5033, + "step": 6712 + }, + { + "epoch": 1.8616195230171937, + "grad_norm": 0.2030191272497177, + "learning_rate": 3.036477667721069e-07, + "loss": 0.4962, + "step": 6713 + }, + { + "epoch": 1.8618968386023296, + "grad_norm": 0.20309413969516754, + "learning_rate": 3.024393071561604e-07, + "loss": 0.5137, + "step": 6714 + }, + { + "epoch": 1.8621741541874655, + "grad_norm": 0.20694345235824585, + "learning_rate": 3.0123322760747394e-07, + "loss": 0.4906, + "step": 6715 + }, + { + "epoch": 1.8624514697726013, + "grad_norm": 0.21089065074920654, + "learning_rate": 3.000295283613869e-07, + "loss": 0.4833, + "step": 6716 + }, + { + "epoch": 1.8627287853577372, + "grad_norm": 0.20231840014457703, + "learning_rate": 2.9882820965276975e-07, + "loss": 0.5002, + "step": 6717 + }, + { + "epoch": 1.8630061009428731, + "grad_norm": 0.20089836418628693, + "learning_rate": 2.9762927171603226e-07, + "loss": 0.4721, + "step": 6718 + }, + { + "epoch": 1.863283416528009, + "grad_norm": 0.25077909231185913, + "learning_rate": 2.9643271478511925e-07, + "loss": 0.5162, + "step": 6719 + }, + { + "epoch": 1.8635607321131449, + "grad_norm": 0.22070035338401794, + "learning_rate": 2.952385390935133e-07, + "loss": 0.482, + "step": 6720 + }, + { + "epoch": 1.8638380476982808, + "grad_norm": 0.21285070478916168, + "learning_rate": 2.94046744874224e-07, + "loss": 0.5105, + "step": 6721 + }, + { + "epoch": 1.8641153632834166, + "grad_norm": 0.20452368259429932, + "learning_rate": 2.928573323598069e-07, + "loss": 0.4942, + "step": 6722 + }, + { + "epoch": 1.8643926788685525, + "grad_norm": 0.21610909700393677, + "learning_rate": 2.9167030178234853e-07, + "loss": 0.5066, + "step": 6723 + }, + { + "epoch": 1.8646699944536884, + "grad_norm": 0.2100083827972412, + "learning_rate": 2.904856533734665e-07, + "loss": 0.486, + "step": 6724 + }, + { + "epoch": 1.8649473100388243, + "grad_norm": 0.211506187915802, + "learning_rate": 2.893033873643175e-07, + "loss": 0.4986, + "step": 6725 + }, + { + "epoch": 1.8652246256239602, + "grad_norm": 0.2029467523097992, + "learning_rate": 2.881235039855934e-07, + "loss": 0.4879, + "step": 6726 + }, + { + "epoch": 1.865501941209096, + "grad_norm": 0.21474742889404297, + "learning_rate": 2.8694600346752255e-07, + "loss": 0.5127, + "step": 6727 + }, + { + "epoch": 1.865779256794232, + "grad_norm": 0.20987072587013245, + "learning_rate": 2.857708860398656e-07, + "loss": 0.5121, + "step": 6728 + }, + { + "epoch": 1.8660565723793678, + "grad_norm": 0.2003088742494583, + "learning_rate": 2.845981519319169e-07, + "loss": 0.4814, + "step": 6729 + }, + { + "epoch": 1.8663338879645037, + "grad_norm": 0.20292720198631287, + "learning_rate": 2.834278013725114e-07, + "loss": 0.4883, + "step": 6730 + }, + { + "epoch": 1.8666112035496396, + "grad_norm": 0.2072405070066452, + "learning_rate": 2.8225983459001374e-07, + "loss": 0.5012, + "step": 6731 + }, + { + "epoch": 1.8668885191347755, + "grad_norm": 0.22093196213245392, + "learning_rate": 2.81094251812325e-07, + "loss": 0.5124, + "step": 6732 + }, + { + "epoch": 1.8671658347199114, + "grad_norm": 0.21116647124290466, + "learning_rate": 2.7993105326687975e-07, + "loss": 0.4803, + "step": 6733 + }, + { + "epoch": 1.8674431503050473, + "grad_norm": 0.20909181237220764, + "learning_rate": 2.7877023918065346e-07, + "loss": 0.5075, + "step": 6734 + }, + { + "epoch": 1.8677204658901831, + "grad_norm": 0.2065388560295105, + "learning_rate": 2.776118097801483e-07, + "loss": 0.4884, + "step": 6735 + }, + { + "epoch": 1.867997781475319, + "grad_norm": 0.2007947862148285, + "learning_rate": 2.764557652914029e-07, + "loss": 0.4811, + "step": 6736 + }, + { + "epoch": 1.868275097060455, + "grad_norm": 0.21440117061138153, + "learning_rate": 2.753021059399952e-07, + "loss": 0.4785, + "step": 6737 + }, + { + "epoch": 1.8685524126455908, + "grad_norm": 0.2125922590494156, + "learning_rate": 2.7415083195103525e-07, + "loss": 0.473, + "step": 6738 + }, + { + "epoch": 1.8688297282307267, + "grad_norm": 0.21997934579849243, + "learning_rate": 2.730019435491657e-07, + "loss": 0.4944, + "step": 6739 + }, + { + "epoch": 1.8691070438158626, + "grad_norm": 0.19915248453617096, + "learning_rate": 2.7185544095856413e-07, + "loss": 0.4901, + "step": 6740 + }, + { + "epoch": 1.8693843594009985, + "grad_norm": 0.21164058148860931, + "learning_rate": 2.7071132440294464e-07, + "loss": 0.4929, + "step": 6741 + }, + { + "epoch": 1.8696616749861343, + "grad_norm": 0.1981211006641388, + "learning_rate": 2.695695941055551e-07, + "loss": 0.4817, + "step": 6742 + }, + { + "epoch": 1.8699389905712702, + "grad_norm": 0.21673041582107544, + "learning_rate": 2.68430250289177e-07, + "loss": 0.5048, + "step": 6743 + }, + { + "epoch": 1.870216306156406, + "grad_norm": 0.19340240955352783, + "learning_rate": 2.6729329317612545e-07, + "loss": 0.5013, + "step": 6744 + }, + { + "epoch": 1.870493621741542, + "grad_norm": 0.20828621089458466, + "learning_rate": 2.661587229882537e-07, + "loss": 0.4879, + "step": 6745 + }, + { + "epoch": 1.8707709373266779, + "grad_norm": 0.2040943205356598, + "learning_rate": 2.650265399469429e-07, + "loss": 0.5113, + "step": 6746 + }, + { + "epoch": 1.8710482529118138, + "grad_norm": 0.2092759907245636, + "learning_rate": 2.6389674427311494e-07, + "loss": 0.5067, + "step": 6747 + }, + { + "epoch": 1.8713255684969496, + "grad_norm": 0.20813149213790894, + "learning_rate": 2.6276933618721995e-07, + "loss": 0.5074, + "step": 6748 + }, + { + "epoch": 1.8716028840820855, + "grad_norm": 0.2012152075767517, + "learning_rate": 2.6164431590924856e-07, + "loss": 0.5058, + "step": 6749 + }, + { + "epoch": 1.8718801996672214, + "grad_norm": 0.2121763676404953, + "learning_rate": 2.605216836587182e-07, + "loss": 0.508, + "step": 6750 + }, + { + "epoch": 1.8721575152523573, + "grad_norm": 0.20884345471858978, + "learning_rate": 2.5940143965468843e-07, + "loss": 0.5022, + "step": 6751 + }, + { + "epoch": 1.8724348308374932, + "grad_norm": 0.21974632143974304, + "learning_rate": 2.582835841157441e-07, + "loss": 0.4794, + "step": 6752 + }, + { + "epoch": 1.872712146422629, + "grad_norm": 0.22024495899677277, + "learning_rate": 2.5716811726001213e-07, + "loss": 0.5149, + "step": 6753 + }, + { + "epoch": 1.872989462007765, + "grad_norm": 0.20395393669605255, + "learning_rate": 2.560550393051475e-07, + "loss": 0.4796, + "step": 6754 + }, + { + "epoch": 1.8732667775929008, + "grad_norm": 0.2120208889245987, + "learning_rate": 2.5494435046834324e-07, + "loss": 0.5186, + "step": 6755 + }, + { + "epoch": 1.8735440931780367, + "grad_norm": 0.20500154793262482, + "learning_rate": 2.538360509663218e-07, + "loss": 0.4926, + "step": 6756 + }, + { + "epoch": 1.8738214087631726, + "grad_norm": 0.20315538346767426, + "learning_rate": 2.527301410153421e-07, + "loss": 0.4664, + "step": 6757 + }, + { + "epoch": 1.8740987243483085, + "grad_norm": 0.19243794679641724, + "learning_rate": 2.516266208311982e-07, + "loss": 0.4876, + "step": 6758 + }, + { + "epoch": 1.8743760399334444, + "grad_norm": 0.2194959968328476, + "learning_rate": 2.505254906292151e-07, + "loss": 0.5032, + "step": 6759 + }, + { + "epoch": 1.8746533555185803, + "grad_norm": 0.21170787513256073, + "learning_rate": 2.4942675062425147e-07, + "loss": 0.4849, + "step": 6760 + }, + { + "epoch": 1.8749306711037161, + "grad_norm": 0.2141827791929245, + "learning_rate": 2.483304010307025e-07, + "loss": 0.5182, + "step": 6761 + }, + { + "epoch": 1.875207986688852, + "grad_norm": 0.21987125277519226, + "learning_rate": 2.4723644206249424e-07, + "loss": 0.4889, + "step": 6762 + }, + { + "epoch": 1.875485302273988, + "grad_norm": 0.2075456976890564, + "learning_rate": 2.4614487393308657e-07, + "loss": 0.5282, + "step": 6763 + }, + { + "epoch": 1.8757626178591238, + "grad_norm": 0.20293202996253967, + "learning_rate": 2.450556968554743e-07, + "loss": 0.4726, + "step": 6764 + }, + { + "epoch": 1.8760399334442597, + "grad_norm": 0.2271379679441452, + "learning_rate": 2.4396891104218335e-07, + "loss": 0.5321, + "step": 6765 + }, + { + "epoch": 1.8763172490293956, + "grad_norm": 0.206784188747406, + "learning_rate": 2.4288451670527736e-07, + "loss": 0.5207, + "step": 6766 + }, + { + "epoch": 1.8765945646145314, + "grad_norm": 0.21522942185401917, + "learning_rate": 2.41802514056344e-07, + "loss": 0.5015, + "step": 6767 + }, + { + "epoch": 1.8768718801996673, + "grad_norm": 0.2123783975839615, + "learning_rate": 2.4072290330651867e-07, + "loss": 0.488, + "step": 6768 + }, + { + "epoch": 1.8771491957848032, + "grad_norm": 0.2079179435968399, + "learning_rate": 2.396456846664577e-07, + "loss": 0.4738, + "step": 6769 + }, + { + "epoch": 1.877426511369939, + "grad_norm": 0.21037521958351135, + "learning_rate": 2.3857085834635557e-07, + "loss": 0.5059, + "step": 6770 + }, + { + "epoch": 1.877703826955075, + "grad_norm": 0.21396882832050323, + "learning_rate": 2.374984245559375e-07, + "loss": 0.5156, + "step": 6771 + }, + { + "epoch": 1.8779811425402109, + "grad_norm": 0.21056626737117767, + "learning_rate": 2.3642838350446812e-07, + "loss": 0.5154, + "step": 6772 + }, + { + "epoch": 1.8782584581253468, + "grad_norm": 0.2141093611717224, + "learning_rate": 2.3536073540073727e-07, + "loss": 0.5053, + "step": 6773 + }, + { + "epoch": 1.8785357737104826, + "grad_norm": 0.2114919126033783, + "learning_rate": 2.342954804530728e-07, + "loss": 0.5151, + "step": 6774 + }, + { + "epoch": 1.8788130892956185, + "grad_norm": 0.2190243899822235, + "learning_rate": 2.3323261886933344e-07, + "loss": 0.5099, + "step": 6775 + }, + { + "epoch": 1.8790904048807544, + "grad_norm": 0.22468531131744385, + "learning_rate": 2.3217215085691164e-07, + "loss": 0.5139, + "step": 6776 + }, + { + "epoch": 1.8793677204658903, + "grad_norm": 0.20503219962120056, + "learning_rate": 2.311140766227349e-07, + "loss": 0.5068, + "step": 6777 + }, + { + "epoch": 1.8796450360510262, + "grad_norm": 0.22132523357868195, + "learning_rate": 2.300583963732575e-07, + "loss": 0.5136, + "step": 6778 + }, + { + "epoch": 1.879922351636162, + "grad_norm": 0.21017663180828094, + "learning_rate": 2.2900511031447303e-07, + "loss": 0.4947, + "step": 6779 + }, + { + "epoch": 1.880199667221298, + "grad_norm": 0.20506809651851654, + "learning_rate": 2.2795421865190737e-07, + "loss": 0.5036, + "step": 6780 + }, + { + "epoch": 1.8804769828064338, + "grad_norm": 0.2145530879497528, + "learning_rate": 2.2690572159061453e-07, + "loss": 0.5016, + "step": 6781 + }, + { + "epoch": 1.8807542983915697, + "grad_norm": 0.20361502468585968, + "learning_rate": 2.2585961933518362e-07, + "loss": 0.4867, + "step": 6782 + }, + { + "epoch": 1.8810316139767056, + "grad_norm": 0.2013719379901886, + "learning_rate": 2.248159120897403e-07, + "loss": 0.4996, + "step": 6783 + }, + { + "epoch": 1.8813089295618415, + "grad_norm": 0.21445105969905853, + "learning_rate": 2.2377460005793688e-07, + "loss": 0.4938, + "step": 6784 + }, + { + "epoch": 1.8815862451469774, + "grad_norm": 0.2540777027606964, + "learning_rate": 2.2273568344295948e-07, + "loss": 0.4668, + "step": 6785 + }, + { + "epoch": 1.8818635607321132, + "grad_norm": 0.21935346722602844, + "learning_rate": 2.2169916244753204e-07, + "loss": 0.4713, + "step": 6786 + }, + { + "epoch": 1.8821408763172491, + "grad_norm": 0.20928525924682617, + "learning_rate": 2.2066503727390526e-07, + "loss": 0.492, + "step": 6787 + }, + { + "epoch": 1.882418191902385, + "grad_norm": 0.21874678134918213, + "learning_rate": 2.1963330812386356e-07, + "loss": 0.4754, + "step": 6788 + }, + { + "epoch": 1.882695507487521, + "grad_norm": 0.22388552129268646, + "learning_rate": 2.186039751987265e-07, + "loss": 0.5055, + "step": 6789 + }, + { + "epoch": 1.8829728230726568, + "grad_norm": 0.20376282930374146, + "learning_rate": 2.175770386993431e-07, + "loss": 0.4799, + "step": 6790 + }, + { + "epoch": 1.8832501386577927, + "grad_norm": 0.22341391444206238, + "learning_rate": 2.1655249882609612e-07, + "loss": 0.5089, + "step": 6791 + }, + { + "epoch": 1.8835274542429286, + "grad_norm": 0.20708690583705902, + "learning_rate": 2.1553035577890068e-07, + "loss": 0.4717, + "step": 6792 + }, + { + "epoch": 1.8838047698280644, + "grad_norm": 0.20336076617240906, + "learning_rate": 2.1451060975720417e-07, + "loss": 0.4888, + "step": 6793 + }, + { + "epoch": 1.8840820854132003, + "grad_norm": 0.23210936784744263, + "learning_rate": 2.134932609599849e-07, + "loss": 0.4947, + "step": 6794 + }, + { + "epoch": 1.8843594009983362, + "grad_norm": 0.2158088982105255, + "learning_rate": 2.1247830958575493e-07, + "loss": 0.5283, + "step": 6795 + }, + { + "epoch": 1.884636716583472, + "grad_norm": 0.21517014503479004, + "learning_rate": 2.1146575583255862e-07, + "loss": 0.4747, + "step": 6796 + }, + { + "epoch": 1.884914032168608, + "grad_norm": 0.20993436872959137, + "learning_rate": 2.1045559989797125e-07, + "loss": 0.4945, + "step": 6797 + }, + { + "epoch": 1.8851913477537439, + "grad_norm": 0.210999995470047, + "learning_rate": 2.0944784197910321e-07, + "loss": 0.4993, + "step": 6798 + }, + { + "epoch": 1.8854686633388797, + "grad_norm": 0.2413627654314041, + "learning_rate": 2.0844248227259305e-07, + "loss": 0.4893, + "step": 6799 + }, + { + "epoch": 1.8857459789240156, + "grad_norm": 0.20545423030853271, + "learning_rate": 2.07439520974613e-07, + "loss": 0.5073, + "step": 6800 + }, + { + "epoch": 1.8860232945091515, + "grad_norm": 0.20195883512496948, + "learning_rate": 2.0643895828086625e-07, + "loss": 0.4633, + "step": 6801 + }, + { + "epoch": 1.8863006100942874, + "grad_norm": 0.21203799545764923, + "learning_rate": 2.0544079438659242e-07, + "loss": 0.5121, + "step": 6802 + }, + { + "epoch": 1.8865779256794233, + "grad_norm": 0.2213236391544342, + "learning_rate": 2.0444502948655658e-07, + "loss": 0.4984, + "step": 6803 + }, + { + "epoch": 1.8868552412645592, + "grad_norm": 0.2082570642232895, + "learning_rate": 2.0345166377506165e-07, + "loss": 0.5068, + "step": 6804 + }, + { + "epoch": 1.887132556849695, + "grad_norm": 0.20740516483783722, + "learning_rate": 2.024606974459359e-07, + "loss": 0.4746, + "step": 6805 + }, + { + "epoch": 1.887409872434831, + "grad_norm": 0.20049674808979034, + "learning_rate": 2.0147213069254683e-07, + "loss": 0.478, + "step": 6806 + }, + { + "epoch": 1.8876871880199668, + "grad_norm": 0.1955636441707611, + "learning_rate": 2.0048596370778744e-07, + "loss": 0.4785, + "step": 6807 + }, + { + "epoch": 1.8879645036051027, + "grad_norm": 0.21246939897537231, + "learning_rate": 1.995021966840871e-07, + "loss": 0.4925, + "step": 6808 + }, + { + "epoch": 1.8882418191902386, + "grad_norm": 0.20627954602241516, + "learning_rate": 1.9852082981340198e-07, + "loss": 0.4882, + "step": 6809 + }, + { + "epoch": 1.8885191347753745, + "grad_norm": 0.20891375839710236, + "learning_rate": 1.9754186328722614e-07, + "loss": 0.5049, + "step": 6810 + }, + { + "epoch": 1.8887964503605104, + "grad_norm": 0.2203957885503769, + "learning_rate": 1.9656529729658036e-07, + "loss": 0.4642, + "step": 6811 + }, + { + "epoch": 1.8890737659456462, + "grad_norm": 0.2100536823272705, + "learning_rate": 1.955911320320164e-07, + "loss": 0.4995, + "step": 6812 + }, + { + "epoch": 1.8893510815307821, + "grad_norm": 0.20311182737350464, + "learning_rate": 1.946193676836225e-07, + "loss": 0.525, + "step": 6813 + }, + { + "epoch": 1.889628397115918, + "grad_norm": 0.2031848132610321, + "learning_rate": 1.936500044410164e-07, + "loss": 0.519, + "step": 6814 + }, + { + "epoch": 1.889905712701054, + "grad_norm": 0.19998584687709808, + "learning_rate": 1.9268304249334402e-07, + "loss": 0.4888, + "step": 6815 + }, + { + "epoch": 1.8901830282861898, + "grad_norm": 0.20453502237796783, + "learning_rate": 1.9171848202928637e-07, + "loss": 0.4936, + "step": 6816 + }, + { + "epoch": 1.8904603438713257, + "grad_norm": 0.1982734203338623, + "learning_rate": 1.9075632323705405e-07, + "loss": 0.4986, + "step": 6817 + }, + { + "epoch": 1.8907376594564616, + "grad_norm": 0.21546244621276855, + "learning_rate": 1.897965663043913e-07, + "loss": 0.5138, + "step": 6818 + }, + { + "epoch": 1.8910149750415974, + "grad_norm": 0.20453289151191711, + "learning_rate": 1.8883921141857053e-07, + "loss": 0.4685, + "step": 6819 + }, + { + "epoch": 1.8912922906267333, + "grad_norm": 0.2022780179977417, + "learning_rate": 1.8788425876639647e-07, + "loss": 0.4743, + "step": 6820 + }, + { + "epoch": 1.8915696062118692, + "grad_norm": 0.20572780072689056, + "learning_rate": 1.8693170853420898e-07, + "loss": 0.4792, + "step": 6821 + }, + { + "epoch": 1.891846921797005, + "grad_norm": 0.2061588168144226, + "learning_rate": 1.8598156090787328e-07, + "loss": 0.491, + "step": 6822 + }, + { + "epoch": 1.892124237382141, + "grad_norm": 0.21832877397537231, + "learning_rate": 1.8503381607278825e-07, + "loss": 0.5126, + "step": 6823 + }, + { + "epoch": 1.8924015529672769, + "grad_norm": 0.21814782917499542, + "learning_rate": 1.8408847421388376e-07, + "loss": 0.4833, + "step": 6824 + }, + { + "epoch": 1.8926788685524127, + "grad_norm": 0.2065282016992569, + "learning_rate": 1.8314553551562336e-07, + "loss": 0.5193, + "step": 6825 + }, + { + "epoch": 1.8929561841375486, + "grad_norm": 0.21190817654132843, + "learning_rate": 1.8220500016199736e-07, + "loss": 0.5011, + "step": 6826 + }, + { + "epoch": 1.8932334997226845, + "grad_norm": 0.3020632266998291, + "learning_rate": 1.8126686833652984e-07, + "loss": 0.4953, + "step": 6827 + }, + { + "epoch": 1.8935108153078204, + "grad_norm": 0.2020588517189026, + "learning_rate": 1.8033114022227432e-07, + "loss": 0.4876, + "step": 6828 + }, + { + "epoch": 1.8937881308929563, + "grad_norm": 0.21000345051288605, + "learning_rate": 1.793978160018181e-07, + "loss": 0.4933, + "step": 6829 + }, + { + "epoch": 1.8940654464780922, + "grad_norm": 0.20526744425296783, + "learning_rate": 1.784668958572752e-07, + "loss": 0.5048, + "step": 6830 + }, + { + "epoch": 1.894342762063228, + "grad_norm": 0.20586876571178436, + "learning_rate": 1.7753837997029343e-07, + "loss": 0.4861, + "step": 6831 + }, + { + "epoch": 1.894620077648364, + "grad_norm": 0.22010831534862518, + "learning_rate": 1.7661226852205143e-07, + "loss": 0.5143, + "step": 6832 + }, + { + "epoch": 1.8948973932334998, + "grad_norm": 0.21001818776130676, + "learning_rate": 1.7568856169325743e-07, + "loss": 0.4987, + "step": 6833 + }, + { + "epoch": 1.8951747088186357, + "grad_norm": 0.1980331391096115, + "learning_rate": 1.7476725966415335e-07, + "loss": 0.4786, + "step": 6834 + }, + { + "epoch": 1.8954520244037716, + "grad_norm": 0.20589183270931244, + "learning_rate": 1.7384836261450655e-07, + "loss": 0.5138, + "step": 6835 + }, + { + "epoch": 1.8957293399889075, + "grad_norm": 0.20626741647720337, + "learning_rate": 1.7293187072361938e-07, + "loss": 0.489, + "step": 6836 + }, + { + "epoch": 1.8960066555740434, + "grad_norm": 0.20527756214141846, + "learning_rate": 1.7201778417032383e-07, + "loss": 0.4767, + "step": 6837 + }, + { + "epoch": 1.8962839711591792, + "grad_norm": 0.21390627324581146, + "learning_rate": 1.7110610313298274e-07, + "loss": 0.4862, + "step": 6838 + }, + { + "epoch": 1.8965612867443151, + "grad_norm": 0.205204576253891, + "learning_rate": 1.701968277894872e-07, + "loss": 0.487, + "step": 6839 + }, + { + "epoch": 1.896838602329451, + "grad_norm": 0.20338958501815796, + "learning_rate": 1.692899583172633e-07, + "loss": 0.492, + "step": 6840 + }, + { + "epoch": 1.897115917914587, + "grad_norm": 0.2076566219329834, + "learning_rate": 1.6838549489326533e-07, + "loss": 0.5127, + "step": 6841 + }, + { + "epoch": 1.8973932334997228, + "grad_norm": 0.21550370752811432, + "learning_rate": 1.6748343769397713e-07, + "loss": 0.5049, + "step": 6842 + }, + { + "epoch": 1.8976705490848587, + "grad_norm": 0.21209217607975006, + "learning_rate": 1.6658378689541343e-07, + "loss": 0.5149, + "step": 6843 + }, + { + "epoch": 1.8979478646699945, + "grad_norm": 0.2269715517759323, + "learning_rate": 1.6568654267312133e-07, + "loss": 0.4871, + "step": 6844 + }, + { + "epoch": 1.8982251802551304, + "grad_norm": 0.2141071856021881, + "learning_rate": 1.6479170520217607e-07, + "loss": 0.4873, + "step": 6845 + }, + { + "epoch": 1.8985024958402663, + "grad_norm": 0.21018719673156738, + "learning_rate": 1.638992746571852e-07, + "loss": 0.5122, + "step": 6846 + }, + { + "epoch": 1.8987798114254022, + "grad_norm": 0.21268904209136963, + "learning_rate": 1.630092512122816e-07, + "loss": 0.5282, + "step": 6847 + }, + { + "epoch": 1.899057127010538, + "grad_norm": 0.21051792800426483, + "learning_rate": 1.6212163504113753e-07, + "loss": 0.4921, + "step": 6848 + }, + { + "epoch": 1.899334442595674, + "grad_norm": 0.20133888721466064, + "learning_rate": 1.6123642631694913e-07, + "loss": 0.4807, + "step": 6849 + }, + { + "epoch": 1.8996117581808099, + "grad_norm": 0.19789256155490875, + "learning_rate": 1.6035362521244213e-07, + "loss": 0.5019, + "step": 6850 + }, + { + "epoch": 1.8998890737659457, + "grad_norm": 0.19800089299678802, + "learning_rate": 1.5947323189987595e-07, + "loss": 0.4618, + "step": 6851 + }, + { + "epoch": 1.9001663893510816, + "grad_norm": 0.20594573020935059, + "learning_rate": 1.585952465510382e-07, + "loss": 0.4941, + "step": 6852 + }, + { + "epoch": 1.9004437049362175, + "grad_norm": 0.18927329778671265, + "learning_rate": 1.5771966933724736e-07, + "loss": 0.4925, + "step": 6853 + }, + { + "epoch": 1.9007210205213534, + "grad_norm": 0.2202519327402115, + "learning_rate": 1.568465004293515e-07, + "loss": 0.5087, + "step": 6854 + }, + { + "epoch": 1.9009983361064893, + "grad_norm": 0.2147960215806961, + "learning_rate": 1.5597573999772823e-07, + "loss": 0.516, + "step": 6855 + }, + { + "epoch": 1.9012756516916252, + "grad_norm": 0.21644139289855957, + "learning_rate": 1.5510738821228887e-07, + "loss": 0.4624, + "step": 6856 + }, + { + "epoch": 1.901552967276761, + "grad_norm": 0.2117416262626648, + "learning_rate": 1.542414452424687e-07, + "loss": 0.4926, + "step": 6857 + }, + { + "epoch": 1.901830282861897, + "grad_norm": 0.2076312005519867, + "learning_rate": 1.533779112572367e-07, + "loss": 0.514, + "step": 6858 + }, + { + "epoch": 1.9021075984470328, + "grad_norm": 0.21172620356082916, + "learning_rate": 1.5251678642509286e-07, + "loss": 0.4708, + "step": 6859 + }, + { + "epoch": 1.9023849140321687, + "grad_norm": 0.2097117304801941, + "learning_rate": 1.5165807091406386e-07, + "loss": 0.5145, + "step": 6860 + }, + { + "epoch": 1.9026622296173046, + "grad_norm": 0.21858976781368256, + "learning_rate": 1.5080176489170734e-07, + "loss": 0.4811, + "step": 6861 + }, + { + "epoch": 1.9029395452024405, + "grad_norm": 0.1994626671075821, + "learning_rate": 1.4994786852511322e-07, + "loss": 0.4825, + "step": 6862 + }, + { + "epoch": 1.9032168607875763, + "grad_norm": 0.20749947428703308, + "learning_rate": 1.4909638198089966e-07, + "loss": 0.5112, + "step": 6863 + }, + { + "epoch": 1.9034941763727122, + "grad_norm": 0.20840351283550262, + "learning_rate": 1.4824730542521148e-07, + "loss": 0.4832, + "step": 6864 + }, + { + "epoch": 1.9037714919578481, + "grad_norm": 0.20934903621673584, + "learning_rate": 1.474006390237273e-07, + "loss": 0.5057, + "step": 6865 + }, + { + "epoch": 1.904048807542984, + "grad_norm": 0.21367427706718445, + "learning_rate": 1.4655638294165525e-07, + "loss": 0.5018, + "step": 6866 + }, + { + "epoch": 1.9043261231281199, + "grad_norm": 0.21489764750003815, + "learning_rate": 1.4571453734373157e-07, + "loss": 0.5019, + "step": 6867 + }, + { + "epoch": 1.9046034387132558, + "grad_norm": 0.20891396701335907, + "learning_rate": 1.4487510239422076e-07, + "loss": 0.4948, + "step": 6868 + }, + { + "epoch": 1.9048807542983917, + "grad_norm": 0.20342187583446503, + "learning_rate": 1.440380782569209e-07, + "loss": 0.4613, + "step": 6869 + }, + { + "epoch": 1.9051580698835275, + "grad_norm": 0.20930233597755432, + "learning_rate": 1.432034650951569e-07, + "loss": 0.5102, + "step": 6870 + }, + { + "epoch": 1.9054353854686634, + "grad_norm": 0.19923752546310425, + "learning_rate": 1.4237126307178467e-07, + "loss": 0.4857, + "step": 6871 + }, + { + "epoch": 1.9057127010537993, + "grad_norm": 0.2359122633934021, + "learning_rate": 1.4154147234918814e-07, + "loss": 0.4877, + "step": 6872 + }, + { + "epoch": 1.9059900166389352, + "grad_norm": 0.19613344967365265, + "learning_rate": 1.4071409308928086e-07, + "loss": 0.4993, + "step": 6873 + }, + { + "epoch": 1.906267332224071, + "grad_norm": 0.22320273518562317, + "learning_rate": 1.398891254535073e-07, + "loss": 0.5018, + "step": 6874 + }, + { + "epoch": 1.906544647809207, + "grad_norm": 0.21227893233299255, + "learning_rate": 1.3906656960284008e-07, + "loss": 0.5012, + "step": 6875 + }, + { + "epoch": 1.9068219633943428, + "grad_norm": 0.2211824506521225, + "learning_rate": 1.382464256977828e-07, + "loss": 0.4604, + "step": 6876 + }, + { + "epoch": 1.9070992789794787, + "grad_norm": 0.20455965399742126, + "learning_rate": 1.3742869389836572e-07, + "loss": 0.4759, + "step": 6877 + }, + { + "epoch": 1.9073765945646146, + "grad_norm": 0.20828378200531006, + "learning_rate": 1.3661337436415012e-07, + "loss": 0.5047, + "step": 6878 + }, + { + "epoch": 1.9076539101497505, + "grad_norm": 0.2089402675628662, + "learning_rate": 1.358004672542282e-07, + "loss": 0.4886, + "step": 6879 + }, + { + "epoch": 1.9079312257348864, + "grad_norm": 0.23213088512420654, + "learning_rate": 1.3498997272721886e-07, + "loss": 0.513, + "step": 6880 + }, + { + "epoch": 1.9082085413200223, + "grad_norm": 0.22040383517742157, + "learning_rate": 1.3418189094126926e-07, + "loss": 0.5072, + "step": 6881 + }, + { + "epoch": 1.9084858569051582, + "grad_norm": 0.22452102601528168, + "learning_rate": 1.333762220540588e-07, + "loss": 0.5167, + "step": 6882 + }, + { + "epoch": 1.908763172490294, + "grad_norm": 0.20027916133403778, + "learning_rate": 1.325729662227951e-07, + "loss": 0.4924, + "step": 6883 + }, + { + "epoch": 1.90904048807543, + "grad_norm": 0.20974531769752502, + "learning_rate": 1.317721236042152e-07, + "loss": 0.4752, + "step": 6884 + }, + { + "epoch": 1.9093178036605658, + "grad_norm": 0.22228215634822845, + "learning_rate": 1.3097369435458167e-07, + "loss": 0.4951, + "step": 6885 + }, + { + "epoch": 1.9095951192457017, + "grad_norm": 0.21499434113502502, + "learning_rate": 1.3017767862969487e-07, + "loss": 0.507, + "step": 6886 + }, + { + "epoch": 1.9098724348308376, + "grad_norm": 0.1968400925397873, + "learning_rate": 1.293840765848736e-07, + "loss": 0.4698, + "step": 6887 + }, + { + "epoch": 1.9101497504159735, + "grad_norm": 0.20211312174797058, + "learning_rate": 1.285928883749718e-07, + "loss": 0.4916, + "step": 6888 + }, + { + "epoch": 1.9104270660011093, + "grad_norm": 0.20148654282093048, + "learning_rate": 1.2780411415437148e-07, + "loss": 0.4794, + "step": 6889 + }, + { + "epoch": 1.9107043815862452, + "grad_norm": 0.20846286416053772, + "learning_rate": 1.2701775407698567e-07, + "loss": 0.4847, + "step": 6890 + }, + { + "epoch": 1.910981697171381, + "grad_norm": 0.2036297619342804, + "learning_rate": 1.2623380829624997e-07, + "loss": 0.4813, + "step": 6891 + }, + { + "epoch": 1.911259012756517, + "grad_norm": 0.21009089052677155, + "learning_rate": 1.2545227696513644e-07, + "loss": 0.5258, + "step": 6892 + }, + { + "epoch": 1.9115363283416529, + "grad_norm": 0.21154698729515076, + "learning_rate": 1.2467316023613978e-07, + "loss": 0.5118, + "step": 6893 + }, + { + "epoch": 1.9118136439267888, + "grad_norm": 0.21194148063659668, + "learning_rate": 1.2389645826128836e-07, + "loss": 0.4956, + "step": 6894 + }, + { + "epoch": 1.9120909595119246, + "grad_norm": 0.21466988325119019, + "learning_rate": 1.2312217119213737e-07, + "loss": 0.5134, + "step": 6895 + }, + { + "epoch": 1.9123682750970605, + "grad_norm": 0.22028987109661102, + "learning_rate": 1.223502991797687e-07, + "loss": 0.5057, + "step": 6896 + }, + { + "epoch": 1.9126455906821964, + "grad_norm": 0.21367661654949188, + "learning_rate": 1.215808423747966e-07, + "loss": 0.5005, + "step": 6897 + }, + { + "epoch": 1.9129229062673323, + "grad_norm": 0.2160566747188568, + "learning_rate": 1.208138009273621e-07, + "loss": 0.5011, + "step": 6898 + }, + { + "epoch": 1.9132002218524682, + "grad_norm": 0.21020996570587158, + "learning_rate": 1.2004917498713576e-07, + "loss": 0.5011, + "step": 6899 + }, + { + "epoch": 1.913477537437604, + "grad_norm": 0.20518991351127625, + "learning_rate": 1.1928696470331486e-07, + "loss": 0.4979, + "step": 6900 + }, + { + "epoch": 1.91375485302274, + "grad_norm": 0.20846955478191376, + "learning_rate": 1.1852717022463045e-07, + "loss": 0.5073, + "step": 6901 + }, + { + "epoch": 1.9140321686078758, + "grad_norm": 0.21253100037574768, + "learning_rate": 1.1776979169933478e-07, + "loss": 0.5105, + "step": 6902 + }, + { + "epoch": 1.9143094841930117, + "grad_norm": 0.22121791541576385, + "learning_rate": 1.1701482927521241e-07, + "loss": 0.4747, + "step": 6903 + }, + { + "epoch": 1.9145867997781476, + "grad_norm": 0.20756427943706512, + "learning_rate": 1.1626228309957881e-07, + "loss": 0.493, + "step": 6904 + }, + { + "epoch": 1.9148641153632835, + "grad_norm": 0.22023040056228638, + "learning_rate": 1.1551215331927489e-07, + "loss": 0.5037, + "step": 6905 + }, + { + "epoch": 1.9151414309484194, + "grad_norm": 0.2466723471879959, + "learning_rate": 1.1476444008067105e-07, + "loss": 0.5177, + "step": 6906 + }, + { + "epoch": 1.9154187465335553, + "grad_norm": 0.21187765896320343, + "learning_rate": 1.1401914352966447e-07, + "loss": 0.4916, + "step": 6907 + }, + { + "epoch": 1.9156960621186911, + "grad_norm": 0.21347801387310028, + "learning_rate": 1.1327626381168466e-07, + "loss": 0.511, + "step": 6908 + }, + { + "epoch": 1.915973377703827, + "grad_norm": 0.20320340991020203, + "learning_rate": 1.125358010716851e-07, + "loss": 0.5196, + "step": 6909 + }, + { + "epoch": 1.916250693288963, + "grad_norm": 0.211409792304039, + "learning_rate": 1.1179775545415022e-07, + "loss": 0.4988, + "step": 6910 + }, + { + "epoch": 1.9165280088740988, + "grad_norm": 0.20658671855926514, + "learning_rate": 1.1106212710309261e-07, + "loss": 0.499, + "step": 6911 + }, + { + "epoch": 1.9168053244592347, + "grad_norm": 0.21737776696681976, + "learning_rate": 1.1032891616205299e-07, + "loss": 0.5227, + "step": 6912 + }, + { + "epoch": 1.9170826400443706, + "grad_norm": 0.20637080073356628, + "learning_rate": 1.0959812277410025e-07, + "loss": 0.4793, + "step": 6913 + }, + { + "epoch": 1.9173599556295065, + "grad_norm": 0.20913554728031158, + "learning_rate": 1.0886974708183007e-07, + "loss": 0.5227, + "step": 6914 + }, + { + "epoch": 1.9176372712146423, + "grad_norm": 0.2132870852947235, + "learning_rate": 1.081437892273704e-07, + "loss": 0.4907, + "step": 6915 + }, + { + "epoch": 1.9179145867997782, + "grad_norm": 0.20474767684936523, + "learning_rate": 1.0742024935237322e-07, + "loss": 0.5073, + "step": 6916 + }, + { + "epoch": 1.918191902384914, + "grad_norm": 0.2082609385251999, + "learning_rate": 1.0669912759802004e-07, + "loss": 0.5381, + "step": 6917 + }, + { + "epoch": 1.91846921797005, + "grad_norm": 0.21400731801986694, + "learning_rate": 1.059804241050219e-07, + "loss": 0.4944, + "step": 6918 + }, + { + "epoch": 1.9187465335551859, + "grad_norm": 0.22101816534996033, + "learning_rate": 1.0526413901361526e-07, + "loss": 0.5161, + "step": 6919 + }, + { + "epoch": 1.9190238491403218, + "grad_norm": 0.21787089109420776, + "learning_rate": 1.0455027246356746e-07, + "loss": 0.5062, + "step": 6920 + }, + { + "epoch": 1.9193011647254576, + "grad_norm": 0.2018042355775833, + "learning_rate": 1.0383882459417404e-07, + "loss": 0.4633, + "step": 6921 + }, + { + "epoch": 1.9195784803105935, + "grad_norm": 0.20412792265415192, + "learning_rate": 1.031297955442559e-07, + "loss": 0.5062, + "step": 6922 + }, + { + "epoch": 1.9198557958957294, + "grad_norm": 0.214492067694664, + "learning_rate": 1.0242318545216207e-07, + "loss": 0.5178, + "step": 6923 + }, + { + "epoch": 1.9201331114808653, + "grad_norm": 0.21588201820850372, + "learning_rate": 1.0171899445577393e-07, + "loss": 0.5249, + "step": 6924 + }, + { + "epoch": 1.9204104270660012, + "grad_norm": 0.19817836582660675, + "learning_rate": 1.0101722269249547e-07, + "loss": 0.4964, + "step": 6925 + }, + { + "epoch": 1.920687742651137, + "grad_norm": 0.20311301946640015, + "learning_rate": 1.003178702992616e-07, + "loss": 0.4804, + "step": 6926 + }, + { + "epoch": 1.920965058236273, + "grad_norm": 0.20916207134723663, + "learning_rate": 9.962093741253537e-08, + "loss": 0.5013, + "step": 6927 + }, + { + "epoch": 1.9212423738214088, + "grad_norm": 0.21295547485351562, + "learning_rate": 9.892642416830522e-08, + "loss": 0.492, + "step": 6928 + }, + { + "epoch": 1.9215196894065447, + "grad_norm": 0.21146497130393982, + "learning_rate": 9.823433070209053e-08, + "loss": 0.5107, + "step": 6929 + }, + { + "epoch": 1.9217970049916806, + "grad_norm": 0.2107129544019699, + "learning_rate": 9.754465714893607e-08, + "loss": 0.5023, + "step": 6930 + }, + { + "epoch": 1.9220743205768165, + "grad_norm": 0.21256358921527863, + "learning_rate": 9.685740364341611e-08, + "loss": 0.4751, + "step": 6931 + }, + { + "epoch": 1.9223516361619524, + "grad_norm": 0.21741509437561035, + "learning_rate": 9.617257031963173e-08, + "loss": 0.5264, + "step": 6932 + }, + { + "epoch": 1.9226289517470883, + "grad_norm": 0.38610130548477173, + "learning_rate": 9.549015731121353e-08, + "loss": 0.5083, + "step": 6933 + }, + { + "epoch": 1.9229062673322241, + "grad_norm": 0.2073136270046234, + "learning_rate": 9.481016475131472e-08, + "loss": 0.4922, + "step": 6934 + }, + { + "epoch": 1.92318358291736, + "grad_norm": 0.21415603160858154, + "learning_rate": 9.41325927726222e-08, + "loss": 0.5014, + "step": 6935 + }, + { + "epoch": 1.923460898502496, + "grad_norm": 0.23321330547332764, + "learning_rate": 9.345744150734969e-08, + "loss": 0.5141, + "step": 6936 + }, + { + "epoch": 1.9237382140876318, + "grad_norm": 0.2120734453201294, + "learning_rate": 9.278471108723347e-08, + "loss": 0.4874, + "step": 6937 + }, + { + "epoch": 1.9240155296727677, + "grad_norm": 0.2397221326828003, + "learning_rate": 9.211440164354351e-08, + "loss": 0.4939, + "step": 6938 + }, + { + "epoch": 1.9242928452579036, + "grad_norm": 0.2101098895072937, + "learning_rate": 9.144651330707659e-08, + "loss": 0.5111, + "step": 6939 + }, + { + "epoch": 1.9245701608430394, + "grad_norm": 0.20636944472789764, + "learning_rate": 9.078104620815209e-08, + "loss": 0.5037, + "step": 6940 + }, + { + "epoch": 1.9248474764281753, + "grad_norm": 0.22542667388916016, + "learning_rate": 9.011800047662028e-08, + "loss": 0.5024, + "step": 6941 + }, + { + "epoch": 1.9251247920133112, + "grad_norm": 0.19407561421394348, + "learning_rate": 8.945737624186101e-08, + "loss": 0.4894, + "step": 6942 + }, + { + "epoch": 1.925402107598447, + "grad_norm": 0.21263688802719116, + "learning_rate": 8.87991736327795e-08, + "loss": 0.513, + "step": 6943 + }, + { + "epoch": 1.925679423183583, + "grad_norm": 0.20905403792858124, + "learning_rate": 8.814339277780636e-08, + "loss": 0.512, + "step": 6944 + }, + { + "epoch": 1.9259567387687189, + "grad_norm": 0.20334313809871674, + "learning_rate": 8.74900338049045e-08, + "loss": 0.5043, + "step": 6945 + }, + { + "epoch": 1.9262340543538548, + "grad_norm": 0.2149227261543274, + "learning_rate": 8.683909684155944e-08, + "loss": 0.4717, + "step": 6946 + }, + { + "epoch": 1.9265113699389906, + "grad_norm": 0.2087196409702301, + "learning_rate": 8.619058201478763e-08, + "loss": 0.5005, + "step": 6947 + }, + { + "epoch": 1.9267886855241265, + "grad_norm": 0.2073797583580017, + "learning_rate": 8.554448945113091e-08, + "loss": 0.4941, + "step": 6948 + }, + { + "epoch": 1.9270660011092624, + "grad_norm": 0.21620948612689972, + "learning_rate": 8.490081927665927e-08, + "loss": 0.4743, + "step": 6949 + }, + { + "epoch": 1.9273433166943983, + "grad_norm": 0.211389422416687, + "learning_rate": 8.425957161696946e-08, + "loss": 0.4886, + "step": 6950 + }, + { + "epoch": 1.9276206322795342, + "grad_norm": 0.20867092907428741, + "learning_rate": 8.3620746597185e-08, + "loss": 0.4856, + "step": 6951 + }, + { + "epoch": 1.92789794786467, + "grad_norm": 0.2138075828552246, + "learning_rate": 8.298434434196034e-08, + "loss": 0.479, + "step": 6952 + }, + { + "epoch": 1.928175263449806, + "grad_norm": 0.21532325446605682, + "learning_rate": 8.235036497547116e-08, + "loss": 0.4989, + "step": 6953 + }, + { + "epoch": 1.9284525790349418, + "grad_norm": 0.19918455183506012, + "learning_rate": 8.171880862142683e-08, + "loss": 0.5055, + "step": 6954 + }, + { + "epoch": 1.9287298946200777, + "grad_norm": 0.21148592233657837, + "learning_rate": 8.108967540305795e-08, + "loss": 0.4639, + "step": 6955 + }, + { + "epoch": 1.9290072102052136, + "grad_norm": 0.20538489520549774, + "learning_rate": 8.046296544312742e-08, + "loss": 0.5217, + "step": 6956 + }, + { + "epoch": 1.9292845257903495, + "grad_norm": 0.19862611591815948, + "learning_rate": 7.983867886391938e-08, + "loss": 0.4928, + "step": 6957 + }, + { + "epoch": 1.9295618413754854, + "grad_norm": 0.2005883902311325, + "learning_rate": 7.921681578725305e-08, + "loss": 0.4719, + "step": 6958 + }, + { + "epoch": 1.9298391569606212, + "grad_norm": 0.21354445815086365, + "learning_rate": 7.859737633446745e-08, + "loss": 0.5073, + "step": 6959 + }, + { + "epoch": 1.9301164725457571, + "grad_norm": 0.20679736137390137, + "learning_rate": 7.798036062643399e-08, + "loss": 0.4767, + "step": 6960 + }, + { + "epoch": 1.930393788130893, + "grad_norm": 0.2107972651720047, + "learning_rate": 7.736576878354523e-08, + "loss": 0.5154, + "step": 6961 + }, + { + "epoch": 1.930671103716029, + "grad_norm": 0.21988734602928162, + "learning_rate": 7.675360092572747e-08, + "loss": 0.4996, + "step": 6962 + }, + { + "epoch": 1.9309484193011648, + "grad_norm": 0.20176535844802856, + "learning_rate": 7.6143857172431e-08, + "loss": 0.4677, + "step": 6963 + }, + { + "epoch": 1.9312257348863007, + "grad_norm": 0.21723569929599762, + "learning_rate": 7.553653764263008e-08, + "loss": 0.5311, + "step": 6964 + }, + { + "epoch": 1.9315030504714366, + "grad_norm": 0.20594346523284912, + "learning_rate": 7.49316424548313e-08, + "loss": 0.4978, + "step": 6965 + }, + { + "epoch": 1.9317803660565724, + "grad_norm": 0.2213924676179886, + "learning_rate": 7.432917172706528e-08, + "loss": 0.5089, + "step": 6966 + }, + { + "epoch": 1.9320576816417083, + "grad_norm": 0.2052609771490097, + "learning_rate": 7.372912557688933e-08, + "loss": 0.4955, + "step": 6967 + }, + { + "epoch": 1.9323349972268442, + "grad_norm": 0.2087954580783844, + "learning_rate": 7.313150412138898e-08, + "loss": 0.4674, + "step": 6968 + }, + { + "epoch": 1.93261231281198, + "grad_norm": 0.20820866525173187, + "learning_rate": 7.253630747717648e-08, + "loss": 0.4871, + "step": 6969 + }, + { + "epoch": 1.932889628397116, + "grad_norm": 0.21151137351989746, + "learning_rate": 7.194353576038953e-08, + "loss": 0.4889, + "step": 6970 + }, + { + "epoch": 1.9331669439822519, + "grad_norm": 0.21804526448249817, + "learning_rate": 7.135318908669392e-08, + "loss": 0.4842, + "step": 6971 + }, + { + "epoch": 1.9334442595673877, + "grad_norm": 0.21454410254955292, + "learning_rate": 7.076526757128083e-08, + "loss": 0.5052, + "step": 6972 + }, + { + "epoch": 1.9337215751525236, + "grad_norm": 0.2476005256175995, + "learning_rate": 7.0179771328871e-08, + "loss": 0.5001, + "step": 6973 + }, + { + "epoch": 1.9339988907376595, + "grad_norm": 0.20526906847953796, + "learning_rate": 6.959670047371053e-08, + "loss": 0.4873, + "step": 6974 + }, + { + "epoch": 1.9342762063227954, + "grad_norm": 0.23170456290245056, + "learning_rate": 6.901605511957093e-08, + "loss": 0.4773, + "step": 6975 + }, + { + "epoch": 1.9345535219079313, + "grad_norm": 0.21698127686977386, + "learning_rate": 6.843783537974907e-08, + "loss": 0.4895, + "step": 6976 + }, + { + "epoch": 1.9348308374930672, + "grad_norm": 0.2072725147008896, + "learning_rate": 6.786204136707691e-08, + "loss": 0.5427, + "step": 6977 + }, + { + "epoch": 1.935108153078203, + "grad_norm": 0.21492376923561096, + "learning_rate": 6.728867319390209e-08, + "loss": 0.5325, + "step": 6978 + }, + { + "epoch": 1.935385468663339, + "grad_norm": 0.2331382930278778, + "learning_rate": 6.671773097210593e-08, + "loss": 0.4782, + "step": 6979 + }, + { + "epoch": 1.9356627842484748, + "grad_norm": 0.2333361804485321, + "learning_rate": 6.614921481309377e-08, + "loss": 0.4814, + "step": 6980 + }, + { + "epoch": 1.9359400998336107, + "grad_norm": 0.20312927663326263, + "learning_rate": 6.55831248277991e-08, + "loss": 0.4894, + "step": 6981 + }, + { + "epoch": 1.9362174154187466, + "grad_norm": 0.21046309173107147, + "learning_rate": 6.501946112668078e-08, + "loss": 0.4759, + "step": 6982 + }, + { + "epoch": 1.9364947310038825, + "grad_norm": 0.21393108367919922, + "learning_rate": 6.445822381972305e-08, + "loss": 0.4939, + "step": 6983 + }, + { + "epoch": 1.9367720465890184, + "grad_norm": 0.20159265398979187, + "learning_rate": 6.38994130164397e-08, + "loss": 0.508, + "step": 6984 + }, + { + "epoch": 1.9370493621741542, + "grad_norm": 0.2000395506620407, + "learning_rate": 6.33430288258699e-08, + "loss": 0.5001, + "step": 6985 + }, + { + "epoch": 1.9373266777592901, + "grad_norm": 0.21411900222301483, + "learning_rate": 6.27890713565768e-08, + "loss": 0.4783, + "step": 6986 + }, + { + "epoch": 1.937603993344426, + "grad_norm": 0.22993165254592896, + "learning_rate": 6.22375407166545e-08, + "loss": 0.491, + "step": 6987 + }, + { + "epoch": 1.937881308929562, + "grad_norm": 0.21177081763744354, + "learning_rate": 6.168843701371968e-08, + "loss": 0.4968, + "step": 6988 + }, + { + "epoch": 1.9381586245146978, + "grad_norm": 0.21067818999290466, + "learning_rate": 6.114176035491859e-08, + "loss": 0.5007, + "step": 6989 + }, + { + "epoch": 1.9384359400998337, + "grad_norm": 0.2169368863105774, + "learning_rate": 6.059751084692006e-08, + "loss": 0.4761, + "step": 6990 + }, + { + "epoch": 1.9387132556849695, + "grad_norm": 0.20408381521701813, + "learning_rate": 6.005568859592386e-08, + "loss": 0.5434, + "step": 6991 + }, + { + "epoch": 1.9389905712701054, + "grad_norm": 0.2044132947921753, + "learning_rate": 5.9516293707652385e-08, + "loss": 0.4814, + "step": 6992 + }, + { + "epoch": 1.9392678868552413, + "grad_norm": 0.2122509479522705, + "learning_rate": 5.897932628735614e-08, + "loss": 0.4915, + "step": 6993 + }, + { + "epoch": 1.9395452024403772, + "grad_norm": 0.21428446471691132, + "learning_rate": 5.844478643981383e-08, + "loss": 0.5069, + "step": 6994 + }, + { + "epoch": 1.939822518025513, + "grad_norm": 0.20293046534061432, + "learning_rate": 5.791267426932395e-08, + "loss": 0.5028, + "step": 6995 + }, + { + "epoch": 1.940099833610649, + "grad_norm": 0.20903684198856354, + "learning_rate": 5.7382989879720126e-08, + "loss": 0.4971, + "step": 6996 + }, + { + "epoch": 1.9403771491957849, + "grad_norm": 0.21450695395469666, + "learning_rate": 5.6855733374354404e-08, + "loss": 0.5173, + "step": 6997 + }, + { + "epoch": 1.9406544647809207, + "grad_norm": 0.21470417082309723, + "learning_rate": 5.633090485611114e-08, + "loss": 0.5008, + "step": 6998 + }, + { + "epoch": 1.9409317803660566, + "grad_norm": 0.20227287709712982, + "learning_rate": 5.580850442739732e-08, + "loss": 0.4925, + "step": 6999 + }, + { + "epoch": 1.9412090959511925, + "grad_norm": 0.20225927233695984, + "learning_rate": 5.5288532190145294e-08, + "loss": 0.4843, + "step": 7000 + }, + { + "epoch": 1.9414864115363284, + "grad_norm": 0.21021947264671326, + "learning_rate": 5.4770988245818336e-08, + "loss": 0.5028, + "step": 7001 + }, + { + "epoch": 1.9417637271214643, + "grad_norm": 0.21199429035186768, + "learning_rate": 5.4255872695400946e-08, + "loss": 0.5071, + "step": 7002 + }, + { + "epoch": 1.9420410427066002, + "grad_norm": 0.20802341401576996, + "learning_rate": 5.374318563940717e-08, + "loss": 0.4943, + "step": 7003 + }, + { + "epoch": 1.942318358291736, + "grad_norm": 0.20316016674041748, + "learning_rate": 5.323292717787504e-08, + "loss": 0.491, + "step": 7004 + }, + { + "epoch": 1.942595673876872, + "grad_norm": 0.20811815559864044, + "learning_rate": 5.272509741037074e-08, + "loss": 0.495, + "step": 7005 + }, + { + "epoch": 1.9428729894620078, + "grad_norm": 0.2118585854768753, + "learning_rate": 5.221969643598307e-08, + "loss": 0.49, + "step": 7006 + }, + { + "epoch": 1.9431503050471437, + "grad_norm": 0.2077036201953888, + "learning_rate": 5.171672435333036e-08, + "loss": 0.4795, + "step": 7007 + }, + { + "epoch": 1.9434276206322796, + "grad_norm": 0.21370601654052734, + "learning_rate": 5.121618126055633e-08, + "loss": 0.4795, + "step": 7008 + }, + { + "epoch": 1.9437049362174155, + "grad_norm": 0.20026057958602905, + "learning_rate": 5.071806725532868e-08, + "loss": 0.4859, + "step": 7009 + }, + { + "epoch": 1.9439822518025514, + "grad_norm": 0.20249269902706146, + "learning_rate": 5.022238243484467e-08, + "loss": 0.5196, + "step": 7010 + }, + { + "epoch": 1.9442595673876872, + "grad_norm": 0.2025049477815628, + "learning_rate": 4.972912689582276e-08, + "loss": 0.4916, + "step": 7011 + }, + { + "epoch": 1.9445368829728231, + "grad_norm": 0.21317322552204132, + "learning_rate": 4.923830073451374e-08, + "loss": 0.4984, + "step": 7012 + }, + { + "epoch": 1.944814198557959, + "grad_norm": 0.21346881985664368, + "learning_rate": 4.8749904046688223e-08, + "loss": 0.4878, + "step": 7013 + }, + { + "epoch": 1.945091514143095, + "grad_norm": 0.21879152953624725, + "learning_rate": 4.826393692764636e-08, + "loss": 0.5174, + "step": 7014 + }, + { + "epoch": 1.9453688297282308, + "grad_norm": 0.22700130939483643, + "learning_rate": 4.77803994722123e-08, + "loss": 0.4503, + "step": 7015 + }, + { + "epoch": 1.9456461453133667, + "grad_norm": 0.20824943482875824, + "learning_rate": 4.729929177473835e-08, + "loss": 0.5011, + "step": 7016 + }, + { + "epoch": 1.9459234608985025, + "grad_norm": 0.1998833268880844, + "learning_rate": 4.682061392910081e-08, + "loss": 0.4994, + "step": 7017 + }, + { + "epoch": 1.9462007764836384, + "grad_norm": 0.2063128501176834, + "learning_rate": 4.6344366028701346e-08, + "loss": 0.5316, + "step": 7018 + }, + { + "epoch": 1.9464780920687743, + "grad_norm": 0.20899119973182678, + "learning_rate": 4.5870548166469796e-08, + "loss": 0.4771, + "step": 7019 + }, + { + "epoch": 1.9467554076539102, + "grad_norm": 0.20648007094860077, + "learning_rate": 4.539916043485998e-08, + "loss": 0.4892, + "step": 7020 + }, + { + "epoch": 1.947032723239046, + "grad_norm": 0.2271704226732254, + "learning_rate": 4.4930202925852484e-08, + "loss": 0.5068, + "step": 7021 + }, + { + "epoch": 1.947310038824182, + "grad_norm": 0.2094396948814392, + "learning_rate": 4.446367573095328e-08, + "loss": 0.5027, + "step": 7022 + }, + { + "epoch": 1.9475873544093179, + "grad_norm": 0.20577369630336761, + "learning_rate": 4.3999578941195107e-08, + "loss": 0.5092, + "step": 7023 + }, + { + "epoch": 1.9478646699944537, + "grad_norm": 0.20665308833122253, + "learning_rate": 4.3537912647133305e-08, + "loss": 0.4884, + "step": 7024 + }, + { + "epoch": 1.9481419855795896, + "grad_norm": 0.21540111303329468, + "learning_rate": 4.3078676938852755e-08, + "loss": 0.5022, + "step": 7025 + }, + { + "epoch": 1.9484193011647255, + "grad_norm": 0.20871970057487488, + "learning_rate": 4.262187190596234e-08, + "loss": 0.5195, + "step": 7026 + }, + { + "epoch": 1.9486966167498614, + "grad_norm": 0.2026074379682541, + "learning_rate": 4.21674976375977e-08, + "loss": 0.5111, + "step": 7027 + }, + { + "epoch": 1.9489739323349973, + "grad_norm": 0.20567013323307037, + "learning_rate": 4.171555422241707e-08, + "loss": 0.4755, + "step": 7028 + }, + { + "epoch": 1.9492512479201332, + "grad_norm": 0.1948745846748352, + "learning_rate": 4.1266041748608265e-08, + "loss": 0.5021, + "step": 7029 + }, + { + "epoch": 1.949528563505269, + "grad_norm": 0.2238130271434784, + "learning_rate": 4.0818960303881656e-08, + "loss": 0.4984, + "step": 7030 + }, + { + "epoch": 1.949805879090405, + "grad_norm": 0.20423227548599243, + "learning_rate": 4.03743099754772e-08, + "loss": 0.495, + "step": 7031 + }, + { + "epoch": 1.9500831946755408, + "grad_norm": 0.2025814950466156, + "learning_rate": 3.9932090850156036e-08, + "loss": 0.5147, + "step": 7032 + }, + { + "epoch": 1.9503605102606767, + "grad_norm": 0.20636986196041107, + "learning_rate": 3.949230301420609e-08, + "loss": 0.496, + "step": 7033 + }, + { + "epoch": 1.9506378258458126, + "grad_norm": 0.1991264522075653, + "learning_rate": 3.905494655344483e-08, + "loss": 0.4856, + "step": 7034 + }, + { + "epoch": 1.9509151414309485, + "grad_norm": 0.21904256939888, + "learning_rate": 3.862002155320815e-08, + "loss": 0.4892, + "step": 7035 + }, + { + "epoch": 1.9511924570160843, + "grad_norm": 0.21712666749954224, + "learning_rate": 3.818752809836429e-08, + "loss": 0.5127, + "step": 7036 + }, + { + "epoch": 1.9514697726012202, + "grad_norm": 0.22889919579029083, + "learning_rate": 3.775746627330268e-08, + "loss": 0.4967, + "step": 7037 + }, + { + "epoch": 1.9517470881863561, + "grad_norm": 0.21116408705711365, + "learning_rate": 3.732983616193952e-08, + "loss": 0.4953, + "step": 7038 + }, + { + "epoch": 1.952024403771492, + "grad_norm": 0.2215505987405777, + "learning_rate": 3.6904637847719195e-08, + "loss": 0.4778, + "step": 7039 + }, + { + "epoch": 1.9523017193566279, + "grad_norm": 0.21022702753543854, + "learning_rate": 3.6481871413605874e-08, + "loss": 0.4975, + "step": 7040 + }, + { + "epoch": 1.9525790349417638, + "grad_norm": 0.20214217901229858, + "learning_rate": 3.606153694209608e-08, + "loss": 0.5121, + "step": 7041 + }, + { + "epoch": 1.9528563505268997, + "grad_norm": 0.20192945003509521, + "learning_rate": 3.5643634515204747e-08, + "loss": 0.4975, + "step": 7042 + }, + { + "epoch": 1.9531336661120355, + "grad_norm": 0.21496793627738953, + "learning_rate": 3.522816421447778e-08, + "loss": 0.5, + "step": 7043 + }, + { + "epoch": 1.9534109816971714, + "grad_norm": 0.20407256484031677, + "learning_rate": 3.4815126120983646e-08, + "loss": 0.5053, + "step": 7044 + }, + { + "epoch": 1.9536882972823073, + "grad_norm": 0.2113526612520218, + "learning_rate": 3.4404520315316216e-08, + "loss": 0.4911, + "step": 7045 + }, + { + "epoch": 1.9539656128674432, + "grad_norm": 0.2122277021408081, + "learning_rate": 3.399634687759751e-08, + "loss": 0.5139, + "step": 7046 + }, + { + "epoch": 1.954242928452579, + "grad_norm": 0.2064533829689026, + "learning_rate": 3.359060588747354e-08, + "loss": 0.5004, + "step": 7047 + }, + { + "epoch": 1.954520244037715, + "grad_norm": 0.22271350026130676, + "learning_rate": 3.318729742411153e-08, + "loss": 0.5052, + "step": 7048 + }, + { + "epoch": 1.9547975596228508, + "grad_norm": 0.20162583887577057, + "learning_rate": 3.278642156620965e-08, + "loss": 0.4936, + "step": 7049 + }, + { + "epoch": 1.9550748752079867, + "grad_norm": 0.20813512802124023, + "learning_rate": 3.238797839199143e-08, + "loss": 0.4976, + "step": 7050 + }, + { + "epoch": 1.9553521907931226, + "grad_norm": 0.20408669114112854, + "learning_rate": 3.1991967979200235e-08, + "loss": 0.4978, + "step": 7051 + }, + { + "epoch": 1.9556295063782585, + "grad_norm": 0.2017277628183365, + "learning_rate": 3.159839040511037e-08, + "loss": 0.4948, + "step": 7052 + }, + { + "epoch": 1.9559068219633944, + "grad_norm": 0.2124013453722, + "learning_rate": 3.120724574651873e-08, + "loss": 0.4853, + "step": 7053 + }, + { + "epoch": 1.9561841375485303, + "grad_norm": 0.20675784349441528, + "learning_rate": 3.0818534079747606e-08, + "loss": 0.4972, + "step": 7054 + }, + { + "epoch": 1.9564614531336662, + "grad_norm": 0.21523889899253845, + "learning_rate": 3.043225548064465e-08, + "loss": 0.5126, + "step": 7055 + }, + { + "epoch": 1.956738768718802, + "grad_norm": 0.20514823496341705, + "learning_rate": 3.004841002458431e-08, + "loss": 0.5061, + "step": 7056 + }, + { + "epoch": 1.957016084303938, + "grad_norm": 0.2089594602584839, + "learning_rate": 2.966699778646359e-08, + "loss": 0.4925, + "step": 7057 + }, + { + "epoch": 1.9572933998890738, + "grad_norm": 0.21636748313903809, + "learning_rate": 2.92880188407077e-08, + "loss": 0.4895, + "step": 7058 + }, + { + "epoch": 1.9575707154742097, + "grad_norm": 0.2242691069841385, + "learning_rate": 2.8911473261264423e-08, + "loss": 0.4819, + "step": 7059 + }, + { + "epoch": 1.9578480310593456, + "grad_norm": 0.19989970326423645, + "learning_rate": 2.853736112160693e-08, + "loss": 0.514, + "step": 7060 + }, + { + "epoch": 1.9581253466444815, + "grad_norm": 0.20352482795715332, + "learning_rate": 2.8165682494736556e-08, + "loss": 0.5165, + "step": 7061 + }, + { + "epoch": 1.9584026622296173, + "grad_norm": 0.21366307139396667, + "learning_rate": 2.7796437453177228e-08, + "loss": 0.5171, + "step": 7062 + }, + { + "epoch": 1.9586799778147532, + "grad_norm": 0.20763346552848816, + "learning_rate": 2.7429626068976865e-08, + "loss": 0.4874, + "step": 7063 + }, + { + "epoch": 1.958957293399889, + "grad_norm": 0.20051513612270355, + "learning_rate": 2.7065248413710166e-08, + "loss": 0.4858, + "step": 7064 + }, + { + "epoch": 1.959234608985025, + "grad_norm": 0.22556112706661224, + "learning_rate": 2.6703304558478583e-08, + "loss": 0.4879, + "step": 7065 + }, + { + "epoch": 1.9595119245701609, + "grad_norm": 0.20919561386108398, + "learning_rate": 2.634379457390618e-08, + "loss": 0.4919, + "step": 7066 + }, + { + "epoch": 1.9597892401552968, + "grad_norm": 0.20339058339595795, + "learning_rate": 2.5986718530142396e-08, + "loss": 0.5165, + "step": 7067 + }, + { + "epoch": 1.9600665557404326, + "grad_norm": 0.19766461849212646, + "learning_rate": 2.5632076496862058e-08, + "loss": 0.4615, + "step": 7068 + }, + { + "epoch": 1.9603438713255685, + "grad_norm": 0.20357239246368408, + "learning_rate": 2.527986854326675e-08, + "loss": 0.4637, + "step": 7069 + }, + { + "epoch": 1.9606211869107044, + "grad_norm": 0.21048687398433685, + "learning_rate": 2.493009473807928e-08, + "loss": 0.485, + "step": 7070 + }, + { + "epoch": 1.9608985024958403, + "grad_norm": 0.21230055391788483, + "learning_rate": 2.4582755149551995e-08, + "loss": 0.5122, + "step": 7071 + }, + { + "epoch": 1.9611758180809762, + "grad_norm": 0.21410781145095825, + "learning_rate": 2.4237849845459848e-08, + "loss": 0.5042, + "step": 7072 + }, + { + "epoch": 1.961453133666112, + "grad_norm": 0.2122282236814499, + "learning_rate": 2.3895378893100394e-08, + "loss": 0.5211, + "step": 7073 + }, + { + "epoch": 1.961730449251248, + "grad_norm": 0.21611341834068298, + "learning_rate": 2.3555342359302123e-08, + "loss": 0.5021, + "step": 7074 + }, + { + "epoch": 1.9620077648363838, + "grad_norm": 0.22394365072250366, + "learning_rate": 2.321774031041335e-08, + "loss": 0.5268, + "step": 7075 + }, + { + "epoch": 1.9622850804215197, + "grad_norm": 0.21854601800441742, + "learning_rate": 2.2882572812309156e-08, + "loss": 0.4732, + "step": 7076 + }, + { + "epoch": 1.9625623960066556, + "grad_norm": 0.21542960405349731, + "learning_rate": 2.2549839930390004e-08, + "loss": 0.5072, + "step": 7077 + }, + { + "epoch": 1.9628397115917915, + "grad_norm": 0.2258591204881668, + "learning_rate": 2.221954172958174e-08, + "loss": 0.517, + "step": 7078 + }, + { + "epoch": 1.9631170271769274, + "grad_norm": 0.22644679248332977, + "learning_rate": 2.1891678274332804e-08, + "loss": 0.4841, + "step": 7079 + }, + { + "epoch": 1.9633943427620633, + "grad_norm": 0.19779692590236664, + "learning_rate": 2.1566249628618417e-08, + "loss": 0.4987, + "step": 7080 + }, + { + "epoch": 1.9636716583471991, + "grad_norm": 0.21196384727954865, + "learning_rate": 2.124325585593917e-08, + "loss": 0.5227, + "step": 7081 + }, + { + "epoch": 1.963948973932335, + "grad_norm": 0.20542475581169128, + "learning_rate": 2.092269701931826e-08, + "loss": 0.4849, + "step": 7082 + }, + { + "epoch": 1.964226289517471, + "grad_norm": 0.1972956657409668, + "learning_rate": 2.060457318130704e-08, + "loss": 0.4605, + "step": 7083 + }, + { + "epoch": 1.9645036051026068, + "grad_norm": 0.2096942663192749, + "learning_rate": 2.028888440397947e-08, + "loss": 0.4854, + "step": 7084 + }, + { + "epoch": 1.9647809206877427, + "grad_norm": 0.20811447501182556, + "learning_rate": 1.9975630748933493e-08, + "loss": 0.4842, + "step": 7085 + }, + { + "epoch": 1.9650582362728786, + "grad_norm": 0.21638718247413635, + "learning_rate": 1.9664812277292442e-08, + "loss": 0.5213, + "step": 7086 + }, + { + "epoch": 1.9653355518580145, + "grad_norm": 0.22020205855369568, + "learning_rate": 1.935642904970919e-08, + "loss": 0.5084, + "step": 7087 + }, + { + "epoch": 1.9656128674431503, + "grad_norm": 0.2070709615945816, + "learning_rate": 1.9050481126353658e-08, + "loss": 0.4763, + "step": 7088 + }, + { + "epoch": 1.9658901830282862, + "grad_norm": 0.20314981043338776, + "learning_rate": 1.8746968566926704e-08, + "loss": 0.5041, + "step": 7089 + }, + { + "epoch": 1.966167498613422, + "grad_norm": 0.21381008625030518, + "learning_rate": 1.844589143064901e-08, + "loss": 0.4862, + "step": 7090 + }, + { + "epoch": 1.966444814198558, + "grad_norm": 0.2065730094909668, + "learning_rate": 1.814724977627219e-08, + "loss": 0.5071, + "step": 7091 + }, + { + "epoch": 1.9667221297836939, + "grad_norm": 0.2038286328315735, + "learning_rate": 1.7851043662066302e-08, + "loss": 0.4935, + "step": 7092 + }, + { + "epoch": 1.9669994453688298, + "grad_norm": 0.2133897840976715, + "learning_rate": 1.7557273145830943e-08, + "loss": 0.5179, + "step": 7093 + }, + { + "epoch": 1.9672767609539656, + "grad_norm": 0.20114904642105103, + "learning_rate": 1.726593828488693e-08, + "loss": 0.5107, + "step": 7094 + }, + { + "epoch": 1.9675540765391015, + "grad_norm": 0.21013131737709045, + "learning_rate": 1.697703913608184e-08, + "loss": 0.5173, + "step": 7095 + }, + { + "epoch": 1.9678313921242374, + "grad_norm": 0.2039504200220108, + "learning_rate": 1.669057575578864e-08, + "loss": 0.4999, + "step": 7096 + }, + { + "epoch": 1.9681087077093733, + "grad_norm": 0.20330029726028442, + "learning_rate": 1.6406548199902893e-08, + "loss": 0.5164, + "step": 7097 + }, + { + "epoch": 1.9683860232945092, + "grad_norm": 0.20391713082790375, + "learning_rate": 1.6124956523846934e-08, + "loss": 0.505, + "step": 7098 + }, + { + "epoch": 1.968663338879645, + "grad_norm": 0.2079223245382309, + "learning_rate": 1.5845800782564314e-08, + "loss": 0.4836, + "step": 7099 + }, + { + "epoch": 1.968940654464781, + "grad_norm": 0.20631448924541473, + "learning_rate": 1.5569081030529507e-08, + "loss": 0.4875, + "step": 7100 + }, + { + "epoch": 1.9692179700499168, + "grad_norm": 0.21116302907466888, + "learning_rate": 1.5294797321734057e-08, + "loss": 0.5115, + "step": 7101 + }, + { + "epoch": 1.9694952856350527, + "grad_norm": 0.21221213042736053, + "learning_rate": 1.5022949709700417e-08, + "loss": 0.495, + "step": 7102 + }, + { + "epoch": 1.9697726012201886, + "grad_norm": 0.20361442863941193, + "learning_rate": 1.4753538247472277e-08, + "loss": 0.4937, + "step": 7103 + }, + { + "epoch": 1.9700499168053245, + "grad_norm": 0.2115209698677063, + "learning_rate": 1.448656298761869e-08, + "loss": 0.4856, + "step": 7104 + }, + { + "epoch": 1.9703272323904604, + "grad_norm": 0.20742273330688477, + "learning_rate": 1.42220239822341e-08, + "loss": 0.4659, + "step": 7105 + }, + { + "epoch": 1.9706045479755963, + "grad_norm": 0.20554955303668976, + "learning_rate": 1.3959921282938327e-08, + "loss": 0.5062, + "step": 7106 + }, + { + "epoch": 1.9708818635607321, + "grad_norm": 0.20452530682086945, + "learning_rate": 1.3700254940872404e-08, + "loss": 0.4663, + "step": 7107 + }, + { + "epoch": 1.971159179145868, + "grad_norm": 0.2129167914390564, + "learning_rate": 1.3443025006705523e-08, + "loss": 0.5172, + "step": 7108 + }, + { + "epoch": 1.971436494731004, + "grad_norm": 0.21334435045719147, + "learning_rate": 1.3188231530628092e-08, + "loss": 0.513, + "step": 7109 + }, + { + "epoch": 1.9717138103161398, + "grad_norm": 0.20458266139030457, + "learning_rate": 1.2935874562360062e-08, + "loss": 0.4769, + "step": 7110 + }, + { + "epoch": 1.9719911259012757, + "grad_norm": 0.20730172097682953, + "learning_rate": 1.2685954151141211e-08, + "loss": 0.5029, + "step": 7111 + }, + { + "epoch": 1.9722684414864116, + "grad_norm": 0.21345804631710052, + "learning_rate": 1.2438470345738085e-08, + "loss": 0.4918, + "step": 7112 + }, + { + "epoch": 1.9725457570715474, + "grad_norm": 0.22697855532169342, + "learning_rate": 1.2193423194439835e-08, + "loss": 0.5038, + "step": 7113 + }, + { + "epoch": 1.9728230726566833, + "grad_norm": 0.20521080493927002, + "learning_rate": 1.1950812745063766e-08, + "loss": 0.4882, + "step": 7114 + }, + { + "epoch": 1.9731003882418192, + "grad_norm": 0.20702117681503296, + "learning_rate": 1.1710639044948401e-08, + "loss": 0.4954, + "step": 7115 + }, + { + "epoch": 1.973377703826955, + "grad_norm": 0.21297289431095123, + "learning_rate": 1.1472902140959029e-08, + "loss": 0.5015, + "step": 7116 + }, + { + "epoch": 1.973655019412091, + "grad_norm": 0.20684416592121124, + "learning_rate": 1.1237602079483545e-08, + "loss": 0.4632, + "step": 7117 + }, + { + "epoch": 1.9739323349972269, + "grad_norm": 0.20604479312896729, + "learning_rate": 1.100473890643522e-08, + "loss": 0.495, + "step": 7118 + }, + { + "epoch": 1.9742096505823628, + "grad_norm": 0.2087058126926422, + "learning_rate": 1.0774312667251319e-08, + "loss": 0.4967, + "step": 7119 + }, + { + "epoch": 1.9744869661674986, + "grad_norm": 0.23602424561977386, + "learning_rate": 1.0546323406895875e-08, + "loss": 0.4913, + "step": 7120 + }, + { + "epoch": 1.9747642817526345, + "grad_norm": 0.20307756960391998, + "learning_rate": 1.0320771169854137e-08, + "loss": 0.4753, + "step": 7121 + }, + { + "epoch": 1.9750415973377704, + "grad_norm": 0.20893272757530212, + "learning_rate": 1.0097656000136735e-08, + "loss": 0.4845, + "step": 7122 + }, + { + "epoch": 1.9753189129229063, + "grad_norm": 0.2109754979610443, + "learning_rate": 9.876977941282451e-09, + "loss": 0.4906, + "step": 7123 + }, + { + "epoch": 1.9755962285080422, + "grad_norm": 0.20519696176052094, + "learning_rate": 9.658737036347121e-09, + "loss": 0.496, + "step": 7124 + }, + { + "epoch": 1.975873544093178, + "grad_norm": 0.21880246698856354, + "learning_rate": 9.442933327918902e-09, + "loss": 0.517, + "step": 7125 + }, + { + "epoch": 1.976150859678314, + "grad_norm": 0.21169376373291016, + "learning_rate": 9.229566858105777e-09, + "loss": 0.4915, + "step": 7126 + }, + { + "epoch": 1.9764281752634498, + "grad_norm": 0.20725256204605103, + "learning_rate": 9.01863766853972e-09, + "loss": 0.4964, + "step": 7127 + }, + { + "epoch": 1.9767054908485857, + "grad_norm": 0.22511859238147736, + "learning_rate": 8.810145800379477e-09, + "loss": 0.5046, + "step": 7128 + }, + { + "epoch": 1.9769828064337216, + "grad_norm": 0.22022607922554016, + "learning_rate": 8.60409129430917e-09, + "loss": 0.4962, + "step": 7129 + }, + { + "epoch": 1.9772601220188575, + "grad_norm": 0.21660543978214264, + "learning_rate": 8.400474190532747e-09, + "loss": 0.524, + "step": 7130 + }, + { + "epoch": 1.9775374376039934, + "grad_norm": 0.21604527533054352, + "learning_rate": 8.199294528783707e-09, + "loss": 0.499, + "step": 7131 + }, + { + "epoch": 1.9778147531891292, + "grad_norm": 0.20662792026996613, + "learning_rate": 8.000552348315371e-09, + "loss": 0.5365, + "step": 7132 + }, + { + "epoch": 1.9780920687742651, + "grad_norm": 0.20979103446006775, + "learning_rate": 7.804247687909216e-09, + "loss": 0.4771, + "step": 7133 + }, + { + "epoch": 1.978369384359401, + "grad_norm": 0.21938779950141907, + "learning_rate": 7.610380585867937e-09, + "loss": 0.498, + "step": 7134 + }, + { + "epoch": 1.978646699944537, + "grad_norm": 0.20905134081840515, + "learning_rate": 7.418951080020997e-09, + "loss": 0.5047, + "step": 7135 + }, + { + "epoch": 1.9789240155296728, + "grad_norm": 0.2120780348777771, + "learning_rate": 7.229959207721848e-09, + "loss": 0.492, + "step": 7136 + }, + { + "epoch": 1.9792013311148087, + "grad_norm": 0.21072250604629517, + "learning_rate": 7.043405005847936e-09, + "loss": 0.503, + "step": 7137 + }, + { + "epoch": 1.9794786466999446, + "grad_norm": 0.2067342847585678, + "learning_rate": 6.859288510799311e-09, + "loss": 0.5092, + "step": 7138 + }, + { + "epoch": 1.9797559622850804, + "grad_norm": 0.21522685885429382, + "learning_rate": 6.67760975850279e-09, + "loss": 0.5161, + "step": 7139 + }, + { + "epoch": 1.9800332778702163, + "grad_norm": 0.20429226756095886, + "learning_rate": 6.498368784409181e-09, + "loss": 0.5381, + "step": 7140 + }, + { + "epoch": 1.9803105934553522, + "grad_norm": 0.2123131901025772, + "learning_rate": 6.321565623494674e-09, + "loss": 0.5146, + "step": 7141 + }, + { + "epoch": 1.980587909040488, + "grad_norm": 0.21125838160514832, + "learning_rate": 6.147200310253898e-09, + "loss": 0.4832, + "step": 7142 + }, + { + "epoch": 1.980865224625624, + "grad_norm": 0.20799988508224487, + "learning_rate": 5.9752728787138005e-09, + "loss": 0.4841, + "step": 7143 + }, + { + "epoch": 1.9811425402107599, + "grad_norm": 0.22431087493896484, + "learning_rate": 5.805783362421158e-09, + "loss": 0.5086, + "step": 7144 + }, + { + "epoch": 1.9814198557958957, + "grad_norm": 0.2249828279018402, + "learning_rate": 5.6387317944481265e-09, + "loss": 0.5089, + "step": 7145 + }, + { + "epoch": 1.9816971713810316, + "grad_norm": 0.22779472172260284, + "learning_rate": 5.474118207389467e-09, + "loss": 0.5253, + "step": 7146 + }, + { + "epoch": 1.9819744869661675, + "grad_norm": 0.21490508317947388, + "learning_rate": 5.311942633366706e-09, + "loss": 0.4918, + "step": 7147 + }, + { + "epoch": 1.9822518025513034, + "grad_norm": 0.21175016462802887, + "learning_rate": 5.152205104023977e-09, + "loss": 0.4858, + "step": 7148 + }, + { + "epoch": 1.9825291181364393, + "grad_norm": 0.22119320929050446, + "learning_rate": 4.99490565053079e-09, + "loss": 0.4984, + "step": 7149 + }, + { + "epoch": 1.9828064337215752, + "grad_norm": 0.22225895524024963, + "learning_rate": 4.840044303582036e-09, + "loss": 0.5394, + "step": 7150 + }, + { + "epoch": 1.983083749306711, + "grad_norm": 0.20062761008739471, + "learning_rate": 4.687621093392436e-09, + "loss": 0.4905, + "step": 7151 + }, + { + "epoch": 1.983361064891847, + "grad_norm": 0.20917369425296783, + "learning_rate": 4.537636049704863e-09, + "loss": 0.5128, + "step": 7152 + }, + { + "epoch": 1.9836383804769828, + "grad_norm": 0.21111933887004852, + "learning_rate": 4.390089201786185e-09, + "loss": 0.4886, + "step": 7153 + }, + { + "epoch": 1.9839156960621187, + "grad_norm": 0.2019733041524887, + "learning_rate": 4.244980578424485e-09, + "loss": 0.4908, + "step": 7154 + }, + { + "epoch": 1.9841930116472546, + "grad_norm": 0.21139049530029297, + "learning_rate": 4.1023102079373875e-09, + "loss": 0.5261, + "step": 7155 + }, + { + "epoch": 1.9844703272323905, + "grad_norm": 0.2183750867843628, + "learning_rate": 3.962078118162349e-09, + "loss": 0.5286, + "step": 7156 + }, + { + "epoch": 1.9847476428175264, + "grad_norm": 0.21051201224327087, + "learning_rate": 3.824284336460815e-09, + "loss": 0.4948, + "step": 7157 + }, + { + "epoch": 1.9850249584026622, + "grad_norm": 0.21117429435253143, + "learning_rate": 3.6889288897223872e-09, + "loss": 0.4867, + "step": 7158 + }, + { + "epoch": 1.9853022739877981, + "grad_norm": 0.20643287897109985, + "learning_rate": 3.556011804356496e-09, + "loss": 0.494, + "step": 7159 + }, + { + "epoch": 1.985579589572934, + "grad_norm": 0.21045584976673126, + "learning_rate": 3.425533106300727e-09, + "loss": 0.5114, + "step": 7160 + }, + { + "epoch": 1.98585690515807, + "grad_norm": 0.20693178474903107, + "learning_rate": 3.297492821013881e-09, + "loss": 0.5092, + "step": 7161 + }, + { + "epoch": 1.9861342207432058, + "grad_norm": 0.19891990721225739, + "learning_rate": 3.1718909734787526e-09, + "loss": 0.4961, + "step": 7162 + }, + { + "epoch": 1.9864115363283417, + "grad_norm": 0.21904566884040833, + "learning_rate": 3.0487275882062906e-09, + "loss": 0.4908, + "step": 7163 + }, + { + "epoch": 1.9866888519134775, + "grad_norm": 0.21359799802303314, + "learning_rate": 2.9280026892272715e-09, + "loss": 0.4848, + "step": 7164 + }, + { + "epoch": 1.9869661674986134, + "grad_norm": 0.21357576549053192, + "learning_rate": 2.809716300097853e-09, + "loss": 0.4776, + "step": 7165 + }, + { + "epoch": 1.9872434830837493, + "grad_norm": 0.2025846391916275, + "learning_rate": 2.6938684439009598e-09, + "loss": 0.477, + "step": 7166 + }, + { + "epoch": 1.9875207986688852, + "grad_norm": 0.20797260105609894, + "learning_rate": 2.5804591432393442e-09, + "loss": 0.4981, + "step": 7167 + }, + { + "epoch": 1.987798114254021, + "grad_norm": 0.21296679973602295, + "learning_rate": 2.469488420242527e-09, + "loss": 0.4845, + "step": 7168 + }, + { + "epoch": 1.988075429839157, + "grad_norm": 0.20188210904598236, + "learning_rate": 2.3609562965654085e-09, + "loss": 0.486, + "step": 7169 + }, + { + "epoch": 1.9883527454242929, + "grad_norm": 0.2065025269985199, + "learning_rate": 2.2548627933841047e-09, + "loss": 0.4778, + "step": 7170 + }, + { + "epoch": 1.9886300610094287, + "grad_norm": 0.21647189557552338, + "learning_rate": 2.151207931400112e-09, + "loss": 0.4972, + "step": 7171 + }, + { + "epoch": 1.9889073765945646, + "grad_norm": 0.21635204553604126, + "learning_rate": 2.0499917308403062e-09, + "loss": 0.5197, + "step": 7172 + }, + { + "epoch": 1.9891846921797005, + "grad_norm": 0.20818372070789337, + "learning_rate": 1.951214211452779e-09, + "loss": 0.4838, + "step": 7173 + }, + { + "epoch": 1.9894620077648364, + "grad_norm": 0.1977744996547699, + "learning_rate": 1.8548753925137773e-09, + "loss": 0.4944, + "step": 7174 + }, + { + "epoch": 1.9897393233499723, + "grad_norm": 0.38426437973976135, + "learning_rate": 1.7609752928207657e-09, + "loss": 0.5029, + "step": 7175 + }, + { + "epoch": 1.9900166389351082, + "grad_norm": 0.22992363572120667, + "learning_rate": 1.6695139306965869e-09, + "loss": 0.5092, + "step": 7176 + }, + { + "epoch": 1.990293954520244, + "grad_norm": 0.20251336693763733, + "learning_rate": 1.580491323986688e-09, + "loss": 0.4982, + "step": 7177 + }, + { + "epoch": 1.99057127010538, + "grad_norm": 0.21631348133087158, + "learning_rate": 1.4939074900618965e-09, + "loss": 0.5016, + "step": 7178 + }, + { + "epoch": 1.9908485856905158, + "grad_norm": 0.21560998260974884, + "learning_rate": 1.4097624458184188e-09, + "loss": 0.5206, + "step": 7179 + }, + { + "epoch": 1.9911259012756517, + "grad_norm": 0.2111794650554657, + "learning_rate": 1.328056207673678e-09, + "loss": 0.5029, + "step": 7180 + }, + { + "epoch": 1.9914032168607876, + "grad_norm": 0.21028463542461395, + "learning_rate": 1.2487887915704766e-09, + "loss": 0.5018, + "step": 7181 + }, + { + "epoch": 1.9916805324459235, + "grad_norm": 0.2057926207780838, + "learning_rate": 1.1719602129769968e-09, + "loss": 0.5104, + "step": 7182 + }, + { + "epoch": 1.9919578480310594, + "grad_norm": 0.2165459394454956, + "learning_rate": 1.097570486885413e-09, + "loss": 0.5189, + "step": 7183 + }, + { + "epoch": 1.9922351636161952, + "grad_norm": 0.21551057696342468, + "learning_rate": 1.0256196278091156e-09, + "loss": 0.4608, + "step": 7184 + }, + { + "epoch": 1.9925124792013311, + "grad_norm": 0.22477814555168152, + "learning_rate": 9.561076497882626e-10, + "loss": 0.4854, + "step": 7185 + }, + { + "epoch": 1.992789794786467, + "grad_norm": 0.21325767040252686, + "learning_rate": 8.890345663870037e-10, + "loss": 0.5055, + "step": 7186 + }, + { + "epoch": 1.993067110371603, + "grad_norm": 0.20826996862888336, + "learning_rate": 8.244003906934806e-10, + "loss": 0.5067, + "step": 7187 + }, + { + "epoch": 1.9933444259567388, + "grad_norm": 0.2099534571170807, + "learning_rate": 7.622051353184389e-10, + "loss": 0.5104, + "step": 7188 + }, + { + "epoch": 1.9936217415418747, + "grad_norm": 0.21604932844638824, + "learning_rate": 7.024488123980044e-10, + "loss": 0.4922, + "step": 7189 + }, + { + "epoch": 1.9938990571270105, + "grad_norm": 0.21847958862781525, + "learning_rate": 6.451314335922942e-10, + "loss": 0.4976, + "step": 7190 + }, + { + "epoch": 1.9941763727121464, + "grad_norm": 0.21057890355587006, + "learning_rate": 5.902530100854175e-10, + "loss": 0.5255, + "step": 7191 + }, + { + "epoch": 1.9944536882972823, + "grad_norm": 0.20802158117294312, + "learning_rate": 5.378135525868633e-10, + "loss": 0.5211, + "step": 7192 + }, + { + "epoch": 1.9947310038824182, + "grad_norm": 0.2087002694606781, + "learning_rate": 4.878130713273365e-10, + "loss": 0.5145, + "step": 7193 + }, + { + "epoch": 1.995008319467554, + "grad_norm": 0.211053267121315, + "learning_rate": 4.402515760629222e-10, + "loss": 0.5037, + "step": 7194 + }, + { + "epoch": 1.99528563505269, + "grad_norm": 0.20845288038253784, + "learning_rate": 3.9512907607647256e-10, + "loss": 0.4992, + "step": 7195 + }, + { + "epoch": 1.9955629506378258, + "grad_norm": 0.2077481597661972, + "learning_rate": 3.524455801706683e-10, + "loss": 0.4787, + "step": 7196 + }, + { + "epoch": 1.9958402662229617, + "grad_norm": 0.21687790751457214, + "learning_rate": 3.1220109667357e-10, + "loss": 0.488, + "step": 7197 + }, + { + "epoch": 1.9961175818080976, + "grad_norm": 0.20425012707710266, + "learning_rate": 2.743956334400055e-10, + "loss": 0.5079, + "step": 7198 + }, + { + "epoch": 1.9963948973932335, + "grad_norm": 0.19561026990413666, + "learning_rate": 2.3902919784601905e-10, + "loss": 0.4914, + "step": 7199 + }, + { + "epoch": 1.9966722129783694, + "grad_norm": 0.20784446597099304, + "learning_rate": 2.0610179679164676e-10, + "loss": 0.4945, + "step": 7200 + }, + { + "epoch": 1.9969495285635053, + "grad_norm": 0.19664216041564941, + "learning_rate": 1.7561343670230436e-10, + "loss": 0.4889, + "step": 7201 + }, + { + "epoch": 1.9972268441486412, + "grad_norm": 0.20693275332450867, + "learning_rate": 1.4756412352878724e-10, + "loss": 0.5138, + "step": 7202 + }, + { + "epoch": 1.997504159733777, + "grad_norm": 0.20792469382286072, + "learning_rate": 1.2195386274171938e-10, + "loss": 0.5086, + "step": 7203 + }, + { + "epoch": 1.997781475318913, + "grad_norm": 0.2034110128879547, + "learning_rate": 9.878265933987995e-11, + "loss": 0.4793, + "step": 7204 + }, + { + "epoch": 1.9980587909040488, + "grad_norm": 0.2156227082014084, + "learning_rate": 7.805051784326445e-11, + "loss": 0.5048, + "step": 7205 + }, + { + "epoch": 1.9983361064891847, + "grad_norm": 0.20901308953762054, + "learning_rate": 5.975744230002356e-11, + "loss": 0.5019, + "step": 7206 + }, + { + "epoch": 1.9986134220743206, + "grad_norm": 0.20007333159446716, + "learning_rate": 4.3903436276748756e-11, + "loss": 0.4837, + "step": 7207 + }, + { + "epoch": 1.9988907376594565, + "grad_norm": 0.21206532418727875, + "learning_rate": 3.048850286679894e-11, + "loss": 0.4823, + "step": 7208 + }, + { + "epoch": 1.9991680532445923, + "grad_norm": 0.2093973159790039, + "learning_rate": 1.9512644690300452e-11, + "loss": 0.4759, + "step": 7209 + }, + { + "epoch": 1.9994453688297282, + "grad_norm": 0.2100788652896881, + "learning_rate": 1.0975863887208171e-11, + "loss": 0.5038, + "step": 7210 + }, + { + "epoch": 1.9997226844148641, + "grad_norm": 0.2026882767677307, + "learning_rate": 4.878162124244412e-12, + "loss": 0.4975, + "step": 7211 + }, + { + "epoch": 2.0, + "grad_norm": 0.23644179105758667, + "learning_rate": 1.2195405907355905e-12, + "loss": 0.5014, + "step": 7212 + }, + { + "epoch": 2.0, + "eval_loss": 0.82587730884552, + "eval_runtime": 439.0989, + "eval_samples_per_second": 93.403, + "eval_steps_per_second": 1.46, + "step": 7212 + } + ], + "logging_steps": 1, + "max_steps": 7212, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.531044471940139e+20, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}