diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,137685 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 19665, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 8.557192575617696, + "learning_rate": 3.3898305084745764e-08, + "loss": 0.8805, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 9.06872712778614, + "learning_rate": 6.779661016949153e-08, + "loss": 0.8498, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 8.14569355426566, + "learning_rate": 1.0169491525423729e-07, + "loss": 0.8802, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.2812101013113, + "learning_rate": 1.3559322033898305e-07, + "loss": 0.8447, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 8.22501739492148, + "learning_rate": 1.6949152542372883e-07, + "loss": 0.8661, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 9.107731386918358, + "learning_rate": 2.0338983050847458e-07, + "loss": 0.9148, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 8.537106942847725, + "learning_rate": 2.3728813559322036e-07, + "loss": 0.7615, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 9.780092872367305, + "learning_rate": 2.711864406779661e-07, + "loss": 0.8196, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 8.790315313064749, + "learning_rate": 3.050847457627119e-07, + "loss": 0.859, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 8.841389331094158, + "learning_rate": 3.3898305084745766e-07, + "loss": 0.8597, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 9.054485774470677, + "learning_rate": 3.7288135593220347e-07, + "loss": 0.8975, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 9.348866446220944, + "learning_rate": 4.0677966101694916e-07, + "loss": 0.8716, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 8.375820124871165, + "learning_rate": 4.4067796610169497e-07, + "loss": 0.8262, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 8.489161661750288, + "learning_rate": 4.745762711864407e-07, + "loss": 0.8317, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 7.647864966680701, + "learning_rate": 5.084745762711865e-07, + "loss": 0.8068, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 11.25567460340579, + "learning_rate": 5.423728813559322e-07, + "loss": 0.8784, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 6.243574981713614, + "learning_rate": 5.76271186440678e-07, + "loss": 0.7368, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 6.459065444090269, + "learning_rate": 6.101694915254238e-07, + "loss": 0.7662, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 7.014917635106802, + "learning_rate": 6.440677966101695e-07, + "loss": 0.7592, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 6.860940141421972, + "learning_rate": 6.779661016949153e-07, + "loss": 0.731, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 6.185099928998005, + "learning_rate": 7.118644067796611e-07, + "loss": 0.6707, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 5.23904808480235, + "learning_rate": 7.457627118644069e-07, + "loss": 0.7006, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 5.327549403384544, + "learning_rate": 7.796610169491527e-07, + "loss": 0.6307, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 4.097537794801515, + "learning_rate": 8.135593220338983e-07, + "loss": 0.6203, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 4.377204730201115, + "learning_rate": 8.474576271186441e-07, + "loss": 0.5821, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 4.358088694702184, + "learning_rate": 8.813559322033899e-07, + "loss": 0.602, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 4.197286363634574, + "learning_rate": 9.152542372881357e-07, + "loss": 0.6424, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 3.799809603049265, + "learning_rate": 9.491525423728814e-07, + "loss": 0.5751, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 3.383671404077906, + "learning_rate": 9.830508474576272e-07, + "loss": 0.5296, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 4.216187266889182, + "learning_rate": 1.016949152542373e-06, + "loss": 0.536, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.3106453196134393, + "learning_rate": 1.0508474576271187e-06, + "loss": 0.5286, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 2.618350813260054, + "learning_rate": 1.0847457627118644e-06, + "loss": 0.5057, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 2.5217817090570738, + "learning_rate": 1.1186440677966102e-06, + "loss": 0.4745, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.3533426940286937, + "learning_rate": 1.152542372881356e-06, + "loss": 0.4211, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 2.61696040904103, + "learning_rate": 1.186440677966102e-06, + "loss": 0.4285, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.456578510501274, + "learning_rate": 1.2203389830508477e-06, + "loss": 0.4165, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.6810354384934776, + "learning_rate": 1.2542372881355932e-06, + "loss": 0.4464, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.387681024180993, + "learning_rate": 1.288135593220339e-06, + "loss": 0.4644, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 2.4467785580460735, + "learning_rate": 1.322033898305085e-06, + "loss": 0.4539, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.4529315649495667, + "learning_rate": 1.3559322033898307e-06, + "loss": 0.4166, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 3.212819968943549, + "learning_rate": 1.3898305084745764e-06, + "loss": 0.3938, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 1.863910204540751, + "learning_rate": 1.4237288135593222e-06, + "loss": 0.3991, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 1.6764048559205706, + "learning_rate": 1.457627118644068e-06, + "loss": 0.372, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 1.799219722876445, + "learning_rate": 1.4915254237288139e-06, + "loss": 0.3803, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 1.9642819860180136, + "learning_rate": 1.5254237288135596e-06, + "loss": 0.39, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 2.102560995531875, + "learning_rate": 1.5593220338983054e-06, + "loss": 0.399, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.2717682014897997, + "learning_rate": 1.593220338983051e-06, + "loss": 0.3334, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 1.8451916643138015, + "learning_rate": 1.6271186440677967e-06, + "loss": 0.3806, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 2.791231995618392, + "learning_rate": 1.6610169491525424e-06, + "loss": 0.3786, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 1.8661867054992236, + "learning_rate": 1.6949152542372882e-06, + "loss": 0.3763, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1.9794021888687487, + "learning_rate": 1.728813559322034e-06, + "loss": 0.3371, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 2.224468293642754, + "learning_rate": 1.7627118644067799e-06, + "loss": 0.3475, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 2.31298307443876, + "learning_rate": 1.7966101694915256e-06, + "loss": 0.3692, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 1.8429593134990803, + "learning_rate": 1.8305084745762714e-06, + "loss": 0.336, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 1.8510671554362415, + "learning_rate": 1.8644067796610171e-06, + "loss": 0.3369, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 2.0884605066914403, + "learning_rate": 1.8983050847457629e-06, + "loss": 0.3531, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 1.7766166145814422, + "learning_rate": 1.932203389830509e-06, + "loss": 0.3224, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 2.00091805426027, + "learning_rate": 1.9661016949152544e-06, + "loss": 0.3207, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 1.671445434483612, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.3473, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 1.8089600378647877, + "learning_rate": 2.033898305084746e-06, + "loss": 0.3158, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 1.857053304370097, + "learning_rate": 2.0677966101694914e-06, + "loss": 0.3353, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 1.8869217170364412, + "learning_rate": 2.1016949152542374e-06, + "loss": 0.2722, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 1.6325695751917972, + "learning_rate": 2.1355932203389833e-06, + "loss": 0.3661, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 1.6938194400308952, + "learning_rate": 2.169491525423729e-06, + "loss": 0.3112, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 3.0006172337912416, + "learning_rate": 2.203389830508475e-06, + "loss": 0.3639, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 2.0199725555296335, + "learning_rate": 2.2372881355932204e-06, + "loss": 0.3086, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 1.8730176329500001, + "learning_rate": 2.2711864406779663e-06, + "loss": 0.323, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 1.7440535132509414, + "learning_rate": 2.305084745762712e-06, + "loss": 0.343, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 2.112486078314485, + "learning_rate": 2.338983050847458e-06, + "loss": 0.3391, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 1.7751924675393387, + "learning_rate": 2.372881355932204e-06, + "loss": 0.3234, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 1.964742351336944, + "learning_rate": 2.4067796610169493e-06, + "loss": 0.2933, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 1.3092146299206104, + "learning_rate": 2.4406779661016953e-06, + "loss": 0.2857, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 1.7904622412420848, + "learning_rate": 2.474576271186441e-06, + "loss": 0.3443, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 1.7863495982308861, + "learning_rate": 2.5084745762711864e-06, + "loss": 0.3031, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 1.5671252584011932, + "learning_rate": 2.5423728813559323e-06, + "loss": 0.2966, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 1.656662139777409, + "learning_rate": 2.576271186440678e-06, + "loss": 0.3134, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 2.041825731687294, + "learning_rate": 2.610169491525424e-06, + "loss": 0.3457, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 1.516375034415401, + "learning_rate": 2.64406779661017e-06, + "loss": 0.2944, + "step": 78 + }, + { + "epoch": 0.0, + "grad_norm": 1.9869017421478328, + "learning_rate": 2.6779661016949153e-06, + "loss": 0.2837, + "step": 79 + }, + { + "epoch": 0.0, + "grad_norm": 1.6175841066321706, + "learning_rate": 2.7118644067796613e-06, + "loss": 0.2864, + "step": 80 + }, + { + "epoch": 0.0, + "grad_norm": 1.7170519798903372, + "learning_rate": 2.745762711864407e-06, + "loss": 0.3284, + "step": 81 + }, + { + "epoch": 0.0, + "grad_norm": 1.7972083881181034, + "learning_rate": 2.779661016949153e-06, + "loss": 0.299, + "step": 82 + }, + { + "epoch": 0.0, + "grad_norm": 2.8307414003842113, + "learning_rate": 2.8135593220338988e-06, + "loss": 0.3023, + "step": 83 + }, + { + "epoch": 0.0, + "grad_norm": 1.613381220629313, + "learning_rate": 2.8474576271186443e-06, + "loss": 0.3124, + "step": 84 + }, + { + "epoch": 0.0, + "grad_norm": 1.7950742462204192, + "learning_rate": 2.8813559322033903e-06, + "loss": 0.3006, + "step": 85 + }, + { + "epoch": 0.0, + "grad_norm": 2.221539092274775, + "learning_rate": 2.915254237288136e-06, + "loss": 0.3048, + "step": 86 + }, + { + "epoch": 0.0, + "grad_norm": 2.259845817762709, + "learning_rate": 2.9491525423728818e-06, + "loss": 0.3044, + "step": 87 + }, + { + "epoch": 0.0, + "grad_norm": 1.541938397353296, + "learning_rate": 2.9830508474576277e-06, + "loss": 0.3434, + "step": 88 + }, + { + "epoch": 0.0, + "grad_norm": 2.1376130907622923, + "learning_rate": 3.0169491525423733e-06, + "loss": 0.3282, + "step": 89 + }, + { + "epoch": 0.0, + "grad_norm": 2.0565278408576604, + "learning_rate": 3.0508474576271192e-06, + "loss": 0.2927, + "step": 90 + }, + { + "epoch": 0.0, + "grad_norm": 1.926909173349644, + "learning_rate": 3.0847457627118648e-06, + "loss": 0.3109, + "step": 91 + }, + { + "epoch": 0.0, + "grad_norm": 2.6185375769384565, + "learning_rate": 3.1186440677966107e-06, + "loss": 0.3094, + "step": 92 + }, + { + "epoch": 0.0, + "grad_norm": 1.8249433332855955, + "learning_rate": 3.1525423728813563e-06, + "loss": 0.3225, + "step": 93 + }, + { + "epoch": 0.0, + "grad_norm": 1.7226923086848356, + "learning_rate": 3.186440677966102e-06, + "loss": 0.293, + "step": 94 + }, + { + "epoch": 0.0, + "grad_norm": 1.8579049970780483, + "learning_rate": 3.2203389830508473e-06, + "loss": 0.297, + "step": 95 + }, + { + "epoch": 0.0, + "grad_norm": 1.8217865526593613, + "learning_rate": 3.2542372881355933e-06, + "loss": 0.299, + "step": 96 + }, + { + "epoch": 0.0, + "grad_norm": 2.9857064112941214, + "learning_rate": 3.288135593220339e-06, + "loss": 0.2983, + "step": 97 + }, + { + "epoch": 0.0, + "grad_norm": 2.0821655536527746, + "learning_rate": 3.322033898305085e-06, + "loss": 0.3261, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 3.1380580706567773, + "learning_rate": 3.3559322033898308e-06, + "loss": 0.2775, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 2.0160814105257923, + "learning_rate": 3.3898305084745763e-06, + "loss": 0.3317, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.5909755454757093, + "learning_rate": 3.4237288135593223e-06, + "loss": 0.303, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 12.191902880650215, + "learning_rate": 3.457627118644068e-06, + "loss": 0.3183, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.9238805550148785, + "learning_rate": 3.4915254237288138e-06, + "loss": 0.3072, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 2.2333502172845163, + "learning_rate": 3.5254237288135597e-06, + "loss": 0.3586, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 8.589718562654825, + "learning_rate": 3.5593220338983053e-06, + "loss": 0.3147, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.6143252679477402, + "learning_rate": 3.5932203389830512e-06, + "loss": 0.2904, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 2.2836235321900076, + "learning_rate": 3.6271186440677968e-06, + "loss": 0.3066, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.6830983780860997, + "learning_rate": 3.6610169491525427e-06, + "loss": 0.3336, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 1.7804936986415487, + "learning_rate": 3.6949152542372883e-06, + "loss": 0.3005, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 3.065589112282441, + "learning_rate": 3.7288135593220342e-06, + "loss": 0.2842, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.350341154005084, + "learning_rate": 3.76271186440678e-06, + "loss": 0.3179, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.4814230337002219, + "learning_rate": 3.7966101694915257e-06, + "loss": 0.2744, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.733113410194878, + "learning_rate": 3.830508474576271e-06, + "loss": 0.3027, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.5967154788237727, + "learning_rate": 3.864406779661018e-06, + "loss": 0.3258, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 3.3370723281642563, + "learning_rate": 3.898305084745763e-06, + "loss": 0.3114, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.6445205745938416, + "learning_rate": 3.932203389830509e-06, + "loss": 0.3116, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 2.0862236046831435, + "learning_rate": 3.966101694915255e-06, + "loss": 0.2998, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 1.5820347545216342, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3014, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 1.9780553225075865, + "learning_rate": 4.033898305084746e-06, + "loss": 0.321, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.0824896309123964, + "learning_rate": 4.067796610169492e-06, + "loss": 0.2436, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 2.235099816002165, + "learning_rate": 4.101694915254237e-06, + "loss": 0.2895, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.9274858752777464, + "learning_rate": 4.135593220338983e-06, + "loss": 0.2712, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 2.0017257398500234, + "learning_rate": 4.169491525423729e-06, + "loss": 0.2674, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 2.3276432711309596, + "learning_rate": 4.203389830508475e-06, + "loss": 0.3218, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 2.1175621752154283, + "learning_rate": 4.23728813559322e-06, + "loss": 0.3057, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.8693993867856755, + "learning_rate": 4.271186440677967e-06, + "loss": 0.2679, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.6694157796882232, + "learning_rate": 4.305084745762712e-06, + "loss": 0.2739, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.249653153315445, + "learning_rate": 4.338983050847458e-06, + "loss": 0.2827, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 2.6424596204092072, + "learning_rate": 4.372881355932203e-06, + "loss": 0.2826, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.7334690566735274, + "learning_rate": 4.40677966101695e-06, + "loss": 0.2932, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 1.9839422437397973, + "learning_rate": 4.440677966101695e-06, + "loss": 0.3081, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.953781353665557, + "learning_rate": 4.474576271186441e-06, + "loss": 0.3027, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.8911646090025422, + "learning_rate": 4.508474576271187e-06, + "loss": 0.3037, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.6962571768979062, + "learning_rate": 4.542372881355933e-06, + "loss": 0.2561, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 1.634664008844036, + "learning_rate": 4.576271186440678e-06, + "loss": 0.2879, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 2.777790158029909, + "learning_rate": 4.610169491525424e-06, + "loss": 0.2721, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.7148761472284506, + "learning_rate": 4.64406779661017e-06, + "loss": 0.2717, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.7208081889064668, + "learning_rate": 4.677966101694916e-06, + "loss": 0.2838, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 1.89025248616027, + "learning_rate": 4.711864406779661e-06, + "loss": 0.3075, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 1.7365633256432669, + "learning_rate": 4.745762711864408e-06, + "loss": 0.2909, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.6009133709225887, + "learning_rate": 4.779661016949153e-06, + "loss": 0.2538, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 2.0332412939427944, + "learning_rate": 4.813559322033899e-06, + "loss": 0.3089, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.775911485217657, + "learning_rate": 4.847457627118645e-06, + "loss": 0.3002, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 1.4574169659822291, + "learning_rate": 4.881355932203391e-06, + "loss": 0.2843, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.7122297188914908, + "learning_rate": 4.915254237288136e-06, + "loss": 0.2923, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 1.8661430535697405, + "learning_rate": 4.949152542372882e-06, + "loss": 0.2933, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 1.5280893595273282, + "learning_rate": 4.983050847457628e-06, + "loss": 0.3097, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.5414815172107994, + "learning_rate": 5.016949152542373e-06, + "loss": 0.2894, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.6979336511591085, + "learning_rate": 5.050847457627119e-06, + "loss": 0.3058, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.399367962381608, + "learning_rate": 5.084745762711865e-06, + "loss": 0.2826, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 1.666309227933981, + "learning_rate": 5.118644067796611e-06, + "loss": 0.2729, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 1.479961432900143, + "learning_rate": 5.152542372881356e-06, + "loss": 0.2805, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 1.7384952021588505, + "learning_rate": 5.186440677966102e-06, + "loss": 0.2759, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.8596650431273607, + "learning_rate": 5.220338983050848e-06, + "loss": 0.2758, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.857375302476961, + "learning_rate": 5.254237288135594e-06, + "loss": 0.3048, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 1.7591182469694164, + "learning_rate": 5.28813559322034e-06, + "loss": 0.2762, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 2.2948961387152282, + "learning_rate": 5.322033898305086e-06, + "loss": 0.2911, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 2.261579664075019, + "learning_rate": 5.355932203389831e-06, + "loss": 0.2845, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 2.0906674064521806, + "learning_rate": 5.389830508474577e-06, + "loss": 0.2798, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 1.689900338950711, + "learning_rate": 5.423728813559323e-06, + "loss": 0.3181, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.5558610571402536, + "learning_rate": 5.457627118644067e-06, + "loss": 0.2835, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 3.231530175438594, + "learning_rate": 5.491525423728814e-06, + "loss": 0.3044, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 2.6429093181423005, + "learning_rate": 5.525423728813559e-06, + "loss": 0.3068, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 1.7844549696996412, + "learning_rate": 5.559322033898306e-06, + "loss": 0.2853, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 2.0082259106645943, + "learning_rate": 5.593220338983051e-06, + "loss": 0.3477, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 1.960115573884396, + "learning_rate": 5.6271186440677975e-06, + "loss": 0.2772, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 2.068249027081921, + "learning_rate": 5.661016949152542e-06, + "loss": 0.2967, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 1.7051358404672956, + "learning_rate": 5.694915254237289e-06, + "loss": 0.2871, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 1.709105980487174, + "learning_rate": 5.728813559322034e-06, + "loss": 0.2613, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 1.950311162930796, + "learning_rate": 5.7627118644067805e-06, + "loss": 0.2923, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 1.651513874408586, + "learning_rate": 5.796610169491525e-06, + "loss": 0.3087, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.710559983293689, + "learning_rate": 5.830508474576272e-06, + "loss": 0.2794, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.7771722763697293, + "learning_rate": 5.864406779661017e-06, + "loss": 0.2783, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 2.160139050072537, + "learning_rate": 5.8983050847457635e-06, + "loss": 0.2966, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 1.5002256496386404, + "learning_rate": 5.932203389830509e-06, + "loss": 0.2879, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 1.8875259490488177, + "learning_rate": 5.9661016949152555e-06, + "loss": 0.2937, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 1.7701374784962995, + "learning_rate": 6e-06, + "loss": 0.2667, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 1.7400487952240289, + "learning_rate": 6.0338983050847465e-06, + "loss": 0.317, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 1.6262926424592759, + "learning_rate": 6.067796610169492e-06, + "loss": 0.2988, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 1.7533113610007194, + "learning_rate": 6.1016949152542385e-06, + "loss": 0.2743, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 2.1487191473952074, + "learning_rate": 6.135593220338983e-06, + "loss": 0.2731, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 1.837373024422903, + "learning_rate": 6.1694915254237295e-06, + "loss": 0.2474, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 3.384431938898038, + "learning_rate": 6.203389830508475e-06, + "loss": 0.2928, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 1.5271044696646137, + "learning_rate": 6.2372881355932215e-06, + "loss": 0.2544, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 1.3710907064781839, + "learning_rate": 6.271186440677966e-06, + "loss": 0.2716, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 1.7978500758050704, + "learning_rate": 6.3050847457627125e-06, + "loss": 0.2629, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 1.266493908137502, + "learning_rate": 6.338983050847458e-06, + "loss": 0.2636, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 1.6865435211762627, + "learning_rate": 6.372881355932204e-06, + "loss": 0.2919, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 1.5177106818253951, + "learning_rate": 6.40677966101695e-06, + "loss": 0.2696, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 1.598312898809967, + "learning_rate": 6.440677966101695e-06, + "loss": 0.2941, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 1.7020330476074053, + "learning_rate": 6.474576271186441e-06, + "loss": 0.2566, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 1.6452188115531354, + "learning_rate": 6.508474576271187e-06, + "loss": 0.2627, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 1.5337855227059187, + "learning_rate": 6.542372881355933e-06, + "loss": 0.2749, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 1.5885009119467683, + "learning_rate": 6.576271186440678e-06, + "loss": 0.2871, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 1.9656764093443815, + "learning_rate": 6.610169491525424e-06, + "loss": 0.3027, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 2.0483602782105494, + "learning_rate": 6.64406779661017e-06, + "loss": 0.3092, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 1.8565023223461687, + "learning_rate": 6.677966101694916e-06, + "loss": 0.3084, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 1.7646513939378161, + "learning_rate": 6.7118644067796615e-06, + "loss": 0.2953, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 1.6957118175929442, + "learning_rate": 6.745762711864408e-06, + "loss": 0.2939, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 1.6271626182864425, + "learning_rate": 6.779661016949153e-06, + "loss": 0.2732, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 2.665492075397036, + "learning_rate": 6.813559322033899e-06, + "loss": 0.2643, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 1.7849521054857391, + "learning_rate": 6.8474576271186445e-06, + "loss": 0.2967, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 1.6961191232197494, + "learning_rate": 6.881355932203391e-06, + "loss": 0.3109, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 1.725277238030002, + "learning_rate": 6.915254237288136e-06, + "loss": 0.2882, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 1.7040601722570679, + "learning_rate": 6.949152542372882e-06, + "loss": 0.261, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 1.7790750902245551, + "learning_rate": 6.9830508474576275e-06, + "loss": 0.2928, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 1.8954542367183922, + "learning_rate": 7.016949152542374e-06, + "loss": 0.2961, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 2.0100427678525317, + "learning_rate": 7.0508474576271195e-06, + "loss": 0.271, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 1.635043641028692, + "learning_rate": 7.084745762711865e-06, + "loss": 0.3017, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 1.5721374879331982, + "learning_rate": 7.1186440677966106e-06, + "loss": 0.2586, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 1.7623509032189595, + "learning_rate": 7.152542372881357e-06, + "loss": 0.2852, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 1.8084559334984929, + "learning_rate": 7.1864406779661025e-06, + "loss": 0.2502, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 1.5700471733368953, + "learning_rate": 7.220338983050849e-06, + "loss": 0.2658, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 1.7884143221055777, + "learning_rate": 7.2542372881355936e-06, + "loss": 0.2813, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 1.8403028285709235, + "learning_rate": 7.288135593220339e-06, + "loss": 0.2509, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 1.6358262298861554, + "learning_rate": 7.3220338983050855e-06, + "loss": 0.2608, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 8.065627300198091, + "learning_rate": 7.355932203389831e-06, + "loss": 0.2775, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 1.5288052157129923, + "learning_rate": 7.3898305084745766e-06, + "loss": 0.2694, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 1.6187358313701743, + "learning_rate": 7.423728813559322e-06, + "loss": 0.2611, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 1.500589738935865, + "learning_rate": 7.4576271186440685e-06, + "loss": 0.2611, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 1.6846488257425314, + "learning_rate": 7.491525423728814e-06, + "loss": 0.2625, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 2.5149283412876415, + "learning_rate": 7.52542372881356e-06, + "loss": 0.2698, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 1.520702365266604, + "learning_rate": 7.559322033898305e-06, + "loss": 0.2756, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 1.6031162343212535, + "learning_rate": 7.5932203389830515e-06, + "loss": 0.2617, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 1.6952860116491963, + "learning_rate": 7.627118644067797e-06, + "loss": 0.2818, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 1.477255037322905, + "learning_rate": 7.661016949152543e-06, + "loss": 0.2496, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 1.6660271734854857, + "learning_rate": 7.694915254237289e-06, + "loss": 0.2768, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 1.9775731672940686, + "learning_rate": 7.728813559322035e-06, + "loss": 0.2535, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 1.7011300558433908, + "learning_rate": 7.76271186440678e-06, + "loss": 0.2851, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 1.646079153476468, + "learning_rate": 7.796610169491526e-06, + "loss": 0.3166, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 1.4393049865391425, + "learning_rate": 7.830508474576271e-06, + "loss": 0.3049, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 1.6556807137319012, + "learning_rate": 7.864406779661017e-06, + "loss": 0.2875, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 1.8731866145038525, + "learning_rate": 7.898305084745764e-06, + "loss": 0.2701, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 1.62571796111418, + "learning_rate": 7.93220338983051e-06, + "loss": 0.2728, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 1.487202975941064, + "learning_rate": 7.966101694915255e-06, + "loss": 0.2719, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 1.535136361003896, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2615, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 1.6670845697053458, + "learning_rate": 8.033898305084746e-06, + "loss": 0.2603, + "step": 237 + }, + { + "epoch": 0.01, + "grad_norm": 1.2513572304096718, + "learning_rate": 8.067796610169492e-06, + "loss": 0.2842, + "step": 238 + }, + { + "epoch": 0.01, + "grad_norm": 1.6010215262108967, + "learning_rate": 8.101694915254237e-06, + "loss": 0.2575, + "step": 239 + }, + { + "epoch": 0.01, + "grad_norm": 2.264897174430261, + "learning_rate": 8.135593220338983e-06, + "loss": 0.28, + "step": 240 + }, + { + "epoch": 0.01, + "grad_norm": 1.5210659548113379, + "learning_rate": 8.16949152542373e-06, + "loss": 0.2957, + "step": 241 + }, + { + "epoch": 0.01, + "grad_norm": 1.5993802470049059, + "learning_rate": 8.203389830508475e-06, + "loss": 0.284, + "step": 242 + }, + { + "epoch": 0.01, + "grad_norm": 1.4987229304363077, + "learning_rate": 8.237288135593221e-06, + "loss": 0.2617, + "step": 243 + }, + { + "epoch": 0.01, + "grad_norm": 1.45585250607327, + "learning_rate": 8.271186440677966e-06, + "loss": 0.2637, + "step": 244 + }, + { + "epoch": 0.01, + "grad_norm": 1.538155435073543, + "learning_rate": 8.305084745762712e-06, + "loss": 0.2741, + "step": 245 + }, + { + "epoch": 0.01, + "grad_norm": 2.1391423571076142, + "learning_rate": 8.338983050847458e-06, + "loss": 0.2631, + "step": 246 + }, + { + "epoch": 0.01, + "grad_norm": 1.7253417394805781, + "learning_rate": 8.372881355932205e-06, + "loss": 0.2565, + "step": 247 + }, + { + "epoch": 0.01, + "grad_norm": 1.6275070709595192, + "learning_rate": 8.40677966101695e-06, + "loss": 0.2644, + "step": 248 + }, + { + "epoch": 0.01, + "grad_norm": 1.6116776111875923, + "learning_rate": 8.440677966101696e-06, + "loss": 0.2802, + "step": 249 + }, + { + "epoch": 0.01, + "grad_norm": 1.570689718867318, + "learning_rate": 8.47457627118644e-06, + "loss": 0.2606, + "step": 250 + }, + { + "epoch": 0.01, + "grad_norm": 1.5257535084444283, + "learning_rate": 8.508474576271187e-06, + "loss": 0.2729, + "step": 251 + }, + { + "epoch": 0.01, + "grad_norm": 1.4065841492399755, + "learning_rate": 8.542372881355933e-06, + "loss": 0.2552, + "step": 252 + }, + { + "epoch": 0.01, + "grad_norm": 1.580942735174244, + "learning_rate": 8.57627118644068e-06, + "loss": 0.2869, + "step": 253 + }, + { + "epoch": 0.01, + "grad_norm": 1.7823741192008924, + "learning_rate": 8.610169491525424e-06, + "loss": 0.2458, + "step": 254 + }, + { + "epoch": 0.01, + "grad_norm": 1.6882930628464303, + "learning_rate": 8.64406779661017e-06, + "loss": 0.2797, + "step": 255 + }, + { + "epoch": 0.01, + "grad_norm": 1.5710000159998525, + "learning_rate": 8.677966101694915e-06, + "loss": 0.2596, + "step": 256 + }, + { + "epoch": 0.01, + "grad_norm": 1.5117969977132588, + "learning_rate": 8.711864406779662e-06, + "loss": 0.2915, + "step": 257 + }, + { + "epoch": 0.01, + "grad_norm": 2.0879634715923228, + "learning_rate": 8.745762711864407e-06, + "loss": 0.2928, + "step": 258 + }, + { + "epoch": 0.01, + "grad_norm": 1.4763672894599624, + "learning_rate": 8.779661016949153e-06, + "loss": 0.2661, + "step": 259 + }, + { + "epoch": 0.01, + "grad_norm": 1.5887813347318072, + "learning_rate": 8.8135593220339e-06, + "loss": 0.2644, + "step": 260 + }, + { + "epoch": 0.01, + "grad_norm": 2.1284805411680634, + "learning_rate": 8.847457627118646e-06, + "loss": 0.2896, + "step": 261 + }, + { + "epoch": 0.01, + "grad_norm": 1.6764740798105202, + "learning_rate": 8.88135593220339e-06, + "loss": 0.2679, + "step": 262 + }, + { + "epoch": 0.01, + "grad_norm": 1.5530926451627123, + "learning_rate": 8.915254237288137e-06, + "loss": 0.2618, + "step": 263 + }, + { + "epoch": 0.01, + "grad_norm": 1.5762836341877522, + "learning_rate": 8.949152542372881e-06, + "loss": 0.278, + "step": 264 + }, + { + "epoch": 0.01, + "grad_norm": 2.229849002174887, + "learning_rate": 8.983050847457628e-06, + "loss": 0.2623, + "step": 265 + }, + { + "epoch": 0.01, + "grad_norm": 1.360225880617487, + "learning_rate": 9.016949152542374e-06, + "loss": 0.2769, + "step": 266 + }, + { + "epoch": 0.01, + "grad_norm": 1.578613608691057, + "learning_rate": 9.05084745762712e-06, + "loss": 0.2935, + "step": 267 + }, + { + "epoch": 0.01, + "grad_norm": 1.5343613642802507, + "learning_rate": 9.084745762711865e-06, + "loss": 0.2827, + "step": 268 + }, + { + "epoch": 0.01, + "grad_norm": 1.9651340500582921, + "learning_rate": 9.11864406779661e-06, + "loss": 0.3065, + "step": 269 + }, + { + "epoch": 0.01, + "grad_norm": 1.189584326247112, + "learning_rate": 9.152542372881356e-06, + "loss": 0.2452, + "step": 270 + }, + { + "epoch": 0.01, + "grad_norm": 1.568110390610819, + "learning_rate": 9.186440677966101e-06, + "loss": 0.2642, + "step": 271 + }, + { + "epoch": 0.01, + "grad_norm": 1.490918665724689, + "learning_rate": 9.220338983050847e-06, + "loss": 0.2484, + "step": 272 + }, + { + "epoch": 0.01, + "grad_norm": 1.5178780940378127, + "learning_rate": 9.254237288135594e-06, + "loss": 0.2615, + "step": 273 + }, + { + "epoch": 0.01, + "grad_norm": 1.4243185626593395, + "learning_rate": 9.28813559322034e-06, + "loss": 0.2706, + "step": 274 + }, + { + "epoch": 0.01, + "grad_norm": 1.3943699378544865, + "learning_rate": 9.322033898305085e-06, + "loss": 0.2846, + "step": 275 + }, + { + "epoch": 0.01, + "grad_norm": 1.39958325547155, + "learning_rate": 9.355932203389831e-06, + "loss": 0.2644, + "step": 276 + }, + { + "epoch": 0.01, + "grad_norm": 1.5566516750657318, + "learning_rate": 9.389830508474576e-06, + "loss": 0.2709, + "step": 277 + }, + { + "epoch": 0.01, + "grad_norm": 1.5807818263236182, + "learning_rate": 9.423728813559322e-06, + "loss": 0.2904, + "step": 278 + }, + { + "epoch": 0.01, + "grad_norm": 1.6657073622633627, + "learning_rate": 9.457627118644069e-06, + "loss": 0.2831, + "step": 279 + }, + { + "epoch": 0.01, + "grad_norm": 1.5508860676326361, + "learning_rate": 9.491525423728815e-06, + "loss": 0.3093, + "step": 280 + }, + { + "epoch": 0.01, + "grad_norm": 1.4241612321960757, + "learning_rate": 9.52542372881356e-06, + "loss": 0.2769, + "step": 281 + }, + { + "epoch": 0.01, + "grad_norm": 1.5006924170004527, + "learning_rate": 9.559322033898306e-06, + "loss": 0.2415, + "step": 282 + }, + { + "epoch": 0.01, + "grad_norm": 1.5294543822090856, + "learning_rate": 9.593220338983051e-06, + "loss": 0.2515, + "step": 283 + }, + { + "epoch": 0.01, + "grad_norm": 1.533886353920441, + "learning_rate": 9.627118644067797e-06, + "loss": 0.2829, + "step": 284 + }, + { + "epoch": 0.01, + "grad_norm": 2.069418788425172, + "learning_rate": 9.661016949152544e-06, + "loss": 0.2887, + "step": 285 + }, + { + "epoch": 0.01, + "grad_norm": 1.5889620658480574, + "learning_rate": 9.69491525423729e-06, + "loss": 0.2991, + "step": 286 + }, + { + "epoch": 0.01, + "grad_norm": 1.4597563573302703, + "learning_rate": 9.728813559322035e-06, + "loss": 0.2624, + "step": 287 + }, + { + "epoch": 0.01, + "grad_norm": 1.740665680466997, + "learning_rate": 9.762711864406781e-06, + "loss": 0.2881, + "step": 288 + }, + { + "epoch": 0.01, + "grad_norm": 1.3885281952656445, + "learning_rate": 9.796610169491526e-06, + "loss": 0.2958, + "step": 289 + }, + { + "epoch": 0.01, + "grad_norm": 1.4980986665212326, + "learning_rate": 9.830508474576272e-06, + "loss": 0.2894, + "step": 290 + }, + { + "epoch": 0.01, + "grad_norm": 1.6970976017778991, + "learning_rate": 9.864406779661017e-06, + "loss": 0.277, + "step": 291 + }, + { + "epoch": 0.01, + "grad_norm": 1.5465766596610715, + "learning_rate": 9.898305084745763e-06, + "loss": 0.2723, + "step": 292 + }, + { + "epoch": 0.01, + "grad_norm": 1.7023966032743651, + "learning_rate": 9.93220338983051e-06, + "loss": 0.2977, + "step": 293 + }, + { + "epoch": 0.01, + "grad_norm": 1.3390528257847034, + "learning_rate": 9.966101694915256e-06, + "loss": 0.2537, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 1.1821890114626603, + "learning_rate": 1e-05, + "loss": 0.2668, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.7286845032215452, + "learning_rate": 1.0033898305084746e-05, + "loss": 0.2744, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 1.4950792801793105, + "learning_rate": 1.0067796610169492e-05, + "loss": 0.272, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 1.4700432391190763, + "learning_rate": 1.0101694915254238e-05, + "loss": 0.2662, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 1.3967794895014582, + "learning_rate": 1.0135593220338985e-05, + "loss": 0.2802, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 2.391964862237865, + "learning_rate": 1.016949152542373e-05, + "loss": 0.273, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.4563931631110092, + "learning_rate": 1.0203389830508474e-05, + "loss": 0.2905, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.284413744656201, + "learning_rate": 1.0237288135593222e-05, + "loss": 0.2749, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 1.7292121695347487, + "learning_rate": 1.0271186440677967e-05, + "loss": 0.243, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 1.8232969594205204, + "learning_rate": 1.0305084745762712e-05, + "loss": 0.2722, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 1.7932246344435712, + "learning_rate": 1.0338983050847458e-05, + "loss": 0.2684, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 1.533073576390442, + "learning_rate": 1.0372881355932204e-05, + "loss": 0.2745, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 1.8333910248684946, + "learning_rate": 1.040677966101695e-05, + "loss": 0.293, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 3.1995055633474583, + "learning_rate": 1.0440677966101695e-05, + "loss": 0.2629, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 1.4645465210433868, + "learning_rate": 1.047457627118644e-05, + "loss": 0.2698, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 1.346835487540612, + "learning_rate": 1.0508474576271188e-05, + "loss": 0.2704, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 1.4321145393072345, + "learning_rate": 1.0542372881355933e-05, + "loss": 0.2677, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 1.5265056300536548, + "learning_rate": 1.057627118644068e-05, + "loss": 0.2803, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 1.3311183401050715, + "learning_rate": 1.0610169491525424e-05, + "loss": 0.2662, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 1.4977642596654024, + "learning_rate": 1.0644067796610172e-05, + "loss": 0.2729, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 1.5866201311518897, + "learning_rate": 1.0677966101694917e-05, + "loss": 0.255, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 1.5568209460381872, + "learning_rate": 1.0711864406779661e-05, + "loss": 0.2771, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 1.3484579322691495, + "learning_rate": 1.0745762711864408e-05, + "loss": 0.254, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 1.4060019872392868, + "learning_rate": 1.0779661016949154e-05, + "loss": 0.2847, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 1.6215123876307729, + "learning_rate": 1.08135593220339e-05, + "loss": 0.2582, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 1.1960240824180124, + "learning_rate": 1.0847457627118645e-05, + "loss": 0.2757, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 2.6496098507964483, + "learning_rate": 1.088135593220339e-05, + "loss": 0.2946, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 1.3862470565731742, + "learning_rate": 1.0915254237288135e-05, + "loss": 0.2768, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 1.4544121582161837, + "learning_rate": 1.0949152542372883e-05, + "loss": 0.3032, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 1.4175535736163114, + "learning_rate": 1.0983050847457627e-05, + "loss": 0.2686, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 1.3191834379151826, + "learning_rate": 1.1016949152542374e-05, + "loss": 0.252, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 1.33272323985383, + "learning_rate": 1.1050847457627118e-05, + "loss": 0.2779, + "step": 326 + }, + { + "epoch": 0.02, + "grad_norm": 1.660681621213467, + "learning_rate": 1.1084745762711867e-05, + "loss": 0.2794, + "step": 327 + }, + { + "epoch": 0.02, + "grad_norm": 1.431182692042468, + "learning_rate": 1.1118644067796611e-05, + "loss": 0.2699, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 1.4489972397114494, + "learning_rate": 1.1152542372881356e-05, + "loss": 0.2682, + "step": 329 + }, + { + "epoch": 0.02, + "grad_norm": 1.6541146528927058, + "learning_rate": 1.1186440677966102e-05, + "loss": 0.3242, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 1.331208350127372, + "learning_rate": 1.1220338983050849e-05, + "loss": 0.266, + "step": 331 + }, + { + "epoch": 0.02, + "grad_norm": 1.4591943463955406, + "learning_rate": 1.1254237288135595e-05, + "loss": 0.2462, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 1.4940367024477517, + "learning_rate": 1.128813559322034e-05, + "loss": 0.2634, + "step": 333 + }, + { + "epoch": 0.02, + "grad_norm": 1.6020801701154066, + "learning_rate": 1.1322033898305084e-05, + "loss": 0.258, + "step": 334 + }, + { + "epoch": 0.02, + "grad_norm": 1.567603002600629, + "learning_rate": 1.1355932203389833e-05, + "loss": 0.2893, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 1.4512891656223712, + "learning_rate": 1.1389830508474577e-05, + "loss": 0.2592, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 1.9013614792745004, + "learning_rate": 1.1423728813559322e-05, + "loss": 0.3183, + "step": 337 + }, + { + "epoch": 0.02, + "grad_norm": 1.4816696067269588, + "learning_rate": 1.1457627118644068e-05, + "loss": 0.2521, + "step": 338 + }, + { + "epoch": 0.02, + "grad_norm": 1.4699744125682022, + "learning_rate": 1.1491525423728815e-05, + "loss": 0.2435, + "step": 339 + }, + { + "epoch": 0.02, + "grad_norm": 1.5292539331602504, + "learning_rate": 1.1525423728813561e-05, + "loss": 0.2773, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 1.493805068160698, + "learning_rate": 1.1559322033898306e-05, + "loss": 0.2776, + "step": 341 + }, + { + "epoch": 0.02, + "grad_norm": 2.3320441267535226, + "learning_rate": 1.159322033898305e-05, + "loss": 0.3148, + "step": 342 + }, + { + "epoch": 0.02, + "grad_norm": 1.5768942640637522, + "learning_rate": 1.1627118644067799e-05, + "loss": 0.2882, + "step": 343 + }, + { + "epoch": 0.02, + "grad_norm": 2.6327174925876067, + "learning_rate": 1.1661016949152543e-05, + "loss": 0.2643, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 1.5979901584661973, + "learning_rate": 1.169491525423729e-05, + "loss": 0.2793, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 1.490949422523808, + "learning_rate": 1.1728813559322034e-05, + "loss": 0.2797, + "step": 346 + }, + { + "epoch": 0.02, + "grad_norm": 1.6373314185665537, + "learning_rate": 1.1762711864406782e-05, + "loss": 0.2702, + "step": 347 + }, + { + "epoch": 0.02, + "grad_norm": 1.494597348638564, + "learning_rate": 1.1796610169491527e-05, + "loss": 0.256, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 1.268989549770058, + "learning_rate": 1.1830508474576272e-05, + "loss": 0.275, + "step": 349 + }, + { + "epoch": 0.02, + "grad_norm": 1.5728102611683135, + "learning_rate": 1.1864406779661018e-05, + "loss": 0.2747, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 1.4554919008312315, + "learning_rate": 1.1898305084745763e-05, + "loss": 0.2656, + "step": 351 + }, + { + "epoch": 0.02, + "grad_norm": 1.6882217452125865, + "learning_rate": 1.1932203389830511e-05, + "loss": 0.2724, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 1.5458553639090058, + "learning_rate": 1.1966101694915256e-05, + "loss": 0.2574, + "step": 353 + }, + { + "epoch": 0.02, + "grad_norm": 2.9734651946977806, + "learning_rate": 1.2e-05, + "loss": 0.2682, + "step": 354 + }, + { + "epoch": 0.02, + "grad_norm": 1.4674881583289108, + "learning_rate": 1.2033898305084745e-05, + "loss": 0.2624, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 1.6725844123567122, + "learning_rate": 1.2067796610169493e-05, + "loss": 0.2693, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 1.3657325069194421, + "learning_rate": 1.2101694915254238e-05, + "loss": 0.2422, + "step": 357 + }, + { + "epoch": 0.02, + "grad_norm": 1.4560212112782298, + "learning_rate": 1.2135593220338984e-05, + "loss": 0.2519, + "step": 358 + }, + { + "epoch": 0.02, + "grad_norm": 1.3112148572383935, + "learning_rate": 1.2169491525423729e-05, + "loss": 0.2769, + "step": 359 + }, + { + "epoch": 0.02, + "grad_norm": 1.491608195320627, + "learning_rate": 1.2203389830508477e-05, + "loss": 0.2526, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 2.3012264572970316, + "learning_rate": 1.2237288135593222e-05, + "loss": 0.2841, + "step": 361 + }, + { + "epoch": 0.02, + "grad_norm": 1.42262697227907, + "learning_rate": 1.2271186440677966e-05, + "loss": 0.264, + "step": 362 + }, + { + "epoch": 0.02, + "grad_norm": 1.376381527131282, + "learning_rate": 1.2305084745762713e-05, + "loss": 0.2895, + "step": 363 + }, + { + "epoch": 0.02, + "grad_norm": 1.3621092285439624, + "learning_rate": 1.2338983050847459e-05, + "loss": 0.2818, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 1.6416902061672696, + "learning_rate": 1.2372881355932205e-05, + "loss": 0.242, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 1.8070792328678054, + "learning_rate": 1.240677966101695e-05, + "loss": 0.2513, + "step": 366 + }, + { + "epoch": 0.02, + "grad_norm": 1.4230650127004605, + "learning_rate": 1.2440677966101695e-05, + "loss": 0.2663, + "step": 367 + }, + { + "epoch": 0.02, + "grad_norm": 1.5876663584225577, + "learning_rate": 1.2474576271186443e-05, + "loss": 0.2619, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 2.4967545863007254, + "learning_rate": 1.2508474576271188e-05, + "loss": 0.2822, + "step": 369 + }, + { + "epoch": 0.02, + "grad_norm": 1.9039535303784283, + "learning_rate": 1.2542372881355932e-05, + "loss": 0.284, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 1.428940565663837, + "learning_rate": 1.2576271186440679e-05, + "loss": 0.2601, + "step": 371 + }, + { + "epoch": 0.02, + "grad_norm": 1.5662155563814324, + "learning_rate": 1.2610169491525425e-05, + "loss": 0.2873, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 1.2971094059613077, + "learning_rate": 1.2644067796610171e-05, + "loss": 0.2626, + "step": 373 + }, + { + "epoch": 0.02, + "grad_norm": 1.0351725300274326, + "learning_rate": 1.2677966101694916e-05, + "loss": 0.2743, + "step": 374 + }, + { + "epoch": 0.02, + "grad_norm": 1.3553284978125044, + "learning_rate": 1.2711864406779661e-05, + "loss": 0.2709, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 1.5882208330471586, + "learning_rate": 1.2745762711864407e-05, + "loss": 0.2708, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 1.8462534147511642, + "learning_rate": 1.2779661016949154e-05, + "loss": 0.2699, + "step": 377 + }, + { + "epoch": 0.02, + "grad_norm": 1.426504473136271, + "learning_rate": 1.28135593220339e-05, + "loss": 0.281, + "step": 378 + }, + { + "epoch": 0.02, + "grad_norm": 1.2340057175968697, + "learning_rate": 1.2847457627118645e-05, + "loss": 0.2483, + "step": 379 + }, + { + "epoch": 0.02, + "grad_norm": 1.835330825736432, + "learning_rate": 1.288135593220339e-05, + "loss": 0.308, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 1.3981944760568665, + "learning_rate": 1.2915254237288137e-05, + "loss": 0.2507, + "step": 381 + }, + { + "epoch": 0.02, + "grad_norm": 1.3749165361391003, + "learning_rate": 1.2949152542372882e-05, + "loss": 0.2647, + "step": 382 + }, + { + "epoch": 0.02, + "grad_norm": 2.0292717133598073, + "learning_rate": 1.2983050847457629e-05, + "loss": 0.2718, + "step": 383 + }, + { + "epoch": 0.02, + "grad_norm": 1.4857167490857963, + "learning_rate": 1.3016949152542373e-05, + "loss": 0.2635, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 1.747307923320907, + "learning_rate": 1.305084745762712e-05, + "loss": 0.2706, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 1.3277925058690871, + "learning_rate": 1.3084745762711866e-05, + "loss": 0.2432, + "step": 386 + }, + { + "epoch": 0.02, + "grad_norm": 1.5142700170780168, + "learning_rate": 1.311864406779661e-05, + "loss": 0.2811, + "step": 387 + }, + { + "epoch": 0.02, + "grad_norm": 1.3779037473413898, + "learning_rate": 1.3152542372881355e-05, + "loss": 0.2839, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 2.5766348414501605, + "learning_rate": 1.3186440677966103e-05, + "loss": 0.2622, + "step": 389 + }, + { + "epoch": 0.02, + "grad_norm": 1.2036696780959797, + "learning_rate": 1.3220338983050848e-05, + "loss": 0.2799, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 1.3338172319722734, + "learning_rate": 1.3254237288135595e-05, + "loss": 0.2516, + "step": 391 + }, + { + "epoch": 0.02, + "grad_norm": 1.5466043637609173, + "learning_rate": 1.328813559322034e-05, + "loss": 0.2892, + "step": 392 + }, + { + "epoch": 0.02, + "grad_norm": 1.375902595553232, + "learning_rate": 1.3322033898305087e-05, + "loss": 0.3, + "step": 393 + }, + { + "epoch": 0.02, + "grad_norm": 1.223183507661743, + "learning_rate": 1.3355932203389832e-05, + "loss": 0.2864, + "step": 394 + }, + { + "epoch": 0.02, + "grad_norm": 1.4529473301965807, + "learning_rate": 1.3389830508474577e-05, + "loss": 0.2643, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 1.3608254167875427, + "learning_rate": 1.3423728813559323e-05, + "loss": 0.2635, + "step": 396 + }, + { + "epoch": 0.02, + "grad_norm": 1.2513147886137037, + "learning_rate": 1.345762711864407e-05, + "loss": 0.2513, + "step": 397 + }, + { + "epoch": 0.02, + "grad_norm": 1.5112671514145013, + "learning_rate": 1.3491525423728816e-05, + "loss": 0.2861, + "step": 398 + }, + { + "epoch": 0.02, + "grad_norm": 1.6150957465634022, + "learning_rate": 1.352542372881356e-05, + "loss": 0.2701, + "step": 399 + }, + { + "epoch": 0.02, + "grad_norm": 1.7792619826658456, + "learning_rate": 1.3559322033898305e-05, + "loss": 0.2726, + "step": 400 + }, + { + "epoch": 0.02, + "grad_norm": 1.3186392092611758, + "learning_rate": 1.3593220338983053e-05, + "loss": 0.2749, + "step": 401 + }, + { + "epoch": 0.02, + "grad_norm": 1.255587345530338, + "learning_rate": 1.3627118644067798e-05, + "loss": 0.2524, + "step": 402 + }, + { + "epoch": 0.02, + "grad_norm": 1.2400872274349326, + "learning_rate": 1.3661016949152543e-05, + "loss": 0.2778, + "step": 403 + }, + { + "epoch": 0.02, + "grad_norm": 1.482742495732322, + "learning_rate": 1.3694915254237289e-05, + "loss": 0.2698, + "step": 404 + }, + { + "epoch": 0.02, + "grad_norm": 1.9013554292200026, + "learning_rate": 1.3728813559322034e-05, + "loss": 0.2835, + "step": 405 + }, + { + "epoch": 0.02, + "grad_norm": 1.401766043993974, + "learning_rate": 1.3762711864406782e-05, + "loss": 0.2991, + "step": 406 + }, + { + "epoch": 0.02, + "grad_norm": 1.881122545071624, + "learning_rate": 1.3796610169491527e-05, + "loss": 0.3155, + "step": 407 + }, + { + "epoch": 0.02, + "grad_norm": 1.644242035858314, + "learning_rate": 1.3830508474576271e-05, + "loss": 0.2766, + "step": 408 + }, + { + "epoch": 0.02, + "grad_norm": 1.4160651170704013, + "learning_rate": 1.3864406779661018e-05, + "loss": 0.2864, + "step": 409 + }, + { + "epoch": 0.02, + "grad_norm": 1.1874236164670124, + "learning_rate": 1.3898305084745764e-05, + "loss": 0.275, + "step": 410 + }, + { + "epoch": 0.02, + "grad_norm": 1.2883313224686657, + "learning_rate": 1.393220338983051e-05, + "loss": 0.2647, + "step": 411 + }, + { + "epoch": 0.02, + "grad_norm": 1.2956543891841616, + "learning_rate": 1.3966101694915255e-05, + "loss": 0.2598, + "step": 412 + }, + { + "epoch": 0.02, + "grad_norm": 1.4091672402968367, + "learning_rate": 1.4e-05, + "loss": 0.2744, + "step": 413 + }, + { + "epoch": 0.02, + "grad_norm": 1.3024750495831694, + "learning_rate": 1.4033898305084748e-05, + "loss": 0.2525, + "step": 414 + }, + { + "epoch": 0.02, + "grad_norm": 1.7724406208457262, + "learning_rate": 1.4067796610169493e-05, + "loss": 0.2916, + "step": 415 + }, + { + "epoch": 0.02, + "grad_norm": 1.306539549408704, + "learning_rate": 1.4101694915254239e-05, + "loss": 0.2697, + "step": 416 + }, + { + "epoch": 0.02, + "grad_norm": 1.4086019817090085, + "learning_rate": 1.4135593220338984e-05, + "loss": 0.2597, + "step": 417 + }, + { + "epoch": 0.02, + "grad_norm": 1.5154347103816932, + "learning_rate": 1.416949152542373e-05, + "loss": 0.2622, + "step": 418 + }, + { + "epoch": 0.02, + "grad_norm": 1.2824331064750205, + "learning_rate": 1.4203389830508476e-05, + "loss": 0.295, + "step": 419 + }, + { + "epoch": 0.02, + "grad_norm": 1.1915074345746457, + "learning_rate": 1.4237288135593221e-05, + "loss": 0.2642, + "step": 420 + }, + { + "epoch": 0.02, + "grad_norm": 1.1801999544960764, + "learning_rate": 1.4271186440677966e-05, + "loss": 0.2695, + "step": 421 + }, + { + "epoch": 0.02, + "grad_norm": 1.2427987470169837, + "learning_rate": 1.4305084745762714e-05, + "loss": 0.2693, + "step": 422 + }, + { + "epoch": 0.02, + "grad_norm": 1.416370988934776, + "learning_rate": 1.4338983050847459e-05, + "loss": 0.2566, + "step": 423 + }, + { + "epoch": 0.02, + "grad_norm": 1.4784492398159457, + "learning_rate": 1.4372881355932205e-05, + "loss": 0.3128, + "step": 424 + }, + { + "epoch": 0.02, + "grad_norm": 1.7103590658489138, + "learning_rate": 1.440677966101695e-05, + "loss": 0.2916, + "step": 425 + }, + { + "epoch": 0.02, + "grad_norm": 1.177582810375724, + "learning_rate": 1.4440677966101698e-05, + "loss": 0.2787, + "step": 426 + }, + { + "epoch": 0.02, + "grad_norm": 1.2187682328759377, + "learning_rate": 1.4474576271186442e-05, + "loss": 0.2766, + "step": 427 + }, + { + "epoch": 0.02, + "grad_norm": 1.1786396709929452, + "learning_rate": 1.4508474576271187e-05, + "loss": 0.2958, + "step": 428 + }, + { + "epoch": 0.02, + "grad_norm": 1.1586043727922222, + "learning_rate": 1.4542372881355933e-05, + "loss": 0.2629, + "step": 429 + }, + { + "epoch": 0.02, + "grad_norm": 1.374274393866389, + "learning_rate": 1.4576271186440678e-05, + "loss": 0.2725, + "step": 430 + }, + { + "epoch": 0.02, + "grad_norm": 1.1422580383010243, + "learning_rate": 1.4610169491525426e-05, + "loss": 0.2608, + "step": 431 + }, + { + "epoch": 0.02, + "grad_norm": 1.362018091483847, + "learning_rate": 1.4644067796610171e-05, + "loss": 0.2686, + "step": 432 + }, + { + "epoch": 0.02, + "grad_norm": 1.3772004572389647, + "learning_rate": 1.4677966101694916e-05, + "loss": 0.2848, + "step": 433 + }, + { + "epoch": 0.02, + "grad_norm": 1.3697625449391713, + "learning_rate": 1.4711864406779662e-05, + "loss": 0.2618, + "step": 434 + }, + { + "epoch": 0.02, + "grad_norm": 1.2817523722761615, + "learning_rate": 1.4745762711864408e-05, + "loss": 0.281, + "step": 435 + }, + { + "epoch": 0.02, + "grad_norm": 1.5667550162037722, + "learning_rate": 1.4779661016949153e-05, + "loss": 0.3107, + "step": 436 + }, + { + "epoch": 0.02, + "grad_norm": 1.2154704540265984, + "learning_rate": 1.48135593220339e-05, + "loss": 0.2641, + "step": 437 + }, + { + "epoch": 0.02, + "grad_norm": 1.601548252501335, + "learning_rate": 1.4847457627118644e-05, + "loss": 0.2749, + "step": 438 + }, + { + "epoch": 0.02, + "grad_norm": 1.3412392808454243, + "learning_rate": 1.4881355932203392e-05, + "loss": 0.2591, + "step": 439 + }, + { + "epoch": 0.02, + "grad_norm": 1.8745181492381693, + "learning_rate": 1.4915254237288137e-05, + "loss": 0.293, + "step": 440 + }, + { + "epoch": 0.02, + "grad_norm": 1.2032286439232478, + "learning_rate": 1.4949152542372882e-05, + "loss": 0.2488, + "step": 441 + }, + { + "epoch": 0.02, + "grad_norm": 1.8027795794557806, + "learning_rate": 1.4983050847457628e-05, + "loss": 0.2828, + "step": 442 + }, + { + "epoch": 0.02, + "grad_norm": 1.1544444706346058, + "learning_rate": 1.5016949152542374e-05, + "loss": 0.2894, + "step": 443 + }, + { + "epoch": 0.02, + "grad_norm": 1.28630079163346, + "learning_rate": 1.505084745762712e-05, + "loss": 0.2826, + "step": 444 + }, + { + "epoch": 0.02, + "grad_norm": 1.3571363572134902, + "learning_rate": 1.5084745762711865e-05, + "loss": 0.2838, + "step": 445 + }, + { + "epoch": 0.02, + "grad_norm": 1.4725896993155925, + "learning_rate": 1.511864406779661e-05, + "loss": 0.2623, + "step": 446 + }, + { + "epoch": 0.02, + "grad_norm": 1.1861507973687793, + "learning_rate": 1.5152542372881358e-05, + "loss": 0.2835, + "step": 447 + }, + { + "epoch": 0.02, + "grad_norm": 1.8819167655356706, + "learning_rate": 1.5186440677966103e-05, + "loss": 0.2677, + "step": 448 + }, + { + "epoch": 0.02, + "grad_norm": 1.1347425370954585, + "learning_rate": 1.522033898305085e-05, + "loss": 0.2794, + "step": 449 + }, + { + "epoch": 0.02, + "grad_norm": 1.1153160184489386, + "learning_rate": 1.5254237288135594e-05, + "loss": 0.2658, + "step": 450 + }, + { + "epoch": 0.02, + "grad_norm": 1.1449747260362184, + "learning_rate": 1.528813559322034e-05, + "loss": 0.2524, + "step": 451 + }, + { + "epoch": 0.02, + "grad_norm": 1.441613690575214, + "learning_rate": 1.5322033898305085e-05, + "loss": 0.2901, + "step": 452 + }, + { + "epoch": 0.02, + "grad_norm": 1.1305687396821262, + "learning_rate": 1.5355932203389833e-05, + "loss": 0.2702, + "step": 453 + }, + { + "epoch": 0.02, + "grad_norm": 1.627737800353934, + "learning_rate": 1.5389830508474578e-05, + "loss": 0.2708, + "step": 454 + }, + { + "epoch": 0.02, + "grad_norm": 1.1744672296713026, + "learning_rate": 1.5423728813559326e-05, + "loss": 0.2516, + "step": 455 + }, + { + "epoch": 0.02, + "grad_norm": 1.2999579170148163, + "learning_rate": 1.545762711864407e-05, + "loss": 0.2595, + "step": 456 + }, + { + "epoch": 0.02, + "grad_norm": 1.407224076389098, + "learning_rate": 1.5491525423728815e-05, + "loss": 0.2781, + "step": 457 + }, + { + "epoch": 0.02, + "grad_norm": 1.7356557602560885, + "learning_rate": 1.552542372881356e-05, + "loss": 0.274, + "step": 458 + }, + { + "epoch": 0.02, + "grad_norm": 1.7821775541345888, + "learning_rate": 1.5559322033898305e-05, + "loss": 0.2563, + "step": 459 + }, + { + "epoch": 0.02, + "grad_norm": 1.3388894848421589, + "learning_rate": 1.5593220338983053e-05, + "loss": 0.2848, + "step": 460 + }, + { + "epoch": 0.02, + "grad_norm": 1.42437970488922, + "learning_rate": 1.5627118644067798e-05, + "loss": 0.261, + "step": 461 + }, + { + "epoch": 0.02, + "grad_norm": 1.2321061249146887, + "learning_rate": 1.5661016949152542e-05, + "loss": 0.2685, + "step": 462 + }, + { + "epoch": 0.02, + "grad_norm": 1.6009091932685735, + "learning_rate": 1.5694915254237287e-05, + "loss": 0.2772, + "step": 463 + }, + { + "epoch": 0.02, + "grad_norm": 1.2297059855573542, + "learning_rate": 1.5728813559322035e-05, + "loss": 0.2537, + "step": 464 + }, + { + "epoch": 0.02, + "grad_norm": 1.3453190050509531, + "learning_rate": 1.576271186440678e-05, + "loss": 0.2881, + "step": 465 + }, + { + "epoch": 0.02, + "grad_norm": 1.9238560454482667, + "learning_rate": 1.5796610169491528e-05, + "loss": 0.2704, + "step": 466 + }, + { + "epoch": 0.02, + "grad_norm": 1.5999053962529295, + "learning_rate": 1.5830508474576272e-05, + "loss": 0.2626, + "step": 467 + }, + { + "epoch": 0.02, + "grad_norm": 1.3191503928216304, + "learning_rate": 1.586440677966102e-05, + "loss": 0.277, + "step": 468 + }, + { + "epoch": 0.02, + "grad_norm": 1.5589440578069296, + "learning_rate": 1.5898305084745765e-05, + "loss": 0.2668, + "step": 469 + }, + { + "epoch": 0.02, + "grad_norm": 1.0608630567816568, + "learning_rate": 1.593220338983051e-05, + "loss": 0.2587, + "step": 470 + }, + { + "epoch": 0.02, + "grad_norm": 1.278828909895488, + "learning_rate": 1.5966101694915255e-05, + "loss": 0.2757, + "step": 471 + }, + { + "epoch": 0.02, + "grad_norm": 1.2073777948507576, + "learning_rate": 1.6000000000000003e-05, + "loss": 0.3151, + "step": 472 + }, + { + "epoch": 0.02, + "grad_norm": 1.3296212003545202, + "learning_rate": 1.6033898305084747e-05, + "loss": 0.289, + "step": 473 + }, + { + "epoch": 0.02, + "grad_norm": 1.1975562289235413, + "learning_rate": 1.6067796610169492e-05, + "loss": 0.269, + "step": 474 + }, + { + "epoch": 0.02, + "grad_norm": 1.890410123709992, + "learning_rate": 1.6101694915254237e-05, + "loss": 0.248, + "step": 475 + }, + { + "epoch": 0.02, + "grad_norm": 1.2316296356173326, + "learning_rate": 1.6135593220338985e-05, + "loss": 0.2764, + "step": 476 + }, + { + "epoch": 0.02, + "grad_norm": 1.158451224340742, + "learning_rate": 1.616949152542373e-05, + "loss": 0.2515, + "step": 477 + }, + { + "epoch": 0.02, + "grad_norm": 1.386768235161273, + "learning_rate": 1.6203389830508474e-05, + "loss": 0.2766, + "step": 478 + }, + { + "epoch": 0.02, + "grad_norm": 0.9916586427606384, + "learning_rate": 1.6237288135593222e-05, + "loss": 0.2425, + "step": 479 + }, + { + "epoch": 0.02, + "grad_norm": 1.0663820855084396, + "learning_rate": 1.6271186440677967e-05, + "loss": 0.2849, + "step": 480 + }, + { + "epoch": 0.02, + "grad_norm": 1.2016145341855438, + "learning_rate": 1.6305084745762715e-05, + "loss": 0.2686, + "step": 481 + }, + { + "epoch": 0.02, + "grad_norm": 1.1723921344826131, + "learning_rate": 1.633898305084746e-05, + "loss": 0.2573, + "step": 482 + }, + { + "epoch": 0.02, + "grad_norm": 1.2111557499904375, + "learning_rate": 1.6372881355932204e-05, + "loss": 0.2532, + "step": 483 + }, + { + "epoch": 0.02, + "grad_norm": 1.699199017647184, + "learning_rate": 1.640677966101695e-05, + "loss": 0.2902, + "step": 484 + }, + { + "epoch": 0.02, + "grad_norm": 1.2909577938024068, + "learning_rate": 1.6440677966101697e-05, + "loss": 0.2663, + "step": 485 + }, + { + "epoch": 0.02, + "grad_norm": 1.5571826032846792, + "learning_rate": 1.6474576271186442e-05, + "loss": 0.2645, + "step": 486 + }, + { + "epoch": 0.02, + "grad_norm": 1.1279016844343919, + "learning_rate": 1.6508474576271187e-05, + "loss": 0.2576, + "step": 487 + }, + { + "epoch": 0.02, + "grad_norm": 1.5591821887853188, + "learning_rate": 1.654237288135593e-05, + "loss": 0.2851, + "step": 488 + }, + { + "epoch": 0.02, + "grad_norm": 1.2092586739280375, + "learning_rate": 1.657627118644068e-05, + "loss": 0.2801, + "step": 489 + }, + { + "epoch": 0.02, + "grad_norm": 1.1463301267011572, + "learning_rate": 1.6610169491525424e-05, + "loss": 0.2563, + "step": 490 + }, + { + "epoch": 0.02, + "grad_norm": 1.2444534543779873, + "learning_rate": 1.6644067796610172e-05, + "loss": 0.2642, + "step": 491 + }, + { + "epoch": 0.03, + "grad_norm": 1.3314286719081205, + "learning_rate": 1.6677966101694917e-05, + "loss": 0.2806, + "step": 492 + }, + { + "epoch": 0.03, + "grad_norm": 1.4953543966574252, + "learning_rate": 1.671186440677966e-05, + "loss": 0.2812, + "step": 493 + }, + { + "epoch": 0.03, + "grad_norm": 1.229405126380989, + "learning_rate": 1.674576271186441e-05, + "loss": 0.2702, + "step": 494 + }, + { + "epoch": 0.03, + "grad_norm": 1.226387305787883, + "learning_rate": 1.6779661016949154e-05, + "loss": 0.2672, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 1.2279350727222604, + "learning_rate": 1.68135593220339e-05, + "loss": 0.2746, + "step": 496 + }, + { + "epoch": 0.03, + "grad_norm": 1.4752201164031455, + "learning_rate": 1.6847457627118647e-05, + "loss": 0.2615, + "step": 497 + }, + { + "epoch": 0.03, + "grad_norm": 1.5171752111549475, + "learning_rate": 1.6881355932203392e-05, + "loss": 0.2511, + "step": 498 + }, + { + "epoch": 0.03, + "grad_norm": 1.4085635220195363, + "learning_rate": 1.6915254237288136e-05, + "loss": 0.2503, + "step": 499 + }, + { + "epoch": 0.03, + "grad_norm": 1.5619747875254615, + "learning_rate": 1.694915254237288e-05, + "loss": 0.279, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 1.6055369708361142, + "learning_rate": 1.698305084745763e-05, + "loss": 0.2639, + "step": 501 + }, + { + "epoch": 0.03, + "grad_norm": 1.6065530819560707, + "learning_rate": 1.7016949152542374e-05, + "loss": 0.2877, + "step": 502 + }, + { + "epoch": 0.03, + "grad_norm": 1.3686957461363003, + "learning_rate": 1.705084745762712e-05, + "loss": 0.2679, + "step": 503 + }, + { + "epoch": 0.03, + "grad_norm": 1.3246185897098746, + "learning_rate": 1.7084745762711867e-05, + "loss": 0.2803, + "step": 504 + }, + { + "epoch": 0.03, + "grad_norm": 1.601972430115074, + "learning_rate": 1.711864406779661e-05, + "loss": 0.2557, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 1.4960766298227883, + "learning_rate": 1.715254237288136e-05, + "loss": 0.2722, + "step": 506 + }, + { + "epoch": 0.03, + "grad_norm": 1.7165264426107494, + "learning_rate": 1.7186440677966104e-05, + "loss": 0.2747, + "step": 507 + }, + { + "epoch": 0.03, + "grad_norm": 1.300412910903492, + "learning_rate": 1.722033898305085e-05, + "loss": 0.2704, + "step": 508 + }, + { + "epoch": 0.03, + "grad_norm": 1.4490955968513795, + "learning_rate": 1.7254237288135597e-05, + "loss": 0.2591, + "step": 509 + }, + { + "epoch": 0.03, + "grad_norm": 1.3759244398932398, + "learning_rate": 1.728813559322034e-05, + "loss": 0.2969, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 1.6898284753441082, + "learning_rate": 1.7322033898305086e-05, + "loss": 0.2443, + "step": 511 + }, + { + "epoch": 0.03, + "grad_norm": 1.4318962229777292, + "learning_rate": 1.735593220338983e-05, + "loss": 0.2513, + "step": 512 + }, + { + "epoch": 0.03, + "grad_norm": 1.743666701146938, + "learning_rate": 1.7389830508474576e-05, + "loss": 0.2814, + "step": 513 + }, + { + "epoch": 0.03, + "grad_norm": 1.7193073886473862, + "learning_rate": 1.7423728813559324e-05, + "loss": 0.2888, + "step": 514 + }, + { + "epoch": 0.03, + "grad_norm": 1.557692647375557, + "learning_rate": 1.745762711864407e-05, + "loss": 0.2702, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 1.3850957268156736, + "learning_rate": 1.7491525423728813e-05, + "loss": 0.2806, + "step": 516 + }, + { + "epoch": 0.03, + "grad_norm": 1.687921295616476, + "learning_rate": 1.752542372881356e-05, + "loss": 0.246, + "step": 517 + }, + { + "epoch": 0.03, + "grad_norm": 1.1706695625942483, + "learning_rate": 1.7559322033898306e-05, + "loss": 0.251, + "step": 518 + }, + { + "epoch": 0.03, + "grad_norm": 4.037585934141602, + "learning_rate": 1.7593220338983054e-05, + "loss": 0.27, + "step": 519 + }, + { + "epoch": 0.03, + "grad_norm": 1.3341641611216237, + "learning_rate": 1.76271186440678e-05, + "loss": 0.2909, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 1.1083063405567553, + "learning_rate": 1.7661016949152543e-05, + "loss": 0.273, + "step": 521 + }, + { + "epoch": 0.03, + "grad_norm": 1.1166108132403247, + "learning_rate": 1.769491525423729e-05, + "loss": 0.2647, + "step": 522 + }, + { + "epoch": 0.03, + "grad_norm": 1.4730148417394753, + "learning_rate": 1.7728813559322036e-05, + "loss": 0.2574, + "step": 523 + }, + { + "epoch": 0.03, + "grad_norm": 1.3796044596292965, + "learning_rate": 1.776271186440678e-05, + "loss": 0.2809, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 6.668912240239683, + "learning_rate": 1.7796610169491526e-05, + "loss": 0.2896, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 1.6221075546474033, + "learning_rate": 1.7830508474576274e-05, + "loss": 0.2429, + "step": 526 + }, + { + "epoch": 0.03, + "grad_norm": 1.314326489119452, + "learning_rate": 1.7864406779661018e-05, + "loss": 0.2974, + "step": 527 + }, + { + "epoch": 0.03, + "grad_norm": 1.4011654180586948, + "learning_rate": 1.7898305084745763e-05, + "loss": 0.2805, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 1.456593465147238, + "learning_rate": 1.7932203389830508e-05, + "loss": 0.271, + "step": 529 + }, + { + "epoch": 0.03, + "grad_norm": 1.4153772547557295, + "learning_rate": 1.7966101694915256e-05, + "loss": 0.2993, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 1.460962611843914, + "learning_rate": 1.8e-05, + "loss": 0.288, + "step": 531 + }, + { + "epoch": 0.03, + "grad_norm": 1.3671451429644492, + "learning_rate": 1.803389830508475e-05, + "loss": 0.2591, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 1.558548773828584, + "learning_rate": 1.8067796610169493e-05, + "loss": 0.2689, + "step": 533 + }, + { + "epoch": 0.03, + "grad_norm": 1.5127491344620718, + "learning_rate": 1.810169491525424e-05, + "loss": 0.3259, + "step": 534 + }, + { + "epoch": 0.03, + "grad_norm": 1.5486731950661254, + "learning_rate": 1.8135593220338986e-05, + "loss": 0.2712, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 1.3193510633104664, + "learning_rate": 1.816949152542373e-05, + "loss": 0.2838, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 1.0997689513888096, + "learning_rate": 1.8203389830508475e-05, + "loss": 0.2525, + "step": 537 + }, + { + "epoch": 0.03, + "grad_norm": 1.2756030235865132, + "learning_rate": 1.823728813559322e-05, + "loss": 0.307, + "step": 538 + }, + { + "epoch": 0.03, + "grad_norm": 1.4687241798166086, + "learning_rate": 1.8271186440677968e-05, + "loss": 0.2428, + "step": 539 + }, + { + "epoch": 0.03, + "grad_norm": 1.3399067424295357, + "learning_rate": 1.8305084745762713e-05, + "loss": 0.2732, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 1.6084487910737866, + "learning_rate": 1.8338983050847458e-05, + "loss": 0.2552, + "step": 541 + }, + { + "epoch": 0.03, + "grad_norm": 1.2944041185222261, + "learning_rate": 1.8372881355932202e-05, + "loss": 0.2804, + "step": 542 + }, + { + "epoch": 0.03, + "grad_norm": 1.2566959055725666, + "learning_rate": 1.840677966101695e-05, + "loss": 0.259, + "step": 543 + }, + { + "epoch": 0.03, + "grad_norm": 1.2933124442569186, + "learning_rate": 1.8440677966101695e-05, + "loss": 0.2676, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 1.3273078884878304, + "learning_rate": 1.8474576271186443e-05, + "loss": 0.2719, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 1.263916893620895, + "learning_rate": 1.8508474576271188e-05, + "loss": 0.286, + "step": 546 + }, + { + "epoch": 0.03, + "grad_norm": 1.1352925308401638, + "learning_rate": 1.8542372881355936e-05, + "loss": 0.2449, + "step": 547 + }, + { + "epoch": 0.03, + "grad_norm": 1.4287019035022674, + "learning_rate": 1.857627118644068e-05, + "loss": 0.2854, + "step": 548 + }, + { + "epoch": 0.03, + "grad_norm": 1.536114169133046, + "learning_rate": 1.8610169491525425e-05, + "loss": 0.2749, + "step": 549 + }, + { + "epoch": 0.03, + "grad_norm": 1.3636384713719114, + "learning_rate": 1.864406779661017e-05, + "loss": 0.2798, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 1.1282443226800742, + "learning_rate": 1.8677966101694918e-05, + "loss": 0.2545, + "step": 551 + }, + { + "epoch": 0.03, + "grad_norm": 1.4372325985492267, + "learning_rate": 1.8711864406779663e-05, + "loss": 0.2764, + "step": 552 + }, + { + "epoch": 0.03, + "grad_norm": 1.431962567950389, + "learning_rate": 1.8745762711864407e-05, + "loss": 0.2535, + "step": 553 + }, + { + "epoch": 0.03, + "grad_norm": 1.2145997199986456, + "learning_rate": 1.8779661016949152e-05, + "loss": 0.2566, + "step": 554 + }, + { + "epoch": 0.03, + "grad_norm": 1.1352608835127773, + "learning_rate": 1.88135593220339e-05, + "loss": 0.2535, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 1.2210496354415843, + "learning_rate": 1.8847457627118645e-05, + "loss": 0.2643, + "step": 556 + }, + { + "epoch": 0.03, + "grad_norm": 1.1232924683009815, + "learning_rate": 1.8881355932203393e-05, + "loss": 0.2627, + "step": 557 + }, + { + "epoch": 0.03, + "grad_norm": 3.4024455416720127, + "learning_rate": 1.8915254237288138e-05, + "loss": 0.2525, + "step": 558 + }, + { + "epoch": 0.03, + "grad_norm": 1.3887487502150935, + "learning_rate": 1.8949152542372882e-05, + "loss": 0.2758, + "step": 559 + }, + { + "epoch": 0.03, + "grad_norm": 1.232504365515857, + "learning_rate": 1.898305084745763e-05, + "loss": 0.2406, + "step": 560 + }, + { + "epoch": 0.03, + "grad_norm": 1.3940857376689135, + "learning_rate": 1.9016949152542375e-05, + "loss": 0.2558, + "step": 561 + }, + { + "epoch": 0.03, + "grad_norm": 1.345932352677997, + "learning_rate": 1.905084745762712e-05, + "loss": 0.2626, + "step": 562 + }, + { + "epoch": 0.03, + "grad_norm": 1.1157309455332585, + "learning_rate": 1.9084745762711868e-05, + "loss": 0.2632, + "step": 563 + }, + { + "epoch": 0.03, + "grad_norm": 1.156636964733872, + "learning_rate": 1.9118644067796613e-05, + "loss": 0.2781, + "step": 564 + }, + { + "epoch": 0.03, + "grad_norm": 1.0729683625503257, + "learning_rate": 1.9152542372881357e-05, + "loss": 0.2457, + "step": 565 + }, + { + "epoch": 0.03, + "grad_norm": 1.1647734275933048, + "learning_rate": 1.9186440677966102e-05, + "loss": 0.2393, + "step": 566 + }, + { + "epoch": 0.03, + "grad_norm": 1.5138294469581877, + "learning_rate": 1.9220338983050847e-05, + "loss": 0.2876, + "step": 567 + }, + { + "epoch": 0.03, + "grad_norm": 1.323351272486395, + "learning_rate": 1.9254237288135595e-05, + "loss": 0.2396, + "step": 568 + }, + { + "epoch": 0.03, + "grad_norm": 1.9451745243471656, + "learning_rate": 1.928813559322034e-05, + "loss": 0.2514, + "step": 569 + }, + { + "epoch": 0.03, + "grad_norm": 1.2208860898726068, + "learning_rate": 1.9322033898305087e-05, + "loss": 0.2516, + "step": 570 + }, + { + "epoch": 0.03, + "grad_norm": 1.3855491705683636, + "learning_rate": 1.9355932203389832e-05, + "loss": 0.2758, + "step": 571 + }, + { + "epoch": 0.03, + "grad_norm": 1.429772998796651, + "learning_rate": 1.938983050847458e-05, + "loss": 0.2356, + "step": 572 + }, + { + "epoch": 0.03, + "grad_norm": 1.7057138660762692, + "learning_rate": 1.9423728813559325e-05, + "loss": 0.2283, + "step": 573 + }, + { + "epoch": 0.03, + "grad_norm": 1.51201577923191, + "learning_rate": 1.945762711864407e-05, + "loss": 0.2671, + "step": 574 + }, + { + "epoch": 0.03, + "grad_norm": 1.2651595136689437, + "learning_rate": 1.9491525423728814e-05, + "loss": 0.2801, + "step": 575 + }, + { + "epoch": 0.03, + "grad_norm": 1.3249553121989968, + "learning_rate": 1.9525423728813562e-05, + "loss": 0.2787, + "step": 576 + }, + { + "epoch": 0.03, + "grad_norm": 2.74385495359413, + "learning_rate": 1.9559322033898307e-05, + "loss": 0.2608, + "step": 577 + }, + { + "epoch": 0.03, + "grad_norm": 1.6741267385582472, + "learning_rate": 1.9593220338983052e-05, + "loss": 0.258, + "step": 578 + }, + { + "epoch": 0.03, + "grad_norm": 1.4888420560673257, + "learning_rate": 1.9627118644067796e-05, + "loss": 0.281, + "step": 579 + }, + { + "epoch": 0.03, + "grad_norm": 1.4190781041419613, + "learning_rate": 1.9661016949152545e-05, + "loss": 0.2438, + "step": 580 + }, + { + "epoch": 0.03, + "grad_norm": 2.12210293134904, + "learning_rate": 1.969491525423729e-05, + "loss": 0.2877, + "step": 581 + }, + { + "epoch": 0.03, + "grad_norm": 2.441475622095831, + "learning_rate": 1.9728813559322034e-05, + "loss": 0.253, + "step": 582 + }, + { + "epoch": 0.03, + "grad_norm": 3.745675132840984, + "learning_rate": 1.9762711864406782e-05, + "loss": 0.2806, + "step": 583 + }, + { + "epoch": 0.03, + "grad_norm": 2.3836639201458314, + "learning_rate": 1.9796610169491527e-05, + "loss": 0.3177, + "step": 584 + }, + { + "epoch": 0.03, + "grad_norm": 3.30532147466716, + "learning_rate": 1.9830508474576275e-05, + "loss": 0.2846, + "step": 585 + }, + { + "epoch": 0.03, + "grad_norm": 1.5409660828860403, + "learning_rate": 1.986440677966102e-05, + "loss": 0.2681, + "step": 586 + }, + { + "epoch": 0.03, + "grad_norm": 4.731352615733341, + "learning_rate": 1.9898305084745764e-05, + "loss": 0.2567, + "step": 587 + }, + { + "epoch": 0.03, + "grad_norm": 1.475042781560892, + "learning_rate": 1.9932203389830512e-05, + "loss": 0.2628, + "step": 588 + }, + { + "epoch": 0.03, + "grad_norm": 6.0834959806057185, + "learning_rate": 1.9966101694915257e-05, + "loss": 0.24, + "step": 589 + }, + { + "epoch": 0.03, + "grad_norm": 1.3873201495539649, + "learning_rate": 2e-05, + "loss": 0.26, + "step": 590 + }, + { + "epoch": 0.03, + "grad_norm": 1.3568449606917727, + "learning_rate": 1.999999986437472e-05, + "loss": 0.279, + "step": 591 + }, + { + "epoch": 0.03, + "grad_norm": 1.4736919604577718, + "learning_rate": 1.9999999457498875e-05, + "loss": 0.2684, + "step": 592 + }, + { + "epoch": 0.03, + "grad_norm": 1.2388398729176897, + "learning_rate": 1.9999998779372483e-05, + "loss": 0.296, + "step": 593 + }, + { + "epoch": 0.03, + "grad_norm": 1.266582495935962, + "learning_rate": 1.9999997829995557e-05, + "loss": 0.2539, + "step": 594 + }, + { + "epoch": 0.03, + "grad_norm": 1.563096295904235, + "learning_rate": 1.9999996609368124e-05, + "loss": 0.2957, + "step": 595 + }, + { + "epoch": 0.03, + "grad_norm": 1.5591054305818515, + "learning_rate": 1.9999995117490224e-05, + "loss": 0.2662, + "step": 596 + }, + { + "epoch": 0.03, + "grad_norm": 1.65621574378536, + "learning_rate": 1.9999993354361887e-05, + "loss": 0.2403, + "step": 597 + }, + { + "epoch": 0.03, + "grad_norm": 1.2548699939118981, + "learning_rate": 1.9999991319983162e-05, + "loss": 0.2813, + "step": 598 + }, + { + "epoch": 0.03, + "grad_norm": 1.8410217890854863, + "learning_rate": 1.9999989014354117e-05, + "loss": 0.2371, + "step": 599 + }, + { + "epoch": 0.03, + "grad_norm": 1.4153342484952254, + "learning_rate": 1.9999986437474797e-05, + "loss": 0.2565, + "step": 600 + }, + { + "epoch": 0.03, + "grad_norm": 1.10289019805283, + "learning_rate": 1.9999983589345282e-05, + "loss": 0.2698, + "step": 601 + }, + { + "epoch": 0.03, + "grad_norm": 4.453681957989118, + "learning_rate": 1.9999980469965646e-05, + "loss": 0.2506, + "step": 602 + }, + { + "epoch": 0.03, + "grad_norm": 1.948473664806546, + "learning_rate": 1.999997707933598e-05, + "loss": 0.2494, + "step": 603 + }, + { + "epoch": 0.03, + "grad_norm": 1.3618436212411482, + "learning_rate": 1.9999973417456367e-05, + "loss": 0.2644, + "step": 604 + }, + { + "epoch": 0.03, + "grad_norm": 1.3724384206589046, + "learning_rate": 1.999996948432691e-05, + "loss": 0.295, + "step": 605 + }, + { + "epoch": 0.03, + "grad_norm": 1.4885167672678044, + "learning_rate": 1.999996527994772e-05, + "loss": 0.2799, + "step": 606 + }, + { + "epoch": 0.03, + "grad_norm": 1.177622041217393, + "learning_rate": 1.9999960804318904e-05, + "loss": 0.2878, + "step": 607 + }, + { + "epoch": 0.03, + "grad_norm": 2.0149772969197515, + "learning_rate": 1.999995605744059e-05, + "loss": 0.257, + "step": 608 + }, + { + "epoch": 0.03, + "grad_norm": 1.691709339113414, + "learning_rate": 1.9999951039312897e-05, + "loss": 0.2524, + "step": 609 + }, + { + "epoch": 0.03, + "grad_norm": 1.0999445528030725, + "learning_rate": 1.999994574993597e-05, + "loss": 0.2612, + "step": 610 + }, + { + "epoch": 0.03, + "grad_norm": 1.5337654360480533, + "learning_rate": 1.999994018930995e-05, + "loss": 0.2948, + "step": 611 + }, + { + "epoch": 0.03, + "grad_norm": 1.1245129713817184, + "learning_rate": 1.9999934357434986e-05, + "loss": 0.2783, + "step": 612 + }, + { + "epoch": 0.03, + "grad_norm": 1.051000583585932, + "learning_rate": 1.9999928254311242e-05, + "loss": 0.2663, + "step": 613 + }, + { + "epoch": 0.03, + "grad_norm": 1.6065272631126895, + "learning_rate": 1.9999921879938875e-05, + "loss": 0.2648, + "step": 614 + }, + { + "epoch": 0.03, + "grad_norm": 1.040871656632654, + "learning_rate": 1.9999915234318064e-05, + "loss": 0.258, + "step": 615 + }, + { + "epoch": 0.03, + "grad_norm": 1.0489036653596298, + "learning_rate": 1.9999908317448985e-05, + "loss": 0.2619, + "step": 616 + }, + { + "epoch": 0.03, + "grad_norm": 1.2406900738256312, + "learning_rate": 1.9999901129331832e-05, + "loss": 0.2688, + "step": 617 + }, + { + "epoch": 0.03, + "grad_norm": 1.3093334914999668, + "learning_rate": 1.9999893669966794e-05, + "loss": 0.2856, + "step": 618 + }, + { + "epoch": 0.03, + "grad_norm": 1.0150981610834626, + "learning_rate": 1.9999885939354077e-05, + "loss": 0.267, + "step": 619 + }, + { + "epoch": 0.03, + "grad_norm": 1.1050649685427374, + "learning_rate": 1.9999877937493886e-05, + "loss": 0.2558, + "step": 620 + }, + { + "epoch": 0.03, + "grad_norm": 1.0775846893782826, + "learning_rate": 1.9999869664386443e-05, + "loss": 0.2675, + "step": 621 + }, + { + "epoch": 0.03, + "grad_norm": 1.0419452826452065, + "learning_rate": 1.999986112003197e-05, + "loss": 0.2645, + "step": 622 + }, + { + "epoch": 0.03, + "grad_norm": 1.3183338349176241, + "learning_rate": 1.99998523044307e-05, + "loss": 0.3004, + "step": 623 + }, + { + "epoch": 0.03, + "grad_norm": 0.9943967830586373, + "learning_rate": 1.999984321758287e-05, + "loss": 0.3, + "step": 624 + }, + { + "epoch": 0.03, + "grad_norm": 1.498860798184339, + "learning_rate": 1.999983385948873e-05, + "loss": 0.2547, + "step": 625 + }, + { + "epoch": 0.03, + "grad_norm": 4.537978887587886, + "learning_rate": 1.9999824230148532e-05, + "loss": 0.2938, + "step": 626 + }, + { + "epoch": 0.03, + "grad_norm": 0.9265795045788969, + "learning_rate": 1.999981432956254e-05, + "loss": 0.2698, + "step": 627 + }, + { + "epoch": 0.03, + "grad_norm": 1.92643007386415, + "learning_rate": 1.999980415773101e-05, + "loss": 0.3034, + "step": 628 + }, + { + "epoch": 0.03, + "grad_norm": 1.0860148365443336, + "learning_rate": 1.9999793714654236e-05, + "loss": 0.2617, + "step": 629 + }, + { + "epoch": 0.03, + "grad_norm": 3.232951628312408, + "learning_rate": 1.9999783000332486e-05, + "loss": 0.2436, + "step": 630 + }, + { + "epoch": 0.03, + "grad_norm": 1.6804316633229621, + "learning_rate": 1.9999772014766062e-05, + "loss": 0.305, + "step": 631 + }, + { + "epoch": 0.03, + "grad_norm": 2.0772468232195704, + "learning_rate": 1.9999760757955258e-05, + "loss": 0.284, + "step": 632 + }, + { + "epoch": 0.03, + "grad_norm": 1.4342896278895008, + "learning_rate": 1.9999749229900376e-05, + "loss": 0.2677, + "step": 633 + }, + { + "epoch": 0.03, + "grad_norm": 1.8471599254915816, + "learning_rate": 1.9999737430601734e-05, + "loss": 0.2557, + "step": 634 + }, + { + "epoch": 0.03, + "grad_norm": 1.281316376095995, + "learning_rate": 1.9999725360059648e-05, + "loss": 0.2817, + "step": 635 + }, + { + "epoch": 0.03, + "grad_norm": 1.3402740898159577, + "learning_rate": 1.9999713018274444e-05, + "loss": 0.265, + "step": 636 + }, + { + "epoch": 0.03, + "grad_norm": 1.3958741995969437, + "learning_rate": 1.9999700405246462e-05, + "loss": 0.3047, + "step": 637 + }, + { + "epoch": 0.03, + "grad_norm": 1.4108555514675845, + "learning_rate": 1.9999687520976043e-05, + "loss": 0.2721, + "step": 638 + }, + { + "epoch": 0.03, + "grad_norm": 1.2241788885846367, + "learning_rate": 1.9999674365463532e-05, + "loss": 0.2645, + "step": 639 + }, + { + "epoch": 0.03, + "grad_norm": 1.211714854940445, + "learning_rate": 1.999966093870929e-05, + "loss": 0.2947, + "step": 640 + }, + { + "epoch": 0.03, + "grad_norm": 1.2759128182319426, + "learning_rate": 1.999964724071368e-05, + "loss": 0.2615, + "step": 641 + }, + { + "epoch": 0.03, + "grad_norm": 1.437640733114652, + "learning_rate": 1.999963327147708e-05, + "loss": 0.2614, + "step": 642 + }, + { + "epoch": 0.03, + "grad_norm": 1.4274344790032485, + "learning_rate": 1.9999619030999853e-05, + "loss": 0.2999, + "step": 643 + }, + { + "epoch": 0.03, + "grad_norm": 2.1171499634893127, + "learning_rate": 1.9999604519282403e-05, + "loss": 0.2506, + "step": 644 + }, + { + "epoch": 0.03, + "grad_norm": 1.3503945628596226, + "learning_rate": 1.999958973632511e-05, + "loss": 0.2618, + "step": 645 + }, + { + "epoch": 0.03, + "grad_norm": 1.259518347435971, + "learning_rate": 1.9999574682128385e-05, + "loss": 0.2696, + "step": 646 + }, + { + "epoch": 0.03, + "grad_norm": 1.2025649744782203, + "learning_rate": 1.999955935669263e-05, + "loss": 0.2773, + "step": 647 + }, + { + "epoch": 0.03, + "grad_norm": 1.3748457996542727, + "learning_rate": 1.9999543760018264e-05, + "loss": 0.2731, + "step": 648 + }, + { + "epoch": 0.03, + "grad_norm": 1.286447108152475, + "learning_rate": 1.999952789210571e-05, + "loss": 0.2513, + "step": 649 + }, + { + "epoch": 0.03, + "grad_norm": 1.1625739396436945, + "learning_rate": 1.999951175295539e-05, + "loss": 0.2805, + "step": 650 + }, + { + "epoch": 0.03, + "grad_norm": 0.8934326598683626, + "learning_rate": 1.9999495342567754e-05, + "loss": 0.2703, + "step": 651 + }, + { + "epoch": 0.03, + "grad_norm": 1.2766979998578325, + "learning_rate": 1.999947866094324e-05, + "loss": 0.2709, + "step": 652 + }, + { + "epoch": 0.03, + "grad_norm": 1.1893920363930803, + "learning_rate": 1.999946170808231e-05, + "loss": 0.27, + "step": 653 + }, + { + "epoch": 0.03, + "grad_norm": 1.4471066279469218, + "learning_rate": 1.9999444483985408e-05, + "loss": 0.2706, + "step": 654 + }, + { + "epoch": 0.03, + "grad_norm": 1.0597340037872298, + "learning_rate": 1.9999426988653012e-05, + "loss": 0.2862, + "step": 655 + }, + { + "epoch": 0.03, + "grad_norm": 1.1600664080975713, + "learning_rate": 1.9999409222085596e-05, + "loss": 0.2725, + "step": 656 + }, + { + "epoch": 0.03, + "grad_norm": 1.2152493757257978, + "learning_rate": 1.9999391184283638e-05, + "loss": 0.2869, + "step": 657 + }, + { + "epoch": 0.03, + "grad_norm": 1.2957871045516876, + "learning_rate": 1.999937287524763e-05, + "loss": 0.2851, + "step": 658 + }, + { + "epoch": 0.03, + "grad_norm": 1.1059643575300484, + "learning_rate": 1.9999354294978066e-05, + "loss": 0.2534, + "step": 659 + }, + { + "epoch": 0.03, + "grad_norm": 1.0210468864347002, + "learning_rate": 1.9999335443475452e-05, + "loss": 0.2608, + "step": 660 + }, + { + "epoch": 0.03, + "grad_norm": 1.2949778276813804, + "learning_rate": 1.9999316320740302e-05, + "loss": 0.2689, + "step": 661 + }, + { + "epoch": 0.03, + "grad_norm": 1.1647843952730665, + "learning_rate": 1.9999296926773133e-05, + "loss": 0.2436, + "step": 662 + }, + { + "epoch": 0.03, + "grad_norm": 1.3682879718976364, + "learning_rate": 1.9999277261574468e-05, + "loss": 0.2564, + "step": 663 + }, + { + "epoch": 0.03, + "grad_norm": 1.2468871282706768, + "learning_rate": 1.999925732514484e-05, + "loss": 0.2714, + "step": 664 + }, + { + "epoch": 0.03, + "grad_norm": 1.7519741264204172, + "learning_rate": 1.99992371174848e-05, + "loss": 0.2612, + "step": 665 + }, + { + "epoch": 0.03, + "grad_norm": 1.161839164666825, + "learning_rate": 1.999921663859488e-05, + "loss": 0.2479, + "step": 666 + }, + { + "epoch": 0.03, + "grad_norm": 1.2751282318849002, + "learning_rate": 1.999919588847565e-05, + "loss": 0.2568, + "step": 667 + }, + { + "epoch": 0.03, + "grad_norm": 1.0965692887004719, + "learning_rate": 1.999917486712766e-05, + "loss": 0.269, + "step": 668 + }, + { + "epoch": 0.03, + "grad_norm": 1.0701031723437957, + "learning_rate": 1.9999153574551492e-05, + "loss": 0.2679, + "step": 669 + }, + { + "epoch": 0.03, + "grad_norm": 1.2142348695150131, + "learning_rate": 1.999913201074772e-05, + "loss": 0.265, + "step": 670 + }, + { + "epoch": 0.03, + "grad_norm": 1.06668955891612, + "learning_rate": 1.9999110175716924e-05, + "loss": 0.2521, + "step": 671 + }, + { + "epoch": 0.03, + "grad_norm": 1.0533089187255256, + "learning_rate": 1.99990880694597e-05, + "loss": 0.256, + "step": 672 + }, + { + "epoch": 0.03, + "grad_norm": 1.0623846929343919, + "learning_rate": 1.9999065691976648e-05, + "loss": 0.2956, + "step": 673 + }, + { + "epoch": 0.03, + "grad_norm": 0.9917831191421542, + "learning_rate": 1.9999043043268375e-05, + "loss": 0.2521, + "step": 674 + }, + { + "epoch": 0.03, + "grad_norm": 1.0909337781250295, + "learning_rate": 1.9999020123335496e-05, + "loss": 0.2541, + "step": 675 + }, + { + "epoch": 0.03, + "grad_norm": 1.2088864473371073, + "learning_rate": 1.9998996932178625e-05, + "loss": 0.2353, + "step": 676 + }, + { + "epoch": 0.03, + "grad_norm": 1.267002024540383, + "learning_rate": 1.9998973469798404e-05, + "loss": 0.2871, + "step": 677 + }, + { + "epoch": 0.03, + "grad_norm": 1.26664688395502, + "learning_rate": 1.9998949736195464e-05, + "loss": 0.2697, + "step": 678 + }, + { + "epoch": 0.03, + "grad_norm": 1.1258218909222184, + "learning_rate": 1.9998925731370443e-05, + "loss": 0.2767, + "step": 679 + }, + { + "epoch": 0.03, + "grad_norm": 1.4333128018170362, + "learning_rate": 1.9998901455324e-05, + "loss": 0.272, + "step": 680 + }, + { + "epoch": 0.03, + "grad_norm": 1.1539615363566418, + "learning_rate": 1.999887690805679e-05, + "loss": 0.2703, + "step": 681 + }, + { + "epoch": 0.03, + "grad_norm": 1.3055896441511934, + "learning_rate": 1.9998852089569473e-05, + "loss": 0.2805, + "step": 682 + }, + { + "epoch": 0.03, + "grad_norm": 0.9049653729869355, + "learning_rate": 1.9998826999862736e-05, + "loss": 0.2588, + "step": 683 + }, + { + "epoch": 0.03, + "grad_norm": 1.638495294480143, + "learning_rate": 1.9998801638937245e-05, + "loss": 0.2485, + "step": 684 + }, + { + "epoch": 0.03, + "grad_norm": 1.2575064722605738, + "learning_rate": 1.9998776006793702e-05, + "loss": 0.2661, + "step": 685 + }, + { + "epoch": 0.03, + "grad_norm": 1.1924520993769014, + "learning_rate": 1.999875010343279e-05, + "loss": 0.2907, + "step": 686 + }, + { + "epoch": 0.03, + "grad_norm": 1.0528956283908704, + "learning_rate": 1.999872392885522e-05, + "loss": 0.2597, + "step": 687 + }, + { + "epoch": 0.03, + "grad_norm": 1.0761853664702166, + "learning_rate": 1.9998697483061698e-05, + "loss": 0.2267, + "step": 688 + }, + { + "epoch": 0.04, + "grad_norm": 1.7045786645036725, + "learning_rate": 1.9998670766052942e-05, + "loss": 0.2515, + "step": 689 + }, + { + "epoch": 0.04, + "grad_norm": 1.0632182594277078, + "learning_rate": 1.9998643777829674e-05, + "loss": 0.2673, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 1.3976256225702135, + "learning_rate": 1.9998616518392633e-05, + "loss": 0.2639, + "step": 691 + }, + { + "epoch": 0.04, + "grad_norm": 1.0113588487491112, + "learning_rate": 1.999858898774255e-05, + "loss": 0.2794, + "step": 692 + }, + { + "epoch": 0.04, + "grad_norm": 1.0553084109359292, + "learning_rate": 1.999856118588018e-05, + "loss": 0.2411, + "step": 693 + }, + { + "epoch": 0.04, + "grad_norm": 1.0623825667352056, + "learning_rate": 1.999853311280627e-05, + "loss": 0.2567, + "step": 694 + }, + { + "epoch": 0.04, + "grad_norm": 1.6341578895660698, + "learning_rate": 1.9998504768521588e-05, + "loss": 0.2903, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 1.5184615805888484, + "learning_rate": 1.99984761530269e-05, + "loss": 0.2781, + "step": 696 + }, + { + "epoch": 0.04, + "grad_norm": 1.3640734092403373, + "learning_rate": 1.9998447266322974e-05, + "loss": 0.2571, + "step": 697 + }, + { + "epoch": 0.04, + "grad_norm": 1.0545124494588674, + "learning_rate": 1.9998418108410606e-05, + "loss": 0.2442, + "step": 698 + }, + { + "epoch": 0.04, + "grad_norm": 1.2493399526273568, + "learning_rate": 1.9998388679290583e-05, + "loss": 0.2984, + "step": 699 + }, + { + "epoch": 0.04, + "grad_norm": 1.1295762074044364, + "learning_rate": 1.9998358978963702e-05, + "loss": 0.2512, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 1.1603953126986037, + "learning_rate": 1.9998329007430767e-05, + "loss": 0.2696, + "step": 701 + }, + { + "epoch": 0.04, + "grad_norm": 1.2509835069608402, + "learning_rate": 1.9998298764692596e-05, + "loss": 0.2651, + "step": 702 + }, + { + "epoch": 0.04, + "grad_norm": 1.3348261112019737, + "learning_rate": 1.9998268250750006e-05, + "loss": 0.2583, + "step": 703 + }, + { + "epoch": 0.04, + "grad_norm": 1.3375545121799972, + "learning_rate": 1.9998237465603822e-05, + "loss": 0.28, + "step": 704 + }, + { + "epoch": 0.04, + "grad_norm": 1.363579686038191, + "learning_rate": 1.9998206409254886e-05, + "loss": 0.2937, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 1.6025082506089567, + "learning_rate": 1.9998175081704035e-05, + "loss": 0.2634, + "step": 706 + }, + { + "epoch": 0.04, + "grad_norm": 1.2722761240387512, + "learning_rate": 1.9998143482952117e-05, + "loss": 0.2599, + "step": 707 + }, + { + "epoch": 0.04, + "grad_norm": 5.599998265070209, + "learning_rate": 1.9998111612999995e-05, + "loss": 0.2505, + "step": 708 + }, + { + "epoch": 0.04, + "grad_norm": 1.317103804306265, + "learning_rate": 1.999807947184853e-05, + "loss": 0.2807, + "step": 709 + }, + { + "epoch": 0.04, + "grad_norm": 1.3710100936702176, + "learning_rate": 1.99980470594986e-05, + "loss": 0.2641, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 1.108376334270745, + "learning_rate": 1.9998014375951073e-05, + "loss": 0.2403, + "step": 711 + }, + { + "epoch": 0.04, + "grad_norm": 1.252535265060163, + "learning_rate": 1.999798142120684e-05, + "loss": 0.2769, + "step": 712 + }, + { + "epoch": 0.04, + "grad_norm": 1.3742660328462404, + "learning_rate": 1.99979481952668e-05, + "loss": 0.2906, + "step": 713 + }, + { + "epoch": 0.04, + "grad_norm": 3.644541022537266, + "learning_rate": 1.999791469813185e-05, + "loss": 0.257, + "step": 714 + }, + { + "epoch": 0.04, + "grad_norm": 1.0749151196722126, + "learning_rate": 1.9997880929802895e-05, + "loss": 0.262, + "step": 715 + }, + { + "epoch": 0.04, + "grad_norm": 1.1730231560972142, + "learning_rate": 1.999784689028086e-05, + "loss": 0.2644, + "step": 716 + }, + { + "epoch": 0.04, + "grad_norm": 1.6294158398432395, + "learning_rate": 1.999781257956666e-05, + "loss": 0.2706, + "step": 717 + }, + { + "epoch": 0.04, + "grad_norm": 1.0705353040911285, + "learning_rate": 1.999777799766123e-05, + "loss": 0.265, + "step": 718 + }, + { + "epoch": 0.04, + "grad_norm": 1.1518326861280468, + "learning_rate": 1.9997743144565513e-05, + "loss": 0.2823, + "step": 719 + }, + { + "epoch": 0.04, + "grad_norm": 1.17251836953955, + "learning_rate": 1.999770802028044e-05, + "loss": 0.2623, + "step": 720 + }, + { + "epoch": 0.04, + "grad_norm": 1.1856563147370942, + "learning_rate": 1.9997672624806976e-05, + "loss": 0.2777, + "step": 721 + }, + { + "epoch": 0.04, + "grad_norm": 1.4003250521694823, + "learning_rate": 1.999763695814608e-05, + "loss": 0.2464, + "step": 722 + }, + { + "epoch": 0.04, + "grad_norm": 2.5113897733883497, + "learning_rate": 1.9997601020298713e-05, + "loss": 0.2613, + "step": 723 + }, + { + "epoch": 0.04, + "grad_norm": 1.2219147153367094, + "learning_rate": 1.9997564811265854e-05, + "loss": 0.2616, + "step": 724 + }, + { + "epoch": 0.04, + "grad_norm": 1.4067252099576408, + "learning_rate": 1.999752833104849e-05, + "loss": 0.2842, + "step": 725 + }, + { + "epoch": 0.04, + "grad_norm": 1.0208594719205228, + "learning_rate": 1.9997491579647595e-05, + "loss": 0.2505, + "step": 726 + }, + { + "epoch": 0.04, + "grad_norm": 1.1036370771367006, + "learning_rate": 1.9997454557064185e-05, + "loss": 0.2498, + "step": 727 + }, + { + "epoch": 0.04, + "grad_norm": 1.7207498654087374, + "learning_rate": 1.9997417263299256e-05, + "loss": 0.2713, + "step": 728 + }, + { + "epoch": 0.04, + "grad_norm": 1.3992568424381269, + "learning_rate": 1.999737969835381e-05, + "loss": 0.2955, + "step": 729 + }, + { + "epoch": 0.04, + "grad_norm": 1.9982495451376947, + "learning_rate": 1.9997341862228886e-05, + "loss": 0.2583, + "step": 730 + }, + { + "epoch": 0.04, + "grad_norm": 1.3960847560767105, + "learning_rate": 1.999730375492549e-05, + "loss": 0.2939, + "step": 731 + }, + { + "epoch": 0.04, + "grad_norm": 1.214687864500847, + "learning_rate": 1.999726537644467e-05, + "loss": 0.254, + "step": 732 + }, + { + "epoch": 0.04, + "grad_norm": 1.4135176505419116, + "learning_rate": 1.9997226726787462e-05, + "loss": 0.3036, + "step": 733 + }, + { + "epoch": 0.04, + "grad_norm": 1.3404019047194708, + "learning_rate": 1.9997187805954916e-05, + "loss": 0.2803, + "step": 734 + }, + { + "epoch": 0.04, + "grad_norm": 1.0201205942546145, + "learning_rate": 1.999714861394808e-05, + "loss": 0.273, + "step": 735 + }, + { + "epoch": 0.04, + "grad_norm": 1.0396421943283112, + "learning_rate": 1.999710915076803e-05, + "loss": 0.2595, + "step": 736 + }, + { + "epoch": 0.04, + "grad_norm": 1.0398657233769981, + "learning_rate": 1.9997069416415824e-05, + "loss": 0.2853, + "step": 737 + }, + { + "epoch": 0.04, + "grad_norm": 1.0920534147148957, + "learning_rate": 1.9997029410892546e-05, + "loss": 0.241, + "step": 738 + }, + { + "epoch": 0.04, + "grad_norm": 1.0928042063839294, + "learning_rate": 1.9996989134199287e-05, + "loss": 0.2144, + "step": 739 + }, + { + "epoch": 0.04, + "grad_norm": 1.0198803572927333, + "learning_rate": 1.9996948586337127e-05, + "loss": 0.2572, + "step": 740 + }, + { + "epoch": 0.04, + "grad_norm": 1.4319185641684506, + "learning_rate": 1.9996907767307175e-05, + "loss": 0.2694, + "step": 741 + }, + { + "epoch": 0.04, + "grad_norm": 1.0858520979144273, + "learning_rate": 1.9996866677110534e-05, + "loss": 0.2594, + "step": 742 + }, + { + "epoch": 0.04, + "grad_norm": 1.3255623969997914, + "learning_rate": 1.999682531574832e-05, + "loss": 0.2538, + "step": 743 + }, + { + "epoch": 0.04, + "grad_norm": 1.0108943423155112, + "learning_rate": 1.9996783683221652e-05, + "loss": 0.2516, + "step": 744 + }, + { + "epoch": 0.04, + "grad_norm": 1.4094537154588183, + "learning_rate": 1.9996741779531665e-05, + "loss": 0.2707, + "step": 745 + }, + { + "epoch": 0.04, + "grad_norm": 1.1793015053224885, + "learning_rate": 1.9996699604679493e-05, + "loss": 0.2646, + "step": 746 + }, + { + "epoch": 0.04, + "grad_norm": 1.0792637757557646, + "learning_rate": 1.9996657158666276e-05, + "loss": 0.2478, + "step": 747 + }, + { + "epoch": 0.04, + "grad_norm": 1.039527097370698, + "learning_rate": 1.999661444149317e-05, + "loss": 0.2534, + "step": 748 + }, + { + "epoch": 0.04, + "grad_norm": 1.2559400175483508, + "learning_rate": 1.9996571453161338e-05, + "loss": 0.2882, + "step": 749 + }, + { + "epoch": 0.04, + "grad_norm": 1.1366971909735748, + "learning_rate": 1.9996528193671936e-05, + "loss": 0.2505, + "step": 750 + }, + { + "epoch": 0.04, + "grad_norm": 1.3078857802825534, + "learning_rate": 1.9996484663026143e-05, + "loss": 0.2835, + "step": 751 + }, + { + "epoch": 0.04, + "grad_norm": 1.0829375306862181, + "learning_rate": 1.999644086122514e-05, + "loss": 0.2479, + "step": 752 + }, + { + "epoch": 0.04, + "grad_norm": 1.4273810049424516, + "learning_rate": 1.999639678827011e-05, + "loss": 0.2669, + "step": 753 + }, + { + "epoch": 0.04, + "grad_norm": 1.3117200257674768, + "learning_rate": 1.9996352444162257e-05, + "loss": 0.2834, + "step": 754 + }, + { + "epoch": 0.04, + "grad_norm": 1.3033840703007538, + "learning_rate": 1.999630782890278e-05, + "loss": 0.24, + "step": 755 + }, + { + "epoch": 0.04, + "grad_norm": 1.4419353437645621, + "learning_rate": 1.9996262942492882e-05, + "loss": 0.2608, + "step": 756 + }, + { + "epoch": 0.04, + "grad_norm": 2.0632655417967007, + "learning_rate": 1.9996217784933794e-05, + "loss": 0.293, + "step": 757 + }, + { + "epoch": 0.04, + "grad_norm": 1.138628289340763, + "learning_rate": 1.999617235622673e-05, + "loss": 0.2835, + "step": 758 + }, + { + "epoch": 0.04, + "grad_norm": 1.8702915385259948, + "learning_rate": 1.999612665637293e-05, + "loss": 0.2692, + "step": 759 + }, + { + "epoch": 0.04, + "grad_norm": 1.130927538844785, + "learning_rate": 1.9996080685373628e-05, + "loss": 0.2563, + "step": 760 + }, + { + "epoch": 0.04, + "grad_norm": 1.1131329974735835, + "learning_rate": 1.999603444323007e-05, + "loss": 0.2744, + "step": 761 + }, + { + "epoch": 0.04, + "grad_norm": 1.5508343576824948, + "learning_rate": 1.9995987929943518e-05, + "loss": 0.2657, + "step": 762 + }, + { + "epoch": 0.04, + "grad_norm": 1.3218831953876735, + "learning_rate": 1.9995941145515224e-05, + "loss": 0.2584, + "step": 763 + }, + { + "epoch": 0.04, + "grad_norm": 1.1460227432051737, + "learning_rate": 1.9995894089946466e-05, + "loss": 0.2883, + "step": 764 + }, + { + "epoch": 0.04, + "grad_norm": 1.0196084487669637, + "learning_rate": 1.9995846763238514e-05, + "loss": 0.2501, + "step": 765 + }, + { + "epoch": 0.04, + "grad_norm": 1.0954833712086312, + "learning_rate": 1.9995799165392653e-05, + "loss": 0.2882, + "step": 766 + }, + { + "epoch": 0.04, + "grad_norm": 1.2774372509323548, + "learning_rate": 1.9995751296410176e-05, + "loss": 0.2535, + "step": 767 + }, + { + "epoch": 0.04, + "grad_norm": 1.103024655690764, + "learning_rate": 1.9995703156292382e-05, + "loss": 0.2699, + "step": 768 + }, + { + "epoch": 0.04, + "grad_norm": 1.126010661023456, + "learning_rate": 1.9995654745040575e-05, + "loss": 0.2746, + "step": 769 + }, + { + "epoch": 0.04, + "grad_norm": 1.2036940841199122, + "learning_rate": 1.999560606265607e-05, + "loss": 0.2846, + "step": 770 + }, + { + "epoch": 0.04, + "grad_norm": 1.053928772845716, + "learning_rate": 1.999555710914018e-05, + "loss": 0.2536, + "step": 771 + }, + { + "epoch": 0.04, + "grad_norm": 1.1459541009748426, + "learning_rate": 1.9995507884494244e-05, + "loss": 0.2503, + "step": 772 + }, + { + "epoch": 0.04, + "grad_norm": 1.2796107407868114, + "learning_rate": 1.999545838871959e-05, + "loss": 0.2575, + "step": 773 + }, + { + "epoch": 0.04, + "grad_norm": 1.313315822502975, + "learning_rate": 1.9995408621817566e-05, + "loss": 0.2616, + "step": 774 + }, + { + "epoch": 0.04, + "grad_norm": 1.6089597975605014, + "learning_rate": 1.9995358583789514e-05, + "loss": 0.245, + "step": 775 + }, + { + "epoch": 0.04, + "grad_norm": 1.1189952245805992, + "learning_rate": 1.99953082746368e-05, + "loss": 0.2719, + "step": 776 + }, + { + "epoch": 0.04, + "grad_norm": 1.1676561428296084, + "learning_rate": 1.9995257694360778e-05, + "loss": 0.2515, + "step": 777 + }, + { + "epoch": 0.04, + "grad_norm": 1.2390945203221548, + "learning_rate": 1.9995206842962833e-05, + "loss": 0.2815, + "step": 778 + }, + { + "epoch": 0.04, + "grad_norm": 1.284354717320916, + "learning_rate": 1.9995155720444336e-05, + "loss": 0.2474, + "step": 779 + }, + { + "epoch": 0.04, + "grad_norm": 1.2223633934856006, + "learning_rate": 1.9995104326806675e-05, + "loss": 0.2695, + "step": 780 + }, + { + "epoch": 0.04, + "grad_norm": 1.1957041076695898, + "learning_rate": 1.9995052662051244e-05, + "loss": 0.2401, + "step": 781 + }, + { + "epoch": 0.04, + "grad_norm": 1.6931627034823373, + "learning_rate": 1.999500072617945e-05, + "loss": 0.2469, + "step": 782 + }, + { + "epoch": 0.04, + "grad_norm": 1.1597265079302603, + "learning_rate": 1.999494851919269e-05, + "loss": 0.2868, + "step": 783 + }, + { + "epoch": 0.04, + "grad_norm": 1.8762144776870082, + "learning_rate": 1.999489604109239e-05, + "loss": 0.2919, + "step": 784 + }, + { + "epoch": 0.04, + "grad_norm": 1.1508642549126946, + "learning_rate": 1.999484329187997e-05, + "loss": 0.2577, + "step": 785 + }, + { + "epoch": 0.04, + "grad_norm": 1.1939915989038365, + "learning_rate": 1.9994790271556862e-05, + "loss": 0.2597, + "step": 786 + }, + { + "epoch": 0.04, + "grad_norm": 1.1376437711036782, + "learning_rate": 1.9994736980124502e-05, + "loss": 0.2614, + "step": 787 + }, + { + "epoch": 0.04, + "grad_norm": 1.1683273366283589, + "learning_rate": 1.9994683417584336e-05, + "loss": 0.2736, + "step": 788 + }, + { + "epoch": 0.04, + "grad_norm": 1.6975838851394127, + "learning_rate": 1.999462958393782e-05, + "loss": 0.2642, + "step": 789 + }, + { + "epoch": 0.04, + "grad_norm": 1.3563670815198583, + "learning_rate": 1.999457547918641e-05, + "loss": 0.2493, + "step": 790 + }, + { + "epoch": 0.04, + "grad_norm": 1.1854453853936364, + "learning_rate": 1.999452110333158e-05, + "loss": 0.2397, + "step": 791 + }, + { + "epoch": 0.04, + "grad_norm": 1.5034728936630222, + "learning_rate": 1.9994466456374796e-05, + "loss": 0.2834, + "step": 792 + }, + { + "epoch": 0.04, + "grad_norm": 0.9981359156760405, + "learning_rate": 1.9994411538317546e-05, + "loss": 0.2322, + "step": 793 + }, + { + "epoch": 0.04, + "grad_norm": 1.2749037646530144, + "learning_rate": 1.999435634916132e-05, + "loss": 0.2485, + "step": 794 + }, + { + "epoch": 0.04, + "grad_norm": 1.1573639241050016, + "learning_rate": 1.9994300888907613e-05, + "loss": 0.2847, + "step": 795 + }, + { + "epoch": 0.04, + "grad_norm": 0.9231922613268816, + "learning_rate": 1.999424515755793e-05, + "loss": 0.2389, + "step": 796 + }, + { + "epoch": 0.04, + "grad_norm": 1.092813811557751, + "learning_rate": 1.9994189155113778e-05, + "loss": 0.2567, + "step": 797 + }, + { + "epoch": 0.04, + "grad_norm": 1.097683225891256, + "learning_rate": 1.9994132881576685e-05, + "loss": 0.2702, + "step": 798 + }, + { + "epoch": 0.04, + "grad_norm": 1.1363590369891523, + "learning_rate": 1.9994076336948175e-05, + "loss": 0.2784, + "step": 799 + }, + { + "epoch": 0.04, + "grad_norm": 0.8879204954807518, + "learning_rate": 1.999401952122978e-05, + "loss": 0.2435, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 1.033548685282584, + "learning_rate": 1.9993962434423037e-05, + "loss": 0.2656, + "step": 801 + }, + { + "epoch": 0.04, + "grad_norm": 1.1875030010672916, + "learning_rate": 1.99939050765295e-05, + "loss": 0.2726, + "step": 802 + }, + { + "epoch": 0.04, + "grad_norm": 1.079155108315839, + "learning_rate": 1.9993847447550722e-05, + "loss": 0.2373, + "step": 803 + }, + { + "epoch": 0.04, + "grad_norm": 1.1246899751217057, + "learning_rate": 1.9993789547488268e-05, + "loss": 0.2584, + "step": 804 + }, + { + "epoch": 0.04, + "grad_norm": 1.0258829824673186, + "learning_rate": 1.999373137634371e-05, + "loss": 0.2619, + "step": 805 + }, + { + "epoch": 0.04, + "grad_norm": 1.105454260644435, + "learning_rate": 1.9993672934118625e-05, + "loss": 0.3153, + "step": 806 + }, + { + "epoch": 0.04, + "grad_norm": 0.971938599700853, + "learning_rate": 1.9993614220814594e-05, + "loss": 0.2531, + "step": 807 + }, + { + "epoch": 0.04, + "grad_norm": 1.1308239034584113, + "learning_rate": 1.9993555236433216e-05, + "loss": 0.2378, + "step": 808 + }, + { + "epoch": 0.04, + "grad_norm": 1.1821286909927475, + "learning_rate": 1.9993495980976084e-05, + "loss": 0.2461, + "step": 809 + }, + { + "epoch": 0.04, + "grad_norm": 1.0491796907842892, + "learning_rate": 1.9993436454444814e-05, + "loss": 0.2863, + "step": 810 + }, + { + "epoch": 0.04, + "grad_norm": 1.1794335903141098, + "learning_rate": 1.999337665684101e-05, + "loss": 0.2662, + "step": 811 + }, + { + "epoch": 0.04, + "grad_norm": 1.1534916181430674, + "learning_rate": 1.9993316588166307e-05, + "loss": 0.2768, + "step": 812 + }, + { + "epoch": 0.04, + "grad_norm": 1.127511520512681, + "learning_rate": 1.999325624842232e-05, + "loss": 0.2873, + "step": 813 + }, + { + "epoch": 0.04, + "grad_norm": 1.0337648173317513, + "learning_rate": 1.9993195637610695e-05, + "loss": 0.2319, + "step": 814 + }, + { + "epoch": 0.04, + "grad_norm": 1.1132394984821739, + "learning_rate": 1.9993134755733075e-05, + "loss": 0.2471, + "step": 815 + }, + { + "epoch": 0.04, + "grad_norm": 1.1475184237917153, + "learning_rate": 1.9993073602791108e-05, + "loss": 0.2765, + "step": 816 + }, + { + "epoch": 0.04, + "grad_norm": 1.333803344892296, + "learning_rate": 1.999301217878646e-05, + "loss": 0.2513, + "step": 817 + }, + { + "epoch": 0.04, + "grad_norm": 1.0499954721840008, + "learning_rate": 1.9992950483720787e-05, + "loss": 0.2605, + "step": 818 + }, + { + "epoch": 0.04, + "grad_norm": 1.3012802506530357, + "learning_rate": 1.999288851759577e-05, + "loss": 0.2676, + "step": 819 + }, + { + "epoch": 0.04, + "grad_norm": 1.4136693582266662, + "learning_rate": 1.9992826280413087e-05, + "loss": 0.2397, + "step": 820 + }, + { + "epoch": 0.04, + "grad_norm": 1.3517183530845074, + "learning_rate": 1.9992763772174427e-05, + "loss": 0.2772, + "step": 821 + }, + { + "epoch": 0.04, + "grad_norm": 0.9780065045060403, + "learning_rate": 1.9992700992881486e-05, + "loss": 0.2579, + "step": 822 + }, + { + "epoch": 0.04, + "grad_norm": 1.107034582008244, + "learning_rate": 1.9992637942535963e-05, + "loss": 0.3015, + "step": 823 + }, + { + "epoch": 0.04, + "grad_norm": 1.208106797361871, + "learning_rate": 1.9992574621139575e-05, + "loss": 0.2684, + "step": 824 + }, + { + "epoch": 0.04, + "grad_norm": 1.0879601895663045, + "learning_rate": 1.9992511028694036e-05, + "loss": 0.288, + "step": 825 + }, + { + "epoch": 0.04, + "grad_norm": 2.116435309323238, + "learning_rate": 1.999244716520107e-05, + "loss": 0.2718, + "step": 826 + }, + { + "epoch": 0.04, + "grad_norm": 1.3059275763713378, + "learning_rate": 1.9992383030662412e-05, + "loss": 0.307, + "step": 827 + }, + { + "epoch": 0.04, + "grad_norm": 1.6808967976744835, + "learning_rate": 1.9992318625079796e-05, + "loss": 0.2571, + "step": 828 + }, + { + "epoch": 0.04, + "grad_norm": 1.2453853874792142, + "learning_rate": 1.9992253948454975e-05, + "loss": 0.265, + "step": 829 + }, + { + "epoch": 0.04, + "grad_norm": 1.2037147547935059, + "learning_rate": 1.99921890007897e-05, + "loss": 0.2994, + "step": 830 + }, + { + "epoch": 0.04, + "grad_norm": 1.420063191800624, + "learning_rate": 1.9992123782085738e-05, + "loss": 0.2597, + "step": 831 + }, + { + "epoch": 0.04, + "grad_norm": 1.2273296556423356, + "learning_rate": 1.9992058292344853e-05, + "loss": 0.2513, + "step": 832 + }, + { + "epoch": 0.04, + "grad_norm": 1.1062136182815394, + "learning_rate": 1.9991992531568817e-05, + "loss": 0.2663, + "step": 833 + }, + { + "epoch": 0.04, + "grad_norm": 1.08435612624478, + "learning_rate": 1.9991926499759426e-05, + "loss": 0.2818, + "step": 834 + }, + { + "epoch": 0.04, + "grad_norm": 1.159958172965142, + "learning_rate": 1.999186019691846e-05, + "loss": 0.2732, + "step": 835 + }, + { + "epoch": 0.04, + "grad_norm": 1.0939438567851123, + "learning_rate": 1.9991793623047724e-05, + "loss": 0.2321, + "step": 836 + }, + { + "epoch": 0.04, + "grad_norm": 1.8205766434289725, + "learning_rate": 1.999172677814902e-05, + "loss": 0.2246, + "step": 837 + }, + { + "epoch": 0.04, + "grad_norm": 1.1672477240865653, + "learning_rate": 1.9991659662224166e-05, + "loss": 0.2701, + "step": 838 + }, + { + "epoch": 0.04, + "grad_norm": 1.129230834652707, + "learning_rate": 1.9991592275274976e-05, + "loss": 0.2614, + "step": 839 + }, + { + "epoch": 0.04, + "grad_norm": 1.7214179515732553, + "learning_rate": 1.9991524617303282e-05, + "loss": 0.2762, + "step": 840 + }, + { + "epoch": 0.04, + "grad_norm": 1.2950757995564135, + "learning_rate": 1.999145668831092e-05, + "loss": 0.2626, + "step": 841 + }, + { + "epoch": 0.04, + "grad_norm": 1.5470842349719043, + "learning_rate": 1.999138848829973e-05, + "loss": 0.2667, + "step": 842 + }, + { + "epoch": 0.04, + "grad_norm": 1.5064490380389803, + "learning_rate": 1.9991320017271562e-05, + "loss": 0.2616, + "step": 843 + }, + { + "epoch": 0.04, + "grad_norm": 1.1662089984074246, + "learning_rate": 1.9991251275228274e-05, + "loss": 0.2637, + "step": 844 + }, + { + "epoch": 0.04, + "grad_norm": 1.2511580698565579, + "learning_rate": 1.9991182262171734e-05, + "loss": 0.2811, + "step": 845 + }, + { + "epoch": 0.04, + "grad_norm": 1.1962141316679449, + "learning_rate": 1.9991112978103807e-05, + "loss": 0.2535, + "step": 846 + }, + { + "epoch": 0.04, + "grad_norm": 1.2435377890556183, + "learning_rate": 1.9991043423026377e-05, + "loss": 0.2707, + "step": 847 + }, + { + "epoch": 0.04, + "grad_norm": 1.125868593032254, + "learning_rate": 1.999097359694133e-05, + "loss": 0.2737, + "step": 848 + }, + { + "epoch": 0.04, + "grad_norm": 1.171550604371576, + "learning_rate": 1.999090349985056e-05, + "loss": 0.2519, + "step": 849 + }, + { + "epoch": 0.04, + "grad_norm": 1.3676752292310235, + "learning_rate": 1.999083313175597e-05, + "loss": 0.2485, + "step": 850 + }, + { + "epoch": 0.04, + "grad_norm": 1.2978859424907145, + "learning_rate": 1.9990762492659466e-05, + "loss": 0.2642, + "step": 851 + }, + { + "epoch": 0.04, + "grad_norm": 1.1216709783383152, + "learning_rate": 1.9990691582562963e-05, + "loss": 0.2571, + "step": 852 + }, + { + "epoch": 0.04, + "grad_norm": 1.4066273485739214, + "learning_rate": 1.9990620401468392e-05, + "loss": 0.2656, + "step": 853 + }, + { + "epoch": 0.04, + "grad_norm": 1.0970791764561405, + "learning_rate": 1.9990548949377674e-05, + "loss": 0.2225, + "step": 854 + }, + { + "epoch": 0.04, + "grad_norm": 1.1687899119933483, + "learning_rate": 1.999047722629275e-05, + "loss": 0.2735, + "step": 855 + }, + { + "epoch": 0.04, + "grad_norm": 1.5732209820343088, + "learning_rate": 1.999040523221557e-05, + "loss": 0.2543, + "step": 856 + }, + { + "epoch": 0.04, + "grad_norm": 1.725733160854452, + "learning_rate": 1.9990332967148082e-05, + "loss": 0.2354, + "step": 857 + }, + { + "epoch": 0.04, + "grad_norm": 2.28006853250872, + "learning_rate": 1.999026043109225e-05, + "loss": 0.2678, + "step": 858 + }, + { + "epoch": 0.04, + "grad_norm": 1.172545431709117, + "learning_rate": 1.9990187624050038e-05, + "loss": 0.2394, + "step": 859 + }, + { + "epoch": 0.04, + "grad_norm": 1.2752673779385229, + "learning_rate": 1.9990114546023423e-05, + "loss": 0.2663, + "step": 860 + }, + { + "epoch": 0.04, + "grad_norm": 1.0882818240971561, + "learning_rate": 1.999004119701439e-05, + "loss": 0.2499, + "step": 861 + }, + { + "epoch": 0.04, + "grad_norm": 0.9920608283590259, + "learning_rate": 1.9989967577024922e-05, + "loss": 0.2615, + "step": 862 + }, + { + "epoch": 0.04, + "grad_norm": 1.0741793303028535, + "learning_rate": 1.9989893686057016e-05, + "loss": 0.2781, + "step": 863 + }, + { + "epoch": 0.04, + "grad_norm": 1.1532640931043636, + "learning_rate": 1.9989819524112683e-05, + "loss": 0.2701, + "step": 864 + }, + { + "epoch": 0.04, + "grad_norm": 1.0592006955548727, + "learning_rate": 1.998974509119393e-05, + "loss": 0.2384, + "step": 865 + }, + { + "epoch": 0.04, + "grad_norm": 1.0794821739805955, + "learning_rate": 1.9989670387302783e-05, + "loss": 0.2575, + "step": 866 + }, + { + "epoch": 0.04, + "grad_norm": 0.9108946007980193, + "learning_rate": 1.9989595412441252e-05, + "loss": 0.2738, + "step": 867 + }, + { + "epoch": 0.04, + "grad_norm": 1.2301240916180438, + "learning_rate": 1.9989520166611388e-05, + "loss": 0.285, + "step": 868 + }, + { + "epoch": 0.04, + "grad_norm": 1.1232165575207511, + "learning_rate": 1.9989444649815226e-05, + "loss": 0.2408, + "step": 869 + }, + { + "epoch": 0.04, + "grad_norm": 1.3153763910607554, + "learning_rate": 1.9989368862054814e-05, + "loss": 0.2648, + "step": 870 + }, + { + "epoch": 0.04, + "grad_norm": 1.067960049988388, + "learning_rate": 1.9989292803332203e-05, + "loss": 0.2372, + "step": 871 + }, + { + "epoch": 0.04, + "grad_norm": 1.1159094593203644, + "learning_rate": 1.9989216473649466e-05, + "loss": 0.2354, + "step": 872 + }, + { + "epoch": 0.04, + "grad_norm": 1.0141369128982671, + "learning_rate": 1.998913987300866e-05, + "loss": 0.2533, + "step": 873 + }, + { + "epoch": 0.04, + "grad_norm": 1.0006771174963607, + "learning_rate": 1.998906300141188e-05, + "loss": 0.2808, + "step": 874 + }, + { + "epoch": 0.04, + "grad_norm": 1.116211648499011, + "learning_rate": 1.9988985858861193e-05, + "loss": 0.2637, + "step": 875 + }, + { + "epoch": 0.04, + "grad_norm": 1.2578617963336698, + "learning_rate": 1.9988908445358705e-05, + "loss": 0.2743, + "step": 876 + }, + { + "epoch": 0.04, + "grad_norm": 1.5131417020303695, + "learning_rate": 1.998883076090651e-05, + "loss": 0.2753, + "step": 877 + }, + { + "epoch": 0.04, + "grad_norm": 1.2599095826842825, + "learning_rate": 1.9988752805506723e-05, + "loss": 0.2678, + "step": 878 + }, + { + "epoch": 0.04, + "grad_norm": 0.9360444508399847, + "learning_rate": 1.9988674579161444e-05, + "loss": 0.2284, + "step": 879 + }, + { + "epoch": 0.04, + "grad_norm": 1.4269838584838916, + "learning_rate": 1.9988596081872805e-05, + "loss": 0.2552, + "step": 880 + }, + { + "epoch": 0.04, + "grad_norm": 1.2384542201113082, + "learning_rate": 1.9988517313642934e-05, + "loss": 0.2617, + "step": 881 + }, + { + "epoch": 0.04, + "grad_norm": 1.4157479977396126, + "learning_rate": 1.9988438274473966e-05, + "loss": 0.2937, + "step": 882 + }, + { + "epoch": 0.04, + "grad_norm": 1.481445241615998, + "learning_rate": 1.9988358964368046e-05, + "loss": 0.2472, + "step": 883 + }, + { + "epoch": 0.04, + "grad_norm": 1.4095283492098256, + "learning_rate": 1.9988279383327324e-05, + "loss": 0.279, + "step": 884 + }, + { + "epoch": 0.05, + "grad_norm": 1.1291656446775273, + "learning_rate": 1.9988199531353963e-05, + "loss": 0.2332, + "step": 885 + }, + { + "epoch": 0.05, + "grad_norm": 1.556219150900177, + "learning_rate": 1.998811940845012e-05, + "loss": 0.2166, + "step": 886 + }, + { + "epoch": 0.05, + "grad_norm": 1.1722666149602972, + "learning_rate": 1.998803901461798e-05, + "loss": 0.2168, + "step": 887 + }, + { + "epoch": 0.05, + "grad_norm": 1.362960636009129, + "learning_rate": 1.998795834985971e-05, + "loss": 0.2438, + "step": 888 + }, + { + "epoch": 0.05, + "grad_norm": 1.2382505267852677, + "learning_rate": 1.998787741417751e-05, + "loss": 0.2446, + "step": 889 + }, + { + "epoch": 0.05, + "grad_norm": 1.2519924974220127, + "learning_rate": 1.9987796207573573e-05, + "loss": 0.2247, + "step": 890 + }, + { + "epoch": 0.05, + "grad_norm": 1.138520059051484, + "learning_rate": 1.9987714730050098e-05, + "loss": 0.2394, + "step": 891 + }, + { + "epoch": 0.05, + "grad_norm": 1.184081525001222, + "learning_rate": 1.9987632981609297e-05, + "loss": 0.2342, + "step": 892 + }, + { + "epoch": 0.05, + "grad_norm": 1.4228525464925417, + "learning_rate": 1.9987550962253387e-05, + "loss": 0.2505, + "step": 893 + }, + { + "epoch": 0.05, + "grad_norm": 1.1732583727474242, + "learning_rate": 1.998746867198459e-05, + "loss": 0.2408, + "step": 894 + }, + { + "epoch": 0.05, + "grad_norm": 1.2929452831438806, + "learning_rate": 1.9987386110805146e-05, + "loss": 0.2636, + "step": 895 + }, + { + "epoch": 0.05, + "grad_norm": 1.1957505225992544, + "learning_rate": 1.9987303278717288e-05, + "loss": 0.2597, + "step": 896 + }, + { + "epoch": 0.05, + "grad_norm": 1.3406390837672997, + "learning_rate": 1.9987220175723265e-05, + "loss": 0.2643, + "step": 897 + }, + { + "epoch": 0.05, + "grad_norm": 1.1369635625540935, + "learning_rate": 1.998713680182533e-05, + "loss": 0.2523, + "step": 898 + }, + { + "epoch": 0.05, + "grad_norm": 1.5937134878395935, + "learning_rate": 1.9987053157025748e-05, + "loss": 0.2677, + "step": 899 + }, + { + "epoch": 0.05, + "grad_norm": 1.305439396439621, + "learning_rate": 1.998696924132678e-05, + "loss": 0.2856, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 1.3427530047357976, + "learning_rate": 1.9986885054730708e-05, + "loss": 0.2232, + "step": 901 + }, + { + "epoch": 0.05, + "grad_norm": 1.3349578827424304, + "learning_rate": 1.9986800597239817e-05, + "loss": 0.2687, + "step": 902 + }, + { + "epoch": 0.05, + "grad_norm": 1.2393032091995557, + "learning_rate": 1.9986715868856396e-05, + "loss": 0.2342, + "step": 903 + }, + { + "epoch": 0.05, + "grad_norm": 1.349656127097357, + "learning_rate": 1.998663086958274e-05, + "loss": 0.2683, + "step": 904 + }, + { + "epoch": 0.05, + "grad_norm": 1.4183409271315817, + "learning_rate": 1.998654559942116e-05, + "loss": 0.2843, + "step": 905 + }, + { + "epoch": 0.05, + "grad_norm": 1.8087602257457172, + "learning_rate": 1.998646005837397e-05, + "loss": 0.2616, + "step": 906 + }, + { + "epoch": 0.05, + "grad_norm": 1.5093979621304114, + "learning_rate": 1.998637424644348e-05, + "loss": 0.2367, + "step": 907 + }, + { + "epoch": 0.05, + "grad_norm": 1.2972565928670108, + "learning_rate": 1.998628816363203e-05, + "loss": 0.2765, + "step": 908 + }, + { + "epoch": 0.05, + "grad_norm": 1.1342135617558307, + "learning_rate": 1.9986201809941945e-05, + "loss": 0.2605, + "step": 909 + }, + { + "epoch": 0.05, + "grad_norm": 1.622288471832347, + "learning_rate": 1.9986115185375576e-05, + "loss": 0.2756, + "step": 910 + }, + { + "epoch": 0.05, + "grad_norm": 8.712971571944573, + "learning_rate": 1.9986028289935263e-05, + "loss": 0.3367, + "step": 911 + }, + { + "epoch": 0.05, + "grad_norm": 1.831004903324416, + "learning_rate": 1.9985941123623374e-05, + "loss": 0.2635, + "step": 912 + }, + { + "epoch": 0.05, + "grad_norm": 37.253091376437865, + "learning_rate": 1.9985853686442266e-05, + "loss": 0.425, + "step": 913 + }, + { + "epoch": 0.05, + "grad_norm": 1.6139419406539022, + "learning_rate": 1.9985765978394315e-05, + "loss": 0.2935, + "step": 914 + }, + { + "epoch": 0.05, + "grad_norm": 1.6099763905589324, + "learning_rate": 1.9985677999481898e-05, + "loss": 0.2702, + "step": 915 + }, + { + "epoch": 0.05, + "grad_norm": 1.6413119822026072, + "learning_rate": 1.9985589749707395e-05, + "loss": 0.2629, + "step": 916 + }, + { + "epoch": 0.05, + "grad_norm": 1.3524571716304779, + "learning_rate": 1.9985501229073213e-05, + "loss": 0.2703, + "step": 917 + }, + { + "epoch": 0.05, + "grad_norm": 1.0938158950381753, + "learning_rate": 1.998541243758174e-05, + "loss": 0.2704, + "step": 918 + }, + { + "epoch": 0.05, + "grad_norm": 1.1489715020729094, + "learning_rate": 1.9985323375235395e-05, + "loss": 0.2543, + "step": 919 + }, + { + "epoch": 0.05, + "grad_norm": 1.1731682133705816, + "learning_rate": 1.9985234042036588e-05, + "loss": 0.2608, + "step": 920 + }, + { + "epoch": 0.05, + "grad_norm": 0.9738308855528613, + "learning_rate": 1.9985144437987743e-05, + "loss": 0.2424, + "step": 921 + }, + { + "epoch": 0.05, + "grad_norm": 1.0626982152800977, + "learning_rate": 1.9985054563091295e-05, + "loss": 0.2758, + "step": 922 + }, + { + "epoch": 0.05, + "grad_norm": 1.426479209067473, + "learning_rate": 1.9984964417349675e-05, + "loss": 0.2733, + "step": 923 + }, + { + "epoch": 0.05, + "grad_norm": 1.1284519447109926, + "learning_rate": 1.998487400076533e-05, + "loss": 0.2457, + "step": 924 + }, + { + "epoch": 0.05, + "grad_norm": 1.0912469894763157, + "learning_rate": 1.9984783313340715e-05, + "loss": 0.2408, + "step": 925 + }, + { + "epoch": 0.05, + "grad_norm": 1.1812135886717172, + "learning_rate": 1.998469235507829e-05, + "loss": 0.2503, + "step": 926 + }, + { + "epoch": 0.05, + "grad_norm": 1.0073430784559474, + "learning_rate": 1.998460112598052e-05, + "loss": 0.244, + "step": 927 + }, + { + "epoch": 0.05, + "grad_norm": 1.3404204748861077, + "learning_rate": 1.998450962604988e-05, + "loss": 0.2885, + "step": 928 + }, + { + "epoch": 0.05, + "grad_norm": 1.7811568491496874, + "learning_rate": 1.9984417855288853e-05, + "loss": 0.2516, + "step": 929 + }, + { + "epoch": 0.05, + "grad_norm": 1.5443231025515465, + "learning_rate": 1.998432581369993e-05, + "loss": 0.2572, + "step": 930 + }, + { + "epoch": 0.05, + "grad_norm": 1.4293337544325062, + "learning_rate": 1.9984233501285602e-05, + "loss": 0.2992, + "step": 931 + }, + { + "epoch": 0.05, + "grad_norm": 1.284982874168832, + "learning_rate": 1.9984140918048376e-05, + "loss": 0.2399, + "step": 932 + }, + { + "epoch": 0.05, + "grad_norm": 1.3961467293148901, + "learning_rate": 1.9984048063990766e-05, + "loss": 0.2564, + "step": 933 + }, + { + "epoch": 0.05, + "grad_norm": 1.1794505830153215, + "learning_rate": 1.9983954939115286e-05, + "loss": 0.2417, + "step": 934 + }, + { + "epoch": 0.05, + "grad_norm": 1.2760274907553164, + "learning_rate": 1.9983861543424467e-05, + "loss": 0.2662, + "step": 935 + }, + { + "epoch": 0.05, + "grad_norm": 1.1984319455046042, + "learning_rate": 1.998376787692084e-05, + "loss": 0.2402, + "step": 936 + }, + { + "epoch": 0.05, + "grad_norm": 1.5134177656969858, + "learning_rate": 1.9983673939606946e-05, + "loss": 0.2641, + "step": 937 + }, + { + "epoch": 0.05, + "grad_norm": 1.08231174372219, + "learning_rate": 1.9983579731485326e-05, + "loss": 0.2398, + "step": 938 + }, + { + "epoch": 0.05, + "grad_norm": 1.3307559472577821, + "learning_rate": 1.998348525255855e-05, + "loss": 0.2773, + "step": 939 + }, + { + "epoch": 0.05, + "grad_norm": 1.256364695732902, + "learning_rate": 1.9983390502829168e-05, + "loss": 0.2815, + "step": 940 + }, + { + "epoch": 0.05, + "grad_norm": 1.2766424443497033, + "learning_rate": 1.9983295482299752e-05, + "loss": 0.2555, + "step": 941 + }, + { + "epoch": 0.05, + "grad_norm": 1.3076871853427046, + "learning_rate": 1.998320019097289e-05, + "loss": 0.234, + "step": 942 + }, + { + "epoch": 0.05, + "grad_norm": 1.7653987555897812, + "learning_rate": 1.9983104628851154e-05, + "loss": 0.2476, + "step": 943 + }, + { + "epoch": 0.05, + "grad_norm": 4.147047712588286, + "learning_rate": 1.9983008795937142e-05, + "loss": 0.291, + "step": 944 + }, + { + "epoch": 0.05, + "grad_norm": 1.9005551250112118, + "learning_rate": 1.9982912692233455e-05, + "loss": 0.2655, + "step": 945 + }, + { + "epoch": 0.05, + "grad_norm": 1.2830320142702902, + "learning_rate": 1.9982816317742694e-05, + "loss": 0.2607, + "step": 946 + }, + { + "epoch": 0.05, + "grad_norm": 1.6188483702984462, + "learning_rate": 1.9982719672467476e-05, + "loss": 0.2535, + "step": 947 + }, + { + "epoch": 0.05, + "grad_norm": 2.494189922904832, + "learning_rate": 1.9982622756410425e-05, + "loss": 0.263, + "step": 948 + }, + { + "epoch": 0.05, + "grad_norm": 1.6891101259528638, + "learning_rate": 1.998252556957417e-05, + "loss": 0.2674, + "step": 949 + }, + { + "epoch": 0.05, + "grad_norm": 1.4203377893373739, + "learning_rate": 1.998242811196134e-05, + "loss": 0.2332, + "step": 950 + }, + { + "epoch": 0.05, + "grad_norm": 1.5932351561865459, + "learning_rate": 1.9982330383574586e-05, + "loss": 0.2421, + "step": 951 + }, + { + "epoch": 0.05, + "grad_norm": 2.03124317920117, + "learning_rate": 1.998223238441656e-05, + "loss": 0.2498, + "step": 952 + }, + { + "epoch": 0.05, + "grad_norm": 1.8931422127100161, + "learning_rate": 1.9982134114489912e-05, + "loss": 0.2673, + "step": 953 + }, + { + "epoch": 0.05, + "grad_norm": 1.5731660080894065, + "learning_rate": 1.9982035573797315e-05, + "loss": 0.2766, + "step": 954 + }, + { + "epoch": 0.05, + "grad_norm": 1.2214546421076942, + "learning_rate": 1.9981936762341438e-05, + "loss": 0.2317, + "step": 955 + }, + { + "epoch": 0.05, + "grad_norm": 1.3338466940653078, + "learning_rate": 1.9981837680124963e-05, + "loss": 0.2238, + "step": 956 + }, + { + "epoch": 0.05, + "grad_norm": 1.4219834080515363, + "learning_rate": 1.9981738327150575e-05, + "loss": 0.2452, + "step": 957 + }, + { + "epoch": 0.05, + "grad_norm": 1.281365091349734, + "learning_rate": 1.9981638703420977e-05, + "loss": 0.2621, + "step": 958 + }, + { + "epoch": 0.05, + "grad_norm": 1.2303691416352147, + "learning_rate": 1.998153880893886e-05, + "loss": 0.2442, + "step": 959 + }, + { + "epoch": 0.05, + "grad_norm": 2.7164286811059375, + "learning_rate": 1.998143864370694e-05, + "loss": 0.25, + "step": 960 + }, + { + "epoch": 0.05, + "grad_norm": 1.1619798707542857, + "learning_rate": 1.998133820772793e-05, + "loss": 0.2448, + "step": 961 + }, + { + "epoch": 0.05, + "grad_norm": 1.4654543872298886, + "learning_rate": 1.998123750100456e-05, + "loss": 0.2592, + "step": 962 + }, + { + "epoch": 0.05, + "grad_norm": 1.0862977637195583, + "learning_rate": 1.9981136523539565e-05, + "loss": 0.2426, + "step": 963 + }, + { + "epoch": 0.05, + "grad_norm": 1.2390086218034442, + "learning_rate": 1.9981035275335672e-05, + "loss": 0.2557, + "step": 964 + }, + { + "epoch": 0.05, + "grad_norm": 1.504056753106432, + "learning_rate": 1.9980933756395635e-05, + "loss": 0.2697, + "step": 965 + }, + { + "epoch": 0.05, + "grad_norm": 1.398764234923714, + "learning_rate": 1.9980831966722204e-05, + "loss": 0.2398, + "step": 966 + }, + { + "epoch": 0.05, + "grad_norm": 1.2339532264497461, + "learning_rate": 1.9980729906318145e-05, + "loss": 0.264, + "step": 967 + }, + { + "epoch": 0.05, + "grad_norm": 1.4824979512173875, + "learning_rate": 1.998062757518622e-05, + "loss": 0.2453, + "step": 968 + }, + { + "epoch": 0.05, + "grad_norm": 1.7206314812947987, + "learning_rate": 1.998052497332921e-05, + "loss": 0.2569, + "step": 969 + }, + { + "epoch": 0.05, + "grad_norm": 1.6681319774857464, + "learning_rate": 1.99804221007499e-05, + "loss": 0.2418, + "step": 970 + }, + { + "epoch": 0.05, + "grad_norm": 1.2922792928709421, + "learning_rate": 1.9980318957451073e-05, + "loss": 0.233, + "step": 971 + }, + { + "epoch": 0.05, + "grad_norm": 1.3283707199306902, + "learning_rate": 1.9980215543435532e-05, + "loss": 0.2715, + "step": 972 + }, + { + "epoch": 0.05, + "grad_norm": 1.7199472670960634, + "learning_rate": 1.998011185870608e-05, + "loss": 0.2525, + "step": 973 + }, + { + "epoch": 0.05, + "grad_norm": 1.2668493722493168, + "learning_rate": 1.998000790326553e-05, + "loss": 0.25, + "step": 974 + }, + { + "epoch": 0.05, + "grad_norm": 3.4536780933019773, + "learning_rate": 1.9979903677116705e-05, + "loss": 0.2754, + "step": 975 + }, + { + "epoch": 0.05, + "grad_norm": 1.395271992192874, + "learning_rate": 1.9979799180262423e-05, + "loss": 0.2559, + "step": 976 + }, + { + "epoch": 0.05, + "grad_norm": 2.691859147708012, + "learning_rate": 1.997969441270553e-05, + "loss": 0.2739, + "step": 977 + }, + { + "epoch": 0.05, + "grad_norm": 1.3249583444644388, + "learning_rate": 1.997958937444886e-05, + "loss": 0.2406, + "step": 978 + }, + { + "epoch": 0.05, + "grad_norm": 1.1293700203760684, + "learning_rate": 1.9979484065495264e-05, + "loss": 0.2522, + "step": 979 + }, + { + "epoch": 0.05, + "grad_norm": 1.6352020696036085, + "learning_rate": 1.99793784858476e-05, + "loss": 0.2365, + "step": 980 + }, + { + "epoch": 0.05, + "grad_norm": 1.4766571166239415, + "learning_rate": 1.997927263550873e-05, + "loss": 0.2377, + "step": 981 + }, + { + "epoch": 0.05, + "grad_norm": 1.5825322786834481, + "learning_rate": 1.997916651448153e-05, + "loss": 0.283, + "step": 982 + }, + { + "epoch": 0.05, + "grad_norm": 1.5960016847073644, + "learning_rate": 1.997906012276887e-05, + "loss": 0.2336, + "step": 983 + }, + { + "epoch": 0.05, + "grad_norm": 1.7464179676633318, + "learning_rate": 1.9978953460373643e-05, + "loss": 0.2491, + "step": 984 + }, + { + "epoch": 0.05, + "grad_norm": 1.332022414231344, + "learning_rate": 1.997884652729874e-05, + "loss": 0.2708, + "step": 985 + }, + { + "epoch": 0.05, + "grad_norm": 1.4284193259813256, + "learning_rate": 1.997873932354706e-05, + "loss": 0.2558, + "step": 986 + }, + { + "epoch": 0.05, + "grad_norm": 1.2548510619479858, + "learning_rate": 1.9978631849121514e-05, + "loss": 0.23, + "step": 987 + }, + { + "epoch": 0.05, + "grad_norm": 2.5253198374407417, + "learning_rate": 1.997852410402501e-05, + "loss": 0.2598, + "step": 988 + }, + { + "epoch": 0.05, + "grad_norm": 1.6856588529543604, + "learning_rate": 1.9978416088260483e-05, + "loss": 0.2466, + "step": 989 + }, + { + "epoch": 0.05, + "grad_norm": 1.204933102528033, + "learning_rate": 1.9978307801830855e-05, + "loss": 0.2608, + "step": 990 + }, + { + "epoch": 0.05, + "grad_norm": 1.403160012398721, + "learning_rate": 1.997819924473906e-05, + "loss": 0.2457, + "step": 991 + }, + { + "epoch": 0.05, + "grad_norm": 1.3417670872825422, + "learning_rate": 1.997809041698805e-05, + "loss": 0.2481, + "step": 992 + }, + { + "epoch": 0.05, + "grad_norm": 1.9679789941414283, + "learning_rate": 1.9977981318580773e-05, + "loss": 0.2481, + "step": 993 + }, + { + "epoch": 0.05, + "grad_norm": 1.802566030293649, + "learning_rate": 1.9977871949520188e-05, + "loss": 0.2454, + "step": 994 + }, + { + "epoch": 0.05, + "grad_norm": 1.1290121014796652, + "learning_rate": 1.9977762309809266e-05, + "loss": 0.2532, + "step": 995 + }, + { + "epoch": 0.05, + "grad_norm": 1.4914383201086971, + "learning_rate": 1.9977652399450976e-05, + "loss": 0.273, + "step": 996 + }, + { + "epoch": 0.05, + "grad_norm": 1.4738780508956397, + "learning_rate": 1.99775422184483e-05, + "loss": 0.2562, + "step": 997 + }, + { + "epoch": 0.05, + "grad_norm": 1.431594373320625, + "learning_rate": 1.997743176680423e-05, + "loss": 0.2681, + "step": 998 + }, + { + "epoch": 0.05, + "grad_norm": 1.2916825659721045, + "learning_rate": 1.997732104452176e-05, + "loss": 0.2434, + "step": 999 + }, + { + "epoch": 0.05, + "grad_norm": 1.887959469677641, + "learning_rate": 1.997721005160389e-05, + "loss": 0.2451, + "step": 1000 + }, + { + "epoch": 0.05, + "grad_norm": 1.4206349978995914, + "learning_rate": 1.9977098788053637e-05, + "loss": 0.262, + "step": 1001 + }, + { + "epoch": 0.05, + "grad_norm": 1.146152194745477, + "learning_rate": 1.9976987253874016e-05, + "loss": 0.2638, + "step": 1002 + }, + { + "epoch": 0.05, + "grad_norm": 1.318586277766884, + "learning_rate": 1.997687544906805e-05, + "loss": 0.2635, + "step": 1003 + }, + { + "epoch": 0.05, + "grad_norm": 1.3449964495777744, + "learning_rate": 1.9976763373638773e-05, + "loss": 0.2805, + "step": 1004 + }, + { + "epoch": 0.05, + "grad_norm": 1.3295176549041057, + "learning_rate": 1.997665102758923e-05, + "loss": 0.2925, + "step": 1005 + }, + { + "epoch": 0.05, + "grad_norm": 1.356327669023187, + "learning_rate": 1.997653841092246e-05, + "loss": 0.2487, + "step": 1006 + }, + { + "epoch": 0.05, + "grad_norm": 1.4765934626224548, + "learning_rate": 1.9976425523641527e-05, + "loss": 0.2929, + "step": 1007 + }, + { + "epoch": 0.05, + "grad_norm": 1.497001144648849, + "learning_rate": 1.9976312365749484e-05, + "loss": 0.2477, + "step": 1008 + }, + { + "epoch": 0.05, + "grad_norm": 1.5166775118877334, + "learning_rate": 1.9976198937249408e-05, + "loss": 0.2361, + "step": 1009 + }, + { + "epoch": 0.05, + "grad_norm": 1.4083767570633934, + "learning_rate": 1.997608523814437e-05, + "loss": 0.254, + "step": 1010 + }, + { + "epoch": 0.05, + "grad_norm": 1.540794321329871, + "learning_rate": 1.9975971268437457e-05, + "loss": 0.2738, + "step": 1011 + }, + { + "epoch": 0.05, + "grad_norm": 1.668700196997005, + "learning_rate": 1.997585702813176e-05, + "loss": 0.2793, + "step": 1012 + }, + { + "epoch": 0.05, + "grad_norm": 1.5112483099308371, + "learning_rate": 1.9975742517230377e-05, + "loss": 0.2443, + "step": 1013 + }, + { + "epoch": 0.05, + "grad_norm": 1.5205434194057652, + "learning_rate": 1.9975627735736416e-05, + "loss": 0.2532, + "step": 1014 + }, + { + "epoch": 0.05, + "grad_norm": 1.40062000800819, + "learning_rate": 1.9975512683652985e-05, + "loss": 0.2509, + "step": 1015 + }, + { + "epoch": 0.05, + "grad_norm": 1.4963993851155675, + "learning_rate": 1.9975397360983216e-05, + "loss": 0.24, + "step": 1016 + }, + { + "epoch": 0.05, + "grad_norm": 2.1069124215856547, + "learning_rate": 1.9975281767730226e-05, + "loss": 0.2256, + "step": 1017 + }, + { + "epoch": 0.05, + "grad_norm": 1.4490239252320818, + "learning_rate": 1.9975165903897155e-05, + "loss": 0.2318, + "step": 1018 + }, + { + "epoch": 0.05, + "grad_norm": 1.594997764001681, + "learning_rate": 1.9975049769487147e-05, + "loss": 0.2416, + "step": 1019 + }, + { + "epoch": 0.05, + "grad_norm": 1.7645257393696694, + "learning_rate": 1.9974933364503347e-05, + "loss": 0.2589, + "step": 1020 + }, + { + "epoch": 0.05, + "grad_norm": 1.288053085705425, + "learning_rate": 1.9974816688948923e-05, + "loss": 0.2536, + "step": 1021 + }, + { + "epoch": 0.05, + "grad_norm": 1.2860181373293385, + "learning_rate": 1.9974699742827028e-05, + "loss": 0.2414, + "step": 1022 + }, + { + "epoch": 0.05, + "grad_norm": 1.4077289076246466, + "learning_rate": 1.997458252614084e-05, + "loss": 0.2416, + "step": 1023 + }, + { + "epoch": 0.05, + "grad_norm": 2.378778097881167, + "learning_rate": 1.9974465038893535e-05, + "loss": 0.2535, + "step": 1024 + }, + { + "epoch": 0.05, + "grad_norm": 1.3503965136445129, + "learning_rate": 1.9974347281088305e-05, + "loss": 0.2676, + "step": 1025 + }, + { + "epoch": 0.05, + "grad_norm": 1.236924180487533, + "learning_rate": 1.9974229252728345e-05, + "loss": 0.2545, + "step": 1026 + }, + { + "epoch": 0.05, + "grad_norm": 1.0346377489547494, + "learning_rate": 1.9974110953816846e-05, + "loss": 0.239, + "step": 1027 + }, + { + "epoch": 0.05, + "grad_norm": 1.3754934009638555, + "learning_rate": 1.997399238435703e-05, + "loss": 0.288, + "step": 1028 + }, + { + "epoch": 0.05, + "grad_norm": 1.2183415405104483, + "learning_rate": 1.99738735443521e-05, + "loss": 0.2624, + "step": 1029 + }, + { + "epoch": 0.05, + "grad_norm": 0.9613397656490309, + "learning_rate": 1.9973754433805294e-05, + "loss": 0.2424, + "step": 1030 + }, + { + "epoch": 0.05, + "grad_norm": 1.6289514132511271, + "learning_rate": 1.9973635052719836e-05, + "loss": 0.2523, + "step": 1031 + }, + { + "epoch": 0.05, + "grad_norm": 1.0487668403001444, + "learning_rate": 1.997351540109896e-05, + "loss": 0.2854, + "step": 1032 + }, + { + "epoch": 0.05, + "grad_norm": 1.3730747343031353, + "learning_rate": 1.9973395478945917e-05, + "loss": 0.251, + "step": 1033 + }, + { + "epoch": 0.05, + "grad_norm": 1.247569885594585, + "learning_rate": 1.9973275286263955e-05, + "loss": 0.2465, + "step": 1034 + }, + { + "epoch": 0.05, + "grad_norm": 1.200220973084389, + "learning_rate": 1.9973154823056343e-05, + "loss": 0.2733, + "step": 1035 + }, + { + "epoch": 0.05, + "grad_norm": 1.1170295827926, + "learning_rate": 1.997303408932634e-05, + "loss": 0.2553, + "step": 1036 + }, + { + "epoch": 0.05, + "grad_norm": 1.234375683527027, + "learning_rate": 1.9972913085077225e-05, + "loss": 0.2682, + "step": 1037 + }, + { + "epoch": 0.05, + "grad_norm": 1.2679487419761017, + "learning_rate": 1.997279181031228e-05, + "loss": 0.272, + "step": 1038 + }, + { + "epoch": 0.05, + "grad_norm": 1.6447639613922878, + "learning_rate": 1.997267026503479e-05, + "loss": 0.3011, + "step": 1039 + }, + { + "epoch": 0.05, + "grad_norm": 1.3634475769344572, + "learning_rate": 1.997254844924806e-05, + "loss": 0.2631, + "step": 1040 + }, + { + "epoch": 0.05, + "grad_norm": 1.2718450470082385, + "learning_rate": 1.997242636295539e-05, + "loss": 0.2412, + "step": 1041 + }, + { + "epoch": 0.05, + "grad_norm": 1.0108447929482707, + "learning_rate": 1.997230400616009e-05, + "loss": 0.237, + "step": 1042 + }, + { + "epoch": 0.05, + "grad_norm": 1.1645213079118888, + "learning_rate": 1.997218137886548e-05, + "loss": 0.2645, + "step": 1043 + }, + { + "epoch": 0.05, + "grad_norm": 1.0614791909352574, + "learning_rate": 1.997205848107489e-05, + "loss": 0.2693, + "step": 1044 + }, + { + "epoch": 0.05, + "grad_norm": 1.4634827744750394, + "learning_rate": 1.9971935312791646e-05, + "loss": 0.2357, + "step": 1045 + }, + { + "epoch": 0.05, + "grad_norm": 0.9872275898199967, + "learning_rate": 1.9971811874019096e-05, + "loss": 0.2554, + "step": 1046 + }, + { + "epoch": 0.05, + "grad_norm": 1.2016246852199257, + "learning_rate": 1.9971688164760588e-05, + "loss": 0.2769, + "step": 1047 + }, + { + "epoch": 0.05, + "grad_norm": 3.9982730088115166, + "learning_rate": 1.997156418501947e-05, + "loss": 0.2465, + "step": 1048 + }, + { + "epoch": 0.05, + "grad_norm": 1.6992277962147386, + "learning_rate": 1.9971439934799113e-05, + "loss": 0.2723, + "step": 1049 + }, + { + "epoch": 0.05, + "grad_norm": 1.0004795906439163, + "learning_rate": 1.9971315414102886e-05, + "loss": 0.2699, + "step": 1050 + }, + { + "epoch": 0.05, + "grad_norm": 1.2129500121770678, + "learning_rate": 1.9971190622934164e-05, + "loss": 0.2474, + "step": 1051 + }, + { + "epoch": 0.05, + "grad_norm": 2.131678439889529, + "learning_rate": 1.9971065561296334e-05, + "loss": 0.2638, + "step": 1052 + }, + { + "epoch": 0.05, + "grad_norm": 1.1363756749163911, + "learning_rate": 1.9970940229192785e-05, + "loss": 0.2406, + "step": 1053 + }, + { + "epoch": 0.05, + "grad_norm": 1.2498200840780742, + "learning_rate": 1.997081462662692e-05, + "loss": 0.2639, + "step": 1054 + }, + { + "epoch": 0.05, + "grad_norm": 1.1762334890050876, + "learning_rate": 1.997068875360215e-05, + "loss": 0.2543, + "step": 1055 + }, + { + "epoch": 0.05, + "grad_norm": 1.0024363148955997, + "learning_rate": 1.9970562610121878e-05, + "loss": 0.2289, + "step": 1056 + }, + { + "epoch": 0.05, + "grad_norm": 1.1811960528971635, + "learning_rate": 1.9970436196189534e-05, + "loss": 0.2478, + "step": 1057 + }, + { + "epoch": 0.05, + "grad_norm": 1.1637800873859996, + "learning_rate": 1.9970309511808544e-05, + "loss": 0.2465, + "step": 1058 + }, + { + "epoch": 0.05, + "grad_norm": 0.979550169262176, + "learning_rate": 1.997018255698235e-05, + "loss": 0.2214, + "step": 1059 + }, + { + "epoch": 0.05, + "grad_norm": 1.16589763971035, + "learning_rate": 1.9970055331714383e-05, + "loss": 0.2711, + "step": 1060 + }, + { + "epoch": 0.05, + "grad_norm": 1.1246384724575211, + "learning_rate": 1.9969927836008106e-05, + "loss": 0.2775, + "step": 1061 + }, + { + "epoch": 0.05, + "grad_norm": 1.7036162064431464, + "learning_rate": 1.9969800069866977e-05, + "loss": 0.2495, + "step": 1062 + }, + { + "epoch": 0.05, + "grad_norm": 1.1887124846196262, + "learning_rate": 1.9969672033294456e-05, + "loss": 0.2403, + "step": 1063 + }, + { + "epoch": 0.05, + "grad_norm": 1.1111538706770012, + "learning_rate": 1.9969543726294015e-05, + "loss": 0.2787, + "step": 1064 + }, + { + "epoch": 0.05, + "grad_norm": 1.344197692512503, + "learning_rate": 1.996941514886914e-05, + "loss": 0.2406, + "step": 1065 + }, + { + "epoch": 0.05, + "grad_norm": 0.9370858572857556, + "learning_rate": 1.9969286301023313e-05, + "loss": 0.2453, + "step": 1066 + }, + { + "epoch": 0.05, + "grad_norm": 1.317987916437125, + "learning_rate": 1.9969157182760038e-05, + "loss": 0.2469, + "step": 1067 + }, + { + "epoch": 0.05, + "grad_norm": 1.4308374640331052, + "learning_rate": 1.9969027794082805e-05, + "loss": 0.2618, + "step": 1068 + }, + { + "epoch": 0.05, + "grad_norm": 1.0128360475469447, + "learning_rate": 1.9968898134995133e-05, + "loss": 0.2589, + "step": 1069 + }, + { + "epoch": 0.05, + "grad_norm": 1.2206987095575548, + "learning_rate": 1.9968768205500537e-05, + "loss": 0.2539, + "step": 1070 + }, + { + "epoch": 0.05, + "grad_norm": 1.2616967144566469, + "learning_rate": 1.996863800560254e-05, + "loss": 0.2561, + "step": 1071 + }, + { + "epoch": 0.05, + "grad_norm": 1.2420509612739192, + "learning_rate": 1.9968507535304673e-05, + "loss": 0.2488, + "step": 1072 + }, + { + "epoch": 0.05, + "grad_norm": 1.6531805252472223, + "learning_rate": 1.9968376794610476e-05, + "loss": 0.2473, + "step": 1073 + }, + { + "epoch": 0.05, + "grad_norm": 1.512339883342327, + "learning_rate": 1.9968245783523494e-05, + "loss": 0.2761, + "step": 1074 + }, + { + "epoch": 0.05, + "grad_norm": 1.1497848143449239, + "learning_rate": 1.9968114502047285e-05, + "loss": 0.2543, + "step": 1075 + }, + { + "epoch": 0.05, + "grad_norm": 1.3298769541275925, + "learning_rate": 1.9967982950185406e-05, + "loss": 0.2466, + "step": 1076 + }, + { + "epoch": 0.05, + "grad_norm": 1.1633256939356942, + "learning_rate": 1.9967851127941428e-05, + "loss": 0.2452, + "step": 1077 + }, + { + "epoch": 0.05, + "grad_norm": 1.4220440751364178, + "learning_rate": 1.9967719035318923e-05, + "loss": 0.2764, + "step": 1078 + }, + { + "epoch": 0.05, + "grad_norm": 1.1132936198571883, + "learning_rate": 1.996758667232148e-05, + "loss": 0.26, + "step": 1079 + }, + { + "epoch": 0.05, + "grad_norm": 1.6312747105898855, + "learning_rate": 1.996745403895268e-05, + "loss": 0.2669, + "step": 1080 + }, + { + "epoch": 0.05, + "grad_norm": 1.6402142909338209, + "learning_rate": 1.996732113521613e-05, + "loss": 0.2457, + "step": 1081 + }, + { + "epoch": 0.06, + "grad_norm": 2.0667893373200155, + "learning_rate": 1.996718796111543e-05, + "loss": 0.2608, + "step": 1082 + }, + { + "epoch": 0.06, + "grad_norm": 1.852776127345349, + "learning_rate": 1.9967054516654192e-05, + "loss": 0.2296, + "step": 1083 + }, + { + "epoch": 0.06, + "grad_norm": 1.409690469886677, + "learning_rate": 1.996692080183604e-05, + "loss": 0.2808, + "step": 1084 + }, + { + "epoch": 0.06, + "grad_norm": 1.3043579795586937, + "learning_rate": 1.9966786816664595e-05, + "loss": 0.2389, + "step": 1085 + }, + { + "epoch": 0.06, + "grad_norm": 1.5235254463247614, + "learning_rate": 1.9966652561143497e-05, + "loss": 0.2704, + "step": 1086 + }, + { + "epoch": 0.06, + "grad_norm": 1.1451019290362825, + "learning_rate": 1.9966518035276386e-05, + "loss": 0.2436, + "step": 1087 + }, + { + "epoch": 0.06, + "grad_norm": 1.1315370371336715, + "learning_rate": 1.996638323906691e-05, + "loss": 0.2551, + "step": 1088 + }, + { + "epoch": 0.06, + "grad_norm": 1.2459189550689043, + "learning_rate": 1.9966248172518724e-05, + "loss": 0.2388, + "step": 1089 + }, + { + "epoch": 0.06, + "grad_norm": 1.0925475069915203, + "learning_rate": 1.9966112835635493e-05, + "loss": 0.2755, + "step": 1090 + }, + { + "epoch": 0.06, + "grad_norm": 1.5310245971490808, + "learning_rate": 1.996597722842089e-05, + "loss": 0.2449, + "step": 1091 + }, + { + "epoch": 0.06, + "grad_norm": 1.1293064532018613, + "learning_rate": 1.9965841350878594e-05, + "loss": 0.2457, + "step": 1092 + }, + { + "epoch": 0.06, + "grad_norm": 1.270511290540653, + "learning_rate": 1.9965705203012288e-05, + "loss": 0.2584, + "step": 1093 + }, + { + "epoch": 0.06, + "grad_norm": 1.3994165582848725, + "learning_rate": 1.9965568784825665e-05, + "loss": 0.2679, + "step": 1094 + }, + { + "epoch": 0.06, + "grad_norm": 1.2426772281808531, + "learning_rate": 1.9965432096322423e-05, + "loss": 0.2421, + "step": 1095 + }, + { + "epoch": 0.06, + "grad_norm": 1.3840821487760482, + "learning_rate": 1.9965295137506275e-05, + "loss": 0.2817, + "step": 1096 + }, + { + "epoch": 0.06, + "grad_norm": 1.5904477416527278, + "learning_rate": 1.9965157908380934e-05, + "loss": 0.2625, + "step": 1097 + }, + { + "epoch": 0.06, + "grad_norm": 1.0619560169492481, + "learning_rate": 1.996502040895012e-05, + "loss": 0.2489, + "step": 1098 + }, + { + "epoch": 0.06, + "grad_norm": 1.1477414335171585, + "learning_rate": 1.9964882639217564e-05, + "loss": 0.2561, + "step": 1099 + }, + { + "epoch": 0.06, + "grad_norm": 1.2290065958182672, + "learning_rate": 1.9964744599187006e-05, + "loss": 0.2423, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 1.0360763227358927, + "learning_rate": 1.9964606288862187e-05, + "loss": 0.2513, + "step": 1101 + }, + { + "epoch": 0.06, + "grad_norm": 1.0421375048836479, + "learning_rate": 1.9964467708246858e-05, + "loss": 0.2321, + "step": 1102 + }, + { + "epoch": 0.06, + "grad_norm": 1.1176614561117166, + "learning_rate": 1.9964328857344782e-05, + "loss": 0.2346, + "step": 1103 + }, + { + "epoch": 0.06, + "grad_norm": 1.1864842864031386, + "learning_rate": 1.9964189736159724e-05, + "loss": 0.2379, + "step": 1104 + }, + { + "epoch": 0.06, + "grad_norm": 0.9928267143128622, + "learning_rate": 1.9964050344695454e-05, + "loss": 0.2471, + "step": 1105 + }, + { + "epoch": 0.06, + "grad_norm": 1.2547197785017616, + "learning_rate": 1.9963910682955755e-05, + "loss": 0.2796, + "step": 1106 + }, + { + "epoch": 0.06, + "grad_norm": 1.108318882422375, + "learning_rate": 1.9963770750944416e-05, + "loss": 0.2731, + "step": 1107 + }, + { + "epoch": 0.06, + "grad_norm": 1.3881418626610218, + "learning_rate": 1.9963630548665234e-05, + "loss": 0.2565, + "step": 1108 + }, + { + "epoch": 0.06, + "grad_norm": 1.0514992110398136, + "learning_rate": 1.9963490076122013e-05, + "loss": 0.2517, + "step": 1109 + }, + { + "epoch": 0.06, + "grad_norm": 1.0593218515735172, + "learning_rate": 1.9963349333318557e-05, + "loss": 0.2534, + "step": 1110 + }, + { + "epoch": 0.06, + "grad_norm": 1.118821356451065, + "learning_rate": 1.996320832025869e-05, + "loss": 0.2423, + "step": 1111 + }, + { + "epoch": 0.06, + "grad_norm": 1.303669999694777, + "learning_rate": 1.9963067036946234e-05, + "loss": 0.2487, + "step": 1112 + }, + { + "epoch": 0.06, + "grad_norm": 0.9298685030587246, + "learning_rate": 1.996292548338502e-05, + "loss": 0.255, + "step": 1113 + }, + { + "epoch": 0.06, + "grad_norm": 1.1029318356890085, + "learning_rate": 1.9962783659578893e-05, + "loss": 0.2728, + "step": 1114 + }, + { + "epoch": 0.06, + "grad_norm": 1.0197604969095162, + "learning_rate": 1.9962641565531694e-05, + "loss": 0.2539, + "step": 1115 + }, + { + "epoch": 0.06, + "grad_norm": 1.4415415643758744, + "learning_rate": 1.9962499201247278e-05, + "loss": 0.2628, + "step": 1116 + }, + { + "epoch": 0.06, + "grad_norm": 1.083338942951384, + "learning_rate": 1.996235656672951e-05, + "loss": 0.2233, + "step": 1117 + }, + { + "epoch": 0.06, + "grad_norm": 2.1160686080415427, + "learning_rate": 1.9962213661982258e-05, + "loss": 0.2383, + "step": 1118 + }, + { + "epoch": 0.06, + "grad_norm": 1.3011374875117159, + "learning_rate": 1.99620704870094e-05, + "loss": 0.2546, + "step": 1119 + }, + { + "epoch": 0.06, + "grad_norm": 1.2663679967062615, + "learning_rate": 1.9961927041814818e-05, + "loss": 0.2764, + "step": 1120 + }, + { + "epoch": 0.06, + "grad_norm": 4.08364336655249, + "learning_rate": 1.99617833264024e-05, + "loss": 0.2728, + "step": 1121 + }, + { + "epoch": 0.06, + "grad_norm": 1.1913191519966877, + "learning_rate": 1.9961639340776044e-05, + "loss": 0.2662, + "step": 1122 + }, + { + "epoch": 0.06, + "grad_norm": 1.337006850164463, + "learning_rate": 1.9961495084939663e-05, + "loss": 0.2617, + "step": 1123 + }, + { + "epoch": 0.06, + "grad_norm": 0.9984066446142431, + "learning_rate": 1.9961350558897165e-05, + "loss": 0.2304, + "step": 1124 + }, + { + "epoch": 0.06, + "grad_norm": 1.1472444721091424, + "learning_rate": 1.996120576265247e-05, + "loss": 0.2582, + "step": 1125 + }, + { + "epoch": 0.06, + "grad_norm": 1.0510611344781502, + "learning_rate": 1.99610606962095e-05, + "loss": 0.2411, + "step": 1126 + }, + { + "epoch": 0.06, + "grad_norm": 1.1413980782504782, + "learning_rate": 1.99609153595722e-05, + "loss": 0.2501, + "step": 1127 + }, + { + "epoch": 0.06, + "grad_norm": 0.9585331468733518, + "learning_rate": 1.9960769752744508e-05, + "loss": 0.2266, + "step": 1128 + }, + { + "epoch": 0.06, + "grad_norm": 1.0384063965422534, + "learning_rate": 1.9960623875730376e-05, + "loss": 0.2433, + "step": 1129 + }, + { + "epoch": 0.06, + "grad_norm": 0.9887923427738782, + "learning_rate": 1.9960477728533756e-05, + "loss": 0.242, + "step": 1130 + }, + { + "epoch": 0.06, + "grad_norm": 1.2281455484801602, + "learning_rate": 1.9960331311158618e-05, + "loss": 0.2478, + "step": 1131 + }, + { + "epoch": 0.06, + "grad_norm": 1.1420820808451997, + "learning_rate": 1.9960184623608927e-05, + "loss": 0.2313, + "step": 1132 + }, + { + "epoch": 0.06, + "grad_norm": 1.1291272243771713, + "learning_rate": 1.996003766588867e-05, + "loss": 0.2435, + "step": 1133 + }, + { + "epoch": 0.06, + "grad_norm": 1.291842843117987, + "learning_rate": 1.9959890438001826e-05, + "loss": 0.2505, + "step": 1134 + }, + { + "epoch": 0.06, + "grad_norm": 1.0968369271525031, + "learning_rate": 1.9959742939952393e-05, + "loss": 0.2585, + "step": 1135 + }, + { + "epoch": 0.06, + "grad_norm": 1.1421824643955416, + "learning_rate": 1.9959595171744367e-05, + "loss": 0.2725, + "step": 1136 + }, + { + "epoch": 0.06, + "grad_norm": 0.9766039383545265, + "learning_rate": 1.9959447133381762e-05, + "loss": 0.2533, + "step": 1137 + }, + { + "epoch": 0.06, + "grad_norm": 0.9345271213518466, + "learning_rate": 1.995929882486859e-05, + "loss": 0.241, + "step": 1138 + }, + { + "epoch": 0.06, + "grad_norm": 1.187322374200805, + "learning_rate": 1.9959150246208876e-05, + "loss": 0.2347, + "step": 1139 + }, + { + "epoch": 0.06, + "grad_norm": 1.0532689372785176, + "learning_rate": 1.995900139740665e-05, + "loss": 0.2603, + "step": 1140 + }, + { + "epoch": 0.06, + "grad_norm": 1.5080362513166135, + "learning_rate": 1.9958852278465946e-05, + "loss": 0.2028, + "step": 1141 + }, + { + "epoch": 0.06, + "grad_norm": 1.1438427953969899, + "learning_rate": 1.9958702889390813e-05, + "loss": 0.2538, + "step": 1142 + }, + { + "epoch": 0.06, + "grad_norm": 1.3178650171897648, + "learning_rate": 1.99585532301853e-05, + "loss": 0.2687, + "step": 1143 + }, + { + "epoch": 0.06, + "grad_norm": 1.0837016465004479, + "learning_rate": 1.9958403300853472e-05, + "loss": 0.2732, + "step": 1144 + }, + { + "epoch": 0.06, + "grad_norm": 1.151385755054102, + "learning_rate": 1.9958253101399388e-05, + "loss": 0.2318, + "step": 1145 + }, + { + "epoch": 0.06, + "grad_norm": 1.0980774803415834, + "learning_rate": 1.9958102631827127e-05, + "loss": 0.2398, + "step": 1146 + }, + { + "epoch": 0.06, + "grad_norm": 1.0518834716128322, + "learning_rate": 1.995795189214077e-05, + "loss": 0.2567, + "step": 1147 + }, + { + "epoch": 0.06, + "grad_norm": 0.9774327625199531, + "learning_rate": 1.9957800882344406e-05, + "loss": 0.2912, + "step": 1148 + }, + { + "epoch": 0.06, + "grad_norm": 0.9752705364199556, + "learning_rate": 1.9957649602442132e-05, + "loss": 0.2637, + "step": 1149 + }, + { + "epoch": 0.06, + "grad_norm": 1.22461622064381, + "learning_rate": 1.9957498052438046e-05, + "loss": 0.2682, + "step": 1150 + }, + { + "epoch": 0.06, + "grad_norm": 0.9363047948353536, + "learning_rate": 1.9957346232336264e-05, + "loss": 0.2608, + "step": 1151 + }, + { + "epoch": 0.06, + "grad_norm": 1.0908039664636027, + "learning_rate": 1.9957194142140907e-05, + "loss": 0.2642, + "step": 1152 + }, + { + "epoch": 0.06, + "grad_norm": 1.298250529492443, + "learning_rate": 1.9957041781856094e-05, + "loss": 0.2428, + "step": 1153 + }, + { + "epoch": 0.06, + "grad_norm": 0.8691045152210589, + "learning_rate": 1.995688915148596e-05, + "loss": 0.252, + "step": 1154 + }, + { + "epoch": 0.06, + "grad_norm": 1.5017543963557427, + "learning_rate": 1.9956736251034643e-05, + "loss": 0.2807, + "step": 1155 + }, + { + "epoch": 0.06, + "grad_norm": 1.425121321973548, + "learning_rate": 1.99565830805063e-05, + "loss": 0.2524, + "step": 1156 + }, + { + "epoch": 0.06, + "grad_norm": 1.3588406022460622, + "learning_rate": 1.995642963990507e-05, + "loss": 0.2631, + "step": 1157 + }, + { + "epoch": 0.06, + "grad_norm": 2.012820438420422, + "learning_rate": 1.995627592923513e-05, + "loss": 0.2583, + "step": 1158 + }, + { + "epoch": 0.06, + "grad_norm": 1.0423757816916552, + "learning_rate": 1.995612194850064e-05, + "loss": 0.231, + "step": 1159 + }, + { + "epoch": 0.06, + "grad_norm": 1.1506603528204253, + "learning_rate": 1.9955967697705782e-05, + "loss": 0.2758, + "step": 1160 + }, + { + "epoch": 0.06, + "grad_norm": 1.120430771480744, + "learning_rate": 1.9955813176854735e-05, + "loss": 0.2574, + "step": 1161 + }, + { + "epoch": 0.06, + "grad_norm": 1.0850296741610588, + "learning_rate": 1.9955658385951695e-05, + "loss": 0.2612, + "step": 1162 + }, + { + "epoch": 0.06, + "grad_norm": 1.1501268535631535, + "learning_rate": 1.9955503325000857e-05, + "loss": 0.26, + "step": 1163 + }, + { + "epoch": 0.06, + "grad_norm": 1.2212111778915191, + "learning_rate": 1.9955347994006432e-05, + "loss": 0.2737, + "step": 1164 + }, + { + "epoch": 0.06, + "grad_norm": 1.4028969597594856, + "learning_rate": 1.9955192392972628e-05, + "loss": 0.2324, + "step": 1165 + }, + { + "epoch": 0.06, + "grad_norm": 1.3102571784603596, + "learning_rate": 1.995503652190367e-05, + "loss": 0.2674, + "step": 1166 + }, + { + "epoch": 0.06, + "grad_norm": 0.9897987458802096, + "learning_rate": 1.9954880380803787e-05, + "loss": 0.2423, + "step": 1167 + }, + { + "epoch": 0.06, + "grad_norm": 0.9207734644130189, + "learning_rate": 1.9954723969677206e-05, + "loss": 0.2306, + "step": 1168 + }, + { + "epoch": 0.06, + "grad_norm": 1.1295488992326463, + "learning_rate": 1.9954567288528174e-05, + "loss": 0.2418, + "step": 1169 + }, + { + "epoch": 0.06, + "grad_norm": 1.2685761887350202, + "learning_rate": 1.9954410337360945e-05, + "loss": 0.2357, + "step": 1170 + }, + { + "epoch": 0.06, + "grad_norm": 0.9861608199592001, + "learning_rate": 1.9954253116179772e-05, + "loss": 0.2426, + "step": 1171 + }, + { + "epoch": 0.06, + "grad_norm": 1.0280451362848153, + "learning_rate": 1.9954095624988924e-05, + "loss": 0.2339, + "step": 1172 + }, + { + "epoch": 0.06, + "grad_norm": 1.0937539428083092, + "learning_rate": 1.9953937863792666e-05, + "loss": 0.2407, + "step": 1173 + }, + { + "epoch": 0.06, + "grad_norm": 1.5164023930909465, + "learning_rate": 1.9953779832595285e-05, + "loss": 0.271, + "step": 1174 + }, + { + "epoch": 0.06, + "grad_norm": 1.0085274941712419, + "learning_rate": 1.995362153140106e-05, + "loss": 0.2501, + "step": 1175 + }, + { + "epoch": 0.06, + "grad_norm": 1.0757387120719115, + "learning_rate": 1.9953462960214293e-05, + "loss": 0.2299, + "step": 1176 + }, + { + "epoch": 0.06, + "grad_norm": 1.562846048479879, + "learning_rate": 1.995330411903928e-05, + "loss": 0.2913, + "step": 1177 + }, + { + "epoch": 0.06, + "grad_norm": 1.670780598619956, + "learning_rate": 1.995314500788033e-05, + "loss": 0.2559, + "step": 1178 + }, + { + "epoch": 0.06, + "grad_norm": 1.0999296700893124, + "learning_rate": 1.9952985626741757e-05, + "loss": 0.2413, + "step": 1179 + }, + { + "epoch": 0.06, + "grad_norm": 1.2266840303048263, + "learning_rate": 1.995282597562789e-05, + "loss": 0.2299, + "step": 1180 + }, + { + "epoch": 0.06, + "grad_norm": 1.12382611130755, + "learning_rate": 1.9952666054543053e-05, + "loss": 0.2557, + "step": 1181 + }, + { + "epoch": 0.06, + "grad_norm": 0.8852569009332808, + "learning_rate": 1.995250586349159e-05, + "loss": 0.2543, + "step": 1182 + }, + { + "epoch": 0.06, + "grad_norm": 0.9698648745098383, + "learning_rate": 1.9952345402477844e-05, + "loss": 0.2397, + "step": 1183 + }, + { + "epoch": 0.06, + "grad_norm": 1.0757537964497887, + "learning_rate": 1.9952184671506167e-05, + "loss": 0.2867, + "step": 1184 + }, + { + "epoch": 0.06, + "grad_norm": 2.047570354258211, + "learning_rate": 1.9952023670580915e-05, + "loss": 0.2373, + "step": 1185 + }, + { + "epoch": 0.06, + "grad_norm": 1.3164713777096313, + "learning_rate": 1.9951862399706463e-05, + "loss": 0.2745, + "step": 1186 + }, + { + "epoch": 0.06, + "grad_norm": 1.254144229233726, + "learning_rate": 1.995170085888718e-05, + "loss": 0.2355, + "step": 1187 + }, + { + "epoch": 0.06, + "grad_norm": 1.0187453637220607, + "learning_rate": 1.9951539048127447e-05, + "loss": 0.2368, + "step": 1188 + }, + { + "epoch": 0.06, + "grad_norm": 0.985069297112168, + "learning_rate": 1.9951376967431658e-05, + "loss": 0.2376, + "step": 1189 + }, + { + "epoch": 0.06, + "grad_norm": 1.1871515603562464, + "learning_rate": 1.9951214616804203e-05, + "loss": 0.2481, + "step": 1190 + }, + { + "epoch": 0.06, + "grad_norm": 1.1725021607449277, + "learning_rate": 1.9951051996249492e-05, + "loss": 0.2618, + "step": 1191 + }, + { + "epoch": 0.06, + "grad_norm": 1.0558607145471346, + "learning_rate": 1.9950889105771937e-05, + "loss": 0.2689, + "step": 1192 + }, + { + "epoch": 0.06, + "grad_norm": 0.8759610787740514, + "learning_rate": 1.995072594537595e-05, + "loss": 0.258, + "step": 1193 + }, + { + "epoch": 0.06, + "grad_norm": 1.0177805167346698, + "learning_rate": 1.9950562515065957e-05, + "loss": 0.2662, + "step": 1194 + }, + { + "epoch": 0.06, + "grad_norm": 3.21473439251119, + "learning_rate": 1.9950398814846396e-05, + "loss": 0.2414, + "step": 1195 + }, + { + "epoch": 0.06, + "grad_norm": 1.0701077252548996, + "learning_rate": 1.9950234844721707e-05, + "loss": 0.2294, + "step": 1196 + }, + { + "epoch": 0.06, + "grad_norm": 1.304305485860522, + "learning_rate": 1.9950070604696332e-05, + "loss": 0.2575, + "step": 1197 + }, + { + "epoch": 0.06, + "grad_norm": 1.4979470689692984, + "learning_rate": 1.994990609477473e-05, + "loss": 0.2103, + "step": 1198 + }, + { + "epoch": 0.06, + "grad_norm": 1.881741820714266, + "learning_rate": 1.994974131496137e-05, + "loss": 0.2499, + "step": 1199 + }, + { + "epoch": 0.06, + "grad_norm": 1.4797426522385466, + "learning_rate": 1.9949576265260708e-05, + "loss": 0.2396, + "step": 1200 + }, + { + "epoch": 0.06, + "grad_norm": 1.3031654108455368, + "learning_rate": 1.994941094567723e-05, + "loss": 0.2342, + "step": 1201 + }, + { + "epoch": 0.06, + "grad_norm": 1.5757734679789772, + "learning_rate": 1.9949245356215415e-05, + "loss": 0.2518, + "step": 1202 + }, + { + "epoch": 0.06, + "grad_norm": 1.2205859526102094, + "learning_rate": 1.9949079496879763e-05, + "loss": 0.2384, + "step": 1203 + }, + { + "epoch": 0.06, + "grad_norm": 1.3153888292338682, + "learning_rate": 1.9948913367674766e-05, + "loss": 0.2674, + "step": 1204 + }, + { + "epoch": 0.06, + "grad_norm": 1.118783886908271, + "learning_rate": 1.994874696860493e-05, + "loss": 0.2526, + "step": 1205 + }, + { + "epoch": 0.06, + "grad_norm": 1.51742792693216, + "learning_rate": 1.9948580299674774e-05, + "loss": 0.2485, + "step": 1206 + }, + { + "epoch": 0.06, + "grad_norm": 1.0380364345723683, + "learning_rate": 1.994841336088881e-05, + "loss": 0.2567, + "step": 1207 + }, + { + "epoch": 0.06, + "grad_norm": 1.1678784841168433, + "learning_rate": 1.9948246152251576e-05, + "loss": 0.2431, + "step": 1208 + }, + { + "epoch": 0.06, + "grad_norm": 1.3366080159215235, + "learning_rate": 1.9948078673767604e-05, + "loss": 0.2875, + "step": 1209 + }, + { + "epoch": 0.06, + "grad_norm": 1.63814443418371, + "learning_rate": 1.9947910925441435e-05, + "loss": 0.2752, + "step": 1210 + }, + { + "epoch": 0.06, + "grad_norm": 1.2934593570974755, + "learning_rate": 1.9947742907277617e-05, + "loss": 0.2762, + "step": 1211 + }, + { + "epoch": 0.06, + "grad_norm": 1.348419593558008, + "learning_rate": 1.9947574619280713e-05, + "loss": 0.2296, + "step": 1212 + }, + { + "epoch": 0.06, + "grad_norm": 0.9139456450488086, + "learning_rate": 1.9947406061455287e-05, + "loss": 0.2519, + "step": 1213 + }, + { + "epoch": 0.06, + "grad_norm": 0.9703076073163888, + "learning_rate": 1.994723723380591e-05, + "loss": 0.2621, + "step": 1214 + }, + { + "epoch": 0.06, + "grad_norm": 1.3465532732447363, + "learning_rate": 1.994706813633716e-05, + "loss": 0.2558, + "step": 1215 + }, + { + "epoch": 0.06, + "grad_norm": 1.284682342016494, + "learning_rate": 1.9946898769053625e-05, + "loss": 0.2363, + "step": 1216 + }, + { + "epoch": 0.06, + "grad_norm": 1.1236941900852206, + "learning_rate": 1.9946729131959902e-05, + "loss": 0.2692, + "step": 1217 + }, + { + "epoch": 0.06, + "grad_norm": 1.3346742349950482, + "learning_rate": 1.9946559225060585e-05, + "loss": 0.2413, + "step": 1218 + }, + { + "epoch": 0.06, + "grad_norm": 1.2810878909267052, + "learning_rate": 1.9946389048360288e-05, + "loss": 0.2243, + "step": 1219 + }, + { + "epoch": 0.06, + "grad_norm": 1.2926565224995998, + "learning_rate": 1.9946218601863626e-05, + "loss": 0.2635, + "step": 1220 + }, + { + "epoch": 0.06, + "grad_norm": 1.3114698308075319, + "learning_rate": 1.9946047885575224e-05, + "loss": 0.2714, + "step": 1221 + }, + { + "epoch": 0.06, + "grad_norm": 3.1178514386861154, + "learning_rate": 1.9945876899499712e-05, + "loss": 0.2486, + "step": 1222 + }, + { + "epoch": 0.06, + "grad_norm": 1.114371429943034, + "learning_rate": 1.9945705643641727e-05, + "loss": 0.2325, + "step": 1223 + }, + { + "epoch": 0.06, + "grad_norm": 1.4891443815170118, + "learning_rate": 1.9945534118005913e-05, + "loss": 0.2446, + "step": 1224 + }, + { + "epoch": 0.06, + "grad_norm": 1.2381114684264194, + "learning_rate": 1.9945362322596926e-05, + "loss": 0.2606, + "step": 1225 + }, + { + "epoch": 0.06, + "grad_norm": 1.1075696539896358, + "learning_rate": 1.9945190257419424e-05, + "loss": 0.2496, + "step": 1226 + }, + { + "epoch": 0.06, + "grad_norm": 1.2353450742088958, + "learning_rate": 1.9945017922478076e-05, + "loss": 0.2721, + "step": 1227 + }, + { + "epoch": 0.06, + "grad_norm": 1.225499079059848, + "learning_rate": 1.994484531777755e-05, + "loss": 0.2375, + "step": 1228 + }, + { + "epoch": 0.06, + "grad_norm": 1.3675113150352771, + "learning_rate": 1.994467244332254e-05, + "loss": 0.2543, + "step": 1229 + }, + { + "epoch": 0.06, + "grad_norm": 1.483126597664391, + "learning_rate": 1.9944499299117724e-05, + "loss": 0.2659, + "step": 1230 + }, + { + "epoch": 0.06, + "grad_norm": 1.4329247239553178, + "learning_rate": 1.9944325885167807e-05, + "loss": 0.2541, + "step": 1231 + }, + { + "epoch": 0.06, + "grad_norm": 1.1956395859727627, + "learning_rate": 1.9944152201477483e-05, + "loss": 0.2331, + "step": 1232 + }, + { + "epoch": 0.06, + "grad_norm": 1.0789307326922348, + "learning_rate": 1.994397824805147e-05, + "loss": 0.2247, + "step": 1233 + }, + { + "epoch": 0.06, + "grad_norm": 1.2295779281449082, + "learning_rate": 1.9943804024894486e-05, + "loss": 0.2609, + "step": 1234 + }, + { + "epoch": 0.06, + "grad_norm": 1.1590034791235422, + "learning_rate": 1.994362953201126e-05, + "loss": 0.2666, + "step": 1235 + }, + { + "epoch": 0.06, + "grad_norm": 1.0564212568535971, + "learning_rate": 1.9943454769406515e-05, + "loss": 0.2412, + "step": 1236 + }, + { + "epoch": 0.06, + "grad_norm": 1.5078051441569378, + "learning_rate": 1.9943279737085003e-05, + "loss": 0.2494, + "step": 1237 + }, + { + "epoch": 0.06, + "grad_norm": 1.216524313899798, + "learning_rate": 1.9943104435051466e-05, + "loss": 0.2622, + "step": 1238 + }, + { + "epoch": 0.06, + "grad_norm": 1.5140052363285328, + "learning_rate": 1.994292886331066e-05, + "loss": 0.2611, + "step": 1239 + }, + { + "epoch": 0.06, + "grad_norm": 1.2042848747214754, + "learning_rate": 1.994275302186734e-05, + "loss": 0.2446, + "step": 1240 + }, + { + "epoch": 0.06, + "grad_norm": 1.2721633003954307, + "learning_rate": 1.994257691072629e-05, + "loss": 0.252, + "step": 1241 + }, + { + "epoch": 0.06, + "grad_norm": 1.1749578657632214, + "learning_rate": 1.994240052989228e-05, + "loss": 0.2317, + "step": 1242 + }, + { + "epoch": 0.06, + "grad_norm": 1.2140607390698037, + "learning_rate": 1.994222387937009e-05, + "loss": 0.2465, + "step": 1243 + }, + { + "epoch": 0.06, + "grad_norm": 1.2318297370878173, + "learning_rate": 1.9942046959164516e-05, + "loss": 0.2477, + "step": 1244 + }, + { + "epoch": 0.06, + "grad_norm": 1.0822095702256438, + "learning_rate": 1.994186976928036e-05, + "loss": 0.2563, + "step": 1245 + }, + { + "epoch": 0.06, + "grad_norm": 1.2863539137070017, + "learning_rate": 1.9941692309722422e-05, + "loss": 0.2482, + "step": 1246 + }, + { + "epoch": 0.06, + "grad_norm": 0.9198636083932812, + "learning_rate": 1.994151458049552e-05, + "loss": 0.2487, + "step": 1247 + }, + { + "epoch": 0.06, + "grad_norm": 1.0390473686422546, + "learning_rate": 1.9941336581604474e-05, + "loss": 0.2725, + "step": 1248 + }, + { + "epoch": 0.06, + "grad_norm": 0.9622996627966769, + "learning_rate": 1.994115831305411e-05, + "loss": 0.2508, + "step": 1249 + }, + { + "epoch": 0.06, + "grad_norm": 1.1572359650318538, + "learning_rate": 1.9940979774849264e-05, + "loss": 0.2259, + "step": 1250 + }, + { + "epoch": 0.06, + "grad_norm": 1.0814302644289238, + "learning_rate": 1.9940800966994785e-05, + "loss": 0.2188, + "step": 1251 + }, + { + "epoch": 0.06, + "grad_norm": 1.1509504124606063, + "learning_rate": 1.9940621889495516e-05, + "loss": 0.2391, + "step": 1252 + }, + { + "epoch": 0.06, + "grad_norm": 1.1232777139387475, + "learning_rate": 1.9940442542356315e-05, + "loss": 0.2296, + "step": 1253 + }, + { + "epoch": 0.06, + "grad_norm": 1.1470671685669127, + "learning_rate": 1.9940262925582052e-05, + "loss": 0.2782, + "step": 1254 + }, + { + "epoch": 0.06, + "grad_norm": 1.4878598163784487, + "learning_rate": 1.9940083039177594e-05, + "loss": 0.2556, + "step": 1255 + }, + { + "epoch": 0.06, + "grad_norm": 2.419684202169054, + "learning_rate": 1.993990288314782e-05, + "loss": 0.2509, + "step": 1256 + }, + { + "epoch": 0.06, + "grad_norm": 1.0335157524606504, + "learning_rate": 1.9939722457497625e-05, + "loss": 0.2177, + "step": 1257 + }, + { + "epoch": 0.06, + "grad_norm": 1.1965718148733537, + "learning_rate": 1.993954176223189e-05, + "loss": 0.268, + "step": 1258 + }, + { + "epoch": 0.06, + "grad_norm": 1.0653255609824037, + "learning_rate": 1.9939360797355527e-05, + "loss": 0.2258, + "step": 1259 + }, + { + "epoch": 0.06, + "grad_norm": 1.0147997894620475, + "learning_rate": 1.9939179562873437e-05, + "loss": 0.2402, + "step": 1260 + }, + { + "epoch": 0.06, + "grad_norm": 1.6590099622321888, + "learning_rate": 1.9938998058790546e-05, + "loss": 0.241, + "step": 1261 + }, + { + "epoch": 0.06, + "grad_norm": 1.9272334105915023, + "learning_rate": 1.9938816285111768e-05, + "loss": 0.2633, + "step": 1262 + }, + { + "epoch": 0.06, + "grad_norm": 0.9696781916820303, + "learning_rate": 1.9938634241842037e-05, + "loss": 0.2746, + "step": 1263 + }, + { + "epoch": 0.06, + "grad_norm": 1.075879149837418, + "learning_rate": 1.993845192898629e-05, + "loss": 0.2481, + "step": 1264 + }, + { + "epoch": 0.06, + "grad_norm": 1.050216008594986, + "learning_rate": 1.9938269346549473e-05, + "loss": 0.2562, + "step": 1265 + }, + { + "epoch": 0.06, + "grad_norm": 1.0603398741984067, + "learning_rate": 1.993808649453654e-05, + "loss": 0.2201, + "step": 1266 + }, + { + "epoch": 0.06, + "grad_norm": 1.1759766212394545, + "learning_rate": 1.993790337295245e-05, + "loss": 0.2338, + "step": 1267 + }, + { + "epoch": 0.06, + "grad_norm": 1.3895706761656446, + "learning_rate": 1.993771998180217e-05, + "loss": 0.2334, + "step": 1268 + }, + { + "epoch": 0.06, + "grad_norm": 1.007624866161374, + "learning_rate": 1.9937536321090673e-05, + "loss": 0.2513, + "step": 1269 + }, + { + "epoch": 0.06, + "grad_norm": 2.468659081059867, + "learning_rate": 1.9937352390822945e-05, + "loss": 0.2439, + "step": 1270 + }, + { + "epoch": 0.06, + "grad_norm": 1.126146886783081, + "learning_rate": 1.993716819100397e-05, + "loss": 0.2822, + "step": 1271 + }, + { + "epoch": 0.06, + "grad_norm": 0.9947140946371513, + "learning_rate": 1.9936983721638745e-05, + "loss": 0.2452, + "step": 1272 + }, + { + "epoch": 0.06, + "grad_norm": 1.022459747505446, + "learning_rate": 1.9936798982732274e-05, + "loss": 0.2747, + "step": 1273 + }, + { + "epoch": 0.06, + "grad_norm": 1.0124242493363043, + "learning_rate": 1.9936613974289575e-05, + "loss": 0.2578, + "step": 1274 + }, + { + "epoch": 0.06, + "grad_norm": 1.7048341660404465, + "learning_rate": 1.9936428696315656e-05, + "loss": 0.2352, + "step": 1275 + }, + { + "epoch": 0.06, + "grad_norm": 1.0907511105393048, + "learning_rate": 1.993624314881555e-05, + "loss": 0.265, + "step": 1276 + }, + { + "epoch": 0.06, + "grad_norm": 1.08612995999494, + "learning_rate": 1.9936057331794284e-05, + "loss": 0.2474, + "step": 1277 + }, + { + "epoch": 0.06, + "grad_norm": 1.0279733992904583, + "learning_rate": 1.9935871245256907e-05, + "loss": 0.2478, + "step": 1278 + }, + { + "epoch": 0.07, + "grad_norm": 1.3072664522397148, + "learning_rate": 1.9935684889208455e-05, + "loss": 0.244, + "step": 1279 + }, + { + "epoch": 0.07, + "grad_norm": 0.9953628542258297, + "learning_rate": 1.9935498263653994e-05, + "loss": 0.2457, + "step": 1280 + }, + { + "epoch": 0.07, + "grad_norm": 0.8782714215203581, + "learning_rate": 1.993531136859858e-05, + "loss": 0.2319, + "step": 1281 + }, + { + "epoch": 0.07, + "grad_norm": 1.0964019610858486, + "learning_rate": 1.9935124204047283e-05, + "loss": 0.2362, + "step": 1282 + }, + { + "epoch": 0.07, + "grad_norm": 1.1545713070308408, + "learning_rate": 1.9934936770005184e-05, + "loss": 0.2196, + "step": 1283 + }, + { + "epoch": 0.07, + "grad_norm": 2.7314670261178784, + "learning_rate": 1.993474906647736e-05, + "loss": 0.2384, + "step": 1284 + }, + { + "epoch": 0.07, + "grad_norm": 1.0590502228754426, + "learning_rate": 1.993456109346891e-05, + "loss": 0.2324, + "step": 1285 + }, + { + "epoch": 0.07, + "grad_norm": 1.0498547537114502, + "learning_rate": 1.9934372850984925e-05, + "loss": 0.2585, + "step": 1286 + }, + { + "epoch": 0.07, + "grad_norm": 1.1848581217189293, + "learning_rate": 1.9934184339030517e-05, + "loss": 0.2664, + "step": 1287 + }, + { + "epoch": 0.07, + "grad_norm": 2.072168224985916, + "learning_rate": 1.99339955576108e-05, + "loss": 0.2768, + "step": 1288 + }, + { + "epoch": 0.07, + "grad_norm": 1.3175290108699542, + "learning_rate": 1.993380650673089e-05, + "loss": 0.2889, + "step": 1289 + }, + { + "epoch": 0.07, + "grad_norm": 0.9720547354553507, + "learning_rate": 1.9933617186395917e-05, + "loss": 0.2231, + "step": 1290 + }, + { + "epoch": 0.07, + "grad_norm": 1.373607960688602, + "learning_rate": 1.993342759661102e-05, + "loss": 0.2704, + "step": 1291 + }, + { + "epoch": 0.07, + "grad_norm": 1.3201599152256596, + "learning_rate": 1.9933237737381336e-05, + "loss": 0.2411, + "step": 1292 + }, + { + "epoch": 0.07, + "grad_norm": 1.090537077664811, + "learning_rate": 1.993304760871202e-05, + "loss": 0.2439, + "step": 1293 + }, + { + "epoch": 0.07, + "grad_norm": 1.2988085961178586, + "learning_rate": 1.993285721060822e-05, + "loss": 0.2552, + "step": 1294 + }, + { + "epoch": 0.07, + "grad_norm": 1.12247096396022, + "learning_rate": 1.9932666543075113e-05, + "loss": 0.2555, + "step": 1295 + }, + { + "epoch": 0.07, + "grad_norm": 1.474840116009373, + "learning_rate": 1.9932475606117865e-05, + "loss": 0.2577, + "step": 1296 + }, + { + "epoch": 0.07, + "grad_norm": 1.5661222125417564, + "learning_rate": 1.9932284399741653e-05, + "loss": 0.2386, + "step": 1297 + }, + { + "epoch": 0.07, + "grad_norm": 1.3560953139033376, + "learning_rate": 1.9932092923951667e-05, + "loss": 0.2418, + "step": 1298 + }, + { + "epoch": 0.07, + "grad_norm": 1.1428106762278663, + "learning_rate": 1.99319011787531e-05, + "loss": 0.2286, + "step": 1299 + }, + { + "epoch": 0.07, + "grad_norm": 1.6241541244210773, + "learning_rate": 1.993170916415115e-05, + "loss": 0.2679, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 1.1611966906057736, + "learning_rate": 1.993151688015103e-05, + "loss": 0.2799, + "step": 1301 + }, + { + "epoch": 0.07, + "grad_norm": 1.06454836864526, + "learning_rate": 1.993132432675795e-05, + "loss": 0.224, + "step": 1302 + }, + { + "epoch": 0.07, + "grad_norm": 1.4358338142793268, + "learning_rate": 1.993113150397714e-05, + "loss": 0.2535, + "step": 1303 + }, + { + "epoch": 0.07, + "grad_norm": 1.1185775618889204, + "learning_rate": 1.993093841181383e-05, + "loss": 0.2704, + "step": 1304 + }, + { + "epoch": 0.07, + "grad_norm": 0.8895921291367405, + "learning_rate": 1.993074505027325e-05, + "loss": 0.2508, + "step": 1305 + }, + { + "epoch": 0.07, + "grad_norm": 0.9817015060406692, + "learning_rate": 1.9930551419360653e-05, + "loss": 0.2346, + "step": 1306 + }, + { + "epoch": 0.07, + "grad_norm": 2.9722902810607787, + "learning_rate": 1.9930357519081286e-05, + "loss": 0.2384, + "step": 1307 + }, + { + "epoch": 0.07, + "grad_norm": 1.250573129564375, + "learning_rate": 1.993016334944041e-05, + "loss": 0.2486, + "step": 1308 + }, + { + "epoch": 0.07, + "grad_norm": 1.1498427550944075, + "learning_rate": 1.9929968910443294e-05, + "loss": 0.2396, + "step": 1309 + }, + { + "epoch": 0.07, + "grad_norm": 1.0067228344309367, + "learning_rate": 1.992977420209521e-05, + "loss": 0.2576, + "step": 1310 + }, + { + "epoch": 0.07, + "grad_norm": 1.2293727519204154, + "learning_rate": 1.9929579224401436e-05, + "loss": 0.2297, + "step": 1311 + }, + { + "epoch": 0.07, + "grad_norm": 1.5442224539939846, + "learning_rate": 1.992938397736727e-05, + "loss": 0.2366, + "step": 1312 + }, + { + "epoch": 0.07, + "grad_norm": 1.3856656195152042, + "learning_rate": 1.9929188460998e-05, + "loss": 0.2534, + "step": 1313 + }, + { + "epoch": 0.07, + "grad_norm": 1.0568400725170732, + "learning_rate": 1.992899267529893e-05, + "loss": 0.2555, + "step": 1314 + }, + { + "epoch": 0.07, + "grad_norm": 1.3167276569636372, + "learning_rate": 1.9928796620275377e-05, + "loss": 0.2651, + "step": 1315 + }, + { + "epoch": 0.07, + "grad_norm": 1.5992657211036883, + "learning_rate": 1.9928600295932655e-05, + "loss": 0.2644, + "step": 1316 + }, + { + "epoch": 0.07, + "grad_norm": 1.2137062647139218, + "learning_rate": 1.992840370227609e-05, + "loss": 0.2527, + "step": 1317 + }, + { + "epoch": 0.07, + "grad_norm": 1.1268154425617247, + "learning_rate": 1.992820683931101e-05, + "loss": 0.2532, + "step": 1318 + }, + { + "epoch": 0.07, + "grad_norm": 4.83134570568081, + "learning_rate": 1.992800970704276e-05, + "loss": 0.2679, + "step": 1319 + }, + { + "epoch": 0.07, + "grad_norm": 1.2780231232806325, + "learning_rate": 1.9927812305476685e-05, + "loss": 0.2515, + "step": 1320 + }, + { + "epoch": 0.07, + "grad_norm": 0.9825579835071978, + "learning_rate": 1.9927614634618142e-05, + "loss": 0.2491, + "step": 1321 + }, + { + "epoch": 0.07, + "grad_norm": 1.0270718091675821, + "learning_rate": 1.9927416694472493e-05, + "loss": 0.2598, + "step": 1322 + }, + { + "epoch": 0.07, + "grad_norm": 0.8900160649600508, + "learning_rate": 1.9927218485045103e-05, + "loss": 0.2355, + "step": 1323 + }, + { + "epoch": 0.07, + "grad_norm": 1.4166876201090004, + "learning_rate": 1.992702000634135e-05, + "loss": 0.2445, + "step": 1324 + }, + { + "epoch": 0.07, + "grad_norm": 0.9133842813076238, + "learning_rate": 1.9926821258366622e-05, + "loss": 0.2533, + "step": 1325 + }, + { + "epoch": 0.07, + "grad_norm": 1.1537899021882343, + "learning_rate": 1.9926622241126306e-05, + "loss": 0.2521, + "step": 1326 + }, + { + "epoch": 0.07, + "grad_norm": 1.237239241656995, + "learning_rate": 1.99264229546258e-05, + "loss": 0.2769, + "step": 1327 + }, + { + "epoch": 0.07, + "grad_norm": 0.9828066104197971, + "learning_rate": 1.992622339887051e-05, + "loss": 0.2688, + "step": 1328 + }, + { + "epoch": 0.07, + "grad_norm": 0.9653344834028247, + "learning_rate": 1.992602357386585e-05, + "loss": 0.2613, + "step": 1329 + }, + { + "epoch": 0.07, + "grad_norm": 1.1792847900807026, + "learning_rate": 1.9925823479617242e-05, + "loss": 0.2516, + "step": 1330 + }, + { + "epoch": 0.07, + "grad_norm": 0.9584914944637997, + "learning_rate": 1.9925623116130105e-05, + "loss": 0.2474, + "step": 1331 + }, + { + "epoch": 0.07, + "grad_norm": 1.1377392233359997, + "learning_rate": 1.9925422483409886e-05, + "loss": 0.2352, + "step": 1332 + }, + { + "epoch": 0.07, + "grad_norm": 1.2468284883092173, + "learning_rate": 1.992522158146202e-05, + "loss": 0.241, + "step": 1333 + }, + { + "epoch": 0.07, + "grad_norm": 1.3281788584533107, + "learning_rate": 1.9925020410291963e-05, + "loss": 0.2457, + "step": 1334 + }, + { + "epoch": 0.07, + "grad_norm": 1.3139206487167114, + "learning_rate": 1.992481896990516e-05, + "loss": 0.2725, + "step": 1335 + }, + { + "epoch": 0.07, + "grad_norm": 1.1181599807995584, + "learning_rate": 1.9924617260307088e-05, + "loss": 0.2496, + "step": 1336 + }, + { + "epoch": 0.07, + "grad_norm": 1.095318282833729, + "learning_rate": 1.9924415281503204e-05, + "loss": 0.2635, + "step": 1337 + }, + { + "epoch": 0.07, + "grad_norm": 1.3388267621206682, + "learning_rate": 1.9924213033499e-05, + "loss": 0.254, + "step": 1338 + }, + { + "epoch": 0.07, + "grad_norm": 1.2570545418236239, + "learning_rate": 1.9924010516299956e-05, + "loss": 0.2343, + "step": 1339 + }, + { + "epoch": 0.07, + "grad_norm": 1.8375633870522983, + "learning_rate": 1.9923807729911567e-05, + "loss": 0.2485, + "step": 1340 + }, + { + "epoch": 0.07, + "grad_norm": 1.2080640469217854, + "learning_rate": 1.9923604674339336e-05, + "loss": 0.2477, + "step": 1341 + }, + { + "epoch": 0.07, + "grad_norm": 1.3859634411593529, + "learning_rate": 1.9923401349588762e-05, + "loss": 0.2701, + "step": 1342 + }, + { + "epoch": 0.07, + "grad_norm": 1.2651163070331262, + "learning_rate": 1.9923197755665368e-05, + "loss": 0.237, + "step": 1343 + }, + { + "epoch": 0.07, + "grad_norm": 1.212850146390197, + "learning_rate": 1.9922993892574676e-05, + "loss": 0.2636, + "step": 1344 + }, + { + "epoch": 0.07, + "grad_norm": 1.5856155338509108, + "learning_rate": 1.9922789760322213e-05, + "loss": 0.2572, + "step": 1345 + }, + { + "epoch": 0.07, + "grad_norm": 1.3322458710609553, + "learning_rate": 1.9922585358913515e-05, + "loss": 0.2538, + "step": 1346 + }, + { + "epoch": 0.07, + "grad_norm": 1.1453763616701749, + "learning_rate": 1.992238068835413e-05, + "loss": 0.2127, + "step": 1347 + }, + { + "epoch": 0.07, + "grad_norm": 1.4671152149814002, + "learning_rate": 1.9922175748649612e-05, + "loss": 0.2467, + "step": 1348 + }, + { + "epoch": 0.07, + "grad_norm": 0.96344423338482, + "learning_rate": 1.9921970539805513e-05, + "loss": 0.2449, + "step": 1349 + }, + { + "epoch": 0.07, + "grad_norm": 1.3164717303997437, + "learning_rate": 1.9921765061827405e-05, + "loss": 0.2527, + "step": 1350 + }, + { + "epoch": 0.07, + "grad_norm": 1.1273036629419297, + "learning_rate": 1.992155931472086e-05, + "loss": 0.2335, + "step": 1351 + }, + { + "epoch": 0.07, + "grad_norm": 1.3271998637630977, + "learning_rate": 1.9921353298491453e-05, + "loss": 0.233, + "step": 1352 + }, + { + "epoch": 0.07, + "grad_norm": 1.083621975560061, + "learning_rate": 1.9921147013144782e-05, + "loss": 0.26, + "step": 1353 + }, + { + "epoch": 0.07, + "grad_norm": 1.1429372463643215, + "learning_rate": 1.9920940458686434e-05, + "loss": 0.2267, + "step": 1354 + }, + { + "epoch": 0.07, + "grad_norm": 1.3916262889864177, + "learning_rate": 1.992073363512202e-05, + "loss": 0.2518, + "step": 1355 + }, + { + "epoch": 0.07, + "grad_norm": 1.5964270594798335, + "learning_rate": 1.9920526542457143e-05, + "loss": 0.2459, + "step": 1356 + }, + { + "epoch": 0.07, + "grad_norm": 1.1391799157648104, + "learning_rate": 1.9920319180697422e-05, + "loss": 0.2825, + "step": 1357 + }, + { + "epoch": 0.07, + "grad_norm": 1.1659412582972448, + "learning_rate": 1.9920111549848486e-05, + "loss": 0.2372, + "step": 1358 + }, + { + "epoch": 0.07, + "grad_norm": 1.4113772014832484, + "learning_rate": 1.991990364991596e-05, + "loss": 0.2844, + "step": 1359 + }, + { + "epoch": 0.07, + "grad_norm": 1.2647682628121506, + "learning_rate": 1.991969548090549e-05, + "loss": 0.2307, + "step": 1360 + }, + { + "epoch": 0.07, + "grad_norm": 1.2706451265074896, + "learning_rate": 1.9919487042822722e-05, + "loss": 0.2504, + "step": 1361 + }, + { + "epoch": 0.07, + "grad_norm": 1.3043546137160325, + "learning_rate": 1.9919278335673306e-05, + "loss": 0.2653, + "step": 1362 + }, + { + "epoch": 0.07, + "grad_norm": 1.526648857603099, + "learning_rate": 1.9919069359462906e-05, + "loss": 0.2352, + "step": 1363 + }, + { + "epoch": 0.07, + "grad_norm": 0.9163989199581128, + "learning_rate": 1.9918860114197186e-05, + "loss": 0.2623, + "step": 1364 + }, + { + "epoch": 0.07, + "grad_norm": 1.2210780161581907, + "learning_rate": 1.9918650599881828e-05, + "loss": 0.2463, + "step": 1365 + }, + { + "epoch": 0.07, + "grad_norm": 1.2037239438393557, + "learning_rate": 1.9918440816522514e-05, + "loss": 0.2681, + "step": 1366 + }, + { + "epoch": 0.07, + "grad_norm": 1.2841089377890307, + "learning_rate": 1.991823076412493e-05, + "loss": 0.25, + "step": 1367 + }, + { + "epoch": 0.07, + "grad_norm": 1.2099082139270674, + "learning_rate": 1.9918020442694773e-05, + "loss": 0.2374, + "step": 1368 + }, + { + "epoch": 0.07, + "grad_norm": 1.5448627661147787, + "learning_rate": 1.9917809852237754e-05, + "loss": 0.2389, + "step": 1369 + }, + { + "epoch": 0.07, + "grad_norm": 1.3484794901039256, + "learning_rate": 1.9917598992759587e-05, + "loss": 0.2431, + "step": 1370 + }, + { + "epoch": 0.07, + "grad_norm": 1.3696444473216656, + "learning_rate": 1.9917387864265983e-05, + "loss": 0.2469, + "step": 1371 + }, + { + "epoch": 0.07, + "grad_norm": 1.1573883129027844, + "learning_rate": 1.9917176466762673e-05, + "loss": 0.2794, + "step": 1372 + }, + { + "epoch": 0.07, + "grad_norm": 0.9521797753529773, + "learning_rate": 1.991696480025539e-05, + "loss": 0.2609, + "step": 1373 + }, + { + "epoch": 0.07, + "grad_norm": 0.9701217511012938, + "learning_rate": 1.991675286474988e-05, + "loss": 0.2452, + "step": 1374 + }, + { + "epoch": 0.07, + "grad_norm": 1.2255199211334986, + "learning_rate": 1.9916540660251887e-05, + "loss": 0.2657, + "step": 1375 + }, + { + "epoch": 0.07, + "grad_norm": 1.2802572417504723, + "learning_rate": 1.9916328186767168e-05, + "loss": 0.2507, + "step": 1376 + }, + { + "epoch": 0.07, + "grad_norm": 1.2780752266677757, + "learning_rate": 1.9916115444301488e-05, + "loss": 0.2475, + "step": 1377 + }, + { + "epoch": 0.07, + "grad_norm": 1.1917746479811409, + "learning_rate": 1.9915902432860615e-05, + "loss": 0.2778, + "step": 1378 + }, + { + "epoch": 0.07, + "grad_norm": 1.1122713843909182, + "learning_rate": 1.9915689152450328e-05, + "loss": 0.257, + "step": 1379 + }, + { + "epoch": 0.07, + "grad_norm": 1.372238893365632, + "learning_rate": 1.9915475603076414e-05, + "loss": 0.2623, + "step": 1380 + }, + { + "epoch": 0.07, + "grad_norm": 1.3604773532215377, + "learning_rate": 1.9915261784744664e-05, + "loss": 0.2317, + "step": 1381 + }, + { + "epoch": 0.07, + "grad_norm": 1.200145793587868, + "learning_rate": 1.9915047697460878e-05, + "loss": 0.2662, + "step": 1382 + }, + { + "epoch": 0.07, + "grad_norm": 1.0781961411781062, + "learning_rate": 1.9914833341230863e-05, + "loss": 0.2551, + "step": 1383 + }, + { + "epoch": 0.07, + "grad_norm": 0.9837813270598623, + "learning_rate": 1.9914618716060437e-05, + "loss": 0.259, + "step": 1384 + }, + { + "epoch": 0.07, + "grad_norm": 1.1325359219128366, + "learning_rate": 1.9914403821955414e-05, + "loss": 0.2386, + "step": 1385 + }, + { + "epoch": 0.07, + "grad_norm": 1.0389864770057453, + "learning_rate": 1.9914188658921628e-05, + "loss": 0.236, + "step": 1386 + }, + { + "epoch": 0.07, + "grad_norm": 1.3836282909574422, + "learning_rate": 1.9913973226964917e-05, + "loss": 0.2576, + "step": 1387 + }, + { + "epoch": 0.07, + "grad_norm": 1.0113155814154398, + "learning_rate": 1.991375752609112e-05, + "loss": 0.2626, + "step": 1388 + }, + { + "epoch": 0.07, + "grad_norm": 1.0919641731577683, + "learning_rate": 1.991354155630609e-05, + "loss": 0.2562, + "step": 1389 + }, + { + "epoch": 0.07, + "grad_norm": 1.4916606982981637, + "learning_rate": 1.9913325317615684e-05, + "loss": 0.2225, + "step": 1390 + }, + { + "epoch": 0.07, + "grad_norm": 1.269965773751471, + "learning_rate": 1.9913108810025776e-05, + "loss": 0.2719, + "step": 1391 + }, + { + "epoch": 0.07, + "grad_norm": 2.1927198996440365, + "learning_rate": 1.9912892033542225e-05, + "loss": 0.2448, + "step": 1392 + }, + { + "epoch": 0.07, + "grad_norm": 1.0247277433743283, + "learning_rate": 1.991267498817092e-05, + "loss": 0.2456, + "step": 1393 + }, + { + "epoch": 0.07, + "grad_norm": 0.999133874275022, + "learning_rate": 1.9912457673917745e-05, + "loss": 0.2409, + "step": 1394 + }, + { + "epoch": 0.07, + "grad_norm": 1.160387438965137, + "learning_rate": 1.9912240090788595e-05, + "loss": 0.2619, + "step": 1395 + }, + { + "epoch": 0.07, + "grad_norm": 1.208793083153623, + "learning_rate": 1.9912022238789374e-05, + "loss": 0.242, + "step": 1396 + }, + { + "epoch": 0.07, + "grad_norm": 1.8843530643442636, + "learning_rate": 1.991180411792599e-05, + "loss": 0.2707, + "step": 1397 + }, + { + "epoch": 0.07, + "grad_norm": 1.1037363784292973, + "learning_rate": 1.9911585728204362e-05, + "loss": 0.2335, + "step": 1398 + }, + { + "epoch": 0.07, + "grad_norm": 0.954476589084465, + "learning_rate": 1.9911367069630408e-05, + "loss": 0.2311, + "step": 1399 + }, + { + "epoch": 0.07, + "grad_norm": 1.0015353333121517, + "learning_rate": 1.991114814221006e-05, + "loss": 0.2325, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 0.9980972684885128, + "learning_rate": 1.9910928945949264e-05, + "loss": 0.2394, + "step": 1401 + }, + { + "epoch": 0.07, + "grad_norm": 1.7186452306443267, + "learning_rate": 1.9910709480853957e-05, + "loss": 0.2424, + "step": 1402 + }, + { + "epoch": 0.07, + "grad_norm": 0.9696052813958719, + "learning_rate": 1.9910489746930097e-05, + "loss": 0.26, + "step": 1403 + }, + { + "epoch": 0.07, + "grad_norm": 0.7690335657622046, + "learning_rate": 1.9910269744183645e-05, + "loss": 0.2129, + "step": 1404 + }, + { + "epoch": 0.07, + "grad_norm": 1.1975760811845586, + "learning_rate": 1.9910049472620564e-05, + "loss": 0.2611, + "step": 1405 + }, + { + "epoch": 0.07, + "grad_norm": 0.8503953892746059, + "learning_rate": 1.990982893224683e-05, + "loss": 0.2246, + "step": 1406 + }, + { + "epoch": 0.07, + "grad_norm": 0.847893913853286, + "learning_rate": 1.990960812306843e-05, + "loss": 0.2194, + "step": 1407 + }, + { + "epoch": 0.07, + "grad_norm": 0.9348019252601966, + "learning_rate": 1.990938704509135e-05, + "loss": 0.221, + "step": 1408 + }, + { + "epoch": 0.07, + "grad_norm": 0.9761686610221613, + "learning_rate": 1.9909165698321585e-05, + "loss": 0.243, + "step": 1409 + }, + { + "epoch": 0.07, + "grad_norm": 1.1722248378675564, + "learning_rate": 1.990894408276514e-05, + "loss": 0.2547, + "step": 1410 + }, + { + "epoch": 0.07, + "grad_norm": 0.8053092328818088, + "learning_rate": 1.9908722198428027e-05, + "loss": 0.227, + "step": 1411 + }, + { + "epoch": 0.07, + "grad_norm": 0.9665903219728691, + "learning_rate": 1.9908500045316264e-05, + "loss": 0.2451, + "step": 1412 + }, + { + "epoch": 0.07, + "grad_norm": 0.953084935420593, + "learning_rate": 1.9908277623435878e-05, + "loss": 0.2318, + "step": 1413 + }, + { + "epoch": 0.07, + "grad_norm": 1.5767742424384938, + "learning_rate": 1.9908054932792903e-05, + "loss": 0.2865, + "step": 1414 + }, + { + "epoch": 0.07, + "grad_norm": 1.0425077282991027, + "learning_rate": 1.9907831973393377e-05, + "loss": 0.2654, + "step": 1415 + }, + { + "epoch": 0.07, + "grad_norm": 0.9923093106371809, + "learning_rate": 1.9907608745243356e-05, + "loss": 0.2429, + "step": 1416 + }, + { + "epoch": 0.07, + "grad_norm": 1.104082202213175, + "learning_rate": 1.9907385248348882e-05, + "loss": 0.2542, + "step": 1417 + }, + { + "epoch": 0.07, + "grad_norm": 1.1450188139943347, + "learning_rate": 1.990716148271602e-05, + "loss": 0.2285, + "step": 1418 + }, + { + "epoch": 0.07, + "grad_norm": 1.0882818233930411, + "learning_rate": 1.990693744835085e-05, + "loss": 0.2331, + "step": 1419 + }, + { + "epoch": 0.07, + "grad_norm": 1.0449521038261802, + "learning_rate": 1.990671314525944e-05, + "loss": 0.2442, + "step": 1420 + }, + { + "epoch": 0.07, + "grad_norm": 1.1106530955831952, + "learning_rate": 1.9906488573447875e-05, + "loss": 0.2584, + "step": 1421 + }, + { + "epoch": 0.07, + "grad_norm": 1.0370821761153255, + "learning_rate": 1.990626373292225e-05, + "loss": 0.2335, + "step": 1422 + }, + { + "epoch": 0.07, + "grad_norm": 1.13841965133708, + "learning_rate": 1.9906038623688658e-05, + "loss": 0.2388, + "step": 1423 + }, + { + "epoch": 0.07, + "grad_norm": 0.9337248189403204, + "learning_rate": 1.9905813245753214e-05, + "loss": 0.253, + "step": 1424 + }, + { + "epoch": 0.07, + "grad_norm": 0.8239707311358239, + "learning_rate": 1.9905587599122022e-05, + "loss": 0.2417, + "step": 1425 + }, + { + "epoch": 0.07, + "grad_norm": 0.8948997842459009, + "learning_rate": 1.990536168380121e-05, + "loss": 0.2574, + "step": 1426 + }, + { + "epoch": 0.07, + "grad_norm": 2.344858167440341, + "learning_rate": 1.9905135499796903e-05, + "loss": 0.2557, + "step": 1427 + }, + { + "epoch": 0.07, + "grad_norm": 0.8315357711462754, + "learning_rate": 1.9904909047115233e-05, + "loss": 0.267, + "step": 1428 + }, + { + "epoch": 0.07, + "grad_norm": 0.7994238898963966, + "learning_rate": 1.990468232576235e-05, + "loss": 0.2377, + "step": 1429 + }, + { + "epoch": 0.07, + "grad_norm": 0.8653614776519574, + "learning_rate": 1.9904455335744395e-05, + "loss": 0.2432, + "step": 1430 + }, + { + "epoch": 0.07, + "grad_norm": 0.8806812047415478, + "learning_rate": 1.990422807706753e-05, + "loss": 0.2571, + "step": 1431 + }, + { + "epoch": 0.07, + "grad_norm": 0.9214838257317811, + "learning_rate": 1.990400054973792e-05, + "loss": 0.2043, + "step": 1432 + }, + { + "epoch": 0.07, + "grad_norm": 1.8867702484440099, + "learning_rate": 1.9903772753761736e-05, + "loss": 0.2562, + "step": 1433 + }, + { + "epoch": 0.07, + "grad_norm": 0.8546974694213872, + "learning_rate": 1.990354468914516e-05, + "loss": 0.2408, + "step": 1434 + }, + { + "epoch": 0.07, + "grad_norm": 1.1024048985123924, + "learning_rate": 1.990331635589437e-05, + "loss": 0.2714, + "step": 1435 + }, + { + "epoch": 0.07, + "grad_norm": 1.1446317404952169, + "learning_rate": 1.9903087754015567e-05, + "loss": 0.2347, + "step": 1436 + }, + { + "epoch": 0.07, + "grad_norm": 0.919739139874813, + "learning_rate": 1.9902858883514948e-05, + "loss": 0.2527, + "step": 1437 + }, + { + "epoch": 0.07, + "grad_norm": 1.435959400311835, + "learning_rate": 1.990262974439872e-05, + "loss": 0.2464, + "step": 1438 + }, + { + "epoch": 0.07, + "grad_norm": 3.8152490838875965, + "learning_rate": 1.9902400336673107e-05, + "loss": 0.2828, + "step": 1439 + }, + { + "epoch": 0.07, + "grad_norm": 0.9967524541458044, + "learning_rate": 1.9902170660344323e-05, + "loss": 0.2598, + "step": 1440 + }, + { + "epoch": 0.07, + "grad_norm": 1.066795566885315, + "learning_rate": 1.99019407154186e-05, + "loss": 0.2262, + "step": 1441 + }, + { + "epoch": 0.07, + "grad_norm": 1.0092050775090406, + "learning_rate": 1.9901710501902177e-05, + "loss": 0.2339, + "step": 1442 + }, + { + "epoch": 0.07, + "grad_norm": 1.1680892173535489, + "learning_rate": 1.9901480019801297e-05, + "loss": 0.217, + "step": 1443 + }, + { + "epoch": 0.07, + "grad_norm": 0.9901721765525188, + "learning_rate": 1.990124926912221e-05, + "loss": 0.2208, + "step": 1444 + }, + { + "epoch": 0.07, + "grad_norm": 1.3413707759726372, + "learning_rate": 1.990101824987118e-05, + "loss": 0.2542, + "step": 1445 + }, + { + "epoch": 0.07, + "grad_norm": 1.0970427933144882, + "learning_rate": 1.9900786962054468e-05, + "loss": 0.2431, + "step": 1446 + }, + { + "epoch": 0.07, + "grad_norm": 1.1698370073558522, + "learning_rate": 1.9900555405678354e-05, + "loss": 0.264, + "step": 1447 + }, + { + "epoch": 0.07, + "grad_norm": 5.996959606051725, + "learning_rate": 1.990032358074911e-05, + "loss": 0.2658, + "step": 1448 + }, + { + "epoch": 0.07, + "grad_norm": 1.3875556188415172, + "learning_rate": 1.9900091487273035e-05, + "loss": 0.2672, + "step": 1449 + }, + { + "epoch": 0.07, + "grad_norm": 1.1253872775336855, + "learning_rate": 1.9899859125256417e-05, + "loss": 0.2545, + "step": 1450 + }, + { + "epoch": 0.07, + "grad_norm": 1.065186848280434, + "learning_rate": 1.989962649470556e-05, + "loss": 0.2541, + "step": 1451 + }, + { + "epoch": 0.07, + "grad_norm": 1.1268591743661116, + "learning_rate": 1.989939359562678e-05, + "loss": 0.2451, + "step": 1452 + }, + { + "epoch": 0.07, + "grad_norm": 1.2351628782733877, + "learning_rate": 1.9899160428026383e-05, + "loss": 0.2669, + "step": 1453 + }, + { + "epoch": 0.07, + "grad_norm": 1.2936651171469973, + "learning_rate": 1.9898926991910704e-05, + "loss": 0.2772, + "step": 1454 + }, + { + "epoch": 0.07, + "grad_norm": 1.2085038183262553, + "learning_rate": 1.989869328728607e-05, + "loss": 0.2427, + "step": 1455 + }, + { + "epoch": 0.07, + "grad_norm": 1.4147175272036487, + "learning_rate": 1.9898459314158825e-05, + "loss": 0.2632, + "step": 1456 + }, + { + "epoch": 0.07, + "grad_norm": 0.9223842251997182, + "learning_rate": 1.989822507253531e-05, + "loss": 0.221, + "step": 1457 + }, + { + "epoch": 0.07, + "grad_norm": 1.2893374006720397, + "learning_rate": 1.9897990562421882e-05, + "loss": 0.2401, + "step": 1458 + }, + { + "epoch": 0.07, + "grad_norm": 0.9799151191327158, + "learning_rate": 1.9897755783824897e-05, + "loss": 0.2362, + "step": 1459 + }, + { + "epoch": 0.07, + "grad_norm": 1.1257687249716646, + "learning_rate": 1.989752073675073e-05, + "loss": 0.2519, + "step": 1460 + }, + { + "epoch": 0.07, + "grad_norm": 1.0071942580571653, + "learning_rate": 1.9897285421205753e-05, + "loss": 0.2347, + "step": 1461 + }, + { + "epoch": 0.07, + "grad_norm": 1.127580590734185, + "learning_rate": 1.989704983719635e-05, + "loss": 0.2469, + "step": 1462 + }, + { + "epoch": 0.07, + "grad_norm": 7.490512732609794, + "learning_rate": 1.9896813984728915e-05, + "loss": 0.2626, + "step": 1463 + }, + { + "epoch": 0.07, + "grad_norm": 1.5208393177639197, + "learning_rate": 1.9896577863809836e-05, + "loss": 0.2562, + "step": 1464 + }, + { + "epoch": 0.07, + "grad_norm": 1.3154094170613637, + "learning_rate": 1.9896341474445526e-05, + "loss": 0.2369, + "step": 1465 + }, + { + "epoch": 0.07, + "grad_norm": 1.3400389440702367, + "learning_rate": 1.9896104816642393e-05, + "loss": 0.2435, + "step": 1466 + }, + { + "epoch": 0.07, + "grad_norm": 1.245380768826678, + "learning_rate": 1.989586789040686e-05, + "loss": 0.2342, + "step": 1467 + }, + { + "epoch": 0.07, + "grad_norm": 1.0622069292754814, + "learning_rate": 1.9895630695745353e-05, + "loss": 0.2514, + "step": 1468 + }, + { + "epoch": 0.07, + "grad_norm": 1.3739800782133331, + "learning_rate": 1.98953932326643e-05, + "loss": 0.2709, + "step": 1469 + }, + { + "epoch": 0.07, + "grad_norm": 1.2438019019651285, + "learning_rate": 1.9895155501170153e-05, + "loss": 0.224, + "step": 1470 + }, + { + "epoch": 0.07, + "grad_norm": 1.1099411604601386, + "learning_rate": 1.9894917501269346e-05, + "loss": 0.2386, + "step": 1471 + }, + { + "epoch": 0.07, + "grad_norm": 1.0999919296078493, + "learning_rate": 1.989467923296835e-05, + "loss": 0.2301, + "step": 1472 + }, + { + "epoch": 0.07, + "grad_norm": 0.8744198630330524, + "learning_rate": 1.9894440696273615e-05, + "loss": 0.2443, + "step": 1473 + }, + { + "epoch": 0.07, + "grad_norm": 1.462670102238129, + "learning_rate": 1.9894201891191624e-05, + "loss": 0.2574, + "step": 1474 + }, + { + "epoch": 0.08, + "grad_norm": 3.522143438867789, + "learning_rate": 1.9893962817728842e-05, + "loss": 0.2598, + "step": 1475 + }, + { + "epoch": 0.08, + "grad_norm": 2.66521405994223, + "learning_rate": 1.9893723475891762e-05, + "loss": 0.227, + "step": 1476 + }, + { + "epoch": 0.08, + "grad_norm": 1.030802909005997, + "learning_rate": 1.9893483865686875e-05, + "loss": 0.2425, + "step": 1477 + }, + { + "epoch": 0.08, + "grad_norm": 1.619965380448205, + "learning_rate": 1.989324398712068e-05, + "loss": 0.2383, + "step": 1478 + }, + { + "epoch": 0.08, + "grad_norm": 1.5818095937642507, + "learning_rate": 1.9893003840199677e-05, + "loss": 0.2516, + "step": 1479 + }, + { + "epoch": 0.08, + "grad_norm": 1.4259409101129912, + "learning_rate": 1.989276342493039e-05, + "loss": 0.2405, + "step": 1480 + }, + { + "epoch": 0.08, + "grad_norm": 0.9980110737147162, + "learning_rate": 1.989252274131934e-05, + "loss": 0.2323, + "step": 1481 + }, + { + "epoch": 0.08, + "grad_norm": 1.0821845943196076, + "learning_rate": 1.9892281789373047e-05, + "loss": 0.2628, + "step": 1482 + }, + { + "epoch": 0.08, + "grad_norm": 1.0011387530258766, + "learning_rate": 1.9892040569098054e-05, + "loss": 0.2427, + "step": 1483 + }, + { + "epoch": 0.08, + "grad_norm": 1.296208187020809, + "learning_rate": 1.98917990805009e-05, + "loss": 0.2274, + "step": 1484 + }, + { + "epoch": 0.08, + "grad_norm": 1.17330571312088, + "learning_rate": 1.989155732358814e-05, + "loss": 0.2519, + "step": 1485 + }, + { + "epoch": 0.08, + "grad_norm": 1.0333952943167788, + "learning_rate": 1.9891315298366327e-05, + "loss": 0.2308, + "step": 1486 + }, + { + "epoch": 0.08, + "grad_norm": 1.1104592349324152, + "learning_rate": 1.9891073004842026e-05, + "loss": 0.2546, + "step": 1487 + }, + { + "epoch": 0.08, + "grad_norm": 1.2835504367778445, + "learning_rate": 1.9890830443021814e-05, + "loss": 0.2398, + "step": 1488 + }, + { + "epoch": 0.08, + "grad_norm": 1.1519730418068233, + "learning_rate": 1.9890587612912268e-05, + "loss": 0.2599, + "step": 1489 + }, + { + "epoch": 0.08, + "grad_norm": 1.1704596933418903, + "learning_rate": 1.9890344514519974e-05, + "loss": 0.2392, + "step": 1490 + }, + { + "epoch": 0.08, + "grad_norm": 1.349986371882224, + "learning_rate": 1.9890101147851526e-05, + "loss": 0.2604, + "step": 1491 + }, + { + "epoch": 0.08, + "grad_norm": 1.097941104349231, + "learning_rate": 1.9889857512913523e-05, + "loss": 0.2556, + "step": 1492 + }, + { + "epoch": 0.08, + "grad_norm": 1.7466568104138178, + "learning_rate": 1.988961360971258e-05, + "loss": 0.2476, + "step": 1493 + }, + { + "epoch": 0.08, + "grad_norm": 1.1822046880072066, + "learning_rate": 1.988936943825531e-05, + "loss": 0.2595, + "step": 1494 + }, + { + "epoch": 0.08, + "grad_norm": 0.9970149311719365, + "learning_rate": 1.9889124998548332e-05, + "loss": 0.2517, + "step": 1495 + }, + { + "epoch": 0.08, + "grad_norm": 1.0518883109686619, + "learning_rate": 1.9888880290598282e-05, + "loss": 0.2647, + "step": 1496 + }, + { + "epoch": 0.08, + "grad_norm": 1.2107601881178482, + "learning_rate": 1.9888635314411797e-05, + "loss": 0.2647, + "step": 1497 + }, + { + "epoch": 0.08, + "grad_norm": 1.0645260886991201, + "learning_rate": 1.9888390069995516e-05, + "loss": 0.2764, + "step": 1498 + }, + { + "epoch": 0.08, + "grad_norm": 1.1982898262283437, + "learning_rate": 1.98881445573561e-05, + "loss": 0.243, + "step": 1499 + }, + { + "epoch": 0.08, + "grad_norm": 1.2710638079525047, + "learning_rate": 1.9887898776500203e-05, + "loss": 0.2552, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 1.2871447436383632, + "learning_rate": 1.9887652727434492e-05, + "loss": 0.2446, + "step": 1501 + }, + { + "epoch": 0.08, + "grad_norm": 1.1434019027245008, + "learning_rate": 1.9887406410165644e-05, + "loss": 0.2352, + "step": 1502 + }, + { + "epoch": 0.08, + "grad_norm": 1.405194585504932, + "learning_rate": 1.988715982470034e-05, + "loss": 0.2275, + "step": 1503 + }, + { + "epoch": 0.08, + "grad_norm": 1.4582424939786653, + "learning_rate": 1.9886912971045263e-05, + "loss": 0.216, + "step": 1504 + }, + { + "epoch": 0.08, + "grad_norm": 1.7604856830727356, + "learning_rate": 1.9886665849207116e-05, + "loss": 0.268, + "step": 1505 + }, + { + "epoch": 0.08, + "grad_norm": 1.0954405919257795, + "learning_rate": 1.98864184591926e-05, + "loss": 0.2502, + "step": 1506 + }, + { + "epoch": 0.08, + "grad_norm": 1.226051267596389, + "learning_rate": 1.9886170801008423e-05, + "loss": 0.2505, + "step": 1507 + }, + { + "epoch": 0.08, + "grad_norm": 1.2075409268454134, + "learning_rate": 1.9885922874661308e-05, + "loss": 0.2461, + "step": 1508 + }, + { + "epoch": 0.08, + "grad_norm": 1.4044291962244422, + "learning_rate": 1.9885674680157974e-05, + "loss": 0.2331, + "step": 1509 + }, + { + "epoch": 0.08, + "grad_norm": 1.892375728058062, + "learning_rate": 1.9885426217505154e-05, + "loss": 0.2579, + "step": 1510 + }, + { + "epoch": 0.08, + "grad_norm": 1.0109769323742304, + "learning_rate": 1.9885177486709595e-05, + "loss": 0.2407, + "step": 1511 + }, + { + "epoch": 0.08, + "grad_norm": 1.2523333881157754, + "learning_rate": 1.988492848777803e-05, + "loss": 0.2368, + "step": 1512 + }, + { + "epoch": 0.08, + "grad_norm": 0.9684049169883828, + "learning_rate": 1.9884679220717232e-05, + "loss": 0.238, + "step": 1513 + }, + { + "epoch": 0.08, + "grad_norm": 0.985704737514277, + "learning_rate": 1.9884429685533947e-05, + "loss": 0.2292, + "step": 1514 + }, + { + "epoch": 0.08, + "grad_norm": 1.5076127760722757, + "learning_rate": 1.9884179882234946e-05, + "loss": 0.2582, + "step": 1515 + }, + { + "epoch": 0.08, + "grad_norm": 1.1491759101287482, + "learning_rate": 1.988392981082701e-05, + "loss": 0.2619, + "step": 1516 + }, + { + "epoch": 0.08, + "grad_norm": 1.016203850638015, + "learning_rate": 1.9883679471316918e-05, + "loss": 0.2411, + "step": 1517 + }, + { + "epoch": 0.08, + "grad_norm": 1.237079163922459, + "learning_rate": 1.9883428863711463e-05, + "loss": 0.2125, + "step": 1518 + }, + { + "epoch": 0.08, + "grad_norm": 1.1480551559538474, + "learning_rate": 1.9883177988017444e-05, + "loss": 0.2565, + "step": 1519 + }, + { + "epoch": 0.08, + "grad_norm": 1.1236030361228728, + "learning_rate": 1.9882926844241662e-05, + "loss": 0.2568, + "step": 1520 + }, + { + "epoch": 0.08, + "grad_norm": 1.0466955366612953, + "learning_rate": 1.988267543239093e-05, + "loss": 0.2243, + "step": 1521 + }, + { + "epoch": 0.08, + "grad_norm": 1.1433635714952062, + "learning_rate": 1.988242375247207e-05, + "loss": 0.2695, + "step": 1522 + }, + { + "epoch": 0.08, + "grad_norm": 0.9521221469115966, + "learning_rate": 1.9882171804491905e-05, + "loss": 0.2548, + "step": 1523 + }, + { + "epoch": 0.08, + "grad_norm": 1.032790584538378, + "learning_rate": 1.9881919588457274e-05, + "loss": 0.219, + "step": 1524 + }, + { + "epoch": 0.08, + "grad_norm": 1.1202926836453848, + "learning_rate": 1.9881667104375018e-05, + "loss": 0.2256, + "step": 1525 + }, + { + "epoch": 0.08, + "grad_norm": 1.0362677561309366, + "learning_rate": 1.988141435225198e-05, + "loss": 0.2424, + "step": 1526 + }, + { + "epoch": 0.08, + "grad_norm": 1.108121046698207, + "learning_rate": 1.9881161332095024e-05, + "loss": 0.259, + "step": 1527 + }, + { + "epoch": 0.08, + "grad_norm": 1.106705413680249, + "learning_rate": 1.9880908043911004e-05, + "loss": 0.2494, + "step": 1528 + }, + { + "epoch": 0.08, + "grad_norm": 0.8919959929553708, + "learning_rate": 1.9880654487706797e-05, + "loss": 0.2384, + "step": 1529 + }, + { + "epoch": 0.08, + "grad_norm": 1.1403536176285372, + "learning_rate": 1.988040066348928e-05, + "loss": 0.2442, + "step": 1530 + }, + { + "epoch": 0.08, + "grad_norm": 1.0397301835981851, + "learning_rate": 1.9880146571265337e-05, + "loss": 0.2215, + "step": 1531 + }, + { + "epoch": 0.08, + "grad_norm": 1.0570264901588597, + "learning_rate": 1.987989221104186e-05, + "loss": 0.2186, + "step": 1532 + }, + { + "epoch": 0.08, + "grad_norm": 1.5454973258691305, + "learning_rate": 1.987963758282575e-05, + "loss": 0.2435, + "step": 1533 + }, + { + "epoch": 0.08, + "grad_norm": 1.00095769815178, + "learning_rate": 1.987938268662391e-05, + "loss": 0.2398, + "step": 1534 + }, + { + "epoch": 0.08, + "grad_norm": 1.0314401299992415, + "learning_rate": 1.9879127522443255e-05, + "loss": 0.2516, + "step": 1535 + }, + { + "epoch": 0.08, + "grad_norm": 1.0239176880004133, + "learning_rate": 1.9878872090290714e-05, + "loss": 0.2209, + "step": 1536 + }, + { + "epoch": 0.08, + "grad_norm": 2.1516319522863183, + "learning_rate": 1.98786163901732e-05, + "loss": 0.2179, + "step": 1537 + }, + { + "epoch": 0.08, + "grad_norm": 1.0899641412385346, + "learning_rate": 1.9878360422097666e-05, + "loss": 0.2629, + "step": 1538 + }, + { + "epoch": 0.08, + "grad_norm": 1.1161182738196964, + "learning_rate": 1.9878104186071047e-05, + "loss": 0.2669, + "step": 1539 + }, + { + "epoch": 0.08, + "grad_norm": 0.9331629099333119, + "learning_rate": 1.9877847682100294e-05, + "loss": 0.2226, + "step": 1540 + }, + { + "epoch": 0.08, + "grad_norm": 0.8936707439041887, + "learning_rate": 1.9877590910192362e-05, + "loss": 0.2725, + "step": 1541 + }, + { + "epoch": 0.08, + "grad_norm": 1.127775496155315, + "learning_rate": 1.987733387035422e-05, + "loss": 0.2563, + "step": 1542 + }, + { + "epoch": 0.08, + "grad_norm": 1.1260913141490958, + "learning_rate": 1.9877076562592844e-05, + "loss": 0.2322, + "step": 1543 + }, + { + "epoch": 0.08, + "grad_norm": 0.8980360219871005, + "learning_rate": 1.9876818986915202e-05, + "loss": 0.2206, + "step": 1544 + }, + { + "epoch": 0.08, + "grad_norm": 0.9796120458036948, + "learning_rate": 1.9876561143328287e-05, + "loss": 0.2516, + "step": 1545 + }, + { + "epoch": 0.08, + "grad_norm": 1.2629491204181673, + "learning_rate": 1.9876303031839094e-05, + "loss": 0.2083, + "step": 1546 + }, + { + "epoch": 0.08, + "grad_norm": 1.0874082802575349, + "learning_rate": 1.9876044652454627e-05, + "loss": 0.2649, + "step": 1547 + }, + { + "epoch": 0.08, + "grad_norm": 1.1914776664656248, + "learning_rate": 1.987578600518189e-05, + "loss": 0.2606, + "step": 1548 + }, + { + "epoch": 0.08, + "grad_norm": 0.9057171643487041, + "learning_rate": 1.9875527090027897e-05, + "loss": 0.2399, + "step": 1549 + }, + { + "epoch": 0.08, + "grad_norm": 1.0640987292525308, + "learning_rate": 1.9875267906999674e-05, + "loss": 0.2369, + "step": 1550 + }, + { + "epoch": 0.08, + "grad_norm": 1.1909353635895097, + "learning_rate": 1.987500845610425e-05, + "loss": 0.2415, + "step": 1551 + }, + { + "epoch": 0.08, + "grad_norm": 1.1777676002953559, + "learning_rate": 1.987474873734867e-05, + "loss": 0.2381, + "step": 1552 + }, + { + "epoch": 0.08, + "grad_norm": 1.0400464992950833, + "learning_rate": 1.987448875073997e-05, + "loss": 0.2272, + "step": 1553 + }, + { + "epoch": 0.08, + "grad_norm": 0.9159186299023466, + "learning_rate": 1.9874228496285203e-05, + "loss": 0.2326, + "step": 1554 + }, + { + "epoch": 0.08, + "grad_norm": 1.0776077764374352, + "learning_rate": 1.9873967973991432e-05, + "loss": 0.251, + "step": 1555 + }, + { + "epoch": 0.08, + "grad_norm": 1.4205363639530653, + "learning_rate": 1.987370718386572e-05, + "loss": 0.2427, + "step": 1556 + }, + { + "epoch": 0.08, + "grad_norm": 1.1243700994075487, + "learning_rate": 1.9873446125915145e-05, + "loss": 0.2768, + "step": 1557 + }, + { + "epoch": 0.08, + "grad_norm": 1.1028876890940438, + "learning_rate": 1.9873184800146785e-05, + "loss": 0.2489, + "step": 1558 + }, + { + "epoch": 0.08, + "grad_norm": 1.0126681453424553, + "learning_rate": 1.987292320656773e-05, + "loss": 0.2471, + "step": 1559 + }, + { + "epoch": 0.08, + "grad_norm": 0.9566294464744491, + "learning_rate": 1.9872661345185076e-05, + "loss": 0.2393, + "step": 1560 + }, + { + "epoch": 0.08, + "grad_norm": 1.3368115973005088, + "learning_rate": 1.9872399216005928e-05, + "loss": 0.2478, + "step": 1561 + }, + { + "epoch": 0.08, + "grad_norm": 1.592650857695837, + "learning_rate": 1.9872136819037388e-05, + "loss": 0.2437, + "step": 1562 + }, + { + "epoch": 0.08, + "grad_norm": 1.1068509765308028, + "learning_rate": 1.9871874154286585e-05, + "loss": 0.2383, + "step": 1563 + }, + { + "epoch": 0.08, + "grad_norm": 1.0576437487873738, + "learning_rate": 1.987161122176063e-05, + "loss": 0.2348, + "step": 1564 + }, + { + "epoch": 0.08, + "grad_norm": 1.0079938384705214, + "learning_rate": 1.9871348021466673e-05, + "loss": 0.2355, + "step": 1565 + }, + { + "epoch": 0.08, + "grad_norm": 1.4585402407253452, + "learning_rate": 1.9871084553411835e-05, + "loss": 0.2369, + "step": 1566 + }, + { + "epoch": 0.08, + "grad_norm": 1.0320403920202317, + "learning_rate": 1.9870820817603276e-05, + "loss": 0.2191, + "step": 1567 + }, + { + "epoch": 0.08, + "grad_norm": 1.0270957769308535, + "learning_rate": 1.9870556814048145e-05, + "loss": 0.2427, + "step": 1568 + }, + { + "epoch": 0.08, + "grad_norm": 1.2074542042203558, + "learning_rate": 1.9870292542753596e-05, + "loss": 0.2504, + "step": 1569 + }, + { + "epoch": 0.08, + "grad_norm": 0.942970746262455, + "learning_rate": 1.987002800372681e-05, + "loss": 0.2212, + "step": 1570 + }, + { + "epoch": 0.08, + "grad_norm": 0.9104940249554769, + "learning_rate": 1.9869763196974957e-05, + "loss": 0.2596, + "step": 1571 + }, + { + "epoch": 0.08, + "grad_norm": 0.8644721366170993, + "learning_rate": 1.986949812250522e-05, + "loss": 0.2258, + "step": 1572 + }, + { + "epoch": 0.08, + "grad_norm": 1.1098327646726607, + "learning_rate": 1.986923278032479e-05, + "loss": 0.2533, + "step": 1573 + }, + { + "epoch": 0.08, + "grad_norm": 1.4782949178290852, + "learning_rate": 1.986896717044086e-05, + "loss": 0.3009, + "step": 1574 + }, + { + "epoch": 0.08, + "grad_norm": 1.0298685807340697, + "learning_rate": 1.986870129286064e-05, + "loss": 0.2592, + "step": 1575 + }, + { + "epoch": 0.08, + "grad_norm": 1.2470618274705578, + "learning_rate": 1.986843514759134e-05, + "loss": 0.2426, + "step": 1576 + }, + { + "epoch": 0.08, + "grad_norm": 1.266134386190633, + "learning_rate": 1.986816873464018e-05, + "loss": 0.2532, + "step": 1577 + }, + { + "epoch": 0.08, + "grad_norm": 1.4067436040344086, + "learning_rate": 1.9867902054014382e-05, + "loss": 0.279, + "step": 1578 + }, + { + "epoch": 0.08, + "grad_norm": 0.8698043011661225, + "learning_rate": 1.986763510572119e-05, + "loss": 0.2399, + "step": 1579 + }, + { + "epoch": 0.08, + "grad_norm": 1.0193089125859338, + "learning_rate": 1.986736788976783e-05, + "loss": 0.2277, + "step": 1580 + }, + { + "epoch": 0.08, + "grad_norm": 1.17900717751552, + "learning_rate": 1.9867100406161563e-05, + "loss": 0.2609, + "step": 1581 + }, + { + "epoch": 0.08, + "grad_norm": 1.062165241963998, + "learning_rate": 1.986683265490964e-05, + "loss": 0.2747, + "step": 1582 + }, + { + "epoch": 0.08, + "grad_norm": 1.137051073249643, + "learning_rate": 1.9866564636019326e-05, + "loss": 0.2288, + "step": 1583 + }, + { + "epoch": 0.08, + "grad_norm": 1.1032651435013483, + "learning_rate": 1.9866296349497885e-05, + "loss": 0.2267, + "step": 1584 + }, + { + "epoch": 0.08, + "grad_norm": 0.9910774254352182, + "learning_rate": 1.9866027795352604e-05, + "loss": 0.2767, + "step": 1585 + }, + { + "epoch": 0.08, + "grad_norm": 1.0635540904541145, + "learning_rate": 1.986575897359076e-05, + "loss": 0.2722, + "step": 1586 + }, + { + "epoch": 0.08, + "grad_norm": 0.9511029033089041, + "learning_rate": 1.9865489884219643e-05, + "loss": 0.2284, + "step": 1587 + }, + { + "epoch": 0.08, + "grad_norm": 1.139944510002614, + "learning_rate": 1.9865220527246556e-05, + "loss": 0.2355, + "step": 1588 + }, + { + "epoch": 0.08, + "grad_norm": 1.3021053893170482, + "learning_rate": 1.986495090267881e-05, + "loss": 0.2657, + "step": 1589 + }, + { + "epoch": 0.08, + "grad_norm": 1.0402349598351108, + "learning_rate": 1.986468101052371e-05, + "loss": 0.2491, + "step": 1590 + }, + { + "epoch": 0.08, + "grad_norm": 0.9833347233482201, + "learning_rate": 1.986441085078858e-05, + "loss": 0.2408, + "step": 1591 + }, + { + "epoch": 0.08, + "grad_norm": 0.9702896415133163, + "learning_rate": 1.986414042348075e-05, + "loss": 0.2296, + "step": 1592 + }, + { + "epoch": 0.08, + "grad_norm": 1.8403703196326306, + "learning_rate": 1.9863869728607553e-05, + "loss": 0.2327, + "step": 1593 + }, + { + "epoch": 0.08, + "grad_norm": 0.9887059891575193, + "learning_rate": 1.986359876617633e-05, + "loss": 0.2416, + "step": 1594 + }, + { + "epoch": 0.08, + "grad_norm": 1.15694928653404, + "learning_rate": 1.9863327536194438e-05, + "loss": 0.2196, + "step": 1595 + }, + { + "epoch": 0.08, + "grad_norm": 1.0068578045610728, + "learning_rate": 1.9863056038669225e-05, + "loss": 0.2533, + "step": 1596 + }, + { + "epoch": 0.08, + "grad_norm": 0.9916793991803713, + "learning_rate": 1.9862784273608066e-05, + "loss": 0.2549, + "step": 1597 + }, + { + "epoch": 0.08, + "grad_norm": 1.0031710272512935, + "learning_rate": 1.986251224101832e-05, + "loss": 0.231, + "step": 1598 + }, + { + "epoch": 0.08, + "grad_norm": 1.021474671000375, + "learning_rate": 1.9862239940907377e-05, + "loss": 0.2491, + "step": 1599 + }, + { + "epoch": 0.08, + "grad_norm": 0.8310914145791256, + "learning_rate": 1.986196737328262e-05, + "loss": 0.2451, + "step": 1600 + }, + { + "epoch": 0.08, + "grad_norm": 0.8690234167987716, + "learning_rate": 1.9861694538151436e-05, + "loss": 0.2687, + "step": 1601 + }, + { + "epoch": 0.08, + "grad_norm": 1.1227562514141765, + "learning_rate": 1.9861421435521234e-05, + "loss": 0.2742, + "step": 1602 + }, + { + "epoch": 0.08, + "grad_norm": 0.9654518363415745, + "learning_rate": 1.9861148065399414e-05, + "loss": 0.2318, + "step": 1603 + }, + { + "epoch": 0.08, + "grad_norm": 1.1076305396603072, + "learning_rate": 1.98608744277934e-05, + "loss": 0.2421, + "step": 1604 + }, + { + "epoch": 0.08, + "grad_norm": 1.4547345595987675, + "learning_rate": 1.986060052271061e-05, + "loss": 0.2305, + "step": 1605 + }, + { + "epoch": 0.08, + "grad_norm": 0.8491699775291529, + "learning_rate": 1.9860326350158472e-05, + "loss": 0.225, + "step": 1606 + }, + { + "epoch": 0.08, + "grad_norm": 0.8319543451433087, + "learning_rate": 1.9860051910144426e-05, + "loss": 0.2435, + "step": 1607 + }, + { + "epoch": 0.08, + "grad_norm": 0.8100604809756528, + "learning_rate": 1.9859777202675915e-05, + "loss": 0.2111, + "step": 1608 + }, + { + "epoch": 0.08, + "grad_norm": 0.9527615886593352, + "learning_rate": 1.985950222776039e-05, + "loss": 0.2644, + "step": 1609 + }, + { + "epoch": 0.08, + "grad_norm": 1.0894915270026155, + "learning_rate": 1.9859226985405312e-05, + "loss": 0.2577, + "step": 1610 + }, + { + "epoch": 0.08, + "grad_norm": 1.215983338220503, + "learning_rate": 1.985895147561814e-05, + "loss": 0.2239, + "step": 1611 + }, + { + "epoch": 0.08, + "grad_norm": 0.9302355264268377, + "learning_rate": 1.985867569840636e-05, + "loss": 0.2541, + "step": 1612 + }, + { + "epoch": 0.08, + "grad_norm": 4.600810373609161, + "learning_rate": 1.985839965377744e-05, + "loss": 0.24, + "step": 1613 + }, + { + "epoch": 0.08, + "grad_norm": 1.2071590122653046, + "learning_rate": 1.9858123341738877e-05, + "loss": 0.255, + "step": 1614 + }, + { + "epoch": 0.08, + "grad_norm": 1.078164560236254, + "learning_rate": 1.9857846762298157e-05, + "loss": 0.2397, + "step": 1615 + }, + { + "epoch": 0.08, + "grad_norm": 1.2358085343430798, + "learning_rate": 1.9857569915462787e-05, + "loss": 0.2642, + "step": 1616 + }, + { + "epoch": 0.08, + "grad_norm": 1.136443380213396, + "learning_rate": 1.9857292801240278e-05, + "loss": 0.2348, + "step": 1617 + }, + { + "epoch": 0.08, + "grad_norm": 1.256005542958859, + "learning_rate": 1.985701541963814e-05, + "loss": 0.2263, + "step": 1618 + }, + { + "epoch": 0.08, + "grad_norm": 0.9861752701380436, + "learning_rate": 1.9856737770663908e-05, + "loss": 0.2171, + "step": 1619 + }, + { + "epoch": 0.08, + "grad_norm": 1.0699500859875848, + "learning_rate": 1.9856459854325108e-05, + "loss": 0.2312, + "step": 1620 + }, + { + "epoch": 0.08, + "grad_norm": 1.0277239815636015, + "learning_rate": 1.9856181670629272e-05, + "loss": 0.2221, + "step": 1621 + }, + { + "epoch": 0.08, + "grad_norm": 1.1622788998264484, + "learning_rate": 1.985590321958396e-05, + "loss": 0.2658, + "step": 1622 + }, + { + "epoch": 0.08, + "grad_norm": 0.9654394119932761, + "learning_rate": 1.985562450119671e-05, + "loss": 0.2127, + "step": 1623 + }, + { + "epoch": 0.08, + "grad_norm": 0.8818972933724379, + "learning_rate": 1.985534551547509e-05, + "loss": 0.2402, + "step": 1624 + }, + { + "epoch": 0.08, + "grad_norm": 1.0822541435500859, + "learning_rate": 1.9855066262426663e-05, + "loss": 0.245, + "step": 1625 + }, + { + "epoch": 0.08, + "grad_norm": 1.0792182766553684, + "learning_rate": 1.9854786742059012e-05, + "loss": 0.2229, + "step": 1626 + }, + { + "epoch": 0.08, + "grad_norm": 0.9164683009523013, + "learning_rate": 1.9854506954379714e-05, + "loss": 0.2442, + "step": 1627 + }, + { + "epoch": 0.08, + "grad_norm": 1.1155262220942679, + "learning_rate": 1.9854226899396356e-05, + "loss": 0.2657, + "step": 1628 + }, + { + "epoch": 0.08, + "grad_norm": 1.3021145519257342, + "learning_rate": 1.9853946577116536e-05, + "loss": 0.23, + "step": 1629 + }, + { + "epoch": 0.08, + "grad_norm": 1.1384442147020182, + "learning_rate": 1.985366598754786e-05, + "loss": 0.2619, + "step": 1630 + }, + { + "epoch": 0.08, + "grad_norm": 1.5198425729758447, + "learning_rate": 1.985338513069794e-05, + "loss": 0.2424, + "step": 1631 + }, + { + "epoch": 0.08, + "grad_norm": 1.2746960811741264, + "learning_rate": 1.9853104006574387e-05, + "loss": 0.2398, + "step": 1632 + }, + { + "epoch": 0.08, + "grad_norm": 0.9081894042267109, + "learning_rate": 1.9852822615184835e-05, + "loss": 0.2712, + "step": 1633 + }, + { + "epoch": 0.08, + "grad_norm": 0.9654287887774892, + "learning_rate": 1.9852540956536912e-05, + "loss": 0.2482, + "step": 1634 + }, + { + "epoch": 0.08, + "grad_norm": 0.9874173268909807, + "learning_rate": 1.985225903063826e-05, + "loss": 0.2628, + "step": 1635 + }, + { + "epoch": 0.08, + "grad_norm": 0.8587542362193802, + "learning_rate": 1.9851976837496522e-05, + "loss": 0.2567, + "step": 1636 + }, + { + "epoch": 0.08, + "grad_norm": 0.8944630553655413, + "learning_rate": 1.9851694377119358e-05, + "loss": 0.2313, + "step": 1637 + }, + { + "epoch": 0.08, + "grad_norm": 0.9580136632504814, + "learning_rate": 1.985141164951443e-05, + "loss": 0.2637, + "step": 1638 + }, + { + "epoch": 0.08, + "grad_norm": 1.0240173091734972, + "learning_rate": 1.98511286546894e-05, + "loss": 0.2588, + "step": 1639 + }, + { + "epoch": 0.08, + "grad_norm": 1.1632183346348985, + "learning_rate": 1.985084539265195e-05, + "loss": 0.2492, + "step": 1640 + }, + { + "epoch": 0.08, + "grad_norm": 1.2418168208945377, + "learning_rate": 1.9850561863409763e-05, + "loss": 0.245, + "step": 1641 + }, + { + "epoch": 0.08, + "grad_norm": 1.2675161459462643, + "learning_rate": 1.985027806697053e-05, + "loss": 0.2235, + "step": 1642 + }, + { + "epoch": 0.08, + "grad_norm": 1.4699273219751203, + "learning_rate": 1.984999400334195e-05, + "loss": 0.2787, + "step": 1643 + }, + { + "epoch": 0.08, + "grad_norm": 1.3416851354072679, + "learning_rate": 1.9849709672531724e-05, + "loss": 0.2144, + "step": 1644 + }, + { + "epoch": 0.08, + "grad_norm": 1.588177614535934, + "learning_rate": 1.9849425074547565e-05, + "loss": 0.2643, + "step": 1645 + }, + { + "epoch": 0.08, + "grad_norm": 1.4621532798948187, + "learning_rate": 1.98491402093972e-05, + "loss": 0.2434, + "step": 1646 + }, + { + "epoch": 0.08, + "grad_norm": 1.016718930677667, + "learning_rate": 1.9848855077088346e-05, + "loss": 0.245, + "step": 1647 + }, + { + "epoch": 0.08, + "grad_norm": 0.960429641737833, + "learning_rate": 1.984856967762874e-05, + "loss": 0.2353, + "step": 1648 + }, + { + "epoch": 0.08, + "grad_norm": 1.0038547952300165, + "learning_rate": 1.984828401102613e-05, + "loss": 0.2471, + "step": 1649 + }, + { + "epoch": 0.08, + "grad_norm": 1.5716038810095376, + "learning_rate": 1.9847998077288255e-05, + "loss": 0.2624, + "step": 1650 + }, + { + "epoch": 0.08, + "grad_norm": 1.0463153168335042, + "learning_rate": 1.984771187642288e-05, + "loss": 0.2196, + "step": 1651 + }, + { + "epoch": 0.08, + "grad_norm": 1.1341781324491753, + "learning_rate": 1.9847425408437763e-05, + "loss": 0.2548, + "step": 1652 + }, + { + "epoch": 0.08, + "grad_norm": 1.3841618420365462, + "learning_rate": 1.9847138673340675e-05, + "loss": 0.2438, + "step": 1653 + }, + { + "epoch": 0.08, + "grad_norm": 1.0169550684413193, + "learning_rate": 1.9846851671139394e-05, + "loss": 0.2607, + "step": 1654 + }, + { + "epoch": 0.08, + "grad_norm": 1.3519043264751935, + "learning_rate": 1.984656440184171e-05, + "loss": 0.2607, + "step": 1655 + }, + { + "epoch": 0.08, + "grad_norm": 1.2608149031054832, + "learning_rate": 1.9846276865455403e-05, + "loss": 0.2437, + "step": 1656 + }, + { + "epoch": 0.08, + "grad_norm": 1.0108753451264925, + "learning_rate": 1.9845989061988283e-05, + "loss": 0.2487, + "step": 1657 + }, + { + "epoch": 0.08, + "grad_norm": 0.9027978818842535, + "learning_rate": 1.9845700991448154e-05, + "loss": 0.2359, + "step": 1658 + }, + { + "epoch": 0.08, + "grad_norm": 1.250101052431056, + "learning_rate": 1.984541265384283e-05, + "loss": 0.2632, + "step": 1659 + }, + { + "epoch": 0.08, + "grad_norm": 1.3325062156063474, + "learning_rate": 1.9845124049180132e-05, + "loss": 0.2225, + "step": 1660 + }, + { + "epoch": 0.08, + "grad_norm": 0.9988706987608464, + "learning_rate": 1.9844835177467886e-05, + "loss": 0.2321, + "step": 1661 + }, + { + "epoch": 0.08, + "grad_norm": 1.120658383739874, + "learning_rate": 1.984454603871393e-05, + "loss": 0.2576, + "step": 1662 + }, + { + "epoch": 0.08, + "grad_norm": 0.9601587400055499, + "learning_rate": 1.984425663292611e-05, + "loss": 0.2143, + "step": 1663 + }, + { + "epoch": 0.08, + "grad_norm": 0.9261788091835301, + "learning_rate": 1.984396696011227e-05, + "loss": 0.2469, + "step": 1664 + }, + { + "epoch": 0.08, + "grad_norm": 1.5508358929347974, + "learning_rate": 1.984367702028027e-05, + "loss": 0.2642, + "step": 1665 + }, + { + "epoch": 0.08, + "grad_norm": 0.9767778816683551, + "learning_rate": 1.9843386813437976e-05, + "loss": 0.2367, + "step": 1666 + }, + { + "epoch": 0.08, + "grad_norm": 1.0196991564528861, + "learning_rate": 1.984309633959326e-05, + "loss": 0.2413, + "step": 1667 + }, + { + "epoch": 0.08, + "grad_norm": 1.316436967597623, + "learning_rate": 1.9842805598753997e-05, + "loss": 0.2642, + "step": 1668 + }, + { + "epoch": 0.08, + "grad_norm": 1.2167182228798439, + "learning_rate": 1.984251459092808e-05, + "loss": 0.2275, + "step": 1669 + }, + { + "epoch": 0.08, + "grad_norm": 1.1146683861566014, + "learning_rate": 1.9842223316123393e-05, + "loss": 0.2346, + "step": 1670 + }, + { + "epoch": 0.08, + "grad_norm": 1.3334225033325622, + "learning_rate": 1.9841931774347846e-05, + "loss": 0.2545, + "step": 1671 + }, + { + "epoch": 0.09, + "grad_norm": 1.3786897506461797, + "learning_rate": 1.9841639965609344e-05, + "loss": 0.2298, + "step": 1672 + }, + { + "epoch": 0.09, + "grad_norm": 1.612244430590351, + "learning_rate": 1.9841347889915804e-05, + "loss": 0.2449, + "step": 1673 + }, + { + "epoch": 0.09, + "grad_norm": 1.2949843857838457, + "learning_rate": 1.9841055547275142e-05, + "loss": 0.2405, + "step": 1674 + }, + { + "epoch": 0.09, + "grad_norm": 1.229655683707706, + "learning_rate": 1.9840762937695296e-05, + "loss": 0.2706, + "step": 1675 + }, + { + "epoch": 0.09, + "grad_norm": 1.5452909783071291, + "learning_rate": 1.98404700611842e-05, + "loss": 0.2369, + "step": 1676 + }, + { + "epoch": 0.09, + "grad_norm": 0.9981909241193303, + "learning_rate": 1.9840176917749795e-05, + "loss": 0.2494, + "step": 1677 + }, + { + "epoch": 0.09, + "grad_norm": 1.0870733215780806, + "learning_rate": 1.9839883507400037e-05, + "loss": 0.2528, + "step": 1678 + }, + { + "epoch": 0.09, + "grad_norm": 1.0528706352105188, + "learning_rate": 1.9839589830142882e-05, + "loss": 0.257, + "step": 1679 + }, + { + "epoch": 0.09, + "grad_norm": 1.1201032075648276, + "learning_rate": 1.98392958859863e-05, + "loss": 0.2492, + "step": 1680 + }, + { + "epoch": 0.09, + "grad_norm": 1.0474478400549914, + "learning_rate": 1.983900167493826e-05, + "loss": 0.2926, + "step": 1681 + }, + { + "epoch": 0.09, + "grad_norm": 1.0154346670627037, + "learning_rate": 1.983870719700674e-05, + "loss": 0.2386, + "step": 1682 + }, + { + "epoch": 0.09, + "grad_norm": 0.8657370203828151, + "learning_rate": 1.9838412452199732e-05, + "loss": 0.242, + "step": 1683 + }, + { + "epoch": 0.09, + "grad_norm": 0.918886234549711, + "learning_rate": 1.9838117440525236e-05, + "loss": 0.2439, + "step": 1684 + }, + { + "epoch": 0.09, + "grad_norm": 0.9403024089951526, + "learning_rate": 1.9837822161991248e-05, + "loss": 0.2429, + "step": 1685 + }, + { + "epoch": 0.09, + "grad_norm": 1.084076095905887, + "learning_rate": 1.9837526616605774e-05, + "loss": 0.2335, + "step": 1686 + }, + { + "epoch": 0.09, + "grad_norm": 1.7367729442066835, + "learning_rate": 1.9837230804376838e-05, + "loss": 0.2293, + "step": 1687 + }, + { + "epoch": 0.09, + "grad_norm": 0.9329776639522541, + "learning_rate": 1.983693472531246e-05, + "loss": 0.2169, + "step": 1688 + }, + { + "epoch": 0.09, + "grad_norm": 1.2153526164638409, + "learning_rate": 1.983663837942067e-05, + "loss": 0.2542, + "step": 1689 + }, + { + "epoch": 0.09, + "grad_norm": 0.956868994422886, + "learning_rate": 1.983634176670951e-05, + "loss": 0.2591, + "step": 1690 + }, + { + "epoch": 0.09, + "grad_norm": 1.029881962928075, + "learning_rate": 1.9836044887187023e-05, + "loss": 0.2213, + "step": 1691 + }, + { + "epoch": 0.09, + "grad_norm": 1.1669934190529483, + "learning_rate": 1.9835747740861266e-05, + "loss": 0.2626, + "step": 1692 + }, + { + "epoch": 0.09, + "grad_norm": 0.9980708214268654, + "learning_rate": 1.9835450327740293e-05, + "loss": 0.2788, + "step": 1693 + }, + { + "epoch": 0.09, + "grad_norm": 1.158179447044645, + "learning_rate": 1.9835152647832175e-05, + "loss": 0.2486, + "step": 1694 + }, + { + "epoch": 0.09, + "grad_norm": 0.9135665839164294, + "learning_rate": 1.9834854701144986e-05, + "loss": 0.2286, + "step": 1695 + }, + { + "epoch": 0.09, + "grad_norm": 1.0453395818232172, + "learning_rate": 1.983455648768681e-05, + "loss": 0.2274, + "step": 1696 + }, + { + "epoch": 0.09, + "grad_norm": 1.0728487666452273, + "learning_rate": 1.983425800746573e-05, + "loss": 0.2582, + "step": 1697 + }, + { + "epoch": 0.09, + "grad_norm": 0.9397892154114075, + "learning_rate": 1.983395926048985e-05, + "loss": 0.2259, + "step": 1698 + }, + { + "epoch": 0.09, + "grad_norm": 0.9379066761125769, + "learning_rate": 1.9833660246767267e-05, + "loss": 0.2386, + "step": 1699 + }, + { + "epoch": 0.09, + "grad_norm": 1.2392563662188694, + "learning_rate": 1.9833360966306095e-05, + "loss": 0.237, + "step": 1700 + }, + { + "epoch": 0.09, + "grad_norm": 1.2609688920322724, + "learning_rate": 1.9833061419114452e-05, + "loss": 0.2885, + "step": 1701 + }, + { + "epoch": 0.09, + "grad_norm": 1.1060909223384836, + "learning_rate": 1.9832761605200464e-05, + "loss": 0.2539, + "step": 1702 + }, + { + "epoch": 0.09, + "grad_norm": 1.0220662279210742, + "learning_rate": 1.9832461524572258e-05, + "loss": 0.2272, + "step": 1703 + }, + { + "epoch": 0.09, + "grad_norm": 1.071997778470365, + "learning_rate": 1.983216117723798e-05, + "loss": 0.2574, + "step": 1704 + }, + { + "epoch": 0.09, + "grad_norm": 1.0151680433606107, + "learning_rate": 1.9831860563205776e-05, + "loss": 0.2401, + "step": 1705 + }, + { + "epoch": 0.09, + "grad_norm": 0.987979664831528, + "learning_rate": 1.98315596824838e-05, + "loss": 0.2201, + "step": 1706 + }, + { + "epoch": 0.09, + "grad_norm": 0.8212769811622768, + "learning_rate": 1.9831258535080206e-05, + "loss": 0.2338, + "step": 1707 + }, + { + "epoch": 0.09, + "grad_norm": 1.1635767738327358, + "learning_rate": 1.9830957121003176e-05, + "loss": 0.2394, + "step": 1708 + }, + { + "epoch": 0.09, + "grad_norm": 1.0173061683887226, + "learning_rate": 1.983065544026087e-05, + "loss": 0.264, + "step": 1709 + }, + { + "epoch": 0.09, + "grad_norm": 1.833164217466478, + "learning_rate": 1.9830353492861493e-05, + "loss": 0.2578, + "step": 1710 + }, + { + "epoch": 0.09, + "grad_norm": 1.022547453107026, + "learning_rate": 1.983005127881321e-05, + "loss": 0.2651, + "step": 1711 + }, + { + "epoch": 0.09, + "grad_norm": 1.0363103004554934, + "learning_rate": 1.9829748798124237e-05, + "loss": 0.2612, + "step": 1712 + }, + { + "epoch": 0.09, + "grad_norm": 0.9714966214210872, + "learning_rate": 1.9829446050802776e-05, + "loss": 0.2519, + "step": 1713 + }, + { + "epoch": 0.09, + "grad_norm": 0.9434534750808347, + "learning_rate": 1.9829143036857027e-05, + "loss": 0.2293, + "step": 1714 + }, + { + "epoch": 0.09, + "grad_norm": 1.195774260854126, + "learning_rate": 1.9828839756295223e-05, + "loss": 0.2517, + "step": 1715 + }, + { + "epoch": 0.09, + "grad_norm": 1.0262577604343457, + "learning_rate": 1.9828536209125584e-05, + "loss": 0.2463, + "step": 1716 + }, + { + "epoch": 0.09, + "grad_norm": 1.842309395951756, + "learning_rate": 1.9828232395356347e-05, + "loss": 0.2409, + "step": 1717 + }, + { + "epoch": 0.09, + "grad_norm": 1.2461010415205944, + "learning_rate": 1.982792831499575e-05, + "loss": 0.2425, + "step": 1718 + }, + { + "epoch": 0.09, + "grad_norm": 0.9829452005423613, + "learning_rate": 1.982762396805204e-05, + "loss": 0.2484, + "step": 1719 + }, + { + "epoch": 0.09, + "grad_norm": 0.984884649513512, + "learning_rate": 1.982731935453348e-05, + "loss": 0.2389, + "step": 1720 + }, + { + "epoch": 0.09, + "grad_norm": 1.1119583351575009, + "learning_rate": 1.9827014474448324e-05, + "loss": 0.2518, + "step": 1721 + }, + { + "epoch": 0.09, + "grad_norm": 1.002796824835171, + "learning_rate": 1.9826709327804846e-05, + "loss": 0.2284, + "step": 1722 + }, + { + "epoch": 0.09, + "grad_norm": 1.0157183698966548, + "learning_rate": 1.982640391461132e-05, + "loss": 0.2415, + "step": 1723 + }, + { + "epoch": 0.09, + "grad_norm": 1.260041652519267, + "learning_rate": 1.982609823487604e-05, + "loss": 0.2351, + "step": 1724 + }, + { + "epoch": 0.09, + "grad_norm": 1.3519455184908302, + "learning_rate": 1.9825792288607284e-05, + "loss": 0.2451, + "step": 1725 + }, + { + "epoch": 0.09, + "grad_norm": 1.0062626123603624, + "learning_rate": 1.982548607581336e-05, + "loss": 0.2467, + "step": 1726 + }, + { + "epoch": 0.09, + "grad_norm": 1.2439664837325628, + "learning_rate": 1.9825179596502567e-05, + "loss": 0.2509, + "step": 1727 + }, + { + "epoch": 0.09, + "grad_norm": 0.8243535374289274, + "learning_rate": 1.9824872850683226e-05, + "loss": 0.2075, + "step": 1728 + }, + { + "epoch": 0.09, + "grad_norm": 1.1505514359912987, + "learning_rate": 1.9824565838363657e-05, + "loss": 0.2419, + "step": 1729 + }, + { + "epoch": 0.09, + "grad_norm": 1.269435382278997, + "learning_rate": 1.9824258559552182e-05, + "loss": 0.239, + "step": 1730 + }, + { + "epoch": 0.09, + "grad_norm": 2.31208623122475, + "learning_rate": 1.9823951014257138e-05, + "loss": 0.2527, + "step": 1731 + }, + { + "epoch": 0.09, + "grad_norm": 1.206820363011641, + "learning_rate": 1.9823643202486867e-05, + "loss": 0.2221, + "step": 1732 + }, + { + "epoch": 0.09, + "grad_norm": 1.2041683786551478, + "learning_rate": 1.982333512424972e-05, + "loss": 0.2213, + "step": 1733 + }, + { + "epoch": 0.09, + "grad_norm": 1.0255291639405049, + "learning_rate": 1.9823026779554055e-05, + "loss": 0.2405, + "step": 1734 + }, + { + "epoch": 0.09, + "grad_norm": 1.1134042500477659, + "learning_rate": 1.982271816840823e-05, + "loss": 0.2355, + "step": 1735 + }, + { + "epoch": 0.09, + "grad_norm": 1.122621923354994, + "learning_rate": 1.982240929082062e-05, + "loss": 0.2591, + "step": 1736 + }, + { + "epoch": 0.09, + "grad_norm": 1.0109011146022282, + "learning_rate": 1.9822100146799607e-05, + "loss": 0.2268, + "step": 1737 + }, + { + "epoch": 0.09, + "grad_norm": 1.1939519658509656, + "learning_rate": 1.982179073635357e-05, + "loss": 0.2648, + "step": 1738 + }, + { + "epoch": 0.09, + "grad_norm": 1.5178683098536014, + "learning_rate": 1.9821481059490906e-05, + "loss": 0.2232, + "step": 1739 + }, + { + "epoch": 0.09, + "grad_norm": 1.9516149012483963, + "learning_rate": 1.982117111622001e-05, + "loss": 0.2476, + "step": 1740 + }, + { + "epoch": 0.09, + "grad_norm": 1.2321003493682299, + "learning_rate": 1.98208609065493e-05, + "loss": 0.2064, + "step": 1741 + }, + { + "epoch": 0.09, + "grad_norm": 1.1263545813097904, + "learning_rate": 1.982055043048718e-05, + "loss": 0.219, + "step": 1742 + }, + { + "epoch": 0.09, + "grad_norm": 0.9164576815422517, + "learning_rate": 1.982023968804207e-05, + "loss": 0.2325, + "step": 1743 + }, + { + "epoch": 0.09, + "grad_norm": 1.481776837320407, + "learning_rate": 1.981992867922241e-05, + "loss": 0.2732, + "step": 1744 + }, + { + "epoch": 0.09, + "grad_norm": 1.0480431065887095, + "learning_rate": 1.981961740403663e-05, + "loss": 0.225, + "step": 1745 + }, + { + "epoch": 0.09, + "grad_norm": 1.1453713530810714, + "learning_rate": 1.981930586249317e-05, + "loss": 0.2555, + "step": 1746 + }, + { + "epoch": 0.09, + "grad_norm": 1.0440512071308312, + "learning_rate": 1.9818994054600484e-05, + "loss": 0.2443, + "step": 1747 + }, + { + "epoch": 0.09, + "grad_norm": 1.3140536888347236, + "learning_rate": 1.981868198036703e-05, + "loss": 0.2296, + "step": 1748 + }, + { + "epoch": 0.09, + "grad_norm": 0.9589913304538162, + "learning_rate": 1.9818369639801273e-05, + "loss": 0.2269, + "step": 1749 + }, + { + "epoch": 0.09, + "grad_norm": 0.9421444481840565, + "learning_rate": 1.9818057032911687e-05, + "loss": 0.236, + "step": 1750 + }, + { + "epoch": 0.09, + "grad_norm": 1.011810083369825, + "learning_rate": 1.9817744159706746e-05, + "loss": 0.2542, + "step": 1751 + }, + { + "epoch": 0.09, + "grad_norm": 0.9867132876670602, + "learning_rate": 1.9817431020194942e-05, + "loss": 0.2248, + "step": 1752 + }, + { + "epoch": 0.09, + "grad_norm": 1.2165872518452028, + "learning_rate": 1.9817117614384767e-05, + "loss": 0.2315, + "step": 1753 + }, + { + "epoch": 0.09, + "grad_norm": 1.6573462337949036, + "learning_rate": 1.9816803942284724e-05, + "loss": 0.2482, + "step": 1754 + }, + { + "epoch": 0.09, + "grad_norm": 1.0905321049950607, + "learning_rate": 1.9816490003903316e-05, + "loss": 0.2451, + "step": 1755 + }, + { + "epoch": 0.09, + "grad_norm": 1.2974795740116438, + "learning_rate": 1.9816175799249064e-05, + "loss": 0.262, + "step": 1756 + }, + { + "epoch": 0.09, + "grad_norm": 0.9930878435396299, + "learning_rate": 1.981586132833049e-05, + "loss": 0.2834, + "step": 1757 + }, + { + "epoch": 0.09, + "grad_norm": 0.9804621023472282, + "learning_rate": 1.981554659115612e-05, + "loss": 0.249, + "step": 1758 + }, + { + "epoch": 0.09, + "grad_norm": 0.8228160230043197, + "learning_rate": 1.98152315877345e-05, + "loss": 0.2267, + "step": 1759 + }, + { + "epoch": 0.09, + "grad_norm": 0.8087224046227851, + "learning_rate": 1.9814916318074163e-05, + "loss": 0.2372, + "step": 1760 + }, + { + "epoch": 0.09, + "grad_norm": 0.9374525942013675, + "learning_rate": 1.981460078218367e-05, + "loss": 0.2595, + "step": 1761 + }, + { + "epoch": 0.09, + "grad_norm": 0.9126980800579981, + "learning_rate": 1.9814284980071578e-05, + "loss": 0.2429, + "step": 1762 + }, + { + "epoch": 0.09, + "grad_norm": 0.7815278328067415, + "learning_rate": 1.9813968911746447e-05, + "loss": 0.2128, + "step": 1763 + }, + { + "epoch": 0.09, + "grad_norm": 1.1630201006053036, + "learning_rate": 1.9813652577216858e-05, + "loss": 0.2348, + "step": 1764 + }, + { + "epoch": 0.09, + "grad_norm": 1.0067825312667709, + "learning_rate": 1.9813335976491387e-05, + "loss": 0.2426, + "step": 1765 + }, + { + "epoch": 0.09, + "grad_norm": 2.7904482493750873, + "learning_rate": 1.9813019109578623e-05, + "loss": 0.2409, + "step": 1766 + }, + { + "epoch": 0.09, + "grad_norm": 1.0847359740312001, + "learning_rate": 1.981270197648716e-05, + "loss": 0.2616, + "step": 1767 + }, + { + "epoch": 0.09, + "grad_norm": 0.935404199004101, + "learning_rate": 1.9812384577225604e-05, + "loss": 0.2277, + "step": 1768 + }, + { + "epoch": 0.09, + "grad_norm": 1.289585307813352, + "learning_rate": 1.9812066911802565e-05, + "loss": 0.2553, + "step": 1769 + }, + { + "epoch": 0.09, + "grad_norm": 1.4646201642946093, + "learning_rate": 1.981174898022665e-05, + "loss": 0.2478, + "step": 1770 + }, + { + "epoch": 0.09, + "grad_norm": 1.3498844676297264, + "learning_rate": 1.9811430782506496e-05, + "loss": 0.2371, + "step": 1771 + }, + { + "epoch": 0.09, + "grad_norm": 2.195156883689694, + "learning_rate": 1.9811112318650725e-05, + "loss": 0.2526, + "step": 1772 + }, + { + "epoch": 0.09, + "grad_norm": 1.7032110116525865, + "learning_rate": 1.9810793588667977e-05, + "loss": 0.258, + "step": 1773 + }, + { + "epoch": 0.09, + "grad_norm": 0.886767523375247, + "learning_rate": 1.98104745925669e-05, + "loss": 0.2432, + "step": 1774 + }, + { + "epoch": 0.09, + "grad_norm": 1.0485652010268987, + "learning_rate": 1.9810155330356147e-05, + "loss": 0.2278, + "step": 1775 + }, + { + "epoch": 0.09, + "grad_norm": 1.0753335063192004, + "learning_rate": 1.9809835802044378e-05, + "loss": 0.2317, + "step": 1776 + }, + { + "epoch": 0.09, + "grad_norm": 1.2770843961058662, + "learning_rate": 1.9809516007640255e-05, + "loss": 0.229, + "step": 1777 + }, + { + "epoch": 0.09, + "grad_norm": 0.8403924810021453, + "learning_rate": 1.9809195947152458e-05, + "loss": 0.2282, + "step": 1778 + }, + { + "epoch": 0.09, + "grad_norm": 1.0793162103481402, + "learning_rate": 1.9808875620589667e-05, + "loss": 0.2234, + "step": 1779 + }, + { + "epoch": 0.09, + "grad_norm": 1.1238385198046674, + "learning_rate": 1.980855502796057e-05, + "loss": 0.2174, + "step": 1780 + }, + { + "epoch": 0.09, + "grad_norm": 1.2699355390187514, + "learning_rate": 1.9808234169273864e-05, + "loss": 0.2288, + "step": 1781 + }, + { + "epoch": 0.09, + "grad_norm": 1.011158926633347, + "learning_rate": 1.9807913044538252e-05, + "loss": 0.2485, + "step": 1782 + }, + { + "epoch": 0.09, + "grad_norm": 1.4074623352058435, + "learning_rate": 1.9807591653762447e-05, + "loss": 0.2307, + "step": 1783 + }, + { + "epoch": 0.09, + "grad_norm": 0.9928912356239473, + "learning_rate": 1.980726999695516e-05, + "loss": 0.2675, + "step": 1784 + }, + { + "epoch": 0.09, + "grad_norm": 1.0184293111623297, + "learning_rate": 1.9806948074125123e-05, + "loss": 0.2908, + "step": 1785 + }, + { + "epoch": 0.09, + "grad_norm": 1.1122490801754585, + "learning_rate": 1.9806625885281065e-05, + "loss": 0.2621, + "step": 1786 + }, + { + "epoch": 0.09, + "grad_norm": 1.1598479098892676, + "learning_rate": 1.9806303430431727e-05, + "loss": 0.2441, + "step": 1787 + }, + { + "epoch": 0.09, + "grad_norm": 0.979354192254303, + "learning_rate": 1.9805980709585855e-05, + "loss": 0.2375, + "step": 1788 + }, + { + "epoch": 0.09, + "grad_norm": 1.267730797707709, + "learning_rate": 1.9805657722752202e-05, + "loss": 0.2284, + "step": 1789 + }, + { + "epoch": 0.09, + "grad_norm": 0.9952643727050997, + "learning_rate": 1.980533446993953e-05, + "loss": 0.2453, + "step": 1790 + }, + { + "epoch": 0.09, + "grad_norm": 1.0233808075949027, + "learning_rate": 1.9805010951156605e-05, + "loss": 0.2396, + "step": 1791 + }, + { + "epoch": 0.09, + "grad_norm": 1.024484392071274, + "learning_rate": 1.9804687166412204e-05, + "loss": 0.2423, + "step": 1792 + }, + { + "epoch": 0.09, + "grad_norm": 1.1779536450407442, + "learning_rate": 1.980436311571511e-05, + "loss": 0.2506, + "step": 1793 + }, + { + "epoch": 0.09, + "grad_norm": 1.110809751968605, + "learning_rate": 1.9804038799074114e-05, + "loss": 0.2379, + "step": 1794 + }, + { + "epoch": 0.09, + "grad_norm": 0.911356847281592, + "learning_rate": 1.9803714216498013e-05, + "loss": 0.2459, + "step": 1795 + }, + { + "epoch": 0.09, + "grad_norm": 1.2508497514282289, + "learning_rate": 1.9803389367995606e-05, + "loss": 0.2559, + "step": 1796 + }, + { + "epoch": 0.09, + "grad_norm": 1.2547143824197837, + "learning_rate": 1.9803064253575713e-05, + "loss": 0.2678, + "step": 1797 + }, + { + "epoch": 0.09, + "grad_norm": 1.1171374299241046, + "learning_rate": 1.9802738873247146e-05, + "loss": 0.2373, + "step": 1798 + }, + { + "epoch": 0.09, + "grad_norm": 1.3682294514056483, + "learning_rate": 1.9802413227018732e-05, + "loss": 0.2634, + "step": 1799 + }, + { + "epoch": 0.09, + "grad_norm": 1.6593452822995967, + "learning_rate": 1.980208731489931e-05, + "loss": 0.2565, + "step": 1800 + }, + { + "epoch": 0.09, + "grad_norm": 1.33397124612498, + "learning_rate": 1.9801761136897713e-05, + "loss": 0.2465, + "step": 1801 + }, + { + "epoch": 0.09, + "grad_norm": 1.2179198485821112, + "learning_rate": 1.980143469302279e-05, + "loss": 0.251, + "step": 1802 + }, + { + "epoch": 0.09, + "grad_norm": 1.248443571251299, + "learning_rate": 1.9801107983283403e-05, + "loss": 0.2264, + "step": 1803 + }, + { + "epoch": 0.09, + "grad_norm": 1.2624127344026586, + "learning_rate": 1.9800781007688403e-05, + "loss": 0.2419, + "step": 1804 + }, + { + "epoch": 0.09, + "grad_norm": 1.0773293999784925, + "learning_rate": 1.9800453766246668e-05, + "loss": 0.2478, + "step": 1805 + }, + { + "epoch": 0.09, + "grad_norm": 1.2539597583340516, + "learning_rate": 1.980012625896707e-05, + "loss": 0.242, + "step": 1806 + }, + { + "epoch": 0.09, + "grad_norm": 1.0523566435986238, + "learning_rate": 1.979979848585849e-05, + "loss": 0.2438, + "step": 1807 + }, + { + "epoch": 0.09, + "grad_norm": 1.2783462412732363, + "learning_rate": 1.9799470446929827e-05, + "loss": 0.2322, + "step": 1808 + }, + { + "epoch": 0.09, + "grad_norm": 2.501195818234163, + "learning_rate": 1.9799142142189974e-05, + "loss": 0.2366, + "step": 1809 + }, + { + "epoch": 0.09, + "grad_norm": 1.0351472253286227, + "learning_rate": 1.9798813571647835e-05, + "loss": 0.2475, + "step": 1810 + }, + { + "epoch": 0.09, + "grad_norm": 1.141466973878634, + "learning_rate": 1.9798484735312327e-05, + "loss": 0.224, + "step": 1811 + }, + { + "epoch": 0.09, + "grad_norm": 1.3377399022823369, + "learning_rate": 1.9798155633192368e-05, + "loss": 0.2341, + "step": 1812 + }, + { + "epoch": 0.09, + "grad_norm": 1.4789245607733694, + "learning_rate": 1.979782626529688e-05, + "loss": 0.2666, + "step": 1813 + }, + { + "epoch": 0.09, + "grad_norm": 1.7178934408624136, + "learning_rate": 1.9797496631634804e-05, + "loss": 0.2506, + "step": 1814 + }, + { + "epoch": 0.09, + "grad_norm": 0.9579183307072763, + "learning_rate": 1.9797166732215078e-05, + "loss": 0.2334, + "step": 1815 + }, + { + "epoch": 0.09, + "grad_norm": 1.1702378825834747, + "learning_rate": 1.979683656704665e-05, + "loss": 0.2367, + "step": 1816 + }, + { + "epoch": 0.09, + "grad_norm": 1.2623264964220406, + "learning_rate": 1.979650613613848e-05, + "loss": 0.2264, + "step": 1817 + }, + { + "epoch": 0.09, + "grad_norm": 1.1794467571917981, + "learning_rate": 1.979617543949952e-05, + "loss": 0.2339, + "step": 1818 + }, + { + "epoch": 0.09, + "grad_norm": 0.91157451744252, + "learning_rate": 1.9795844477138756e-05, + "loss": 0.225, + "step": 1819 + }, + { + "epoch": 0.09, + "grad_norm": 1.2309863507665357, + "learning_rate": 1.9795513249065155e-05, + "loss": 0.253, + "step": 1820 + }, + { + "epoch": 0.09, + "grad_norm": 1.1783442633169707, + "learning_rate": 1.97951817552877e-05, + "loss": 0.2379, + "step": 1821 + }, + { + "epoch": 0.09, + "grad_norm": 1.094279223019567, + "learning_rate": 1.9794849995815392e-05, + "loss": 0.2457, + "step": 1822 + }, + { + "epoch": 0.09, + "grad_norm": 1.3138667208821027, + "learning_rate": 1.979451797065722e-05, + "loss": 0.259, + "step": 1823 + }, + { + "epoch": 0.09, + "grad_norm": 1.2491422875129075, + "learning_rate": 1.97941856798222e-05, + "loss": 0.2364, + "step": 1824 + }, + { + "epoch": 0.09, + "grad_norm": 1.186450281638106, + "learning_rate": 1.979385312331934e-05, + "loss": 0.2312, + "step": 1825 + }, + { + "epoch": 0.09, + "grad_norm": 1.1194218673747123, + "learning_rate": 1.9793520301157656e-05, + "loss": 0.2473, + "step": 1826 + }, + { + "epoch": 0.09, + "grad_norm": 1.5625455110596913, + "learning_rate": 1.9793187213346183e-05, + "loss": 0.2544, + "step": 1827 + }, + { + "epoch": 0.09, + "grad_norm": 1.1394717462108623, + "learning_rate": 1.9792853859893953e-05, + "loss": 0.2318, + "step": 1828 + }, + { + "epoch": 0.09, + "grad_norm": 1.979714128320049, + "learning_rate": 1.9792520240810012e-05, + "loss": 0.2555, + "step": 1829 + }, + { + "epoch": 0.09, + "grad_norm": 1.0604405691060503, + "learning_rate": 1.9792186356103403e-05, + "loss": 0.2421, + "step": 1830 + }, + { + "epoch": 0.09, + "grad_norm": 0.9962422834651873, + "learning_rate": 1.9791852205783186e-05, + "loss": 0.2274, + "step": 1831 + }, + { + "epoch": 0.09, + "grad_norm": 1.0115858456203548, + "learning_rate": 1.9791517789858428e-05, + "loss": 0.218, + "step": 1832 + }, + { + "epoch": 0.09, + "grad_norm": 1.1584982837673712, + "learning_rate": 1.9791183108338195e-05, + "loss": 0.2291, + "step": 1833 + }, + { + "epoch": 0.09, + "grad_norm": 1.4205913603650355, + "learning_rate": 1.9790848161231568e-05, + "loss": 0.2258, + "step": 1834 + }, + { + "epoch": 0.09, + "grad_norm": 0.9593511399242401, + "learning_rate": 1.9790512948547633e-05, + "loss": 0.2421, + "step": 1835 + }, + { + "epoch": 0.09, + "grad_norm": 1.205982035714139, + "learning_rate": 1.9790177470295474e-05, + "loss": 0.2452, + "step": 1836 + }, + { + "epoch": 0.09, + "grad_norm": 1.0021327684946961, + "learning_rate": 1.9789841726484208e-05, + "loss": 0.2571, + "step": 1837 + }, + { + "epoch": 0.09, + "grad_norm": 1.1537386848613858, + "learning_rate": 1.9789505717122926e-05, + "loss": 0.2639, + "step": 1838 + }, + { + "epoch": 0.09, + "grad_norm": 1.251241669990851, + "learning_rate": 1.978916944222075e-05, + "loss": 0.2597, + "step": 1839 + }, + { + "epoch": 0.09, + "grad_norm": 1.3627173950389766, + "learning_rate": 1.97888329017868e-05, + "loss": 0.2549, + "step": 1840 + }, + { + "epoch": 0.09, + "grad_norm": 1.1280632425903117, + "learning_rate": 1.9788496095830205e-05, + "loss": 0.2427, + "step": 1841 + }, + { + "epoch": 0.09, + "grad_norm": 1.2424096456708325, + "learning_rate": 1.97881590243601e-05, + "loss": 0.2507, + "step": 1842 + }, + { + "epoch": 0.09, + "grad_norm": 1.1268147690233024, + "learning_rate": 1.978782168738563e-05, + "loss": 0.2313, + "step": 1843 + }, + { + "epoch": 0.09, + "grad_norm": 0.9731217683506375, + "learning_rate": 1.9787484084915943e-05, + "loss": 0.1961, + "step": 1844 + }, + { + "epoch": 0.09, + "grad_norm": 0.9932726226384986, + "learning_rate": 1.9787146216960196e-05, + "loss": 0.2704, + "step": 1845 + }, + { + "epoch": 0.09, + "grad_norm": 1.3258885680331116, + "learning_rate": 1.978680808352756e-05, + "loss": 0.2223, + "step": 1846 + }, + { + "epoch": 0.09, + "grad_norm": 1.179125947707762, + "learning_rate": 1.9786469684627193e-05, + "loss": 0.2233, + "step": 1847 + }, + { + "epoch": 0.09, + "grad_norm": 1.181157569743411, + "learning_rate": 1.978613102026829e-05, + "loss": 0.2614, + "step": 1848 + }, + { + "epoch": 0.09, + "grad_norm": 1.2465357791351852, + "learning_rate": 1.9785792090460026e-05, + "loss": 0.1947, + "step": 1849 + }, + { + "epoch": 0.09, + "grad_norm": 1.1762006333045996, + "learning_rate": 1.9785452895211606e-05, + "loss": 0.2488, + "step": 1850 + }, + { + "epoch": 0.09, + "grad_norm": 0.9732313500927374, + "learning_rate": 1.978511343453222e-05, + "loss": 0.2208, + "step": 1851 + }, + { + "epoch": 0.09, + "grad_norm": 2.930795569689893, + "learning_rate": 1.9784773708431076e-05, + "loss": 0.2319, + "step": 1852 + }, + { + "epoch": 0.09, + "grad_norm": 1.1402431579192085, + "learning_rate": 1.9784433716917397e-05, + "loss": 0.2394, + "step": 1853 + }, + { + "epoch": 0.09, + "grad_norm": 1.0338244690058258, + "learning_rate": 1.97840934600004e-05, + "loss": 0.2477, + "step": 1854 + }, + { + "epoch": 0.09, + "grad_norm": 1.6535929415927046, + "learning_rate": 1.9783752937689312e-05, + "loss": 0.2408, + "step": 1855 + }, + { + "epoch": 0.09, + "grad_norm": 1.370824985835338, + "learning_rate": 1.9783412149993374e-05, + "loss": 0.2349, + "step": 1856 + }, + { + "epoch": 0.09, + "grad_norm": 1.3956801985306262, + "learning_rate": 1.978307109692183e-05, + "loss": 0.2534, + "step": 1857 + }, + { + "epoch": 0.09, + "grad_norm": 1.3940946373038978, + "learning_rate": 1.978272977848393e-05, + "loss": 0.286, + "step": 1858 + }, + { + "epoch": 0.09, + "grad_norm": 1.0987035112902765, + "learning_rate": 1.9782388194688933e-05, + "loss": 0.2779, + "step": 1859 + }, + { + "epoch": 0.09, + "grad_norm": 1.1882153135309836, + "learning_rate": 1.9782046345546102e-05, + "loss": 0.2235, + "step": 1860 + }, + { + "epoch": 0.09, + "grad_norm": 1.2008243551349194, + "learning_rate": 1.9781704231064715e-05, + "loss": 0.2693, + "step": 1861 + }, + { + "epoch": 0.09, + "grad_norm": 1.1751556702307224, + "learning_rate": 1.9781361851254044e-05, + "loss": 0.2543, + "step": 1862 + }, + { + "epoch": 0.09, + "grad_norm": 1.1081469900399938, + "learning_rate": 1.9781019206123382e-05, + "loss": 0.2512, + "step": 1863 + }, + { + "epoch": 0.09, + "grad_norm": 0.9908211347171663, + "learning_rate": 1.978067629568202e-05, + "loss": 0.2035, + "step": 1864 + }, + { + "epoch": 0.09, + "grad_norm": 1.2227189853228269, + "learning_rate": 1.9780333119939264e-05, + "loss": 0.235, + "step": 1865 + }, + { + "epoch": 0.09, + "grad_norm": 1.2011150507746902, + "learning_rate": 1.9779989678904416e-05, + "loss": 0.2399, + "step": 1866 + }, + { + "epoch": 0.09, + "grad_norm": 1.3756169840284638, + "learning_rate": 1.97796459725868e-05, + "loss": 0.2533, + "step": 1867 + }, + { + "epoch": 0.09, + "grad_norm": 2.7413831052381767, + "learning_rate": 1.9779302000995732e-05, + "loss": 0.2174, + "step": 1868 + }, + { + "epoch": 0.1, + "grad_norm": 1.2214321925490699, + "learning_rate": 1.9778957764140545e-05, + "loss": 0.2509, + "step": 1869 + }, + { + "epoch": 0.1, + "grad_norm": 2.048946823233343, + "learning_rate": 1.9778613262030577e-05, + "loss": 0.2396, + "step": 1870 + }, + { + "epoch": 0.1, + "grad_norm": 1.0703035941982966, + "learning_rate": 1.9778268494675172e-05, + "loss": 0.2352, + "step": 1871 + }, + { + "epoch": 0.1, + "grad_norm": 2.010644102322746, + "learning_rate": 1.977792346208368e-05, + "loss": 0.2269, + "step": 1872 + }, + { + "epoch": 0.1, + "grad_norm": 2.2422707614304045, + "learning_rate": 1.9777578164265464e-05, + "loss": 0.2476, + "step": 1873 + }, + { + "epoch": 0.1, + "grad_norm": 1.2907222314859268, + "learning_rate": 1.9777232601229887e-05, + "loss": 0.2204, + "step": 1874 + }, + { + "epoch": 0.1, + "grad_norm": 1.3521410331749686, + "learning_rate": 1.9776886772986325e-05, + "loss": 0.2366, + "step": 1875 + }, + { + "epoch": 0.1, + "grad_norm": 1.0863288578749053, + "learning_rate": 1.9776540679544154e-05, + "loss": 0.2297, + "step": 1876 + }, + { + "epoch": 0.1, + "grad_norm": 1.0241804063042204, + "learning_rate": 1.977619432091277e-05, + "loss": 0.2277, + "step": 1877 + }, + { + "epoch": 0.1, + "grad_norm": 1.359207853748398, + "learning_rate": 1.977584769710156e-05, + "loss": 0.2602, + "step": 1878 + }, + { + "epoch": 0.1, + "grad_norm": 1.488727423100929, + "learning_rate": 1.9775500808119926e-05, + "loss": 0.2123, + "step": 1879 + }, + { + "epoch": 0.1, + "grad_norm": 0.9310924451888012, + "learning_rate": 1.9775153653977284e-05, + "loss": 0.2347, + "step": 1880 + }, + { + "epoch": 0.1, + "grad_norm": 1.0882134751609247, + "learning_rate": 1.9774806234683047e-05, + "loss": 0.2168, + "step": 1881 + }, + { + "epoch": 0.1, + "grad_norm": 1.1032916586088626, + "learning_rate": 1.9774458550246636e-05, + "loss": 0.2346, + "step": 1882 + }, + { + "epoch": 0.1, + "grad_norm": 0.8964220837477248, + "learning_rate": 1.977411060067749e-05, + "loss": 0.2192, + "step": 1883 + }, + { + "epoch": 0.1, + "grad_norm": 1.2271518408134103, + "learning_rate": 1.977376238598504e-05, + "loss": 0.2207, + "step": 1884 + }, + { + "epoch": 0.1, + "grad_norm": 1.2657120013958592, + "learning_rate": 1.977341390617873e-05, + "loss": 0.2216, + "step": 1885 + }, + { + "epoch": 0.1, + "grad_norm": 1.1240450447057986, + "learning_rate": 1.9773065161268015e-05, + "loss": 0.2152, + "step": 1886 + }, + { + "epoch": 0.1, + "grad_norm": 1.140021817870832, + "learning_rate": 1.977271615126236e-05, + "loss": 0.2405, + "step": 1887 + }, + { + "epoch": 0.1, + "grad_norm": 0.9646328044306108, + "learning_rate": 1.9772366876171224e-05, + "loss": 0.2214, + "step": 1888 + }, + { + "epoch": 0.1, + "grad_norm": 1.0229657136088213, + "learning_rate": 1.9772017336004085e-05, + "loss": 0.2486, + "step": 1889 + }, + { + "epoch": 0.1, + "grad_norm": 1.0801181204271537, + "learning_rate": 1.9771667530770427e-05, + "loss": 0.2413, + "step": 1890 + }, + { + "epoch": 0.1, + "grad_norm": 1.0024092708899657, + "learning_rate": 1.9771317460479733e-05, + "loss": 0.2554, + "step": 1891 + }, + { + "epoch": 0.1, + "grad_norm": 0.9461916604616549, + "learning_rate": 1.9770967125141502e-05, + "loss": 0.2465, + "step": 1892 + }, + { + "epoch": 0.1, + "grad_norm": 1.4241403509851787, + "learning_rate": 1.9770616524765236e-05, + "loss": 0.2618, + "step": 1893 + }, + { + "epoch": 0.1, + "grad_norm": 1.2192273387499462, + "learning_rate": 1.9770265659360445e-05, + "loss": 0.2477, + "step": 1894 + }, + { + "epoch": 0.1, + "grad_norm": 0.9149517263311271, + "learning_rate": 1.9769914528936646e-05, + "loss": 0.2203, + "step": 1895 + }, + { + "epoch": 0.1, + "grad_norm": 0.9640595665851582, + "learning_rate": 1.976956313350336e-05, + "loss": 0.255, + "step": 1896 + }, + { + "epoch": 0.1, + "grad_norm": 1.2296809453079092, + "learning_rate": 1.9769211473070124e-05, + "loss": 0.2257, + "step": 1897 + }, + { + "epoch": 0.1, + "grad_norm": 1.0503214717220533, + "learning_rate": 1.9768859547646476e-05, + "loss": 0.245, + "step": 1898 + }, + { + "epoch": 0.1, + "grad_norm": 1.0261216658899286, + "learning_rate": 1.976850735724196e-05, + "loss": 0.2228, + "step": 1899 + }, + { + "epoch": 0.1, + "grad_norm": 0.8879896992019953, + "learning_rate": 1.9768154901866136e-05, + "loss": 0.2362, + "step": 1900 + }, + { + "epoch": 0.1, + "grad_norm": 0.944882194134527, + "learning_rate": 1.9767802181528552e-05, + "loss": 0.2428, + "step": 1901 + }, + { + "epoch": 0.1, + "grad_norm": 1.4309989587560117, + "learning_rate": 1.9767449196238785e-05, + "loss": 0.2513, + "step": 1902 + }, + { + "epoch": 0.1, + "grad_norm": 1.1054306441049362, + "learning_rate": 1.9767095946006405e-05, + "loss": 0.2416, + "step": 1903 + }, + { + "epoch": 0.1, + "grad_norm": 0.9790156761267768, + "learning_rate": 1.9766742430840998e-05, + "loss": 0.2196, + "step": 1904 + }, + { + "epoch": 0.1, + "grad_norm": 1.225542879755533, + "learning_rate": 1.9766388650752152e-05, + "loss": 0.2531, + "step": 1905 + }, + { + "epoch": 0.1, + "grad_norm": 1.1451682777421768, + "learning_rate": 1.976603460574946e-05, + "loss": 0.2252, + "step": 1906 + }, + { + "epoch": 0.1, + "grad_norm": 1.1697641021216751, + "learning_rate": 1.9765680295842525e-05, + "loss": 0.2374, + "step": 1907 + }, + { + "epoch": 0.1, + "grad_norm": 1.3119766391360652, + "learning_rate": 1.9765325721040964e-05, + "loss": 0.2328, + "step": 1908 + }, + { + "epoch": 0.1, + "grad_norm": 0.8776972094573054, + "learning_rate": 1.976497088135439e-05, + "loss": 0.2255, + "step": 1909 + }, + { + "epoch": 0.1, + "grad_norm": 2.901221087412436, + "learning_rate": 1.976461577679243e-05, + "loss": 0.2538, + "step": 1910 + }, + { + "epoch": 0.1, + "grad_norm": 0.9364330186554849, + "learning_rate": 1.9764260407364714e-05, + "loss": 0.2266, + "step": 1911 + }, + { + "epoch": 0.1, + "grad_norm": 0.8343122185679239, + "learning_rate": 1.9763904773080886e-05, + "loss": 0.241, + "step": 1912 + }, + { + "epoch": 0.1, + "grad_norm": 1.5044879595592187, + "learning_rate": 1.9763548873950586e-05, + "loss": 0.2721, + "step": 1913 + }, + { + "epoch": 0.1, + "grad_norm": 1.0495765131484909, + "learning_rate": 1.9763192709983473e-05, + "loss": 0.2417, + "step": 1914 + }, + { + "epoch": 0.1, + "grad_norm": 1.05508985163837, + "learning_rate": 1.9762836281189207e-05, + "loss": 0.2227, + "step": 1915 + }, + { + "epoch": 0.1, + "grad_norm": 1.0155022843798927, + "learning_rate": 1.9762479587577457e-05, + "loss": 0.2425, + "step": 1916 + }, + { + "epoch": 0.1, + "grad_norm": 1.0967121155196264, + "learning_rate": 1.976212262915789e-05, + "loss": 0.2335, + "step": 1917 + }, + { + "epoch": 0.1, + "grad_norm": 1.37545605462818, + "learning_rate": 1.9761765405940203e-05, + "loss": 0.2407, + "step": 1918 + }, + { + "epoch": 0.1, + "grad_norm": 0.9818909497523616, + "learning_rate": 1.9761407917934073e-05, + "loss": 0.2493, + "step": 1919 + }, + { + "epoch": 0.1, + "grad_norm": 1.137871337305637, + "learning_rate": 1.9761050165149208e-05, + "loss": 0.246, + "step": 1920 + }, + { + "epoch": 0.1, + "grad_norm": 0.9860186561324186, + "learning_rate": 1.9760692147595298e-05, + "loss": 0.243, + "step": 1921 + }, + { + "epoch": 0.1, + "grad_norm": 1.0181895969287278, + "learning_rate": 1.9760333865282067e-05, + "loss": 0.2473, + "step": 1922 + }, + { + "epoch": 0.1, + "grad_norm": 0.8406082864526969, + "learning_rate": 1.975997531821923e-05, + "loss": 0.2075, + "step": 1923 + }, + { + "epoch": 0.1, + "grad_norm": 1.9227285386867792, + "learning_rate": 1.9759616506416506e-05, + "loss": 0.2367, + "step": 1924 + }, + { + "epoch": 0.1, + "grad_norm": 1.5616621406325664, + "learning_rate": 1.975925742988364e-05, + "loss": 0.2383, + "step": 1925 + }, + { + "epoch": 0.1, + "grad_norm": 1.3935247996994191, + "learning_rate": 1.975889808863036e-05, + "loss": 0.2452, + "step": 1926 + }, + { + "epoch": 0.1, + "grad_norm": 1.2943208678791482, + "learning_rate": 1.975853848266642e-05, + "loss": 0.2407, + "step": 1927 + }, + { + "epoch": 0.1, + "grad_norm": 1.450294411426727, + "learning_rate": 1.975817861200157e-05, + "loss": 0.2277, + "step": 1928 + }, + { + "epoch": 0.1, + "grad_norm": 0.9018073347670972, + "learning_rate": 1.9757818476645573e-05, + "loss": 0.2498, + "step": 1929 + }, + { + "epoch": 0.1, + "grad_norm": 1.0314892631011348, + "learning_rate": 1.9757458076608204e-05, + "loss": 0.2412, + "step": 1930 + }, + { + "epoch": 0.1, + "grad_norm": 1.1697931974945825, + "learning_rate": 1.975709741189923e-05, + "loss": 0.2312, + "step": 1931 + }, + { + "epoch": 0.1, + "grad_norm": 0.9365982325740557, + "learning_rate": 1.975673648252844e-05, + "loss": 0.2147, + "step": 1932 + }, + { + "epoch": 0.1, + "grad_norm": 0.8698438621254598, + "learning_rate": 1.975637528850562e-05, + "loss": 0.2396, + "step": 1933 + }, + { + "epoch": 0.1, + "grad_norm": 1.1560916597632185, + "learning_rate": 1.9756013829840568e-05, + "loss": 0.2024, + "step": 1934 + }, + { + "epoch": 0.1, + "grad_norm": 0.8347486142298968, + "learning_rate": 1.9755652106543094e-05, + "loss": 0.2421, + "step": 1935 + }, + { + "epoch": 0.1, + "grad_norm": 0.920153099028764, + "learning_rate": 1.9755290118623e-05, + "loss": 0.2228, + "step": 1936 + }, + { + "epoch": 0.1, + "grad_norm": 0.9913299797122175, + "learning_rate": 1.9754927866090115e-05, + "loss": 0.251, + "step": 1937 + }, + { + "epoch": 0.1, + "grad_norm": 1.0404221817815027, + "learning_rate": 1.975456534895426e-05, + "loss": 0.2099, + "step": 1938 + }, + { + "epoch": 0.1, + "grad_norm": 1.115116727740608, + "learning_rate": 1.975420256722527e-05, + "loss": 0.248, + "step": 1939 + }, + { + "epoch": 0.1, + "grad_norm": 1.1369950824968853, + "learning_rate": 1.9753839520912984e-05, + "loss": 0.2212, + "step": 1940 + }, + { + "epoch": 0.1, + "grad_norm": 1.4411550000757647, + "learning_rate": 1.9753476210027248e-05, + "loss": 0.204, + "step": 1941 + }, + { + "epoch": 0.1, + "grad_norm": 0.877079500076538, + "learning_rate": 1.975311263457792e-05, + "loss": 0.236, + "step": 1942 + }, + { + "epoch": 0.1, + "grad_norm": 1.2958517471483217, + "learning_rate": 1.9752748794574858e-05, + "loss": 0.2547, + "step": 1943 + }, + { + "epoch": 0.1, + "grad_norm": 1.102209497560389, + "learning_rate": 1.9752384690027937e-05, + "loss": 0.2082, + "step": 1944 + }, + { + "epoch": 0.1, + "grad_norm": 1.0121374445198599, + "learning_rate": 1.975202032094703e-05, + "loss": 0.2849, + "step": 1945 + }, + { + "epoch": 0.1, + "grad_norm": 0.8864239305753996, + "learning_rate": 1.9751655687342022e-05, + "loss": 0.2282, + "step": 1946 + }, + { + "epoch": 0.1, + "grad_norm": 0.9967365811704525, + "learning_rate": 1.9751290789222804e-05, + "loss": 0.2322, + "step": 1947 + }, + { + "epoch": 0.1, + "grad_norm": 1.1234867484412785, + "learning_rate": 1.975092562659927e-05, + "loss": 0.2789, + "step": 1948 + }, + { + "epoch": 0.1, + "grad_norm": 0.8313426797342862, + "learning_rate": 1.9750560199481325e-05, + "loss": 0.218, + "step": 1949 + }, + { + "epoch": 0.1, + "grad_norm": 1.0417507918529505, + "learning_rate": 1.975019450787889e-05, + "loss": 0.2595, + "step": 1950 + }, + { + "epoch": 0.1, + "grad_norm": 1.1414670268025355, + "learning_rate": 1.9749828551801875e-05, + "loss": 0.2483, + "step": 1951 + }, + { + "epoch": 0.1, + "grad_norm": 1.0376239315288212, + "learning_rate": 1.974946233126021e-05, + "loss": 0.231, + "step": 1952 + }, + { + "epoch": 0.1, + "grad_norm": 1.1091441540992661, + "learning_rate": 1.9749095846263828e-05, + "loss": 0.2234, + "step": 1953 + }, + { + "epoch": 0.1, + "grad_norm": 1.087702774894137, + "learning_rate": 1.974872909682267e-05, + "loss": 0.2496, + "step": 1954 + }, + { + "epoch": 0.1, + "grad_norm": 1.0600141655170918, + "learning_rate": 1.974836208294669e-05, + "loss": 0.2432, + "step": 1955 + }, + { + "epoch": 0.1, + "grad_norm": 1.3603611052790359, + "learning_rate": 1.9747994804645835e-05, + "loss": 0.2283, + "step": 1956 + }, + { + "epoch": 0.1, + "grad_norm": 1.0742920942448784, + "learning_rate": 1.9747627261930066e-05, + "loss": 0.2387, + "step": 1957 + }, + { + "epoch": 0.1, + "grad_norm": 1.1454768361187495, + "learning_rate": 1.974725945480936e-05, + "loss": 0.2743, + "step": 1958 + }, + { + "epoch": 0.1, + "grad_norm": 1.033670082913112, + "learning_rate": 1.9746891383293692e-05, + "loss": 0.2661, + "step": 1959 + }, + { + "epoch": 0.1, + "grad_norm": 2.2940527105563957, + "learning_rate": 1.9746523047393046e-05, + "loss": 0.2791, + "step": 1960 + }, + { + "epoch": 0.1, + "grad_norm": 0.8717314285910045, + "learning_rate": 1.974615444711741e-05, + "loss": 0.2261, + "step": 1961 + }, + { + "epoch": 0.1, + "grad_norm": 1.1675295248221975, + "learning_rate": 1.974578558247678e-05, + "loss": 0.2474, + "step": 1962 + }, + { + "epoch": 0.1, + "grad_norm": 1.1768908708954267, + "learning_rate": 1.9745416453481168e-05, + "loss": 0.2453, + "step": 1963 + }, + { + "epoch": 0.1, + "grad_norm": 1.0965930944631577, + "learning_rate": 1.974504706014059e-05, + "loss": 0.2422, + "step": 1964 + }, + { + "epoch": 0.1, + "grad_norm": 1.3209827499231572, + "learning_rate": 1.9744677402465053e-05, + "loss": 0.2617, + "step": 1965 + }, + { + "epoch": 0.1, + "grad_norm": 1.0704828420643713, + "learning_rate": 1.9744307480464595e-05, + "loss": 0.24, + "step": 1966 + }, + { + "epoch": 0.1, + "grad_norm": 1.000209014735969, + "learning_rate": 1.9743937294149244e-05, + "loss": 0.2334, + "step": 1967 + }, + { + "epoch": 0.1, + "grad_norm": 1.2831002301749044, + "learning_rate": 1.9743566843529045e-05, + "loss": 0.2512, + "step": 1968 + }, + { + "epoch": 0.1, + "grad_norm": 1.1344015282589346, + "learning_rate": 1.9743196128614045e-05, + "loss": 0.2488, + "step": 1969 + }, + { + "epoch": 0.1, + "grad_norm": 0.9536869918552521, + "learning_rate": 1.97428251494143e-05, + "loss": 0.2471, + "step": 1970 + }, + { + "epoch": 0.1, + "grad_norm": 1.2110892765554457, + "learning_rate": 1.974245390593987e-05, + "loss": 0.2298, + "step": 1971 + }, + { + "epoch": 0.1, + "grad_norm": 1.1996062868964599, + "learning_rate": 1.974208239820083e-05, + "loss": 0.2441, + "step": 1972 + }, + { + "epoch": 0.1, + "grad_norm": 0.8974209218869812, + "learning_rate": 1.9741710626207255e-05, + "loss": 0.2358, + "step": 1973 + }, + { + "epoch": 0.1, + "grad_norm": 1.4554750921379593, + "learning_rate": 1.9741338589969226e-05, + "loss": 0.263, + "step": 1974 + }, + { + "epoch": 0.1, + "grad_norm": 1.0392813150984546, + "learning_rate": 1.9740966289496844e-05, + "loss": 0.2458, + "step": 1975 + }, + { + "epoch": 0.1, + "grad_norm": 0.9963122217447735, + "learning_rate": 1.9740593724800194e-05, + "loss": 0.26, + "step": 1976 + }, + { + "epoch": 0.1, + "grad_norm": 0.9737554516896297, + "learning_rate": 1.9740220895889393e-05, + "loss": 0.2259, + "step": 1977 + }, + { + "epoch": 0.1, + "grad_norm": 0.8923416935134135, + "learning_rate": 1.973984780277455e-05, + "loss": 0.2476, + "step": 1978 + }, + { + "epoch": 0.1, + "grad_norm": 1.172847400701586, + "learning_rate": 1.9739474445465783e-05, + "loss": 0.2455, + "step": 1979 + }, + { + "epoch": 0.1, + "grad_norm": 0.9256808107004492, + "learning_rate": 1.9739100823973226e-05, + "loss": 0.2285, + "step": 1980 + }, + { + "epoch": 0.1, + "grad_norm": 1.5604533353401182, + "learning_rate": 1.9738726938307e-05, + "loss": 0.2567, + "step": 1981 + }, + { + "epoch": 0.1, + "grad_norm": 0.8144426438895492, + "learning_rate": 1.9738352788477268e-05, + "loss": 0.2263, + "step": 1982 + }, + { + "epoch": 0.1, + "grad_norm": 1.0438242662359962, + "learning_rate": 1.9737978374494157e-05, + "loss": 0.2192, + "step": 1983 + }, + { + "epoch": 0.1, + "grad_norm": 1.3678826201890977, + "learning_rate": 1.9737603696367836e-05, + "loss": 0.2379, + "step": 1984 + }, + { + "epoch": 0.1, + "grad_norm": 0.909867140325504, + "learning_rate": 1.9737228754108467e-05, + "loss": 0.2626, + "step": 1985 + }, + { + "epoch": 0.1, + "grad_norm": 1.0062470083627526, + "learning_rate": 1.9736853547726214e-05, + "loss": 0.2115, + "step": 1986 + }, + { + "epoch": 0.1, + "grad_norm": 0.9377825018993832, + "learning_rate": 1.973647807723126e-05, + "loss": 0.255, + "step": 1987 + }, + { + "epoch": 0.1, + "grad_norm": 1.0494091703851158, + "learning_rate": 1.973610234263379e-05, + "loss": 0.2518, + "step": 1988 + }, + { + "epoch": 0.1, + "grad_norm": 1.1987444586881062, + "learning_rate": 1.9735726343943992e-05, + "loss": 0.2384, + "step": 1989 + }, + { + "epoch": 0.1, + "grad_norm": 1.02642903480859, + "learning_rate": 1.973535008117207e-05, + "loss": 0.267, + "step": 1990 + }, + { + "epoch": 0.1, + "grad_norm": 0.9161340787331664, + "learning_rate": 1.9734973554328223e-05, + "loss": 0.2229, + "step": 1991 + }, + { + "epoch": 0.1, + "grad_norm": 1.8948289353381529, + "learning_rate": 1.9734596763422672e-05, + "loss": 0.2656, + "step": 1992 + }, + { + "epoch": 0.1, + "grad_norm": 0.918415437776545, + "learning_rate": 1.973421970846563e-05, + "loss": 0.2379, + "step": 1993 + }, + { + "epoch": 0.1, + "grad_norm": 1.9143249901978743, + "learning_rate": 1.9733842389467334e-05, + "loss": 0.254, + "step": 1994 + }, + { + "epoch": 0.1, + "grad_norm": 0.8228313878161293, + "learning_rate": 1.9733464806438007e-05, + "loss": 0.2356, + "step": 1995 + }, + { + "epoch": 0.1, + "grad_norm": 0.9175875302693642, + "learning_rate": 1.97330869593879e-05, + "loss": 0.2135, + "step": 1996 + }, + { + "epoch": 0.1, + "grad_norm": 1.1814645469727503, + "learning_rate": 1.973270884832726e-05, + "loss": 0.2161, + "step": 1997 + }, + { + "epoch": 0.1, + "grad_norm": 0.9231461656642026, + "learning_rate": 1.9732330473266347e-05, + "loss": 0.2537, + "step": 1998 + }, + { + "epoch": 0.1, + "grad_norm": 0.9796736831409919, + "learning_rate": 1.9731951834215414e-05, + "loss": 0.2183, + "step": 1999 + }, + { + "epoch": 0.1, + "grad_norm": 0.9496265829684779, + "learning_rate": 1.973157293118474e-05, + "loss": 0.2479, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 0.9231151077653849, + "learning_rate": 1.9731193764184603e-05, + "loss": 0.222, + "step": 2001 + }, + { + "epoch": 0.1, + "grad_norm": 1.0860127244202018, + "learning_rate": 1.9730814333225285e-05, + "loss": 0.2175, + "step": 2002 + }, + { + "epoch": 0.1, + "grad_norm": 1.179552345473745, + "learning_rate": 1.9730434638317076e-05, + "loss": 0.2203, + "step": 2003 + }, + { + "epoch": 0.1, + "grad_norm": 1.1933311940151612, + "learning_rate": 1.9730054679470278e-05, + "loss": 0.2185, + "step": 2004 + }, + { + "epoch": 0.1, + "grad_norm": 0.8347201140855571, + "learning_rate": 1.97296744566952e-05, + "loss": 0.2103, + "step": 2005 + }, + { + "epoch": 0.1, + "grad_norm": 1.4454228072145152, + "learning_rate": 1.9729293970002146e-05, + "loss": 0.2386, + "step": 2006 + }, + { + "epoch": 0.1, + "grad_norm": 1.095418832067566, + "learning_rate": 1.972891321940145e-05, + "loss": 0.236, + "step": 2007 + }, + { + "epoch": 0.1, + "grad_norm": 0.9516989176459635, + "learning_rate": 1.9728532204903433e-05, + "loss": 0.2302, + "step": 2008 + }, + { + "epoch": 0.1, + "grad_norm": 1.0057330914000895, + "learning_rate": 1.972815092651843e-05, + "loss": 0.2372, + "step": 2009 + }, + { + "epoch": 0.1, + "grad_norm": 0.895760637885882, + "learning_rate": 1.9727769384256784e-05, + "loss": 0.2467, + "step": 2010 + }, + { + "epoch": 0.1, + "grad_norm": 0.9447309607983849, + "learning_rate": 1.972738757812884e-05, + "loss": 0.2336, + "step": 2011 + }, + { + "epoch": 0.1, + "grad_norm": 1.8281939870261643, + "learning_rate": 1.972700550814496e-05, + "loss": 0.2278, + "step": 2012 + }, + { + "epoch": 0.1, + "grad_norm": 0.9350882822101689, + "learning_rate": 1.9726623174315513e-05, + "loss": 0.2353, + "step": 2013 + }, + { + "epoch": 0.1, + "grad_norm": 0.9463149137617953, + "learning_rate": 1.9726240576650856e-05, + "loss": 0.2321, + "step": 2014 + }, + { + "epoch": 0.1, + "grad_norm": 0.9948397774746942, + "learning_rate": 1.9725857715161375e-05, + "loss": 0.2411, + "step": 2015 + }, + { + "epoch": 0.1, + "grad_norm": 1.2647259419814574, + "learning_rate": 1.9725474589857456e-05, + "loss": 0.2351, + "step": 2016 + }, + { + "epoch": 0.1, + "grad_norm": 0.9016649755178238, + "learning_rate": 1.972509120074949e-05, + "loss": 0.2386, + "step": 2017 + }, + { + "epoch": 0.1, + "grad_norm": 1.1877608427174775, + "learning_rate": 1.9724707547847873e-05, + "loss": 0.2469, + "step": 2018 + }, + { + "epoch": 0.1, + "grad_norm": 1.7100428965467536, + "learning_rate": 1.9724323631163016e-05, + "loss": 0.2349, + "step": 2019 + }, + { + "epoch": 0.1, + "grad_norm": 1.345463433391392, + "learning_rate": 1.972393945070533e-05, + "loss": 0.2513, + "step": 2020 + }, + { + "epoch": 0.1, + "grad_norm": 0.9091717965875171, + "learning_rate": 1.972355500648524e-05, + "loss": 0.233, + "step": 2021 + }, + { + "epoch": 0.1, + "grad_norm": 1.223819582299652, + "learning_rate": 1.9723170298513166e-05, + "loss": 0.2505, + "step": 2022 + }, + { + "epoch": 0.1, + "grad_norm": 1.2243075830635146, + "learning_rate": 1.9722785326799554e-05, + "loss": 0.2262, + "step": 2023 + }, + { + "epoch": 0.1, + "grad_norm": 1.0350825147530838, + "learning_rate": 1.9722400091354837e-05, + "loss": 0.2277, + "step": 2024 + }, + { + "epoch": 0.1, + "grad_norm": 1.935339865872896, + "learning_rate": 1.9722014592189472e-05, + "loss": 0.22, + "step": 2025 + }, + { + "epoch": 0.1, + "grad_norm": 1.166453007673294, + "learning_rate": 1.972162882931391e-05, + "loss": 0.2548, + "step": 2026 + }, + { + "epoch": 0.1, + "grad_norm": 1.0351107766845962, + "learning_rate": 1.9721242802738615e-05, + "loss": 0.2426, + "step": 2027 + }, + { + "epoch": 0.1, + "grad_norm": 1.084600790162855, + "learning_rate": 1.9720856512474065e-05, + "loss": 0.2137, + "step": 2028 + }, + { + "epoch": 0.1, + "grad_norm": 0.9560170541274214, + "learning_rate": 1.972046995853073e-05, + "loss": 0.2629, + "step": 2029 + }, + { + "epoch": 0.1, + "grad_norm": 1.4103631271603914, + "learning_rate": 1.9720083140919097e-05, + "loss": 0.2184, + "step": 2030 + }, + { + "epoch": 0.1, + "grad_norm": 1.4870866452091123, + "learning_rate": 1.9719696059649665e-05, + "loss": 0.2314, + "step": 2031 + }, + { + "epoch": 0.1, + "grad_norm": 1.0031746692829822, + "learning_rate": 1.9719308714732924e-05, + "loss": 0.2556, + "step": 2032 + }, + { + "epoch": 0.1, + "grad_norm": 1.0804349871082373, + "learning_rate": 1.9718921106179384e-05, + "loss": 0.2393, + "step": 2033 + }, + { + "epoch": 0.1, + "grad_norm": 2.029145379261695, + "learning_rate": 1.9718533233999565e-05, + "loss": 0.2449, + "step": 2034 + }, + { + "epoch": 0.1, + "grad_norm": 1.075889733451575, + "learning_rate": 1.9718145098203977e-05, + "loss": 0.2061, + "step": 2035 + }, + { + "epoch": 0.1, + "grad_norm": 1.5370899898173875, + "learning_rate": 1.971775669880316e-05, + "loss": 0.2207, + "step": 2036 + }, + { + "epoch": 0.1, + "grad_norm": 0.9389617118447757, + "learning_rate": 1.971736803580764e-05, + "loss": 0.2453, + "step": 2037 + }, + { + "epoch": 0.1, + "grad_norm": 0.9431192525993586, + "learning_rate": 1.9716979109227965e-05, + "loss": 0.2128, + "step": 2038 + }, + { + "epoch": 0.1, + "grad_norm": 0.8151755033716819, + "learning_rate": 1.9716589919074682e-05, + "loss": 0.2378, + "step": 2039 + }, + { + "epoch": 0.1, + "grad_norm": 0.9130866798995204, + "learning_rate": 1.9716200465358352e-05, + "loss": 0.2421, + "step": 2040 + }, + { + "epoch": 0.1, + "grad_norm": 1.0000040861444954, + "learning_rate": 1.971581074808953e-05, + "loss": 0.2559, + "step": 2041 + }, + { + "epoch": 0.1, + "grad_norm": 1.0485492882545504, + "learning_rate": 1.9715420767278794e-05, + "loss": 0.2318, + "step": 2042 + }, + { + "epoch": 0.1, + "grad_norm": 1.137323578167587, + "learning_rate": 1.9715030522936724e-05, + "loss": 0.2508, + "step": 2043 + }, + { + "epoch": 0.1, + "grad_norm": 1.2110860738142555, + "learning_rate": 1.9714640015073902e-05, + "loss": 0.234, + "step": 2044 + }, + { + "epoch": 0.1, + "grad_norm": 0.9126273522332989, + "learning_rate": 1.9714249243700916e-05, + "loss": 0.2512, + "step": 2045 + }, + { + "epoch": 0.1, + "grad_norm": 1.105935543821545, + "learning_rate": 1.9713858208828376e-05, + "loss": 0.2222, + "step": 2046 + }, + { + "epoch": 0.1, + "grad_norm": 1.176499937899214, + "learning_rate": 1.971346691046688e-05, + "loss": 0.2465, + "step": 2047 + }, + { + "epoch": 0.1, + "grad_norm": 1.044320635285446, + "learning_rate": 1.971307534862705e-05, + "loss": 0.2773, + "step": 2048 + }, + { + "epoch": 0.1, + "grad_norm": 0.8622995262128048, + "learning_rate": 1.9712683523319498e-05, + "loss": 0.2372, + "step": 2049 + }, + { + "epoch": 0.1, + "grad_norm": 0.9419349078412376, + "learning_rate": 1.9712291434554858e-05, + "loss": 0.2445, + "step": 2050 + }, + { + "epoch": 0.1, + "grad_norm": 1.051102010160456, + "learning_rate": 1.9711899082343763e-05, + "loss": 0.2449, + "step": 2051 + }, + { + "epoch": 0.1, + "grad_norm": 0.9448524300694392, + "learning_rate": 1.971150646669686e-05, + "loss": 0.2452, + "step": 2052 + }, + { + "epoch": 0.1, + "grad_norm": 1.216094189344481, + "learning_rate": 1.9711113587624795e-05, + "loss": 0.2247, + "step": 2053 + }, + { + "epoch": 0.1, + "grad_norm": 1.063800136560177, + "learning_rate": 1.9710720445138225e-05, + "loss": 0.2196, + "step": 2054 + }, + { + "epoch": 0.1, + "grad_norm": 1.6769989193035522, + "learning_rate": 1.9710327039247814e-05, + "loss": 0.2664, + "step": 2055 + }, + { + "epoch": 0.1, + "grad_norm": 1.2891307743333376, + "learning_rate": 1.9709933369964235e-05, + "loss": 0.2428, + "step": 2056 + }, + { + "epoch": 0.1, + "grad_norm": 0.8979671748669272, + "learning_rate": 1.970953943729816e-05, + "loss": 0.2286, + "step": 2057 + }, + { + "epoch": 0.1, + "grad_norm": 1.3718627427597734, + "learning_rate": 1.9709145241260283e-05, + "loss": 0.2354, + "step": 2058 + }, + { + "epoch": 0.1, + "grad_norm": 1.4770658344351117, + "learning_rate": 1.9708750781861294e-05, + "loss": 0.2556, + "step": 2059 + }, + { + "epoch": 0.1, + "grad_norm": 1.5425129152295496, + "learning_rate": 1.970835605911189e-05, + "loss": 0.2405, + "step": 2060 + }, + { + "epoch": 0.1, + "grad_norm": 1.0673330540386734, + "learning_rate": 1.970796107302278e-05, + "loss": 0.2262, + "step": 2061 + }, + { + "epoch": 0.1, + "grad_norm": 1.2216615188163307, + "learning_rate": 1.970756582360468e-05, + "loss": 0.2431, + "step": 2062 + }, + { + "epoch": 0.1, + "grad_norm": 1.370798987347387, + "learning_rate": 1.9707170310868303e-05, + "loss": 0.2266, + "step": 2063 + }, + { + "epoch": 0.1, + "grad_norm": 1.357861573424895, + "learning_rate": 1.9706774534824387e-05, + "loss": 0.2137, + "step": 2064 + }, + { + "epoch": 0.11, + "grad_norm": 0.991296929358314, + "learning_rate": 1.9706378495483664e-05, + "loss": 0.2243, + "step": 2065 + }, + { + "epoch": 0.11, + "grad_norm": 1.1184402264270916, + "learning_rate": 1.9705982192856874e-05, + "loss": 0.2171, + "step": 2066 + }, + { + "epoch": 0.11, + "grad_norm": 1.0273223493964336, + "learning_rate": 1.9705585626954772e-05, + "loss": 0.23, + "step": 2067 + }, + { + "epoch": 0.11, + "grad_norm": 1.0079792072394853, + "learning_rate": 1.9705188797788108e-05, + "loss": 0.2138, + "step": 2068 + }, + { + "epoch": 0.11, + "grad_norm": 0.7765221583486901, + "learning_rate": 1.9704791705367653e-05, + "loss": 0.2036, + "step": 2069 + }, + { + "epoch": 0.11, + "grad_norm": 2.4568975454390984, + "learning_rate": 1.9704394349704174e-05, + "loss": 0.2434, + "step": 2070 + }, + { + "epoch": 0.11, + "grad_norm": 1.221785001763761, + "learning_rate": 1.970399673080845e-05, + "loss": 0.2401, + "step": 2071 + }, + { + "epoch": 0.11, + "grad_norm": 1.0248697227267982, + "learning_rate": 1.970359884869126e-05, + "loss": 0.2156, + "step": 2072 + }, + { + "epoch": 0.11, + "grad_norm": 1.10514229232445, + "learning_rate": 1.9703200703363415e-05, + "loss": 0.2445, + "step": 2073 + }, + { + "epoch": 0.11, + "grad_norm": 1.2233075570824092, + "learning_rate": 1.9702802294835695e-05, + "loss": 0.2647, + "step": 2074 + }, + { + "epoch": 0.11, + "grad_norm": 1.411425527125798, + "learning_rate": 1.9702403623118918e-05, + "loss": 0.2264, + "step": 2075 + }, + { + "epoch": 0.11, + "grad_norm": 1.255491000596999, + "learning_rate": 1.970200468822389e-05, + "loss": 0.2211, + "step": 2076 + }, + { + "epoch": 0.11, + "grad_norm": 1.1531607324724602, + "learning_rate": 1.970160549016144e-05, + "loss": 0.2471, + "step": 2077 + }, + { + "epoch": 0.11, + "grad_norm": 1.1173493303154376, + "learning_rate": 1.9701206028942398e-05, + "loss": 0.2495, + "step": 2078 + }, + { + "epoch": 0.11, + "grad_norm": 1.1526086421139528, + "learning_rate": 1.970080630457759e-05, + "loss": 0.2303, + "step": 2079 + }, + { + "epoch": 0.11, + "grad_norm": 0.9464962515330188, + "learning_rate": 1.970040631707786e-05, + "loss": 0.2603, + "step": 2080 + }, + { + "epoch": 0.11, + "grad_norm": 0.8871094596594084, + "learning_rate": 1.9700006066454066e-05, + "loss": 0.2512, + "step": 2081 + }, + { + "epoch": 0.11, + "grad_norm": 1.0292996944262494, + "learning_rate": 1.9699605552717056e-05, + "loss": 0.2385, + "step": 2082 + }, + { + "epoch": 0.11, + "grad_norm": 1.1987567719919687, + "learning_rate": 1.96992047758777e-05, + "loss": 0.2412, + "step": 2083 + }, + { + "epoch": 0.11, + "grad_norm": 0.8475327484829858, + "learning_rate": 1.9698803735946867e-05, + "loss": 0.2352, + "step": 2084 + }, + { + "epoch": 0.11, + "grad_norm": 0.9053438157379965, + "learning_rate": 1.9698402432935432e-05, + "loss": 0.2431, + "step": 2085 + }, + { + "epoch": 0.11, + "grad_norm": 1.336674943053752, + "learning_rate": 1.9698000866854284e-05, + "loss": 0.2356, + "step": 2086 + }, + { + "epoch": 0.11, + "grad_norm": 1.2977002169496847, + "learning_rate": 1.9697599037714315e-05, + "loss": 0.2374, + "step": 2087 + }, + { + "epoch": 0.11, + "grad_norm": 1.2799090702528029, + "learning_rate": 1.9697196945526427e-05, + "loss": 0.2252, + "step": 2088 + }, + { + "epoch": 0.11, + "grad_norm": 0.9752193191002771, + "learning_rate": 1.969679459030152e-05, + "loss": 0.2678, + "step": 2089 + }, + { + "epoch": 0.11, + "grad_norm": 0.8762395610925323, + "learning_rate": 1.9696391972050516e-05, + "loss": 0.2144, + "step": 2090 + }, + { + "epoch": 0.11, + "grad_norm": 1.1460100146302559, + "learning_rate": 1.969598909078433e-05, + "loss": 0.2508, + "step": 2091 + }, + { + "epoch": 0.11, + "grad_norm": 0.9559822932848991, + "learning_rate": 1.969558594651389e-05, + "loss": 0.2487, + "step": 2092 + }, + { + "epoch": 0.11, + "grad_norm": 0.9839613710855772, + "learning_rate": 1.9695182539250138e-05, + "loss": 0.2392, + "step": 2093 + }, + { + "epoch": 0.11, + "grad_norm": 0.8264913114980849, + "learning_rate": 1.969477886900401e-05, + "loss": 0.2213, + "step": 2094 + }, + { + "epoch": 0.11, + "grad_norm": 1.0003343905023958, + "learning_rate": 1.9694374935786457e-05, + "loss": 0.2235, + "step": 2095 + }, + { + "epoch": 0.11, + "grad_norm": 1.5516384795930434, + "learning_rate": 1.9693970739608437e-05, + "loss": 0.2463, + "step": 2096 + }, + { + "epoch": 0.11, + "grad_norm": 2.0876525839249633, + "learning_rate": 1.9693566280480914e-05, + "loss": 0.226, + "step": 2097 + }, + { + "epoch": 0.11, + "grad_norm": 1.346822529145253, + "learning_rate": 1.9693161558414856e-05, + "loss": 0.2393, + "step": 2098 + }, + { + "epoch": 0.11, + "grad_norm": 1.5812948122941315, + "learning_rate": 1.9692756573421246e-05, + "loss": 0.2685, + "step": 2099 + }, + { + "epoch": 0.11, + "grad_norm": 0.9825131769766344, + "learning_rate": 1.9692351325511066e-05, + "loss": 0.2434, + "step": 2100 + }, + { + "epoch": 0.11, + "grad_norm": 2.517270664330921, + "learning_rate": 1.9691945814695306e-05, + "loss": 0.2442, + "step": 2101 + }, + { + "epoch": 0.11, + "grad_norm": 1.5284733760701465, + "learning_rate": 1.9691540040984972e-05, + "loss": 0.2366, + "step": 2102 + }, + { + "epoch": 0.11, + "grad_norm": 1.7611185683006056, + "learning_rate": 1.9691134004391064e-05, + "loss": 0.2571, + "step": 2103 + }, + { + "epoch": 0.11, + "grad_norm": 1.2502143870501974, + "learning_rate": 1.9690727704924598e-05, + "loss": 0.2439, + "step": 2104 + }, + { + "epoch": 0.11, + "grad_norm": 1.155788006118132, + "learning_rate": 1.9690321142596602e-05, + "loss": 0.2368, + "step": 2105 + }, + { + "epoch": 0.11, + "grad_norm": 1.0104694476849565, + "learning_rate": 1.968991431741809e-05, + "loss": 0.2437, + "step": 2106 + }, + { + "epoch": 0.11, + "grad_norm": 1.3811992101020232, + "learning_rate": 1.968950722940011e-05, + "loss": 0.2269, + "step": 2107 + }, + { + "epoch": 0.11, + "grad_norm": 1.0196354555505611, + "learning_rate": 1.9689099878553698e-05, + "loss": 0.2316, + "step": 2108 + }, + { + "epoch": 0.11, + "grad_norm": 1.701387510634705, + "learning_rate": 1.9688692264889905e-05, + "loss": 0.2166, + "step": 2109 + }, + { + "epoch": 0.11, + "grad_norm": 0.8829947026926955, + "learning_rate": 1.9688284388419784e-05, + "loss": 0.2411, + "step": 2110 + }, + { + "epoch": 0.11, + "grad_norm": 1.1844084066745377, + "learning_rate": 1.9687876249154402e-05, + "loss": 0.2682, + "step": 2111 + }, + { + "epoch": 0.11, + "grad_norm": 1.0692862669633376, + "learning_rate": 1.9687467847104834e-05, + "loss": 0.2247, + "step": 2112 + }, + { + "epoch": 0.11, + "grad_norm": 1.931930597121846, + "learning_rate": 1.9687059182282152e-05, + "loss": 0.2397, + "step": 2113 + }, + { + "epoch": 0.11, + "grad_norm": 0.9933543404165622, + "learning_rate": 1.968665025469744e-05, + "loss": 0.2026, + "step": 2114 + }, + { + "epoch": 0.11, + "grad_norm": 1.2298239063976584, + "learning_rate": 1.9686241064361792e-05, + "loss": 0.2491, + "step": 2115 + }, + { + "epoch": 0.11, + "grad_norm": 1.0308486801387686, + "learning_rate": 1.9685831611286312e-05, + "loss": 0.2167, + "step": 2116 + }, + { + "epoch": 0.11, + "grad_norm": 1.232081656908198, + "learning_rate": 1.96854218954821e-05, + "loss": 0.2442, + "step": 2117 + }, + { + "epoch": 0.11, + "grad_norm": 1.1575287552055014, + "learning_rate": 1.9685011916960276e-05, + "loss": 0.2477, + "step": 2118 + }, + { + "epoch": 0.11, + "grad_norm": 1.1704875982681098, + "learning_rate": 1.9684601675731952e-05, + "loss": 0.2328, + "step": 2119 + }, + { + "epoch": 0.11, + "grad_norm": 1.0464516245169295, + "learning_rate": 1.9684191171808262e-05, + "loss": 0.2445, + "step": 2120 + }, + { + "epoch": 0.11, + "grad_norm": 1.238441161592338, + "learning_rate": 1.968378040520034e-05, + "loss": 0.2454, + "step": 2121 + }, + { + "epoch": 0.11, + "grad_norm": 1.1120056639856326, + "learning_rate": 1.9683369375919325e-05, + "loss": 0.2353, + "step": 2122 + }, + { + "epoch": 0.11, + "grad_norm": 1.4210318912864548, + "learning_rate": 1.9682958083976374e-05, + "loss": 0.2377, + "step": 2123 + }, + { + "epoch": 0.11, + "grad_norm": 0.9675502607652713, + "learning_rate": 1.9682546529382635e-05, + "loss": 0.2382, + "step": 2124 + }, + { + "epoch": 0.11, + "grad_norm": 0.8799019495856499, + "learning_rate": 1.968213471214927e-05, + "loss": 0.2327, + "step": 2125 + }, + { + "epoch": 0.11, + "grad_norm": 1.1715411569228944, + "learning_rate": 1.968172263228746e-05, + "loss": 0.237, + "step": 2126 + }, + { + "epoch": 0.11, + "grad_norm": 0.9671697534047214, + "learning_rate": 1.9681310289808377e-05, + "loss": 0.2416, + "step": 2127 + }, + { + "epoch": 0.11, + "grad_norm": 1.0001367934628327, + "learning_rate": 1.9680897684723205e-05, + "loss": 0.2347, + "step": 2128 + }, + { + "epoch": 0.11, + "grad_norm": 0.8494973958552655, + "learning_rate": 1.9680484817043134e-05, + "loss": 0.2301, + "step": 2129 + }, + { + "epoch": 0.11, + "grad_norm": 0.9240069516003688, + "learning_rate": 1.9680071686779368e-05, + "loss": 0.2379, + "step": 2130 + }, + { + "epoch": 0.11, + "grad_norm": 1.3136456921955122, + "learning_rate": 1.9679658293943112e-05, + "loss": 0.2417, + "step": 2131 + }, + { + "epoch": 0.11, + "grad_norm": 1.0592213424376784, + "learning_rate": 1.9679244638545572e-05, + "loss": 0.2174, + "step": 2132 + }, + { + "epoch": 0.11, + "grad_norm": 0.910805874421009, + "learning_rate": 1.967883072059798e-05, + "loss": 0.237, + "step": 2133 + }, + { + "epoch": 0.11, + "grad_norm": 0.8839617866905263, + "learning_rate": 1.9678416540111557e-05, + "loss": 0.2356, + "step": 2134 + }, + { + "epoch": 0.11, + "grad_norm": 1.2152257087176324, + "learning_rate": 1.9678002097097537e-05, + "loss": 0.2349, + "step": 2135 + }, + { + "epoch": 0.11, + "grad_norm": 1.1044155337772443, + "learning_rate": 1.9677587391567164e-05, + "loss": 0.2365, + "step": 2136 + }, + { + "epoch": 0.11, + "grad_norm": 2.7787980072399576, + "learning_rate": 1.967717242353169e-05, + "loss": 0.2265, + "step": 2137 + }, + { + "epoch": 0.11, + "grad_norm": 1.3616430182996238, + "learning_rate": 1.9676757193002363e-05, + "loss": 0.2746, + "step": 2138 + }, + { + "epoch": 0.11, + "grad_norm": 1.1228586924105723, + "learning_rate": 1.9676341699990452e-05, + "loss": 0.2186, + "step": 2139 + }, + { + "epoch": 0.11, + "grad_norm": 1.2125802259045932, + "learning_rate": 1.9675925944507226e-05, + "loss": 0.2519, + "step": 2140 + }, + { + "epoch": 0.11, + "grad_norm": 0.9444977302475596, + "learning_rate": 1.9675509926563964e-05, + "loss": 0.2566, + "step": 2141 + }, + { + "epoch": 0.11, + "grad_norm": 1.5317024122818927, + "learning_rate": 1.9675093646171947e-05, + "loss": 0.2401, + "step": 2142 + }, + { + "epoch": 0.11, + "grad_norm": 1.7667854821812148, + "learning_rate": 1.967467710334247e-05, + "loss": 0.2389, + "step": 2143 + }, + { + "epoch": 0.11, + "grad_norm": 1.189612959109022, + "learning_rate": 1.9674260298086825e-05, + "loss": 0.2352, + "step": 2144 + }, + { + "epoch": 0.11, + "grad_norm": 0.992641156412239, + "learning_rate": 1.967384323041633e-05, + "loss": 0.225, + "step": 2145 + }, + { + "epoch": 0.11, + "grad_norm": 0.9343197806222634, + "learning_rate": 1.9673425900342286e-05, + "loss": 0.2178, + "step": 2146 + }, + { + "epoch": 0.11, + "grad_norm": 1.1016381303751634, + "learning_rate": 1.9673008307876017e-05, + "loss": 0.2271, + "step": 2147 + }, + { + "epoch": 0.11, + "grad_norm": 0.9690005406542, + "learning_rate": 1.9672590453028855e-05, + "loss": 0.228, + "step": 2148 + }, + { + "epoch": 0.11, + "grad_norm": 1.235254056286498, + "learning_rate": 1.967217233581213e-05, + "loss": 0.2428, + "step": 2149 + }, + { + "epoch": 0.11, + "grad_norm": 1.109454195598941, + "learning_rate": 1.9671753956237187e-05, + "loss": 0.2327, + "step": 2150 + }, + { + "epoch": 0.11, + "grad_norm": 3.5720857648078095, + "learning_rate": 1.9671335314315365e-05, + "loss": 0.2501, + "step": 2151 + }, + { + "epoch": 0.11, + "grad_norm": 1.0473294245674567, + "learning_rate": 1.967091641005803e-05, + "loss": 0.222, + "step": 2152 + }, + { + "epoch": 0.11, + "grad_norm": 1.2244939085245228, + "learning_rate": 1.967049724347654e-05, + "loss": 0.2284, + "step": 2153 + }, + { + "epoch": 0.11, + "grad_norm": 1.3558829537666295, + "learning_rate": 1.967007781458227e-05, + "loss": 0.2353, + "step": 2154 + }, + { + "epoch": 0.11, + "grad_norm": 0.9497548159666054, + "learning_rate": 1.966965812338659e-05, + "loss": 0.2122, + "step": 2155 + }, + { + "epoch": 0.11, + "grad_norm": 0.9943932518361649, + "learning_rate": 1.9669238169900886e-05, + "loss": 0.2103, + "step": 2156 + }, + { + "epoch": 0.11, + "grad_norm": 4.593225948018636, + "learning_rate": 1.966881795413655e-05, + "loss": 0.2542, + "step": 2157 + }, + { + "epoch": 0.11, + "grad_norm": 1.282690775096383, + "learning_rate": 1.9668397476104983e-05, + "loss": 0.2286, + "step": 2158 + }, + { + "epoch": 0.11, + "grad_norm": 1.101149696753262, + "learning_rate": 1.966797673581759e-05, + "loss": 0.2089, + "step": 2159 + }, + { + "epoch": 0.11, + "grad_norm": 2.905210614202573, + "learning_rate": 1.966755573328578e-05, + "loss": 0.23, + "step": 2160 + }, + { + "epoch": 0.11, + "grad_norm": 1.0378262137010232, + "learning_rate": 1.9667134468520974e-05, + "loss": 0.2432, + "step": 2161 + }, + { + "epoch": 0.11, + "grad_norm": 1.0797024746109964, + "learning_rate": 1.96667129415346e-05, + "loss": 0.2282, + "step": 2162 + }, + { + "epoch": 0.11, + "grad_norm": 1.2276995992881126, + "learning_rate": 1.966629115233809e-05, + "loss": 0.211, + "step": 2163 + }, + { + "epoch": 0.11, + "grad_norm": 1.0352266908341288, + "learning_rate": 1.9665869100942888e-05, + "loss": 0.2472, + "step": 2164 + }, + { + "epoch": 0.11, + "grad_norm": 1.417599642847786, + "learning_rate": 1.9665446787360444e-05, + "loss": 0.2273, + "step": 2165 + }, + { + "epoch": 0.11, + "grad_norm": 0.9920449774927103, + "learning_rate": 1.9665024211602208e-05, + "loss": 0.242, + "step": 2166 + }, + { + "epoch": 0.11, + "grad_norm": 1.2296252775505276, + "learning_rate": 1.9664601373679644e-05, + "loss": 0.2491, + "step": 2167 + }, + { + "epoch": 0.11, + "grad_norm": 1.0402566138277567, + "learning_rate": 1.966417827360422e-05, + "loss": 0.2284, + "step": 2168 + }, + { + "epoch": 0.11, + "grad_norm": 1.1053617884173759, + "learning_rate": 1.9663754911387414e-05, + "loss": 0.2385, + "step": 2169 + }, + { + "epoch": 0.11, + "grad_norm": 1.1119095497921, + "learning_rate": 1.9663331287040713e-05, + "loss": 0.2539, + "step": 2170 + }, + { + "epoch": 0.11, + "grad_norm": 1.0983948450880103, + "learning_rate": 1.9662907400575606e-05, + "loss": 0.2137, + "step": 2171 + }, + { + "epoch": 0.11, + "grad_norm": 1.269949230715848, + "learning_rate": 1.9662483252003585e-05, + "loss": 0.2293, + "step": 2172 + }, + { + "epoch": 0.11, + "grad_norm": 1.0564708933309246, + "learning_rate": 1.9662058841336164e-05, + "loss": 0.2217, + "step": 2173 + }, + { + "epoch": 0.11, + "grad_norm": 1.1216048180385836, + "learning_rate": 1.966163416858485e-05, + "loss": 0.2748, + "step": 2174 + }, + { + "epoch": 0.11, + "grad_norm": 6.508310528343341, + "learning_rate": 1.9661209233761167e-05, + "loss": 0.227, + "step": 2175 + }, + { + "epoch": 0.11, + "grad_norm": 1.094002964335264, + "learning_rate": 1.9660784036876636e-05, + "loss": 0.2422, + "step": 2176 + }, + { + "epoch": 0.11, + "grad_norm": 1.2065998099809236, + "learning_rate": 1.9660358577942788e-05, + "loss": 0.2282, + "step": 2177 + }, + { + "epoch": 0.11, + "grad_norm": 1.5059749594559086, + "learning_rate": 1.965993285697117e-05, + "loss": 0.2618, + "step": 2178 + }, + { + "epoch": 0.11, + "grad_norm": 1.0542248630235662, + "learning_rate": 1.965950687397333e-05, + "loss": 0.227, + "step": 2179 + }, + { + "epoch": 0.11, + "grad_norm": 1.1505226952579715, + "learning_rate": 1.965908062896082e-05, + "loss": 0.2321, + "step": 2180 + }, + { + "epoch": 0.11, + "grad_norm": 0.9613747888101222, + "learning_rate": 1.96586541219452e-05, + "loss": 0.2532, + "step": 2181 + }, + { + "epoch": 0.11, + "grad_norm": 1.1209519540120025, + "learning_rate": 1.9658227352938044e-05, + "loss": 0.215, + "step": 2182 + }, + { + "epoch": 0.11, + "grad_norm": 1.1901700934240698, + "learning_rate": 1.9657800321950925e-05, + "loss": 0.233, + "step": 2183 + }, + { + "epoch": 0.11, + "grad_norm": 1.0703733820154298, + "learning_rate": 1.9657373028995427e-05, + "loss": 0.2294, + "step": 2184 + }, + { + "epoch": 0.11, + "grad_norm": 1.0694757472302956, + "learning_rate": 1.965694547408314e-05, + "loss": 0.2351, + "step": 2185 + }, + { + "epoch": 0.11, + "grad_norm": 1.06619999016574, + "learning_rate": 1.9656517657225658e-05, + "loss": 0.2091, + "step": 2186 + }, + { + "epoch": 0.11, + "grad_norm": 1.2948971683345574, + "learning_rate": 1.9656089578434595e-05, + "loss": 0.2643, + "step": 2187 + }, + { + "epoch": 0.11, + "grad_norm": 1.16157269655789, + "learning_rate": 1.9655661237721554e-05, + "loss": 0.2298, + "step": 2188 + }, + { + "epoch": 0.11, + "grad_norm": 1.4539573594884465, + "learning_rate": 1.9655232635098157e-05, + "loss": 0.2289, + "step": 2189 + }, + { + "epoch": 0.11, + "grad_norm": 1.7694359425810158, + "learning_rate": 1.965480377057603e-05, + "loss": 0.1937, + "step": 2190 + }, + { + "epoch": 0.11, + "grad_norm": 1.3928943982351816, + "learning_rate": 1.96543746441668e-05, + "loss": 0.2257, + "step": 2191 + }, + { + "epoch": 0.11, + "grad_norm": 1.2322222758893566, + "learning_rate": 1.965394525588212e-05, + "loss": 0.2518, + "step": 2192 + }, + { + "epoch": 0.11, + "grad_norm": 1.2909786558614489, + "learning_rate": 1.9653515605733625e-05, + "loss": 0.2674, + "step": 2193 + }, + { + "epoch": 0.11, + "grad_norm": 1.610007776399509, + "learning_rate": 1.9653085693732976e-05, + "loss": 0.24, + "step": 2194 + }, + { + "epoch": 0.11, + "grad_norm": 1.224363132147249, + "learning_rate": 1.965265551989183e-05, + "loss": 0.2476, + "step": 2195 + }, + { + "epoch": 0.11, + "grad_norm": 1.105382530444532, + "learning_rate": 1.965222508422186e-05, + "loss": 0.2573, + "step": 2196 + }, + { + "epoch": 0.11, + "grad_norm": 1.146983000583085, + "learning_rate": 1.9651794386734743e-05, + "loss": 0.2428, + "step": 2197 + }, + { + "epoch": 0.11, + "grad_norm": 1.1987218803478044, + "learning_rate": 1.965136342744215e-05, + "loss": 0.2421, + "step": 2198 + }, + { + "epoch": 0.11, + "grad_norm": 1.3590799693199993, + "learning_rate": 1.9650932206355786e-05, + "loss": 0.2656, + "step": 2199 + }, + { + "epoch": 0.11, + "grad_norm": 0.9900346782252395, + "learning_rate": 1.9650500723487335e-05, + "loss": 0.2067, + "step": 2200 + }, + { + "epoch": 0.11, + "grad_norm": 1.077951461086788, + "learning_rate": 1.9650068978848512e-05, + "loss": 0.2341, + "step": 2201 + }, + { + "epoch": 0.11, + "grad_norm": 1.907756752533414, + "learning_rate": 1.964963697245102e-05, + "loss": 0.2265, + "step": 2202 + }, + { + "epoch": 0.11, + "grad_norm": 1.2688186973976037, + "learning_rate": 1.964920470430658e-05, + "loss": 0.2449, + "step": 2203 + }, + { + "epoch": 0.11, + "grad_norm": 1.1829663405600277, + "learning_rate": 1.964877217442692e-05, + "loss": 0.2643, + "step": 2204 + }, + { + "epoch": 0.11, + "grad_norm": 1.8793853900121442, + "learning_rate": 1.964833938282377e-05, + "loss": 0.2251, + "step": 2205 + }, + { + "epoch": 0.11, + "grad_norm": 1.2663396192955865, + "learning_rate": 1.9647906329508866e-05, + "loss": 0.2563, + "step": 2206 + }, + { + "epoch": 0.11, + "grad_norm": 1.086276962256967, + "learning_rate": 1.9647473014493958e-05, + "loss": 0.2063, + "step": 2207 + }, + { + "epoch": 0.11, + "grad_norm": 1.6805919171172707, + "learning_rate": 1.9647039437790802e-05, + "loss": 0.2422, + "step": 2208 + }, + { + "epoch": 0.11, + "grad_norm": 1.3180450409740612, + "learning_rate": 1.9646605599411155e-05, + "loss": 0.2218, + "step": 2209 + }, + { + "epoch": 0.11, + "grad_norm": 1.0908244445799435, + "learning_rate": 1.964617149936679e-05, + "loss": 0.2358, + "step": 2210 + }, + { + "epoch": 0.11, + "grad_norm": 1.3992407176335435, + "learning_rate": 1.9645737137669473e-05, + "loss": 0.2655, + "step": 2211 + }, + { + "epoch": 0.11, + "grad_norm": 1.2630430210609305, + "learning_rate": 1.9645302514330994e-05, + "loss": 0.2403, + "step": 2212 + }, + { + "epoch": 0.11, + "grad_norm": 1.1339372584391034, + "learning_rate": 1.9644867629363137e-05, + "loss": 0.2247, + "step": 2213 + }, + { + "epoch": 0.11, + "grad_norm": 1.1341369372627903, + "learning_rate": 1.9644432482777703e-05, + "loss": 0.2674, + "step": 2214 + }, + { + "epoch": 0.11, + "grad_norm": 1.3731513614446267, + "learning_rate": 1.964399707458649e-05, + "loss": 0.2195, + "step": 2215 + }, + { + "epoch": 0.11, + "grad_norm": 1.3995092356589611, + "learning_rate": 1.9643561404801317e-05, + "loss": 0.2371, + "step": 2216 + }, + { + "epoch": 0.11, + "grad_norm": 1.0985880612220564, + "learning_rate": 1.9643125473433992e-05, + "loss": 0.2136, + "step": 2217 + }, + { + "epoch": 0.11, + "grad_norm": 0.8997296488322348, + "learning_rate": 1.9642689280496347e-05, + "loss": 0.1975, + "step": 2218 + }, + { + "epoch": 0.11, + "grad_norm": 1.7606210335459154, + "learning_rate": 1.9642252826000206e-05, + "loss": 0.2158, + "step": 2219 + }, + { + "epoch": 0.11, + "grad_norm": 0.9434073289543887, + "learning_rate": 1.9641816109957415e-05, + "loss": 0.2276, + "step": 2220 + }, + { + "epoch": 0.11, + "grad_norm": 3.9260696285730123, + "learning_rate": 1.9641379132379822e-05, + "loss": 0.2486, + "step": 2221 + }, + { + "epoch": 0.11, + "grad_norm": 1.058546288904667, + "learning_rate": 1.964094189327927e-05, + "loss": 0.2338, + "step": 2222 + }, + { + "epoch": 0.11, + "grad_norm": 1.394988426693415, + "learning_rate": 1.9640504392667626e-05, + "loss": 0.2476, + "step": 2223 + }, + { + "epoch": 0.11, + "grad_norm": 1.0985601759324406, + "learning_rate": 1.9640066630556756e-05, + "loss": 0.2159, + "step": 2224 + }, + { + "epoch": 0.11, + "grad_norm": 1.2145342696137336, + "learning_rate": 1.9639628606958535e-05, + "loss": 0.2311, + "step": 2225 + }, + { + "epoch": 0.11, + "grad_norm": 1.4288512822935417, + "learning_rate": 1.9639190321884842e-05, + "loss": 0.2285, + "step": 2226 + }, + { + "epoch": 0.11, + "grad_norm": 1.1520142737740142, + "learning_rate": 1.9638751775347568e-05, + "loss": 0.2225, + "step": 2227 + }, + { + "epoch": 0.11, + "grad_norm": 1.0531822751852433, + "learning_rate": 1.963831296735861e-05, + "loss": 0.2425, + "step": 2228 + }, + { + "epoch": 0.11, + "grad_norm": 2.359747733426235, + "learning_rate": 1.9637873897929866e-05, + "loss": 0.2537, + "step": 2229 + }, + { + "epoch": 0.11, + "grad_norm": 1.126644132650638, + "learning_rate": 1.9637434567073246e-05, + "loss": 0.2411, + "step": 2230 + }, + { + "epoch": 0.11, + "grad_norm": 1.5801327871398765, + "learning_rate": 1.9636994974800673e-05, + "loss": 0.2442, + "step": 2231 + }, + { + "epoch": 0.11, + "grad_norm": 1.4688906285868015, + "learning_rate": 1.9636555121124063e-05, + "loss": 0.2321, + "step": 2232 + }, + { + "epoch": 0.11, + "grad_norm": 1.4452794250821794, + "learning_rate": 1.963611500605535e-05, + "loss": 0.2282, + "step": 2233 + }, + { + "epoch": 0.11, + "grad_norm": 1.2114241613636223, + "learning_rate": 1.963567462960648e-05, + "loss": 0.2344, + "step": 2234 + }, + { + "epoch": 0.11, + "grad_norm": 1.213601910804405, + "learning_rate": 1.963523399178939e-05, + "loss": 0.2561, + "step": 2235 + }, + { + "epoch": 0.11, + "grad_norm": 1.1851182812942889, + "learning_rate": 1.963479309261603e-05, + "loss": 0.2364, + "step": 2236 + }, + { + "epoch": 0.11, + "grad_norm": 1.8871984432827353, + "learning_rate": 1.9634351932098364e-05, + "loss": 0.2364, + "step": 2237 + }, + { + "epoch": 0.11, + "grad_norm": 1.2435776107199095, + "learning_rate": 1.9633910510248357e-05, + "loss": 0.2381, + "step": 2238 + }, + { + "epoch": 0.11, + "grad_norm": 1.3475281308085763, + "learning_rate": 1.9633468827077986e-05, + "loss": 0.2418, + "step": 2239 + }, + { + "epoch": 0.11, + "grad_norm": 1.239713466612954, + "learning_rate": 1.9633026882599228e-05, + "loss": 0.2504, + "step": 2240 + }, + { + "epoch": 0.11, + "grad_norm": 1.2328229260264705, + "learning_rate": 1.963258467682407e-05, + "loss": 0.241, + "step": 2241 + }, + { + "epoch": 0.11, + "grad_norm": 1.0341545828230583, + "learning_rate": 1.9632142209764514e-05, + "loss": 0.2174, + "step": 2242 + }, + { + "epoch": 0.11, + "grad_norm": 1.2902668946694982, + "learning_rate": 1.963169948143255e-05, + "loss": 0.2275, + "step": 2243 + }, + { + "epoch": 0.11, + "grad_norm": 1.0121788933691918, + "learning_rate": 1.9631256491840197e-05, + "loss": 0.2229, + "step": 2244 + }, + { + "epoch": 0.11, + "grad_norm": 0.9654980378203284, + "learning_rate": 1.9630813240999468e-05, + "loss": 0.2423, + "step": 2245 + }, + { + "epoch": 0.11, + "grad_norm": 1.3478278298336825, + "learning_rate": 1.963036972892238e-05, + "loss": 0.2736, + "step": 2246 + }, + { + "epoch": 0.11, + "grad_norm": 1.2691420401208928, + "learning_rate": 1.962992595562098e-05, + "loss": 0.2059, + "step": 2247 + }, + { + "epoch": 0.11, + "grad_norm": 1.374703279576238, + "learning_rate": 1.9629481921107287e-05, + "loss": 0.2613, + "step": 2248 + }, + { + "epoch": 0.11, + "grad_norm": 1.0652932569776843, + "learning_rate": 1.9629037625393352e-05, + "loss": 0.2316, + "step": 2249 + }, + { + "epoch": 0.11, + "grad_norm": 1.5137256334450104, + "learning_rate": 1.962859306849123e-05, + "loss": 0.2392, + "step": 2250 + }, + { + "epoch": 0.11, + "grad_norm": 1.107016666508369, + "learning_rate": 1.962814825041298e-05, + "loss": 0.2253, + "step": 2251 + }, + { + "epoch": 0.11, + "grad_norm": 0.9550128240410778, + "learning_rate": 1.962770317117066e-05, + "loss": 0.1973, + "step": 2252 + }, + { + "epoch": 0.11, + "grad_norm": 1.297841502121561, + "learning_rate": 1.9627257830776352e-05, + "loss": 0.2174, + "step": 2253 + }, + { + "epoch": 0.11, + "grad_norm": 1.1093582584200514, + "learning_rate": 1.9626812229242128e-05, + "loss": 0.2306, + "step": 2254 + }, + { + "epoch": 0.11, + "grad_norm": 1.2245798746882979, + "learning_rate": 1.962636636658008e-05, + "loss": 0.2535, + "step": 2255 + }, + { + "epoch": 0.11, + "grad_norm": 1.644044637919812, + "learning_rate": 1.9625920242802302e-05, + "loss": 0.2554, + "step": 2256 + }, + { + "epoch": 0.11, + "grad_norm": 1.1361609839406392, + "learning_rate": 1.962547385792089e-05, + "loss": 0.2492, + "step": 2257 + }, + { + "epoch": 0.11, + "grad_norm": 1.6662903840032885, + "learning_rate": 1.962502721194796e-05, + "loss": 0.224, + "step": 2258 + }, + { + "epoch": 0.11, + "grad_norm": 0.9270518048299475, + "learning_rate": 1.962458030489562e-05, + "loss": 0.2125, + "step": 2259 + }, + { + "epoch": 0.11, + "grad_norm": 0.9755561228878188, + "learning_rate": 1.9624133136775998e-05, + "loss": 0.2293, + "step": 2260 + }, + { + "epoch": 0.11, + "grad_norm": 1.8168211138034966, + "learning_rate": 1.962368570760122e-05, + "loss": 0.2527, + "step": 2261 + }, + { + "epoch": 0.12, + "grad_norm": 6.481777245879265, + "learning_rate": 1.9623238017383426e-05, + "loss": 0.2372, + "step": 2262 + }, + { + "epoch": 0.12, + "grad_norm": 0.9735772662420452, + "learning_rate": 1.9622790066134754e-05, + "loss": 0.2282, + "step": 2263 + }, + { + "epoch": 0.12, + "grad_norm": 1.1837794954838359, + "learning_rate": 1.962234185386736e-05, + "loss": 0.2151, + "step": 2264 + }, + { + "epoch": 0.12, + "grad_norm": 1.0788263704931076, + "learning_rate": 1.9621893380593398e-05, + "loss": 0.2346, + "step": 2265 + }, + { + "epoch": 0.12, + "grad_norm": 1.3895384690518788, + "learning_rate": 1.9621444646325036e-05, + "loss": 0.2514, + "step": 2266 + }, + { + "epoch": 0.12, + "grad_norm": 1.1628169725919841, + "learning_rate": 1.9620995651074443e-05, + "loss": 0.2242, + "step": 2267 + }, + { + "epoch": 0.12, + "grad_norm": 1.168404660899102, + "learning_rate": 1.9620546394853802e-05, + "loss": 0.2227, + "step": 2268 + }, + { + "epoch": 0.12, + "grad_norm": 1.1351391787739238, + "learning_rate": 1.9620096877675294e-05, + "loss": 0.2563, + "step": 2269 + }, + { + "epoch": 0.12, + "grad_norm": 1.0035468703101598, + "learning_rate": 1.9619647099551118e-05, + "loss": 0.2242, + "step": 2270 + }, + { + "epoch": 0.12, + "grad_norm": 1.0863642017242183, + "learning_rate": 1.9619197060493465e-05, + "loss": 0.2245, + "step": 2271 + }, + { + "epoch": 0.12, + "grad_norm": 1.3521259808783213, + "learning_rate": 1.9618746760514554e-05, + "loss": 0.2247, + "step": 2272 + }, + { + "epoch": 0.12, + "grad_norm": 1.1789057952904218, + "learning_rate": 1.9618296199626594e-05, + "loss": 0.2412, + "step": 2273 + }, + { + "epoch": 0.12, + "grad_norm": 1.1695044838914121, + "learning_rate": 1.9617845377841804e-05, + "loss": 0.2295, + "step": 2274 + }, + { + "epoch": 0.12, + "grad_norm": 0.911670320372682, + "learning_rate": 1.9617394295172415e-05, + "loss": 0.2248, + "step": 2275 + }, + { + "epoch": 0.12, + "grad_norm": 1.4974238095285362, + "learning_rate": 1.9616942951630668e-05, + "loss": 0.2576, + "step": 2276 + }, + { + "epoch": 0.12, + "grad_norm": 1.065884469776496, + "learning_rate": 1.9616491347228793e-05, + "loss": 0.2129, + "step": 2277 + }, + { + "epoch": 0.12, + "grad_norm": 1.0689950684727134, + "learning_rate": 1.961603948197905e-05, + "loss": 0.2123, + "step": 2278 + }, + { + "epoch": 0.12, + "grad_norm": 1.122557763242407, + "learning_rate": 1.9615587355893693e-05, + "loss": 0.2124, + "step": 2279 + }, + { + "epoch": 0.12, + "grad_norm": 1.6997662112278131, + "learning_rate": 1.9615134968984984e-05, + "loss": 0.2229, + "step": 2280 + }, + { + "epoch": 0.12, + "grad_norm": 1.427459018774372, + "learning_rate": 1.96146823212652e-05, + "loss": 0.2395, + "step": 2281 + }, + { + "epoch": 0.12, + "grad_norm": 1.174838610488931, + "learning_rate": 1.961422941274661e-05, + "loss": 0.2395, + "step": 2282 + }, + { + "epoch": 0.12, + "grad_norm": 1.0444712650544752, + "learning_rate": 1.9613776243441507e-05, + "loss": 0.2293, + "step": 2283 + }, + { + "epoch": 0.12, + "grad_norm": 0.9928779696890382, + "learning_rate": 1.9613322813362182e-05, + "loss": 0.2452, + "step": 2284 + }, + { + "epoch": 0.12, + "grad_norm": 0.9898668843506018, + "learning_rate": 1.961286912252093e-05, + "loss": 0.236, + "step": 2285 + }, + { + "epoch": 0.12, + "grad_norm": 1.0320698498409984, + "learning_rate": 1.961241517093006e-05, + "loss": 0.2426, + "step": 2286 + }, + { + "epoch": 0.12, + "grad_norm": 0.9456851193220972, + "learning_rate": 1.9611960958601886e-05, + "loss": 0.2317, + "step": 2287 + }, + { + "epoch": 0.12, + "grad_norm": 1.0427893908436772, + "learning_rate": 1.9611506485548728e-05, + "loss": 0.2243, + "step": 2288 + }, + { + "epoch": 0.12, + "grad_norm": 0.990790934611057, + "learning_rate": 1.9611051751782915e-05, + "loss": 0.2276, + "step": 2289 + }, + { + "epoch": 0.12, + "grad_norm": 1.4299898481089397, + "learning_rate": 1.961059675731678e-05, + "loss": 0.2421, + "step": 2290 + }, + { + "epoch": 0.12, + "grad_norm": 1.0567116203220435, + "learning_rate": 1.9610141502162662e-05, + "loss": 0.2549, + "step": 2291 + }, + { + "epoch": 0.12, + "grad_norm": 0.9813611728893015, + "learning_rate": 1.9609685986332918e-05, + "loss": 0.2272, + "step": 2292 + }, + { + "epoch": 0.12, + "grad_norm": 1.2665166548155724, + "learning_rate": 1.9609230209839894e-05, + "loss": 0.2406, + "step": 2293 + }, + { + "epoch": 0.12, + "grad_norm": 1.3882608563003989, + "learning_rate": 1.9608774172695964e-05, + "loss": 0.2168, + "step": 2294 + }, + { + "epoch": 0.12, + "grad_norm": 1.2389235543966157, + "learning_rate": 1.9608317874913484e-05, + "loss": 0.2293, + "step": 2295 + }, + { + "epoch": 0.12, + "grad_norm": 0.867329992139809, + "learning_rate": 1.9607861316504848e-05, + "loss": 0.2149, + "step": 2296 + }, + { + "epoch": 0.12, + "grad_norm": 0.9020334001335076, + "learning_rate": 1.9607404497482422e-05, + "loss": 0.2277, + "step": 2297 + }, + { + "epoch": 0.12, + "grad_norm": 0.845846595246084, + "learning_rate": 1.9606947417858614e-05, + "loss": 0.251, + "step": 2298 + }, + { + "epoch": 0.12, + "grad_norm": 1.1287125708824222, + "learning_rate": 1.960649007764581e-05, + "loss": 0.2455, + "step": 2299 + }, + { + "epoch": 0.12, + "grad_norm": 0.9295412239995064, + "learning_rate": 1.960603247685642e-05, + "loss": 0.2402, + "step": 2300 + }, + { + "epoch": 0.12, + "grad_norm": 0.9073561280730207, + "learning_rate": 1.9605574615502857e-05, + "loss": 0.2625, + "step": 2301 + }, + { + "epoch": 0.12, + "grad_norm": 1.561672595584736, + "learning_rate": 1.9605116493597544e-05, + "loss": 0.2539, + "step": 2302 + }, + { + "epoch": 0.12, + "grad_norm": 0.9185494380688095, + "learning_rate": 1.96046581111529e-05, + "loss": 0.2491, + "step": 2303 + }, + { + "epoch": 0.12, + "grad_norm": 0.9986931205874869, + "learning_rate": 1.9604199468181363e-05, + "loss": 0.2366, + "step": 2304 + }, + { + "epoch": 0.12, + "grad_norm": 0.9453153360183175, + "learning_rate": 1.960374056469537e-05, + "loss": 0.2174, + "step": 2305 + }, + { + "epoch": 0.12, + "grad_norm": 2.427772384150711, + "learning_rate": 1.9603281400707378e-05, + "loss": 0.2388, + "step": 2306 + }, + { + "epoch": 0.12, + "grad_norm": 1.0900731948657933, + "learning_rate": 1.9602821976229835e-05, + "loss": 0.23, + "step": 2307 + }, + { + "epoch": 0.12, + "grad_norm": 0.9054697290691388, + "learning_rate": 1.96023622912752e-05, + "loss": 0.2297, + "step": 2308 + }, + { + "epoch": 0.12, + "grad_norm": 0.9245203886194943, + "learning_rate": 1.9601902345855944e-05, + "loss": 0.2433, + "step": 2309 + }, + { + "epoch": 0.12, + "grad_norm": 1.0519878918731516, + "learning_rate": 1.9601442139984548e-05, + "loss": 0.2548, + "step": 2310 + }, + { + "epoch": 0.12, + "grad_norm": 0.8339937473761487, + "learning_rate": 1.9600981673673488e-05, + "loss": 0.2238, + "step": 2311 + }, + { + "epoch": 0.12, + "grad_norm": 1.178375488144858, + "learning_rate": 1.9600520946935263e-05, + "loss": 0.221, + "step": 2312 + }, + { + "epoch": 0.12, + "grad_norm": 0.8533874274139382, + "learning_rate": 1.9600059959782364e-05, + "loss": 0.2291, + "step": 2313 + }, + { + "epoch": 0.12, + "grad_norm": 0.9109953738205394, + "learning_rate": 1.9599598712227294e-05, + "loss": 0.2276, + "step": 2314 + }, + { + "epoch": 0.12, + "grad_norm": 0.8551206349574578, + "learning_rate": 1.9599137204282566e-05, + "loss": 0.2404, + "step": 2315 + }, + { + "epoch": 0.12, + "grad_norm": 0.9342971710035325, + "learning_rate": 1.95986754359607e-05, + "loss": 0.2176, + "step": 2316 + }, + { + "epoch": 0.12, + "grad_norm": 1.0563711413685022, + "learning_rate": 1.959821340727422e-05, + "loss": 0.2443, + "step": 2317 + }, + { + "epoch": 0.12, + "grad_norm": 0.9026438168465336, + "learning_rate": 1.9597751118235662e-05, + "loss": 0.2257, + "step": 2318 + }, + { + "epoch": 0.12, + "grad_norm": 1.0318289234723361, + "learning_rate": 1.9597288568857563e-05, + "loss": 0.219, + "step": 2319 + }, + { + "epoch": 0.12, + "grad_norm": 0.8967950538630484, + "learning_rate": 1.9596825759152466e-05, + "loss": 0.2617, + "step": 2320 + }, + { + "epoch": 0.12, + "grad_norm": 0.9702418393564891, + "learning_rate": 1.959636268913293e-05, + "loss": 0.2492, + "step": 2321 + }, + { + "epoch": 0.12, + "grad_norm": 1.306990740051603, + "learning_rate": 1.9595899358811515e-05, + "loss": 0.2268, + "step": 2322 + }, + { + "epoch": 0.12, + "grad_norm": 0.8623225432994753, + "learning_rate": 1.9595435768200785e-05, + "loss": 0.2351, + "step": 2323 + }, + { + "epoch": 0.12, + "grad_norm": 0.8927352070901556, + "learning_rate": 1.9594971917313323e-05, + "loss": 0.2814, + "step": 2324 + }, + { + "epoch": 0.12, + "grad_norm": 1.1475126857496665, + "learning_rate": 1.9594507806161703e-05, + "loss": 0.2089, + "step": 2325 + }, + { + "epoch": 0.12, + "grad_norm": 0.9149296536113013, + "learning_rate": 1.9594043434758515e-05, + "loss": 0.2321, + "step": 2326 + }, + { + "epoch": 0.12, + "grad_norm": 1.198714270590255, + "learning_rate": 1.959357880311636e-05, + "loss": 0.2626, + "step": 2327 + }, + { + "epoch": 0.12, + "grad_norm": 1.0938019109764625, + "learning_rate": 1.9593113911247836e-05, + "loss": 0.2646, + "step": 2328 + }, + { + "epoch": 0.12, + "grad_norm": 1.4410644951371105, + "learning_rate": 1.9592648759165555e-05, + "loss": 0.2555, + "step": 2329 + }, + { + "epoch": 0.12, + "grad_norm": 1.1039407282923293, + "learning_rate": 1.9592183346882135e-05, + "loss": 0.2521, + "step": 2330 + }, + { + "epoch": 0.12, + "grad_norm": 1.6247005333228324, + "learning_rate": 1.95917176744102e-05, + "loss": 0.2366, + "step": 2331 + }, + { + "epoch": 0.12, + "grad_norm": 1.4229636128215575, + "learning_rate": 1.9591251741762384e-05, + "loss": 0.2175, + "step": 2332 + }, + { + "epoch": 0.12, + "grad_norm": 0.9608883015159911, + "learning_rate": 1.959078554895132e-05, + "loss": 0.2267, + "step": 2333 + }, + { + "epoch": 0.12, + "grad_norm": 0.8501715708850301, + "learning_rate": 1.959031909598966e-05, + "loss": 0.2308, + "step": 2334 + }, + { + "epoch": 0.12, + "grad_norm": 0.8588300419178304, + "learning_rate": 1.958985238289005e-05, + "loss": 0.218, + "step": 2335 + }, + { + "epoch": 0.12, + "grad_norm": 1.1299837382980906, + "learning_rate": 1.9589385409665152e-05, + "loss": 0.2247, + "step": 2336 + }, + { + "epoch": 0.12, + "grad_norm": 0.8643912599959122, + "learning_rate": 1.9588918176327632e-05, + "loss": 0.2627, + "step": 2337 + }, + { + "epoch": 0.12, + "grad_norm": 1.0214914522161034, + "learning_rate": 1.9588450682890167e-05, + "loss": 0.2192, + "step": 2338 + }, + { + "epoch": 0.12, + "grad_norm": 0.8877732443442983, + "learning_rate": 1.9587982929365434e-05, + "loss": 0.2435, + "step": 2339 + }, + { + "epoch": 0.12, + "grad_norm": 1.2859468661713847, + "learning_rate": 1.9587514915766124e-05, + "loss": 0.2073, + "step": 2340 + }, + { + "epoch": 0.12, + "grad_norm": 1.0567286604363095, + "learning_rate": 1.958704664210493e-05, + "loss": 0.2664, + "step": 2341 + }, + { + "epoch": 0.12, + "grad_norm": 0.9992382427909487, + "learning_rate": 1.9586578108394555e-05, + "loss": 0.2252, + "step": 2342 + }, + { + "epoch": 0.12, + "grad_norm": 0.9196579061142152, + "learning_rate": 1.9586109314647705e-05, + "loss": 0.228, + "step": 2343 + }, + { + "epoch": 0.12, + "grad_norm": 1.166095192504243, + "learning_rate": 1.9585640260877102e-05, + "loss": 0.2525, + "step": 2344 + }, + { + "epoch": 0.12, + "grad_norm": 0.8412404902739583, + "learning_rate": 1.958517094709546e-05, + "loss": 0.2287, + "step": 2345 + }, + { + "epoch": 0.12, + "grad_norm": 0.919152691293656, + "learning_rate": 1.9584701373315523e-05, + "loss": 0.2087, + "step": 2346 + }, + { + "epoch": 0.12, + "grad_norm": 1.3970527229544, + "learning_rate": 1.9584231539550012e-05, + "loss": 0.2386, + "step": 2347 + }, + { + "epoch": 0.12, + "grad_norm": 0.8814552434609396, + "learning_rate": 1.9583761445811686e-05, + "loss": 0.204, + "step": 2348 + }, + { + "epoch": 0.12, + "grad_norm": 1.1366633521260399, + "learning_rate": 1.9583291092113283e-05, + "loss": 0.2125, + "step": 2349 + }, + { + "epoch": 0.12, + "grad_norm": 1.5839696990479637, + "learning_rate": 1.958282047846757e-05, + "loss": 0.2572, + "step": 2350 + }, + { + "epoch": 0.12, + "grad_norm": 0.989785096957875, + "learning_rate": 1.9582349604887313e-05, + "loss": 0.2419, + "step": 2351 + }, + { + "epoch": 0.12, + "grad_norm": 1.551925396859981, + "learning_rate": 1.958187847138528e-05, + "loss": 0.2099, + "step": 2352 + }, + { + "epoch": 0.12, + "grad_norm": 1.0599349357850392, + "learning_rate": 1.958140707797425e-05, + "loss": 0.2289, + "step": 2353 + }, + { + "epoch": 0.12, + "grad_norm": 0.9251893046227032, + "learning_rate": 1.9580935424667015e-05, + "loss": 0.2462, + "step": 2354 + }, + { + "epoch": 0.12, + "grad_norm": 1.0352904888157413, + "learning_rate": 1.9580463511476365e-05, + "loss": 0.2442, + "step": 2355 + }, + { + "epoch": 0.12, + "grad_norm": 0.8960283950787143, + "learning_rate": 1.95799913384151e-05, + "loss": 0.2167, + "step": 2356 + }, + { + "epoch": 0.12, + "grad_norm": 1.0395096607347472, + "learning_rate": 1.9579518905496032e-05, + "loss": 0.2407, + "step": 2357 + }, + { + "epoch": 0.12, + "grad_norm": 0.9843402187850503, + "learning_rate": 1.9579046212731968e-05, + "loss": 0.2396, + "step": 2358 + }, + { + "epoch": 0.12, + "grad_norm": 2.112271692780199, + "learning_rate": 1.957857326013574e-05, + "loss": 0.2292, + "step": 2359 + }, + { + "epoch": 0.12, + "grad_norm": 1.221454329725577, + "learning_rate": 1.9578100047720164e-05, + "loss": 0.2175, + "step": 2360 + }, + { + "epoch": 0.12, + "grad_norm": 0.9484425312273643, + "learning_rate": 1.957762657549809e-05, + "loss": 0.2285, + "step": 2361 + }, + { + "epoch": 0.12, + "grad_norm": 0.9496247993199408, + "learning_rate": 1.957715284348235e-05, + "loss": 0.2407, + "step": 2362 + }, + { + "epoch": 0.12, + "grad_norm": 1.0254007460980028, + "learning_rate": 1.95766788516858e-05, + "loss": 0.2231, + "step": 2363 + }, + { + "epoch": 0.12, + "grad_norm": 0.896010356490778, + "learning_rate": 1.9576204600121293e-05, + "loss": 0.2314, + "step": 2364 + }, + { + "epoch": 0.12, + "grad_norm": 0.9778267316564238, + "learning_rate": 1.9575730088801696e-05, + "loss": 0.2544, + "step": 2365 + }, + { + "epoch": 0.12, + "grad_norm": 1.0418675619291378, + "learning_rate": 1.957525531773988e-05, + "loss": 0.2347, + "step": 2366 + }, + { + "epoch": 0.12, + "grad_norm": 1.34539811818473, + "learning_rate": 1.9574780286948724e-05, + "loss": 0.2461, + "step": 2367 + }, + { + "epoch": 0.12, + "grad_norm": 1.2117159081752875, + "learning_rate": 1.957430499644111e-05, + "loss": 0.2365, + "step": 2368 + }, + { + "epoch": 0.12, + "grad_norm": 0.9916311588570211, + "learning_rate": 1.9573829446229935e-05, + "loss": 0.2423, + "step": 2369 + }, + { + "epoch": 0.12, + "grad_norm": 0.8642744609344306, + "learning_rate": 1.9573353636328094e-05, + "loss": 0.1956, + "step": 2370 + }, + { + "epoch": 0.12, + "grad_norm": 1.2779774017391479, + "learning_rate": 1.9572877566748495e-05, + "loss": 0.2349, + "step": 2371 + }, + { + "epoch": 0.12, + "grad_norm": 0.9530889379498755, + "learning_rate": 1.957240123750405e-05, + "loss": 0.2795, + "step": 2372 + }, + { + "epoch": 0.12, + "grad_norm": 2.63061203732391, + "learning_rate": 1.9571924648607684e-05, + "loss": 0.2606, + "step": 2373 + }, + { + "epoch": 0.12, + "grad_norm": 1.206120338499393, + "learning_rate": 1.9571447800072318e-05, + "loss": 0.2514, + "step": 2374 + }, + { + "epoch": 0.12, + "grad_norm": 0.8251362601351317, + "learning_rate": 1.957097069191089e-05, + "loss": 0.2104, + "step": 2375 + }, + { + "epoch": 0.12, + "grad_norm": 1.0358611536294893, + "learning_rate": 1.9570493324136344e-05, + "loss": 0.2304, + "step": 2376 + }, + { + "epoch": 0.12, + "grad_norm": 0.9166959393585152, + "learning_rate": 1.9570015696761623e-05, + "loss": 0.2572, + "step": 2377 + }, + { + "epoch": 0.12, + "grad_norm": 1.0968226275193573, + "learning_rate": 1.9569537809799687e-05, + "loss": 0.2106, + "step": 2378 + }, + { + "epoch": 0.12, + "grad_norm": 1.284618905986818, + "learning_rate": 1.9569059663263498e-05, + "loss": 0.218, + "step": 2379 + }, + { + "epoch": 0.12, + "grad_norm": 1.07367940810242, + "learning_rate": 1.9568581257166025e-05, + "loss": 0.2238, + "step": 2380 + }, + { + "epoch": 0.12, + "grad_norm": 0.9241633450880841, + "learning_rate": 1.9568102591520246e-05, + "loss": 0.2443, + "step": 2381 + }, + { + "epoch": 0.12, + "grad_norm": 1.8693666074144997, + "learning_rate": 1.956762366633914e-05, + "loss": 0.2408, + "step": 2382 + }, + { + "epoch": 0.12, + "grad_norm": 1.152812884267505, + "learning_rate": 1.956714448163571e-05, + "loss": 0.2606, + "step": 2383 + }, + { + "epoch": 0.12, + "grad_norm": 1.0120887204947118, + "learning_rate": 1.9566665037422937e-05, + "loss": 0.2273, + "step": 2384 + }, + { + "epoch": 0.12, + "grad_norm": 0.8850621950766456, + "learning_rate": 1.9566185333713835e-05, + "loss": 0.2189, + "step": 2385 + }, + { + "epoch": 0.12, + "grad_norm": 1.0453797190650638, + "learning_rate": 1.956570537052142e-05, + "loss": 0.2307, + "step": 2386 + }, + { + "epoch": 0.12, + "grad_norm": 0.966999828439038, + "learning_rate": 1.9565225147858704e-05, + "loss": 0.2481, + "step": 2387 + }, + { + "epoch": 0.12, + "grad_norm": 1.795290828022784, + "learning_rate": 1.9564744665738714e-05, + "loss": 0.2282, + "step": 2388 + }, + { + "epoch": 0.12, + "grad_norm": 0.9236235449239979, + "learning_rate": 1.9564263924174488e-05, + "loss": 0.2359, + "step": 2389 + }, + { + "epoch": 0.12, + "grad_norm": 0.987110655783361, + "learning_rate": 1.9563782923179063e-05, + "loss": 0.2457, + "step": 2390 + }, + { + "epoch": 0.12, + "grad_norm": 0.9355041913405805, + "learning_rate": 1.9563301662765482e-05, + "loss": 0.225, + "step": 2391 + }, + { + "epoch": 0.12, + "grad_norm": 0.8777824597160528, + "learning_rate": 1.9562820142946808e-05, + "loss": 0.2172, + "step": 2392 + }, + { + "epoch": 0.12, + "grad_norm": 0.9367037353854338, + "learning_rate": 1.9562338363736095e-05, + "loss": 0.2321, + "step": 2393 + }, + { + "epoch": 0.12, + "grad_norm": 1.6404557075320965, + "learning_rate": 1.9561856325146414e-05, + "loss": 0.2112, + "step": 2394 + }, + { + "epoch": 0.12, + "grad_norm": 1.0918037055500003, + "learning_rate": 1.956137402719084e-05, + "loss": 0.226, + "step": 2395 + }, + { + "epoch": 0.12, + "grad_norm": 1.0405002814531292, + "learning_rate": 1.9560891469882457e-05, + "loss": 0.2411, + "step": 2396 + }, + { + "epoch": 0.12, + "grad_norm": 1.0119293437402799, + "learning_rate": 1.9560408653234352e-05, + "loss": 0.2338, + "step": 2397 + }, + { + "epoch": 0.12, + "grad_norm": 1.0379855418069062, + "learning_rate": 1.9559925577259622e-05, + "loss": 0.2481, + "step": 2398 + }, + { + "epoch": 0.12, + "grad_norm": 1.1508134685563278, + "learning_rate": 1.9559442241971373e-05, + "loss": 0.2326, + "step": 2399 + }, + { + "epoch": 0.12, + "grad_norm": 0.8364922526982728, + "learning_rate": 1.955895864738271e-05, + "loss": 0.2106, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 1.0375308864937787, + "learning_rate": 1.955847479350675e-05, + "loss": 0.2316, + "step": 2401 + }, + { + "epoch": 0.12, + "grad_norm": 1.0535751080622453, + "learning_rate": 1.955799068035663e-05, + "loss": 0.2389, + "step": 2402 + }, + { + "epoch": 0.12, + "grad_norm": 1.928196115611736, + "learning_rate": 1.955750630794547e-05, + "loss": 0.231, + "step": 2403 + }, + { + "epoch": 0.12, + "grad_norm": 0.9919358376481566, + "learning_rate": 1.955702167628641e-05, + "loss": 0.2104, + "step": 2404 + }, + { + "epoch": 0.12, + "grad_norm": 1.0893302608613895, + "learning_rate": 1.9556536785392598e-05, + "loss": 0.2239, + "step": 2405 + }, + { + "epoch": 0.12, + "grad_norm": 1.5843172922704782, + "learning_rate": 1.9556051635277184e-05, + "loss": 0.2484, + "step": 2406 + }, + { + "epoch": 0.12, + "grad_norm": 1.188746173343023, + "learning_rate": 1.9555566225953333e-05, + "loss": 0.2239, + "step": 2407 + }, + { + "epoch": 0.12, + "grad_norm": 0.8501232211498138, + "learning_rate": 1.9555080557434206e-05, + "loss": 0.2239, + "step": 2408 + }, + { + "epoch": 0.12, + "grad_norm": 1.6514133365172499, + "learning_rate": 1.955459462973298e-05, + "loss": 0.2575, + "step": 2409 + }, + { + "epoch": 0.12, + "grad_norm": 0.8867170258575549, + "learning_rate": 1.9554108442862836e-05, + "loss": 0.2368, + "step": 2410 + }, + { + "epoch": 0.12, + "grad_norm": 1.1563166041868196, + "learning_rate": 1.955362199683696e-05, + "loss": 0.2319, + "step": 2411 + }, + { + "epoch": 0.12, + "grad_norm": 1.0302301616491012, + "learning_rate": 1.9553135291668548e-05, + "loss": 0.2467, + "step": 2412 + }, + { + "epoch": 0.12, + "grad_norm": 1.1775203268994718, + "learning_rate": 1.95526483273708e-05, + "loss": 0.248, + "step": 2413 + }, + { + "epoch": 0.12, + "grad_norm": 1.2472244643540031, + "learning_rate": 1.9552161103956932e-05, + "loss": 0.2336, + "step": 2414 + }, + { + "epoch": 0.12, + "grad_norm": 0.9942684568484321, + "learning_rate": 1.955167362144015e-05, + "loss": 0.2312, + "step": 2415 + }, + { + "epoch": 0.12, + "grad_norm": 1.2827573628017537, + "learning_rate": 1.955118587983368e-05, + "loss": 0.2337, + "step": 2416 + }, + { + "epoch": 0.12, + "grad_norm": 1.0779477429174507, + "learning_rate": 1.9550697879150757e-05, + "loss": 0.2528, + "step": 2417 + }, + { + "epoch": 0.12, + "grad_norm": 0.9287186494476223, + "learning_rate": 1.9550209619404616e-05, + "loss": 0.2146, + "step": 2418 + }, + { + "epoch": 0.12, + "grad_norm": 0.9780419354412737, + "learning_rate": 1.9549721100608494e-05, + "loss": 0.2442, + "step": 2419 + }, + { + "epoch": 0.12, + "grad_norm": 1.0299340666660326, + "learning_rate": 1.954923232277565e-05, + "loss": 0.2229, + "step": 2420 + }, + { + "epoch": 0.12, + "grad_norm": 0.9339979389685626, + "learning_rate": 1.954874328591934e-05, + "loss": 0.2441, + "step": 2421 + }, + { + "epoch": 0.12, + "grad_norm": 0.9663170606947568, + "learning_rate": 1.9548253990052833e-05, + "loss": 0.2303, + "step": 2422 + }, + { + "epoch": 0.12, + "grad_norm": 1.0915466600904569, + "learning_rate": 1.9547764435189395e-05, + "loss": 0.2477, + "step": 2423 + }, + { + "epoch": 0.12, + "grad_norm": 2.575099114186478, + "learning_rate": 1.9547274621342303e-05, + "loss": 0.2472, + "step": 2424 + }, + { + "epoch": 0.12, + "grad_norm": 0.9211825722405153, + "learning_rate": 1.9546784548524852e-05, + "loss": 0.2182, + "step": 2425 + }, + { + "epoch": 0.12, + "grad_norm": 1.6693874263644388, + "learning_rate": 1.954629421675033e-05, + "loss": 0.2339, + "step": 2426 + }, + { + "epoch": 0.12, + "grad_norm": 1.560856670186122, + "learning_rate": 1.954580362603204e-05, + "loss": 0.244, + "step": 2427 + }, + { + "epoch": 0.12, + "grad_norm": 1.3346896255835428, + "learning_rate": 1.954531277638328e-05, + "loss": 0.2254, + "step": 2428 + }, + { + "epoch": 0.12, + "grad_norm": 0.9549915898367675, + "learning_rate": 1.954482166781738e-05, + "loss": 0.2545, + "step": 2429 + }, + { + "epoch": 0.12, + "grad_norm": 1.1193417924601143, + "learning_rate": 1.9544330300347655e-05, + "loss": 0.2506, + "step": 2430 + }, + { + "epoch": 0.12, + "grad_norm": 1.1071622856945116, + "learning_rate": 1.9543838673987424e-05, + "loss": 0.2501, + "step": 2431 + }, + { + "epoch": 0.12, + "grad_norm": 1.0377912267602951, + "learning_rate": 1.9543346788750032e-05, + "loss": 0.2332, + "step": 2432 + }, + { + "epoch": 0.12, + "grad_norm": 0.880835359230946, + "learning_rate": 1.9542854644648824e-05, + "loss": 0.2402, + "step": 2433 + }, + { + "epoch": 0.12, + "grad_norm": 0.934264549762433, + "learning_rate": 1.954236224169714e-05, + "loss": 0.2234, + "step": 2434 + }, + { + "epoch": 0.12, + "grad_norm": 0.946717586900411, + "learning_rate": 1.9541869579908343e-05, + "loss": 0.2546, + "step": 2435 + }, + { + "epoch": 0.12, + "grad_norm": 2.0749355807516685, + "learning_rate": 1.9541376659295796e-05, + "loss": 0.2423, + "step": 2436 + }, + { + "epoch": 0.12, + "grad_norm": 1.0938551679100832, + "learning_rate": 1.9540883479872863e-05, + "loss": 0.2355, + "step": 2437 + }, + { + "epoch": 0.12, + "grad_norm": 0.9996299987531758, + "learning_rate": 1.954039004165293e-05, + "loss": 0.2184, + "step": 2438 + }, + { + "epoch": 0.12, + "grad_norm": 0.9625907051201483, + "learning_rate": 1.953989634464938e-05, + "loss": 0.2111, + "step": 2439 + }, + { + "epoch": 0.12, + "grad_norm": 0.9399174909270136, + "learning_rate": 1.9539402388875598e-05, + "loss": 0.2269, + "step": 2440 + }, + { + "epoch": 0.12, + "grad_norm": 1.119626479040853, + "learning_rate": 1.9538908174344994e-05, + "loss": 0.2429, + "step": 2441 + }, + { + "epoch": 0.12, + "grad_norm": 1.0974321999890257, + "learning_rate": 1.9538413701070964e-05, + "loss": 0.2383, + "step": 2442 + }, + { + "epoch": 0.12, + "grad_norm": 1.2114085536835701, + "learning_rate": 1.9537918969066923e-05, + "loss": 0.2101, + "step": 2443 + }, + { + "epoch": 0.12, + "grad_norm": 1.6206815067868514, + "learning_rate": 1.953742397834629e-05, + "loss": 0.2238, + "step": 2444 + }, + { + "epoch": 0.12, + "grad_norm": 1.1385008062544557, + "learning_rate": 1.9536928728922496e-05, + "loss": 0.1996, + "step": 2445 + }, + { + "epoch": 0.12, + "grad_norm": 0.9357832841400486, + "learning_rate": 1.953643322080897e-05, + "loss": 0.2372, + "step": 2446 + }, + { + "epoch": 0.12, + "grad_norm": 0.9088175364992762, + "learning_rate": 1.9535937454019155e-05, + "loss": 0.2301, + "step": 2447 + }, + { + "epoch": 0.12, + "grad_norm": 1.0977441998849506, + "learning_rate": 1.9535441428566496e-05, + "loss": 0.2296, + "step": 2448 + }, + { + "epoch": 0.12, + "grad_norm": 1.0568730184364237, + "learning_rate": 1.9534945144464452e-05, + "loss": 0.2048, + "step": 2449 + }, + { + "epoch": 0.12, + "grad_norm": 1.7694547827815719, + "learning_rate": 1.953444860172648e-05, + "loss": 0.2341, + "step": 2450 + }, + { + "epoch": 0.12, + "grad_norm": 0.9535818827060624, + "learning_rate": 1.9533951800366052e-05, + "loss": 0.2231, + "step": 2451 + }, + { + "epoch": 0.12, + "grad_norm": 2.0852235238905874, + "learning_rate": 1.9533454740396645e-05, + "loss": 0.2207, + "step": 2452 + }, + { + "epoch": 0.12, + "grad_norm": 1.2273715146657684, + "learning_rate": 1.953295742183174e-05, + "loss": 0.2329, + "step": 2453 + }, + { + "epoch": 0.12, + "grad_norm": 1.0468236296362416, + "learning_rate": 1.9532459844684824e-05, + "loss": 0.2469, + "step": 2454 + }, + { + "epoch": 0.12, + "grad_norm": 0.888539113561735, + "learning_rate": 1.9531962008969396e-05, + "loss": 0.213, + "step": 2455 + }, + { + "epoch": 0.12, + "grad_norm": 0.901360077521206, + "learning_rate": 1.953146391469896e-05, + "loss": 0.2335, + "step": 2456 + }, + { + "epoch": 0.12, + "grad_norm": 1.0327000960606898, + "learning_rate": 1.953096556188703e-05, + "loss": 0.2289, + "step": 2457 + }, + { + "epoch": 0.12, + "grad_norm": 0.9485176494343361, + "learning_rate": 1.9530466950547118e-05, + "loss": 0.2263, + "step": 2458 + }, + { + "epoch": 0.13, + "grad_norm": 1.1105931610889015, + "learning_rate": 1.9529968080692753e-05, + "loss": 0.2316, + "step": 2459 + }, + { + "epoch": 0.13, + "grad_norm": 0.9852744012552714, + "learning_rate": 1.9529468952337468e-05, + "loss": 0.2566, + "step": 2460 + }, + { + "epoch": 0.13, + "grad_norm": 1.037940345459345, + "learning_rate": 1.9528969565494792e-05, + "loss": 0.2414, + "step": 2461 + }, + { + "epoch": 0.13, + "grad_norm": 1.376993818426987, + "learning_rate": 1.9528469920178287e-05, + "loss": 0.2499, + "step": 2462 + }, + { + "epoch": 0.13, + "grad_norm": 0.8866653946404849, + "learning_rate": 1.9527970016401493e-05, + "loss": 0.2225, + "step": 2463 + }, + { + "epoch": 0.13, + "grad_norm": 1.0459832052160578, + "learning_rate": 1.9527469854177973e-05, + "loss": 0.2262, + "step": 2464 + }, + { + "epoch": 0.13, + "grad_norm": 0.9970960002026228, + "learning_rate": 1.9526969433521298e-05, + "loss": 0.2127, + "step": 2465 + }, + { + "epoch": 0.13, + "grad_norm": 1.031139726816588, + "learning_rate": 1.9526468754445035e-05, + "loss": 0.239, + "step": 2466 + }, + { + "epoch": 0.13, + "grad_norm": 0.9082242265666298, + "learning_rate": 1.9525967816962775e-05, + "loss": 0.2135, + "step": 2467 + }, + { + "epoch": 0.13, + "grad_norm": 1.0929122867480856, + "learning_rate": 1.9525466621088093e-05, + "loss": 0.2361, + "step": 2468 + }, + { + "epoch": 0.13, + "grad_norm": 0.843789342979408, + "learning_rate": 1.95249651668346e-05, + "loss": 0.2289, + "step": 2469 + }, + { + "epoch": 0.13, + "grad_norm": 1.3704990673651014, + "learning_rate": 1.952446345421588e-05, + "loss": 0.254, + "step": 2470 + }, + { + "epoch": 0.13, + "grad_norm": 0.861102949292528, + "learning_rate": 1.9523961483245552e-05, + "loss": 0.2082, + "step": 2471 + }, + { + "epoch": 0.13, + "grad_norm": 0.9862142239764253, + "learning_rate": 1.9523459253937233e-05, + "loss": 0.2239, + "step": 2472 + }, + { + "epoch": 0.13, + "grad_norm": 0.9025994750433144, + "learning_rate": 1.9522956766304543e-05, + "loss": 0.2438, + "step": 2473 + }, + { + "epoch": 0.13, + "grad_norm": 1.0347478039910887, + "learning_rate": 1.9522454020361116e-05, + "loss": 0.2403, + "step": 2474 + }, + { + "epoch": 0.13, + "grad_norm": 1.268028229249795, + "learning_rate": 1.9521951016120582e-05, + "loss": 0.2541, + "step": 2475 + }, + { + "epoch": 0.13, + "grad_norm": 0.9898697278724068, + "learning_rate": 1.952144775359659e-05, + "loss": 0.2636, + "step": 2476 + }, + { + "epoch": 0.13, + "grad_norm": 0.9029126837260496, + "learning_rate": 1.9520944232802793e-05, + "loss": 0.2285, + "step": 2477 + }, + { + "epoch": 0.13, + "grad_norm": 3.4451648613384407, + "learning_rate": 1.9520440453752842e-05, + "loss": 0.2512, + "step": 2478 + }, + { + "epoch": 0.13, + "grad_norm": 1.3173582760945324, + "learning_rate": 1.951993641646041e-05, + "loss": 0.2275, + "step": 2479 + }, + { + "epoch": 0.13, + "grad_norm": 1.1085719825276406, + "learning_rate": 1.951943212093916e-05, + "loss": 0.2278, + "step": 2480 + }, + { + "epoch": 0.13, + "grad_norm": 1.0384753106293034, + "learning_rate": 1.951892756720278e-05, + "loss": 0.2648, + "step": 2481 + }, + { + "epoch": 0.13, + "grad_norm": 1.100632928939612, + "learning_rate": 1.9518422755264947e-05, + "loss": 0.2049, + "step": 2482 + }, + { + "epoch": 0.13, + "grad_norm": 1.1089547410194291, + "learning_rate": 1.9517917685139365e-05, + "loss": 0.2381, + "step": 2483 + }, + { + "epoch": 0.13, + "grad_norm": 2.1286418385854793, + "learning_rate": 1.9517412356839727e-05, + "loss": 0.2673, + "step": 2484 + }, + { + "epoch": 0.13, + "grad_norm": 0.7957571895575681, + "learning_rate": 1.951690677037974e-05, + "loss": 0.2002, + "step": 2485 + }, + { + "epoch": 0.13, + "grad_norm": 1.2857267543737927, + "learning_rate": 1.9516400925773118e-05, + "loss": 0.2252, + "step": 2486 + }, + { + "epoch": 0.13, + "grad_norm": 0.9023184796103526, + "learning_rate": 1.9515894823033584e-05, + "loss": 0.2185, + "step": 2487 + }, + { + "epoch": 0.13, + "grad_norm": 1.1830975982529575, + "learning_rate": 1.9515388462174868e-05, + "loss": 0.261, + "step": 2488 + }, + { + "epoch": 0.13, + "grad_norm": 0.9656013811551463, + "learning_rate": 1.95148818432107e-05, + "loss": 0.2556, + "step": 2489 + }, + { + "epoch": 0.13, + "grad_norm": 0.9506680880228655, + "learning_rate": 1.9514374966154826e-05, + "loss": 0.2196, + "step": 2490 + }, + { + "epoch": 0.13, + "grad_norm": 1.0597012016818077, + "learning_rate": 1.951386783102099e-05, + "loss": 0.2247, + "step": 2491 + }, + { + "epoch": 0.13, + "grad_norm": 0.9004544786269002, + "learning_rate": 1.9513360437822957e-05, + "loss": 0.2284, + "step": 2492 + }, + { + "epoch": 0.13, + "grad_norm": 1.0277177890445819, + "learning_rate": 1.9512852786574483e-05, + "loss": 0.2368, + "step": 2493 + }, + { + "epoch": 0.13, + "grad_norm": 0.8764913099954449, + "learning_rate": 1.951234487728934e-05, + "loss": 0.2349, + "step": 2494 + }, + { + "epoch": 0.13, + "grad_norm": 1.0231867249114779, + "learning_rate": 1.9511836709981306e-05, + "loss": 0.2336, + "step": 2495 + }, + { + "epoch": 0.13, + "grad_norm": 1.1317906311257189, + "learning_rate": 1.951132828466416e-05, + "loss": 0.2067, + "step": 2496 + }, + { + "epoch": 0.13, + "grad_norm": 1.0525609819493371, + "learning_rate": 1.95108196013517e-05, + "loss": 0.2466, + "step": 2497 + }, + { + "epoch": 0.13, + "grad_norm": 1.3414958850247156, + "learning_rate": 1.951031066005772e-05, + "loss": 0.2371, + "step": 2498 + }, + { + "epoch": 0.13, + "grad_norm": 0.9807391428552752, + "learning_rate": 1.9509801460796027e-05, + "loss": 0.2522, + "step": 2499 + }, + { + "epoch": 0.13, + "grad_norm": 1.1634163941213589, + "learning_rate": 1.950929200358043e-05, + "loss": 0.2359, + "step": 2500 + }, + { + "epoch": 0.13, + "grad_norm": 0.858070387539989, + "learning_rate": 1.9508782288424754e-05, + "loss": 0.2278, + "step": 2501 + }, + { + "epoch": 0.13, + "grad_norm": 0.968181824391059, + "learning_rate": 1.950827231534282e-05, + "loss": 0.2317, + "step": 2502 + }, + { + "epoch": 0.13, + "grad_norm": 1.1859112241043284, + "learning_rate": 1.950776208434846e-05, + "loss": 0.2141, + "step": 2503 + }, + { + "epoch": 0.13, + "grad_norm": 1.1055241908293456, + "learning_rate": 1.9507251595455524e-05, + "loss": 0.233, + "step": 2504 + }, + { + "epoch": 0.13, + "grad_norm": 1.1720307433593404, + "learning_rate": 1.9506740848677845e-05, + "loss": 0.2295, + "step": 2505 + }, + { + "epoch": 0.13, + "grad_norm": 0.9640362423866915, + "learning_rate": 1.9506229844029283e-05, + "loss": 0.2212, + "step": 2506 + }, + { + "epoch": 0.13, + "grad_norm": 1.1471940629954807, + "learning_rate": 1.95057185815237e-05, + "loss": 0.2176, + "step": 2507 + }, + { + "epoch": 0.13, + "grad_norm": 1.2772531433878958, + "learning_rate": 1.9505207061174966e-05, + "loss": 0.245, + "step": 2508 + }, + { + "epoch": 0.13, + "grad_norm": 1.024101923763926, + "learning_rate": 1.9504695282996953e-05, + "loss": 0.2446, + "step": 2509 + }, + { + "epoch": 0.13, + "grad_norm": 1.1602529558305126, + "learning_rate": 1.9504183247003544e-05, + "loss": 0.2296, + "step": 2510 + }, + { + "epoch": 0.13, + "grad_norm": 1.1835773861670562, + "learning_rate": 1.9503670953208628e-05, + "loss": 0.2329, + "step": 2511 + }, + { + "epoch": 0.13, + "grad_norm": 0.9684880160608303, + "learning_rate": 1.9503158401626098e-05, + "loss": 0.2482, + "step": 2512 + }, + { + "epoch": 0.13, + "grad_norm": 1.3001737299396119, + "learning_rate": 1.950264559226986e-05, + "loss": 0.2391, + "step": 2513 + }, + { + "epoch": 0.13, + "grad_norm": 1.056275760297049, + "learning_rate": 1.9502132525153826e-05, + "loss": 0.2292, + "step": 2514 + }, + { + "epoch": 0.13, + "grad_norm": 0.8285779120395363, + "learning_rate": 1.950161920029191e-05, + "loss": 0.2096, + "step": 2515 + }, + { + "epoch": 0.13, + "grad_norm": 1.2265740847100444, + "learning_rate": 1.9501105617698034e-05, + "loss": 0.2525, + "step": 2516 + }, + { + "epoch": 0.13, + "grad_norm": 1.168129362633706, + "learning_rate": 1.9500591777386134e-05, + "loss": 0.214, + "step": 2517 + }, + { + "epoch": 0.13, + "grad_norm": 1.3091587397759643, + "learning_rate": 1.9500077679370145e-05, + "loss": 0.265, + "step": 2518 + }, + { + "epoch": 0.13, + "grad_norm": 1.1204506664591425, + "learning_rate": 1.949956332366401e-05, + "loss": 0.2612, + "step": 2519 + }, + { + "epoch": 0.13, + "grad_norm": 1.064079571330691, + "learning_rate": 1.9499048710281686e-05, + "loss": 0.2295, + "step": 2520 + }, + { + "epoch": 0.13, + "grad_norm": 1.1114983928013271, + "learning_rate": 1.949853383923713e-05, + "loss": 0.2359, + "step": 2521 + }, + { + "epoch": 0.13, + "grad_norm": 1.1622974627712408, + "learning_rate": 1.9498018710544306e-05, + "loss": 0.2127, + "step": 2522 + }, + { + "epoch": 0.13, + "grad_norm": 1.2721738857153555, + "learning_rate": 1.9497503324217188e-05, + "loss": 0.2444, + "step": 2523 + }, + { + "epoch": 0.13, + "grad_norm": 1.2230204085056002, + "learning_rate": 1.9496987680269755e-05, + "loss": 0.2441, + "step": 2524 + }, + { + "epoch": 0.13, + "grad_norm": 1.1287902286094293, + "learning_rate": 1.9496471778715996e-05, + "loss": 0.2071, + "step": 2525 + }, + { + "epoch": 0.13, + "grad_norm": 1.0371896187150622, + "learning_rate": 1.94959556195699e-05, + "loss": 0.2307, + "step": 2526 + }, + { + "epoch": 0.13, + "grad_norm": 1.019653166099954, + "learning_rate": 1.9495439202845478e-05, + "loss": 0.2605, + "step": 2527 + }, + { + "epoch": 0.13, + "grad_norm": 1.2402110275780256, + "learning_rate": 1.9494922528556727e-05, + "loss": 0.2256, + "step": 2528 + }, + { + "epoch": 0.13, + "grad_norm": 0.9884338951179454, + "learning_rate": 1.9494405596717664e-05, + "loss": 0.2233, + "step": 2529 + }, + { + "epoch": 0.13, + "grad_norm": 1.0533813644665326, + "learning_rate": 1.949388840734232e-05, + "loss": 0.2037, + "step": 2530 + }, + { + "epoch": 0.13, + "grad_norm": 0.967803989425464, + "learning_rate": 1.949337096044471e-05, + "loss": 0.2355, + "step": 2531 + }, + { + "epoch": 0.13, + "grad_norm": 1.1704508502228437, + "learning_rate": 1.949285325603888e-05, + "loss": 0.2442, + "step": 2532 + }, + { + "epoch": 0.13, + "grad_norm": 0.8687769631305765, + "learning_rate": 1.9492335294138868e-05, + "loss": 0.2589, + "step": 2533 + }, + { + "epoch": 0.13, + "grad_norm": 1.0410902336408165, + "learning_rate": 1.9491817074758727e-05, + "loss": 0.2424, + "step": 2534 + }, + { + "epoch": 0.13, + "grad_norm": 1.0019116803639123, + "learning_rate": 1.949129859791251e-05, + "loss": 0.2116, + "step": 2535 + }, + { + "epoch": 0.13, + "grad_norm": 1.0089129827853796, + "learning_rate": 1.9490779863614284e-05, + "loss": 0.2288, + "step": 2536 + }, + { + "epoch": 0.13, + "grad_norm": 0.9476886305903411, + "learning_rate": 1.9490260871878114e-05, + "loss": 0.2283, + "step": 2537 + }, + { + "epoch": 0.13, + "grad_norm": 2.7249136953545823, + "learning_rate": 1.9489741622718087e-05, + "loss": 0.2329, + "step": 2538 + }, + { + "epoch": 0.13, + "grad_norm": 1.6606994488067524, + "learning_rate": 1.9489222116148278e-05, + "loss": 0.245, + "step": 2539 + }, + { + "epoch": 0.13, + "grad_norm": 1.1349112515004496, + "learning_rate": 1.948870235218279e-05, + "loss": 0.217, + "step": 2540 + }, + { + "epoch": 0.13, + "grad_norm": 1.0864289977825945, + "learning_rate": 1.9488182330835706e-05, + "loss": 0.249, + "step": 2541 + }, + { + "epoch": 0.13, + "grad_norm": 1.0162834015559579, + "learning_rate": 1.9487662052121145e-05, + "loss": 0.2613, + "step": 2542 + }, + { + "epoch": 0.13, + "grad_norm": 1.3802411208694967, + "learning_rate": 1.9487141516053214e-05, + "loss": 0.2406, + "step": 2543 + }, + { + "epoch": 0.13, + "grad_norm": 1.3959880188571363, + "learning_rate": 1.9486620722646036e-05, + "loss": 0.2466, + "step": 2544 + }, + { + "epoch": 0.13, + "grad_norm": 0.9696196695677537, + "learning_rate": 1.948609967191373e-05, + "loss": 0.2519, + "step": 2545 + }, + { + "epoch": 0.13, + "grad_norm": 1.1169183926233577, + "learning_rate": 1.9485578363870438e-05, + "loss": 0.2221, + "step": 2546 + }, + { + "epoch": 0.13, + "grad_norm": 0.9685353962725984, + "learning_rate": 1.9485056798530296e-05, + "loss": 0.2147, + "step": 2547 + }, + { + "epoch": 0.13, + "grad_norm": 1.3676253959318463, + "learning_rate": 1.9484534975907454e-05, + "loss": 0.2331, + "step": 2548 + }, + { + "epoch": 0.13, + "grad_norm": 1.072106777885036, + "learning_rate": 1.9484012896016064e-05, + "loss": 0.2405, + "step": 2549 + }, + { + "epoch": 0.13, + "grad_norm": 1.9564555833352242, + "learning_rate": 1.948349055887029e-05, + "loss": 0.2367, + "step": 2550 + }, + { + "epoch": 0.13, + "grad_norm": 1.1139384170444573, + "learning_rate": 1.9482967964484297e-05, + "loss": 0.2386, + "step": 2551 + }, + { + "epoch": 0.13, + "grad_norm": 0.8949825457977273, + "learning_rate": 1.9482445112872265e-05, + "loss": 0.2142, + "step": 2552 + }, + { + "epoch": 0.13, + "grad_norm": 0.9924360109156986, + "learning_rate": 1.948192200404837e-05, + "loss": 0.2308, + "step": 2553 + }, + { + "epoch": 0.13, + "grad_norm": 1.213906686434998, + "learning_rate": 1.948139863802681e-05, + "loss": 0.2162, + "step": 2554 + }, + { + "epoch": 0.13, + "grad_norm": 1.245261071851514, + "learning_rate": 1.9480875014821776e-05, + "loss": 0.2611, + "step": 2555 + }, + { + "epoch": 0.13, + "grad_norm": 0.9617311900940123, + "learning_rate": 1.9480351134447466e-05, + "loss": 0.2092, + "step": 2556 + }, + { + "epoch": 0.13, + "grad_norm": 0.8336642002205943, + "learning_rate": 1.94798269969181e-05, + "loss": 0.2081, + "step": 2557 + }, + { + "epoch": 0.13, + "grad_norm": 1.2352999776362503, + "learning_rate": 1.947930260224789e-05, + "loss": 0.2334, + "step": 2558 + }, + { + "epoch": 0.13, + "grad_norm": 1.2133376413858712, + "learning_rate": 1.9478777950451063e-05, + "loss": 0.2486, + "step": 2559 + }, + { + "epoch": 0.13, + "grad_norm": 0.8660840115714117, + "learning_rate": 1.9478253041541848e-05, + "loss": 0.2115, + "step": 2560 + }, + { + "epoch": 0.13, + "grad_norm": 1.0302407585810627, + "learning_rate": 1.9477727875534483e-05, + "loss": 0.238, + "step": 2561 + }, + { + "epoch": 0.13, + "grad_norm": 0.998569291011515, + "learning_rate": 1.9477202452443217e-05, + "loss": 0.226, + "step": 2562 + }, + { + "epoch": 0.13, + "grad_norm": 1.2341549907489868, + "learning_rate": 1.9476676772282297e-05, + "loss": 0.2366, + "step": 2563 + }, + { + "epoch": 0.13, + "grad_norm": 0.9984474900319265, + "learning_rate": 1.9476150835065983e-05, + "loss": 0.2373, + "step": 2564 + }, + { + "epoch": 0.13, + "grad_norm": 0.8639329130273585, + "learning_rate": 1.9475624640808542e-05, + "loss": 0.2377, + "step": 2565 + }, + { + "epoch": 0.13, + "grad_norm": 0.8980352825774708, + "learning_rate": 1.9475098189524253e-05, + "loss": 0.2344, + "step": 2566 + }, + { + "epoch": 0.13, + "grad_norm": 1.0722782379588431, + "learning_rate": 1.9474571481227385e-05, + "loss": 0.2404, + "step": 2567 + }, + { + "epoch": 0.13, + "grad_norm": 1.0137257435535263, + "learning_rate": 1.947404451593223e-05, + "loss": 0.2399, + "step": 2568 + }, + { + "epoch": 0.13, + "grad_norm": 3.063657778639185, + "learning_rate": 1.9473517293653084e-05, + "loss": 0.2402, + "step": 2569 + }, + { + "epoch": 0.13, + "grad_norm": 0.9111470386340367, + "learning_rate": 1.947298981440425e-05, + "loss": 0.2421, + "step": 2570 + }, + { + "epoch": 0.13, + "grad_norm": 1.0021174540309534, + "learning_rate": 1.947246207820003e-05, + "loss": 0.2228, + "step": 2571 + }, + { + "epoch": 0.13, + "grad_norm": 1.1306991183837498, + "learning_rate": 1.947193408505474e-05, + "loss": 0.2297, + "step": 2572 + }, + { + "epoch": 0.13, + "grad_norm": 0.9252103940645041, + "learning_rate": 1.9471405834982702e-05, + "loss": 0.2478, + "step": 2573 + }, + { + "epoch": 0.13, + "grad_norm": 1.3628088679806831, + "learning_rate": 1.947087732799825e-05, + "loss": 0.243, + "step": 2574 + }, + { + "epoch": 0.13, + "grad_norm": 0.9693549537628366, + "learning_rate": 1.947034856411571e-05, + "loss": 0.2315, + "step": 2575 + }, + { + "epoch": 0.13, + "grad_norm": 0.9421192104463941, + "learning_rate": 1.9469819543349433e-05, + "loss": 0.2135, + "step": 2576 + }, + { + "epoch": 0.13, + "grad_norm": 0.8127772980411968, + "learning_rate": 1.9469290265713767e-05, + "loss": 0.2272, + "step": 2577 + }, + { + "epoch": 0.13, + "grad_norm": 1.8785917986766272, + "learning_rate": 1.9468760731223065e-05, + "loss": 0.2169, + "step": 2578 + }, + { + "epoch": 0.13, + "grad_norm": 0.8235476076278131, + "learning_rate": 1.9468230939891695e-05, + "loss": 0.2154, + "step": 2579 + }, + { + "epoch": 0.13, + "grad_norm": 1.1338475304907736, + "learning_rate": 1.9467700891734027e-05, + "loss": 0.2246, + "step": 2580 + }, + { + "epoch": 0.13, + "grad_norm": 1.088495536191225, + "learning_rate": 1.9467170586764436e-05, + "loss": 0.238, + "step": 2581 + }, + { + "epoch": 0.13, + "grad_norm": 2.508331904771354, + "learning_rate": 1.946664002499731e-05, + "loss": 0.2506, + "step": 2582 + }, + { + "epoch": 0.13, + "grad_norm": 1.3740198237235974, + "learning_rate": 1.9466109206447036e-05, + "loss": 0.2363, + "step": 2583 + }, + { + "epoch": 0.13, + "grad_norm": 2.0443872256988116, + "learning_rate": 1.9465578131128017e-05, + "loss": 0.2265, + "step": 2584 + }, + { + "epoch": 0.13, + "grad_norm": 1.9073842275417658, + "learning_rate": 1.9465046799054657e-05, + "loss": 0.2197, + "step": 2585 + }, + { + "epoch": 0.13, + "grad_norm": 1.6196237127154511, + "learning_rate": 1.9464515210241368e-05, + "loss": 0.2237, + "step": 2586 + }, + { + "epoch": 0.13, + "grad_norm": 1.1295037581852974, + "learning_rate": 1.9463983364702567e-05, + "loss": 0.2357, + "step": 2587 + }, + { + "epoch": 0.13, + "grad_norm": 0.8859703033393711, + "learning_rate": 1.9463451262452685e-05, + "loss": 0.2127, + "step": 2588 + }, + { + "epoch": 0.13, + "grad_norm": 1.0225681338883512, + "learning_rate": 1.946291890350615e-05, + "loss": 0.2223, + "step": 2589 + }, + { + "epoch": 0.13, + "grad_norm": 0.9560884298829112, + "learning_rate": 1.946238628787741e-05, + "loss": 0.1999, + "step": 2590 + }, + { + "epoch": 0.13, + "grad_norm": 1.3680393661657546, + "learning_rate": 1.9461853415580902e-05, + "loss": 0.2471, + "step": 2591 + }, + { + "epoch": 0.13, + "grad_norm": 0.9219507253318284, + "learning_rate": 1.9461320286631088e-05, + "loss": 0.2394, + "step": 2592 + }, + { + "epoch": 0.13, + "grad_norm": 0.9532444945302694, + "learning_rate": 1.946078690104243e-05, + "loss": 0.231, + "step": 2593 + }, + { + "epoch": 0.13, + "grad_norm": 0.8964117763557078, + "learning_rate": 1.946025325882939e-05, + "loss": 0.2198, + "step": 2594 + }, + { + "epoch": 0.13, + "grad_norm": 0.9022765936891728, + "learning_rate": 1.945971936000645e-05, + "loss": 0.2021, + "step": 2595 + }, + { + "epoch": 0.13, + "grad_norm": 1.0377148811164694, + "learning_rate": 1.945918520458808e-05, + "loss": 0.2258, + "step": 2596 + }, + { + "epoch": 0.13, + "grad_norm": 0.8926644487193754, + "learning_rate": 1.9458650792588784e-05, + "loss": 0.2272, + "step": 2597 + }, + { + "epoch": 0.13, + "grad_norm": 1.2339397203295959, + "learning_rate": 1.945811612402305e-05, + "loss": 0.2071, + "step": 2598 + }, + { + "epoch": 0.13, + "grad_norm": 1.1447126510888062, + "learning_rate": 1.945758119890538e-05, + "loss": 0.2384, + "step": 2599 + }, + { + "epoch": 0.13, + "grad_norm": 0.94796043499078, + "learning_rate": 1.9457046017250283e-05, + "loss": 0.2329, + "step": 2600 + }, + { + "epoch": 0.13, + "grad_norm": 1.7726117349573447, + "learning_rate": 1.9456510579072282e-05, + "loss": 0.2293, + "step": 2601 + }, + { + "epoch": 0.13, + "grad_norm": 1.1989169212832826, + "learning_rate": 1.94559748843859e-05, + "loss": 0.2463, + "step": 2602 + }, + { + "epoch": 0.13, + "grad_norm": 1.074481968333032, + "learning_rate": 1.9455438933205662e-05, + "loss": 0.2264, + "step": 2603 + }, + { + "epoch": 0.13, + "grad_norm": 0.9435469677018979, + "learning_rate": 1.945490272554611e-05, + "loss": 0.2463, + "step": 2604 + }, + { + "epoch": 0.13, + "grad_norm": 2.1295581894773257, + "learning_rate": 1.9454366261421786e-05, + "loss": 0.2323, + "step": 2605 + }, + { + "epoch": 0.13, + "grad_norm": 0.8943028767051312, + "learning_rate": 1.9453829540847243e-05, + "loss": 0.2174, + "step": 2606 + }, + { + "epoch": 0.13, + "grad_norm": 1.2457900588078497, + "learning_rate": 1.9453292563837043e-05, + "loss": 0.2574, + "step": 2607 + }, + { + "epoch": 0.13, + "grad_norm": 0.8224583557257457, + "learning_rate": 1.9452755330405745e-05, + "loss": 0.2154, + "step": 2608 + }, + { + "epoch": 0.13, + "grad_norm": 1.0649357865229008, + "learning_rate": 1.9452217840567927e-05, + "loss": 0.2397, + "step": 2609 + }, + { + "epoch": 0.13, + "grad_norm": 0.925172871095016, + "learning_rate": 1.9451680094338163e-05, + "loss": 0.2485, + "step": 2610 + }, + { + "epoch": 0.13, + "grad_norm": 1.3335832249417243, + "learning_rate": 1.9451142091731045e-05, + "loss": 0.2324, + "step": 2611 + }, + { + "epoch": 0.13, + "grad_norm": 0.9363316898690303, + "learning_rate": 1.9450603832761165e-05, + "loss": 0.2601, + "step": 2612 + }, + { + "epoch": 0.13, + "grad_norm": 1.0677907449571924, + "learning_rate": 1.945006531744312e-05, + "loss": 0.2292, + "step": 2613 + }, + { + "epoch": 0.13, + "grad_norm": 1.3032820403922316, + "learning_rate": 1.9449526545791523e-05, + "loss": 0.2523, + "step": 2614 + }, + { + "epoch": 0.13, + "grad_norm": 0.9162782130809316, + "learning_rate": 1.9448987517820982e-05, + "loss": 0.2226, + "step": 2615 + }, + { + "epoch": 0.13, + "grad_norm": 1.025744133538097, + "learning_rate": 1.944844823354612e-05, + "loss": 0.2215, + "step": 2616 + }, + { + "epoch": 0.13, + "grad_norm": 0.9187446041207256, + "learning_rate": 1.944790869298157e-05, + "loss": 0.2469, + "step": 2617 + }, + { + "epoch": 0.13, + "grad_norm": 0.869231477206988, + "learning_rate": 1.9447368896141958e-05, + "loss": 0.2336, + "step": 2618 + }, + { + "epoch": 0.13, + "grad_norm": 1.1103999027549547, + "learning_rate": 1.9446828843041933e-05, + "loss": 0.2437, + "step": 2619 + }, + { + "epoch": 0.13, + "grad_norm": 1.0309658164029158, + "learning_rate": 1.9446288533696145e-05, + "loss": 0.2207, + "step": 2620 + }, + { + "epoch": 0.13, + "grad_norm": 1.0021576454715904, + "learning_rate": 1.9445747968119246e-05, + "loss": 0.2232, + "step": 2621 + }, + { + "epoch": 0.13, + "grad_norm": 1.0926465258868414, + "learning_rate": 1.9445207146325894e-05, + "loss": 0.2556, + "step": 2622 + }, + { + "epoch": 0.13, + "grad_norm": 1.67810028770524, + "learning_rate": 1.9444666068330772e-05, + "loss": 0.208, + "step": 2623 + }, + { + "epoch": 0.13, + "grad_norm": 0.8767727670984775, + "learning_rate": 1.9444124734148543e-05, + "loss": 0.226, + "step": 2624 + }, + { + "epoch": 0.13, + "grad_norm": 1.2002140113970021, + "learning_rate": 1.9443583143793904e-05, + "loss": 0.2337, + "step": 2625 + }, + { + "epoch": 0.13, + "grad_norm": 1.0739431790098095, + "learning_rate": 1.9443041297281536e-05, + "loss": 0.2426, + "step": 2626 + }, + { + "epoch": 0.13, + "grad_norm": 1.2736818974157589, + "learning_rate": 1.9442499194626138e-05, + "loss": 0.2387, + "step": 2627 + }, + { + "epoch": 0.13, + "grad_norm": 1.1784427170299878, + "learning_rate": 1.9441956835842416e-05, + "loss": 0.2219, + "step": 2628 + }, + { + "epoch": 0.13, + "grad_norm": 1.1776402200746279, + "learning_rate": 1.9441414220945083e-05, + "loss": 0.2268, + "step": 2629 + }, + { + "epoch": 0.13, + "grad_norm": 1.0166012698350608, + "learning_rate": 1.9440871349948856e-05, + "loss": 0.2448, + "step": 2630 + }, + { + "epoch": 0.13, + "grad_norm": 1.1629080894535453, + "learning_rate": 1.9440328222868457e-05, + "loss": 0.2375, + "step": 2631 + }, + { + "epoch": 0.13, + "grad_norm": 2.5773238347578813, + "learning_rate": 1.9439784839718627e-05, + "loss": 0.2214, + "step": 2632 + }, + { + "epoch": 0.13, + "grad_norm": 0.8701043666547783, + "learning_rate": 1.94392412005141e-05, + "loss": 0.2403, + "step": 2633 + }, + { + "epoch": 0.13, + "grad_norm": 0.9530883571583211, + "learning_rate": 1.943869730526962e-05, + "loss": 0.2313, + "step": 2634 + }, + { + "epoch": 0.13, + "grad_norm": 1.2905919957477667, + "learning_rate": 1.9438153153999942e-05, + "loss": 0.2338, + "step": 2635 + }, + { + "epoch": 0.13, + "grad_norm": 1.3297130904337844, + "learning_rate": 1.9437608746719828e-05, + "loss": 0.2568, + "step": 2636 + }, + { + "epoch": 0.13, + "grad_norm": 1.1409616790457267, + "learning_rate": 1.943706408344404e-05, + "loss": 0.2696, + "step": 2637 + }, + { + "epoch": 0.13, + "grad_norm": 1.165365917351629, + "learning_rate": 1.9436519164187363e-05, + "loss": 0.2398, + "step": 2638 + }, + { + "epoch": 0.13, + "grad_norm": 1.6898833986979496, + "learning_rate": 1.9435973988964564e-05, + "loss": 0.2401, + "step": 2639 + }, + { + "epoch": 0.13, + "grad_norm": 1.6757623447359549, + "learning_rate": 1.943542855779044e-05, + "loss": 0.2353, + "step": 2640 + }, + { + "epoch": 0.13, + "grad_norm": 1.0909672887562558, + "learning_rate": 1.9434882870679783e-05, + "loss": 0.2225, + "step": 2641 + }, + { + "epoch": 0.13, + "grad_norm": 1.286445863100236, + "learning_rate": 1.9434336927647397e-05, + "loss": 0.216, + "step": 2642 + }, + { + "epoch": 0.13, + "grad_norm": 1.1677363817710942, + "learning_rate": 1.9433790728708085e-05, + "loss": 0.2267, + "step": 2643 + }, + { + "epoch": 0.13, + "grad_norm": 1.0503125887814437, + "learning_rate": 1.943324427387667e-05, + "loss": 0.242, + "step": 2644 + }, + { + "epoch": 0.13, + "grad_norm": 2.007415443963795, + "learning_rate": 1.9432697563167974e-05, + "loss": 0.2323, + "step": 2645 + }, + { + "epoch": 0.13, + "grad_norm": 1.3820066306983543, + "learning_rate": 1.9432150596596818e-05, + "loss": 0.2487, + "step": 2646 + }, + { + "epoch": 0.13, + "grad_norm": 1.0342232451505902, + "learning_rate": 1.9431603374178048e-05, + "loss": 0.2394, + "step": 2647 + }, + { + "epoch": 0.13, + "grad_norm": 1.5428237143556407, + "learning_rate": 1.94310558959265e-05, + "loss": 0.2356, + "step": 2648 + }, + { + "epoch": 0.13, + "grad_norm": 1.67906044044975, + "learning_rate": 1.943050816185703e-05, + "loss": 0.2331, + "step": 2649 + }, + { + "epoch": 0.13, + "grad_norm": 1.0458855765510422, + "learning_rate": 1.9429960171984496e-05, + "loss": 0.2103, + "step": 2650 + }, + { + "epoch": 0.13, + "grad_norm": 0.9721893985186519, + "learning_rate": 1.9429411926323756e-05, + "loss": 0.2038, + "step": 2651 + }, + { + "epoch": 0.13, + "grad_norm": 1.0903666819111881, + "learning_rate": 1.942886342488969e-05, + "loss": 0.2179, + "step": 2652 + }, + { + "epoch": 0.13, + "grad_norm": 1.5080139281275566, + "learning_rate": 1.9428314667697166e-05, + "loss": 0.2317, + "step": 2653 + }, + { + "epoch": 0.13, + "grad_norm": 1.6690664079020918, + "learning_rate": 1.9427765654761078e-05, + "loss": 0.2292, + "step": 2654 + }, + { + "epoch": 0.14, + "grad_norm": 1.3396452269621446, + "learning_rate": 1.9427216386096313e-05, + "loss": 0.2412, + "step": 2655 + }, + { + "epoch": 0.14, + "grad_norm": 1.5807328497654407, + "learning_rate": 1.942666686171777e-05, + "loss": 0.2357, + "step": 2656 + }, + { + "epoch": 0.14, + "grad_norm": 1.190131146360561, + "learning_rate": 1.9426117081640356e-05, + "loss": 0.2241, + "step": 2657 + }, + { + "epoch": 0.14, + "grad_norm": 0.9877783524716975, + "learning_rate": 1.9425567045878983e-05, + "loss": 0.2049, + "step": 2658 + }, + { + "epoch": 0.14, + "grad_norm": 1.6005142246281456, + "learning_rate": 1.942501675444857e-05, + "loss": 0.203, + "step": 2659 + }, + { + "epoch": 0.14, + "grad_norm": 1.7675592499022506, + "learning_rate": 1.942446620736405e-05, + "loss": 0.2501, + "step": 2660 + }, + { + "epoch": 0.14, + "grad_norm": 1.3275111289487056, + "learning_rate": 1.942391540464035e-05, + "loss": 0.2689, + "step": 2661 + }, + { + "epoch": 0.14, + "grad_norm": 1.0635503262207242, + "learning_rate": 1.942336434629241e-05, + "loss": 0.2241, + "step": 2662 + }, + { + "epoch": 0.14, + "grad_norm": 1.5034106101267861, + "learning_rate": 1.9422813032335183e-05, + "loss": 0.2364, + "step": 2663 + }, + { + "epoch": 0.14, + "grad_norm": 1.1863865222873722, + "learning_rate": 1.942226146278362e-05, + "loss": 0.2401, + "step": 2664 + }, + { + "epoch": 0.14, + "grad_norm": 2.2804977196215885, + "learning_rate": 1.9421709637652683e-05, + "loss": 0.2521, + "step": 2665 + }, + { + "epoch": 0.14, + "grad_norm": 1.3243453835764538, + "learning_rate": 1.9421157556957335e-05, + "loss": 0.2466, + "step": 2666 + }, + { + "epoch": 0.14, + "grad_norm": 1.114670908193584, + "learning_rate": 1.9420605220712563e-05, + "loss": 0.2216, + "step": 2667 + }, + { + "epoch": 0.14, + "grad_norm": 0.9544414132223076, + "learning_rate": 1.942005262893334e-05, + "loss": 0.2341, + "step": 2668 + }, + { + "epoch": 0.14, + "grad_norm": 1.0708571897346173, + "learning_rate": 1.9419499781634655e-05, + "loss": 0.2332, + "step": 2669 + }, + { + "epoch": 0.14, + "grad_norm": 0.8900595168867483, + "learning_rate": 1.9418946678831507e-05, + "loss": 0.2251, + "step": 2670 + }, + { + "epoch": 0.14, + "grad_norm": 1.129854489633797, + "learning_rate": 1.9418393320538898e-05, + "loss": 0.2257, + "step": 2671 + }, + { + "epoch": 0.14, + "grad_norm": 0.7338804200785907, + "learning_rate": 1.9417839706771842e-05, + "loss": 0.2239, + "step": 2672 + }, + { + "epoch": 0.14, + "grad_norm": 0.9958041310863106, + "learning_rate": 1.941728583754535e-05, + "loss": 0.2195, + "step": 2673 + }, + { + "epoch": 0.14, + "grad_norm": 1.3733645464339348, + "learning_rate": 1.9416731712874446e-05, + "loss": 0.2589, + "step": 2674 + }, + { + "epoch": 0.14, + "grad_norm": 0.9909583269793304, + "learning_rate": 1.9416177332774162e-05, + "loss": 0.2301, + "step": 2675 + }, + { + "epoch": 0.14, + "grad_norm": 1.0248646828945818, + "learning_rate": 1.941562269725954e-05, + "loss": 0.2144, + "step": 2676 + }, + { + "epoch": 0.14, + "grad_norm": 1.008259417023293, + "learning_rate": 1.9415067806345618e-05, + "loss": 0.2314, + "step": 2677 + }, + { + "epoch": 0.14, + "grad_norm": 1.0543469075480103, + "learning_rate": 1.9414512660047447e-05, + "loss": 0.2312, + "step": 2678 + }, + { + "epoch": 0.14, + "grad_norm": 1.1781005557537256, + "learning_rate": 1.9413957258380096e-05, + "loss": 0.2802, + "step": 2679 + }, + { + "epoch": 0.14, + "grad_norm": 1.4001122745244066, + "learning_rate": 1.9413401601358616e-05, + "loss": 0.2215, + "step": 2680 + }, + { + "epoch": 0.14, + "grad_norm": 1.3713329500791462, + "learning_rate": 1.9412845688998088e-05, + "loss": 0.2685, + "step": 2681 + }, + { + "epoch": 0.14, + "grad_norm": 0.886757575995524, + "learning_rate": 1.941228952131359e-05, + "loss": 0.223, + "step": 2682 + }, + { + "epoch": 0.14, + "grad_norm": 1.22683645359689, + "learning_rate": 1.9411733098320206e-05, + "loss": 0.2672, + "step": 2683 + }, + { + "epoch": 0.14, + "grad_norm": 1.5315058602311509, + "learning_rate": 1.941117642003303e-05, + "loss": 0.2429, + "step": 2684 + }, + { + "epoch": 0.14, + "grad_norm": 1.1110857691169762, + "learning_rate": 1.9410619486467165e-05, + "loss": 0.2305, + "step": 2685 + }, + { + "epoch": 0.14, + "grad_norm": 1.258428468415178, + "learning_rate": 1.941006229763771e-05, + "loss": 0.2534, + "step": 2686 + }, + { + "epoch": 0.14, + "grad_norm": 1.0637285182214002, + "learning_rate": 1.9409504853559785e-05, + "loss": 0.2362, + "step": 2687 + }, + { + "epoch": 0.14, + "grad_norm": 0.9076212147606959, + "learning_rate": 1.9408947154248513e-05, + "loss": 0.2272, + "step": 2688 + }, + { + "epoch": 0.14, + "grad_norm": 1.3770753242493052, + "learning_rate": 1.9408389199719014e-05, + "loss": 0.2113, + "step": 2689 + }, + { + "epoch": 0.14, + "grad_norm": 0.9711446101427936, + "learning_rate": 1.940783098998643e-05, + "loss": 0.1977, + "step": 2690 + }, + { + "epoch": 0.14, + "grad_norm": 1.140062040259186, + "learning_rate": 1.9407272525065898e-05, + "loss": 0.2167, + "step": 2691 + }, + { + "epoch": 0.14, + "grad_norm": 1.1803908714893359, + "learning_rate": 1.9406713804972565e-05, + "loss": 0.2327, + "step": 2692 + }, + { + "epoch": 0.14, + "grad_norm": 0.8373370160514211, + "learning_rate": 1.940615482972159e-05, + "loss": 0.2331, + "step": 2693 + }, + { + "epoch": 0.14, + "grad_norm": 1.7754881930206292, + "learning_rate": 1.9405595599328135e-05, + "loss": 0.2258, + "step": 2694 + }, + { + "epoch": 0.14, + "grad_norm": 1.0043781465210622, + "learning_rate": 1.940503611380737e-05, + "loss": 0.2151, + "step": 2695 + }, + { + "epoch": 0.14, + "grad_norm": 1.8156833311388194, + "learning_rate": 1.9404476373174464e-05, + "loss": 0.2327, + "step": 2696 + }, + { + "epoch": 0.14, + "grad_norm": 1.0110757172867997, + "learning_rate": 1.940391637744461e-05, + "loss": 0.208, + "step": 2697 + }, + { + "epoch": 0.14, + "grad_norm": 1.267672729778902, + "learning_rate": 1.9403356126632992e-05, + "loss": 0.2283, + "step": 2698 + }, + { + "epoch": 0.14, + "grad_norm": 1.0261197599948402, + "learning_rate": 1.9402795620754804e-05, + "loss": 0.2338, + "step": 2699 + }, + { + "epoch": 0.14, + "grad_norm": 1.329068764323991, + "learning_rate": 1.9402234859825257e-05, + "loss": 0.2397, + "step": 2700 + }, + { + "epoch": 0.14, + "grad_norm": 1.1760870358990558, + "learning_rate": 1.940167384385956e-05, + "loss": 0.2614, + "step": 2701 + }, + { + "epoch": 0.14, + "grad_norm": 1.3730668241267234, + "learning_rate": 1.9401112572872925e-05, + "loss": 0.2195, + "step": 2702 + }, + { + "epoch": 0.14, + "grad_norm": 1.2577938917534763, + "learning_rate": 1.9400551046880585e-05, + "loss": 0.2342, + "step": 2703 + }, + { + "epoch": 0.14, + "grad_norm": 0.9975635494089065, + "learning_rate": 1.9399989265897764e-05, + "loss": 0.2163, + "step": 2704 + }, + { + "epoch": 0.14, + "grad_norm": 0.9901243552258541, + "learning_rate": 1.9399427229939704e-05, + "loss": 0.2293, + "step": 2705 + }, + { + "epoch": 0.14, + "grad_norm": 1.1380070535962723, + "learning_rate": 1.939886493902165e-05, + "loss": 0.2385, + "step": 2706 + }, + { + "epoch": 0.14, + "grad_norm": 0.8633683152623378, + "learning_rate": 1.9398302393158853e-05, + "loss": 0.214, + "step": 2707 + }, + { + "epoch": 0.14, + "grad_norm": 1.3079801081816878, + "learning_rate": 1.939773959236657e-05, + "loss": 0.246, + "step": 2708 + }, + { + "epoch": 0.14, + "grad_norm": 1.4099778762372743, + "learning_rate": 1.9397176536660074e-05, + "loss": 0.2494, + "step": 2709 + }, + { + "epoch": 0.14, + "grad_norm": 1.3922786096799684, + "learning_rate": 1.939661322605463e-05, + "loss": 0.2666, + "step": 2710 + }, + { + "epoch": 0.14, + "grad_norm": 1.2057784347406664, + "learning_rate": 1.9396049660565525e-05, + "loss": 0.2277, + "step": 2711 + }, + { + "epoch": 0.14, + "grad_norm": 1.6404060910870868, + "learning_rate": 1.939548584020804e-05, + "loss": 0.2288, + "step": 2712 + }, + { + "epoch": 0.14, + "grad_norm": 0.9016999206905085, + "learning_rate": 1.9394921764997475e-05, + "loss": 0.1847, + "step": 2713 + }, + { + "epoch": 0.14, + "grad_norm": 1.162953170396771, + "learning_rate": 1.939435743494912e-05, + "loss": 0.2327, + "step": 2714 + }, + { + "epoch": 0.14, + "grad_norm": 1.1998203345890663, + "learning_rate": 1.9393792850078294e-05, + "loss": 0.2468, + "step": 2715 + }, + { + "epoch": 0.14, + "grad_norm": 1.3223511057603083, + "learning_rate": 1.9393228010400303e-05, + "loss": 0.2008, + "step": 2716 + }, + { + "epoch": 0.14, + "grad_norm": 1.2970323911231219, + "learning_rate": 1.9392662915930476e-05, + "loss": 0.252, + "step": 2717 + }, + { + "epoch": 0.14, + "grad_norm": 0.9859759934321387, + "learning_rate": 1.9392097566684132e-05, + "loss": 0.2326, + "step": 2718 + }, + { + "epoch": 0.14, + "grad_norm": 1.5788317329342427, + "learning_rate": 1.9391531962676614e-05, + "loss": 0.2219, + "step": 2719 + }, + { + "epoch": 0.14, + "grad_norm": 1.4030978767199909, + "learning_rate": 1.939096610392326e-05, + "loss": 0.2487, + "step": 2720 + }, + { + "epoch": 0.14, + "grad_norm": 1.0004712075775568, + "learning_rate": 1.939039999043942e-05, + "loss": 0.2151, + "step": 2721 + }, + { + "epoch": 0.14, + "grad_norm": 0.9521128417870931, + "learning_rate": 1.938983362224045e-05, + "loss": 0.2365, + "step": 2722 + }, + { + "epoch": 0.14, + "grad_norm": 0.8453050326579191, + "learning_rate": 1.9389266999341717e-05, + "loss": 0.228, + "step": 2723 + }, + { + "epoch": 0.14, + "grad_norm": 0.9700950773605047, + "learning_rate": 1.938870012175858e-05, + "loss": 0.2038, + "step": 2724 + }, + { + "epoch": 0.14, + "grad_norm": 1.3553509430700317, + "learning_rate": 1.9388132989506422e-05, + "loss": 0.2297, + "step": 2725 + }, + { + "epoch": 0.14, + "grad_norm": 0.9893932786413834, + "learning_rate": 1.938756560260063e-05, + "loss": 0.2173, + "step": 2726 + }, + { + "epoch": 0.14, + "grad_norm": 1.219016122502431, + "learning_rate": 1.938699796105659e-05, + "loss": 0.2452, + "step": 2727 + }, + { + "epoch": 0.14, + "grad_norm": 1.1359466605415534, + "learning_rate": 1.93864300648897e-05, + "loss": 0.2274, + "step": 2728 + }, + { + "epoch": 0.14, + "grad_norm": 1.9213735652946478, + "learning_rate": 1.9385861914115365e-05, + "loss": 0.217, + "step": 2729 + }, + { + "epoch": 0.14, + "grad_norm": 1.0375854513096163, + "learning_rate": 1.9385293508748994e-05, + "loss": 0.2354, + "step": 2730 + }, + { + "epoch": 0.14, + "grad_norm": 1.1904506309150737, + "learning_rate": 1.9384724848806007e-05, + "loss": 0.2345, + "step": 2731 + }, + { + "epoch": 0.14, + "grad_norm": 1.0897334514227217, + "learning_rate": 1.938415593430183e-05, + "loss": 0.2282, + "step": 2732 + }, + { + "epoch": 0.14, + "grad_norm": 1.3949774336848564, + "learning_rate": 1.938358676525189e-05, + "loss": 0.2158, + "step": 2733 + }, + { + "epoch": 0.14, + "grad_norm": 1.2083550955177704, + "learning_rate": 1.938301734167163e-05, + "loss": 0.2311, + "step": 2734 + }, + { + "epoch": 0.14, + "grad_norm": 1.1414932076518072, + "learning_rate": 1.9382447663576495e-05, + "loss": 0.2166, + "step": 2735 + }, + { + "epoch": 0.14, + "grad_norm": 1.2718276826173287, + "learning_rate": 1.9381877730981938e-05, + "loss": 0.2304, + "step": 2736 + }, + { + "epoch": 0.14, + "grad_norm": 1.8383903990044212, + "learning_rate": 1.9381307543903416e-05, + "loss": 0.2392, + "step": 2737 + }, + { + "epoch": 0.14, + "grad_norm": 1.1569915189018203, + "learning_rate": 1.93807371023564e-05, + "loss": 0.2274, + "step": 2738 + }, + { + "epoch": 0.14, + "grad_norm": 1.4849041251151516, + "learning_rate": 1.9380166406356357e-05, + "loss": 0.2647, + "step": 2739 + }, + { + "epoch": 0.14, + "grad_norm": 0.9769699732654332, + "learning_rate": 1.9379595455918773e-05, + "loss": 0.2233, + "step": 2740 + }, + { + "epoch": 0.14, + "grad_norm": 1.1559082444358126, + "learning_rate": 1.937902425105913e-05, + "loss": 0.225, + "step": 2741 + }, + { + "epoch": 0.14, + "grad_norm": 0.9116840164315465, + "learning_rate": 1.9378452791792924e-05, + "loss": 0.2057, + "step": 2742 + }, + { + "epoch": 0.14, + "grad_norm": 1.0057407085044325, + "learning_rate": 1.937788107813566e-05, + "loss": 0.2108, + "step": 2743 + }, + { + "epoch": 0.14, + "grad_norm": 1.0099264312055576, + "learning_rate": 1.937730911010284e-05, + "loss": 0.2116, + "step": 2744 + }, + { + "epoch": 0.14, + "grad_norm": 1.4555501133886994, + "learning_rate": 1.9376736887709982e-05, + "loss": 0.216, + "step": 2745 + }, + { + "epoch": 0.14, + "grad_norm": 1.0396223279709482, + "learning_rate": 1.9376164410972604e-05, + "loss": 0.229, + "step": 2746 + }, + { + "epoch": 0.14, + "grad_norm": 1.1136116076973643, + "learning_rate": 1.9375591679906242e-05, + "loss": 0.2507, + "step": 2747 + }, + { + "epoch": 0.14, + "grad_norm": 0.8975991818619756, + "learning_rate": 1.937501869452642e-05, + "loss": 0.2231, + "step": 2748 + }, + { + "epoch": 0.14, + "grad_norm": 1.5743246579158048, + "learning_rate": 1.937444545484869e-05, + "loss": 0.2339, + "step": 2749 + }, + { + "epoch": 0.14, + "grad_norm": 2.469957685828871, + "learning_rate": 1.9373871960888594e-05, + "loss": 0.2389, + "step": 2750 + }, + { + "epoch": 0.14, + "grad_norm": 1.0671056971589354, + "learning_rate": 1.9373298212661697e-05, + "loss": 0.2364, + "step": 2751 + }, + { + "epoch": 0.14, + "grad_norm": 0.88066037520169, + "learning_rate": 1.9372724210183552e-05, + "loss": 0.2524, + "step": 2752 + }, + { + "epoch": 0.14, + "grad_norm": 1.109280660148114, + "learning_rate": 1.9372149953469733e-05, + "loss": 0.2065, + "step": 2753 + }, + { + "epoch": 0.14, + "grad_norm": 1.4632192072371528, + "learning_rate": 1.937157544253582e-05, + "loss": 0.2408, + "step": 2754 + }, + { + "epoch": 0.14, + "grad_norm": 0.9689206571329406, + "learning_rate": 1.9371000677397393e-05, + "loss": 0.2171, + "step": 2755 + }, + { + "epoch": 0.14, + "grad_norm": 1.250810549389868, + "learning_rate": 1.9370425658070043e-05, + "loss": 0.25, + "step": 2756 + }, + { + "epoch": 0.14, + "grad_norm": 0.9785658371917585, + "learning_rate": 1.936985038456937e-05, + "loss": 0.2174, + "step": 2757 + }, + { + "epoch": 0.14, + "grad_norm": 0.9513477898582572, + "learning_rate": 1.936927485691097e-05, + "loss": 0.2498, + "step": 2758 + }, + { + "epoch": 0.14, + "grad_norm": 0.9823602970822934, + "learning_rate": 1.9368699075110467e-05, + "loss": 0.2077, + "step": 2759 + }, + { + "epoch": 0.14, + "grad_norm": 1.01832212753293, + "learning_rate": 1.9368123039183468e-05, + "loss": 0.2258, + "step": 2760 + }, + { + "epoch": 0.14, + "grad_norm": 1.0482144964607565, + "learning_rate": 1.9367546749145605e-05, + "loss": 0.2374, + "step": 2761 + }, + { + "epoch": 0.14, + "grad_norm": 1.1170462467104787, + "learning_rate": 1.9366970205012508e-05, + "loss": 0.218, + "step": 2762 + }, + { + "epoch": 0.14, + "grad_norm": 1.2372709983607768, + "learning_rate": 1.9366393406799813e-05, + "loss": 0.2195, + "step": 2763 + }, + { + "epoch": 0.14, + "grad_norm": 4.606539485103394, + "learning_rate": 1.9365816354523167e-05, + "loss": 0.2416, + "step": 2764 + }, + { + "epoch": 0.14, + "grad_norm": 2.08146758226909, + "learning_rate": 1.9365239048198227e-05, + "loss": 0.2265, + "step": 2765 + }, + { + "epoch": 0.14, + "grad_norm": 0.9354277640682458, + "learning_rate": 1.9364661487840645e-05, + "loss": 0.2565, + "step": 2766 + }, + { + "epoch": 0.14, + "grad_norm": 1.194024150331995, + "learning_rate": 1.9364083673466094e-05, + "loss": 0.2313, + "step": 2767 + }, + { + "epoch": 0.14, + "grad_norm": 1.208037331414121, + "learning_rate": 1.9363505605090243e-05, + "loss": 0.2331, + "step": 2768 + }, + { + "epoch": 0.14, + "grad_norm": 2.654984133731084, + "learning_rate": 1.9362927282728774e-05, + "loss": 0.244, + "step": 2769 + }, + { + "epoch": 0.14, + "grad_norm": 1.1251978450679647, + "learning_rate": 1.9362348706397374e-05, + "loss": 0.2296, + "step": 2770 + }, + { + "epoch": 0.14, + "grad_norm": 1.089003515036792, + "learning_rate": 1.9361769876111734e-05, + "loss": 0.2047, + "step": 2771 + }, + { + "epoch": 0.14, + "grad_norm": 1.0686045742352648, + "learning_rate": 1.936119079188756e-05, + "loss": 0.2195, + "step": 2772 + }, + { + "epoch": 0.14, + "grad_norm": 0.912428797587506, + "learning_rate": 1.936061145374056e-05, + "loss": 0.2152, + "step": 2773 + }, + { + "epoch": 0.14, + "grad_norm": 1.2551911812214693, + "learning_rate": 1.936003186168644e-05, + "loss": 0.2467, + "step": 2774 + }, + { + "epoch": 0.14, + "grad_norm": 0.9804946097373898, + "learning_rate": 1.935945201574093e-05, + "loss": 0.2328, + "step": 2775 + }, + { + "epoch": 0.14, + "grad_norm": 1.2927497100162066, + "learning_rate": 1.9358871915919754e-05, + "loss": 0.2347, + "step": 2776 + }, + { + "epoch": 0.14, + "grad_norm": 1.2909319229029048, + "learning_rate": 1.935829156223865e-05, + "loss": 0.2266, + "step": 2777 + }, + { + "epoch": 0.14, + "grad_norm": 1.1159629090785874, + "learning_rate": 1.935771095471336e-05, + "loss": 0.2208, + "step": 2778 + }, + { + "epoch": 0.14, + "grad_norm": 1.3930226613031322, + "learning_rate": 1.935713009335963e-05, + "loss": 0.2331, + "step": 2779 + }, + { + "epoch": 0.14, + "grad_norm": 1.0452858269400926, + "learning_rate": 1.9356548978193216e-05, + "loss": 0.2501, + "step": 2780 + }, + { + "epoch": 0.14, + "grad_norm": 0.9458649340006015, + "learning_rate": 1.9355967609229886e-05, + "loss": 0.2338, + "step": 2781 + }, + { + "epoch": 0.14, + "grad_norm": 1.2348517530609575, + "learning_rate": 1.9355385986485406e-05, + "loss": 0.2253, + "step": 2782 + }, + { + "epoch": 0.14, + "grad_norm": 1.5834193611796006, + "learning_rate": 1.935480410997555e-05, + "loss": 0.2621, + "step": 2783 + }, + { + "epoch": 0.14, + "grad_norm": 0.8904407124477265, + "learning_rate": 1.9354221979716107e-05, + "loss": 0.2173, + "step": 2784 + }, + { + "epoch": 0.14, + "grad_norm": 1.9469089622240592, + "learning_rate": 1.9353639595722863e-05, + "loss": 0.2164, + "step": 2785 + }, + { + "epoch": 0.14, + "grad_norm": 1.7623120361802205, + "learning_rate": 1.9353056958011613e-05, + "loss": 0.2241, + "step": 2786 + }, + { + "epoch": 0.14, + "grad_norm": 1.343534262397146, + "learning_rate": 1.935247406659817e-05, + "loss": 0.2292, + "step": 2787 + }, + { + "epoch": 0.14, + "grad_norm": 3.9677492958050955, + "learning_rate": 1.935189092149834e-05, + "loss": 0.237, + "step": 2788 + }, + { + "epoch": 0.14, + "grad_norm": 1.0513218979485803, + "learning_rate": 1.9351307522727936e-05, + "loss": 0.2477, + "step": 2789 + }, + { + "epoch": 0.14, + "grad_norm": 1.0806439607517602, + "learning_rate": 1.935072387030279e-05, + "loss": 0.2478, + "step": 2790 + }, + { + "epoch": 0.14, + "grad_norm": 1.0885677043023188, + "learning_rate": 1.9350139964238732e-05, + "loss": 0.2577, + "step": 2791 + }, + { + "epoch": 0.14, + "grad_norm": 0.9436811681196573, + "learning_rate": 1.9349555804551598e-05, + "loss": 0.2148, + "step": 2792 + }, + { + "epoch": 0.14, + "grad_norm": 1.4197577391194087, + "learning_rate": 1.9348971391257235e-05, + "loss": 0.2472, + "step": 2793 + }, + { + "epoch": 0.14, + "grad_norm": 2.0348878799009253, + "learning_rate": 1.9348386724371495e-05, + "loss": 0.2477, + "step": 2794 + }, + { + "epoch": 0.14, + "grad_norm": 1.1077434643852102, + "learning_rate": 1.9347801803910236e-05, + "loss": 0.2239, + "step": 2795 + }, + { + "epoch": 0.14, + "grad_norm": 1.3548339541610674, + "learning_rate": 1.9347216629889326e-05, + "loss": 0.224, + "step": 2796 + }, + { + "epoch": 0.14, + "grad_norm": 1.7495935883278109, + "learning_rate": 1.9346631202324638e-05, + "loss": 0.2304, + "step": 2797 + }, + { + "epoch": 0.14, + "grad_norm": 1.101603564197832, + "learning_rate": 1.9346045521232048e-05, + "loss": 0.2365, + "step": 2798 + }, + { + "epoch": 0.14, + "grad_norm": 1.1903832241877375, + "learning_rate": 1.9345459586627448e-05, + "loss": 0.2371, + "step": 2799 + }, + { + "epoch": 0.14, + "grad_norm": 1.1802158056744014, + "learning_rate": 1.9344873398526733e-05, + "loss": 0.2418, + "step": 2800 + }, + { + "epoch": 0.14, + "grad_norm": 1.503865653521154, + "learning_rate": 1.934428695694579e-05, + "loss": 0.2316, + "step": 2801 + }, + { + "epoch": 0.14, + "grad_norm": 12.801483119089678, + "learning_rate": 1.9343700261900543e-05, + "loss": 0.2447, + "step": 2802 + }, + { + "epoch": 0.14, + "grad_norm": 2.688265735472681, + "learning_rate": 1.9343113313406893e-05, + "loss": 0.2316, + "step": 2803 + }, + { + "epoch": 0.14, + "grad_norm": 1.522621184047923, + "learning_rate": 1.9342526111480772e-05, + "loss": 0.2275, + "step": 2804 + }, + { + "epoch": 0.14, + "grad_norm": 2.6548636105769403, + "learning_rate": 1.9341938656138097e-05, + "loss": 0.2378, + "step": 2805 + }, + { + "epoch": 0.14, + "grad_norm": 1.149292127360877, + "learning_rate": 1.9341350947394812e-05, + "loss": 0.2148, + "step": 2806 + }, + { + "epoch": 0.14, + "grad_norm": 1.0514337747612676, + "learning_rate": 1.9340762985266853e-05, + "loss": 0.2271, + "step": 2807 + }, + { + "epoch": 0.14, + "grad_norm": 1.0551929436991787, + "learning_rate": 1.9340174769770173e-05, + "loss": 0.2237, + "step": 2808 + }, + { + "epoch": 0.14, + "grad_norm": 1.3379299345155393, + "learning_rate": 1.933958630092072e-05, + "loss": 0.2207, + "step": 2809 + }, + { + "epoch": 0.14, + "grad_norm": 1.5583520618210185, + "learning_rate": 1.9338997578734466e-05, + "loss": 0.2106, + "step": 2810 + }, + { + "epoch": 0.14, + "grad_norm": 1.4166387778203358, + "learning_rate": 1.9338408603227374e-05, + "loss": 0.2488, + "step": 2811 + }, + { + "epoch": 0.14, + "grad_norm": 1.2264068153220962, + "learning_rate": 1.9337819374415422e-05, + "loss": 0.2566, + "step": 2812 + }, + { + "epoch": 0.14, + "grad_norm": 1.1433870801860593, + "learning_rate": 1.933722989231459e-05, + "loss": 0.2185, + "step": 2813 + }, + { + "epoch": 0.14, + "grad_norm": 0.8592350172417055, + "learning_rate": 1.933664015694087e-05, + "loss": 0.2098, + "step": 2814 + }, + { + "epoch": 0.14, + "grad_norm": 0.8753480279923114, + "learning_rate": 1.933605016831026e-05, + "loss": 0.2096, + "step": 2815 + }, + { + "epoch": 0.14, + "grad_norm": 1.0621829098299027, + "learning_rate": 1.933545992643876e-05, + "loss": 0.2262, + "step": 2816 + }, + { + "epoch": 0.14, + "grad_norm": 1.294220489663147, + "learning_rate": 1.9334869431342386e-05, + "loss": 0.236, + "step": 2817 + }, + { + "epoch": 0.14, + "grad_norm": 1.387910567461303, + "learning_rate": 1.9334278683037145e-05, + "loss": 0.227, + "step": 2818 + }, + { + "epoch": 0.14, + "grad_norm": 1.102746476766718, + "learning_rate": 1.9333687681539073e-05, + "loss": 0.2447, + "step": 2819 + }, + { + "epoch": 0.14, + "grad_norm": 0.8837216385130431, + "learning_rate": 1.9333096426864194e-05, + "loss": 0.2374, + "step": 2820 + }, + { + "epoch": 0.14, + "grad_norm": 1.0590666816622536, + "learning_rate": 1.9332504919028548e-05, + "loss": 0.2266, + "step": 2821 + }, + { + "epoch": 0.14, + "grad_norm": 1.2261448440402158, + "learning_rate": 1.9331913158048175e-05, + "loss": 0.2126, + "step": 2822 + }, + { + "epoch": 0.14, + "grad_norm": 2.3830268899911844, + "learning_rate": 1.9331321143939134e-05, + "loss": 0.1973, + "step": 2823 + }, + { + "epoch": 0.14, + "grad_norm": 1.2466959722982942, + "learning_rate": 1.933072887671748e-05, + "loss": 0.2186, + "step": 2824 + }, + { + "epoch": 0.14, + "grad_norm": 1.1776701511053869, + "learning_rate": 1.933013635639928e-05, + "loss": 0.2299, + "step": 2825 + }, + { + "epoch": 0.14, + "grad_norm": 1.079444331175208, + "learning_rate": 1.93295435830006e-05, + "loss": 0.243, + "step": 2826 + }, + { + "epoch": 0.14, + "grad_norm": 0.9769968665319854, + "learning_rate": 1.9328950556537523e-05, + "loss": 0.2288, + "step": 2827 + }, + { + "epoch": 0.14, + "grad_norm": 2.233898058764008, + "learning_rate": 1.932835727702614e-05, + "loss": 0.2327, + "step": 2828 + }, + { + "epoch": 0.14, + "grad_norm": 0.9367566360481667, + "learning_rate": 1.9327763744482536e-05, + "loss": 0.2117, + "step": 2829 + }, + { + "epoch": 0.14, + "grad_norm": 1.3818642138448198, + "learning_rate": 1.9327169958922813e-05, + "loss": 0.2181, + "step": 2830 + }, + { + "epoch": 0.14, + "grad_norm": 1.2256836948803158, + "learning_rate": 1.932657592036308e-05, + "loss": 0.2203, + "step": 2831 + }, + { + "epoch": 0.14, + "grad_norm": 1.104918269812186, + "learning_rate": 1.9325981628819448e-05, + "loss": 0.2184, + "step": 2832 + }, + { + "epoch": 0.14, + "grad_norm": 1.4675592439172866, + "learning_rate": 1.9325387084308036e-05, + "loss": 0.2331, + "step": 2833 + }, + { + "epoch": 0.14, + "grad_norm": 3.220652819519909, + "learning_rate": 1.9324792286844977e-05, + "loss": 0.2333, + "step": 2834 + }, + { + "epoch": 0.14, + "grad_norm": 1.2065214985533652, + "learning_rate": 1.9324197236446397e-05, + "loss": 0.2497, + "step": 2835 + }, + { + "epoch": 0.14, + "grad_norm": 1.0117364195372431, + "learning_rate": 1.932360193312844e-05, + "loss": 0.2159, + "step": 2836 + }, + { + "epoch": 0.14, + "grad_norm": 1.1569651541855233, + "learning_rate": 1.9323006376907253e-05, + "loss": 0.2282, + "step": 2837 + }, + { + "epoch": 0.14, + "grad_norm": 1.1876368415555707, + "learning_rate": 1.9322410567798996e-05, + "loss": 0.2212, + "step": 2838 + }, + { + "epoch": 0.14, + "grad_norm": 1.3740372088877062, + "learning_rate": 1.932181450581982e-05, + "loss": 0.2384, + "step": 2839 + }, + { + "epoch": 0.14, + "grad_norm": 1.1016021890316428, + "learning_rate": 1.9321218190985906e-05, + "loss": 0.222, + "step": 2840 + }, + { + "epoch": 0.14, + "grad_norm": 0.9683116783883369, + "learning_rate": 1.9320621623313416e-05, + "loss": 0.2128, + "step": 2841 + }, + { + "epoch": 0.14, + "grad_norm": 1.1316210342046384, + "learning_rate": 1.932002480281854e-05, + "loss": 0.2403, + "step": 2842 + }, + { + "epoch": 0.14, + "grad_norm": 0.9264831837106831, + "learning_rate": 1.9319427729517467e-05, + "loss": 0.2175, + "step": 2843 + }, + { + "epoch": 0.14, + "grad_norm": 1.03768617926261, + "learning_rate": 1.9318830403426388e-05, + "loss": 0.2477, + "step": 2844 + }, + { + "epoch": 0.14, + "grad_norm": 1.2202169153802127, + "learning_rate": 1.9318232824561507e-05, + "loss": 0.2141, + "step": 2845 + }, + { + "epoch": 0.14, + "grad_norm": 0.8568755411257019, + "learning_rate": 1.9317634992939034e-05, + "loss": 0.2263, + "step": 2846 + }, + { + "epoch": 0.14, + "grad_norm": 1.0483047904842462, + "learning_rate": 1.931703690857519e-05, + "loss": 0.2382, + "step": 2847 + }, + { + "epoch": 0.14, + "grad_norm": 2.329723964272318, + "learning_rate": 1.9316438571486188e-05, + "loss": 0.1943, + "step": 2848 + }, + { + "epoch": 0.14, + "grad_norm": 2.3214312045060064, + "learning_rate": 1.9315839981688267e-05, + "loss": 0.2255, + "step": 2849 + }, + { + "epoch": 0.14, + "grad_norm": 1.6453793440023385, + "learning_rate": 1.931524113919766e-05, + "loss": 0.2587, + "step": 2850 + }, + { + "epoch": 0.14, + "grad_norm": 1.2808218991790954, + "learning_rate": 1.931464204403061e-05, + "loss": 0.2426, + "step": 2851 + }, + { + "epoch": 0.15, + "grad_norm": 1.1321376470227988, + "learning_rate": 1.931404269620337e-05, + "loss": 0.2192, + "step": 2852 + }, + { + "epoch": 0.15, + "grad_norm": 0.8540458805915087, + "learning_rate": 1.9313443095732197e-05, + "loss": 0.2372, + "step": 2853 + }, + { + "epoch": 0.15, + "grad_norm": 1.629983136852726, + "learning_rate": 1.9312843242633354e-05, + "loss": 0.2371, + "step": 2854 + }, + { + "epoch": 0.15, + "grad_norm": 0.9867139867736547, + "learning_rate": 1.931224313692311e-05, + "loss": 0.2338, + "step": 2855 + }, + { + "epoch": 0.15, + "grad_norm": 1.0737634184425395, + "learning_rate": 1.9311642778617742e-05, + "loss": 0.231, + "step": 2856 + }, + { + "epoch": 0.15, + "grad_norm": 1.03320041635422, + "learning_rate": 1.931104216773354e-05, + "loss": 0.2224, + "step": 2857 + }, + { + "epoch": 0.15, + "grad_norm": 1.1224550744055295, + "learning_rate": 1.9310441304286794e-05, + "loss": 0.2247, + "step": 2858 + }, + { + "epoch": 0.15, + "grad_norm": 0.9518778508395871, + "learning_rate": 1.9309840188293803e-05, + "loss": 0.2337, + "step": 2859 + }, + { + "epoch": 0.15, + "grad_norm": 0.9511248974571589, + "learning_rate": 1.930923881977087e-05, + "loss": 0.2395, + "step": 2860 + }, + { + "epoch": 0.15, + "grad_norm": 1.1447901689296558, + "learning_rate": 1.9308637198734307e-05, + "loss": 0.22, + "step": 2861 + }, + { + "epoch": 0.15, + "grad_norm": 1.1499640522355206, + "learning_rate": 1.9308035325200436e-05, + "loss": 0.242, + "step": 2862 + }, + { + "epoch": 0.15, + "grad_norm": 1.3407711277898409, + "learning_rate": 1.9307433199185582e-05, + "loss": 0.2421, + "step": 2863 + }, + { + "epoch": 0.15, + "grad_norm": 1.0156304338906854, + "learning_rate": 1.9306830820706074e-05, + "loss": 0.2197, + "step": 2864 + }, + { + "epoch": 0.15, + "grad_norm": 1.974378473587215, + "learning_rate": 1.9306228189778255e-05, + "loss": 0.2275, + "step": 2865 + }, + { + "epoch": 0.15, + "grad_norm": 1.0145603045608926, + "learning_rate": 1.930562530641847e-05, + "loss": 0.1958, + "step": 2866 + }, + { + "epoch": 0.15, + "grad_norm": 1.0589342389369067, + "learning_rate": 1.9305022170643077e-05, + "loss": 0.2191, + "step": 2867 + }, + { + "epoch": 0.15, + "grad_norm": 1.0494184592272218, + "learning_rate": 1.9304418782468427e-05, + "loss": 0.2211, + "step": 2868 + }, + { + "epoch": 0.15, + "grad_norm": 0.826301537596647, + "learning_rate": 1.9303815141910894e-05, + "loss": 0.2075, + "step": 2869 + }, + { + "epoch": 0.15, + "grad_norm": 0.8674197245882275, + "learning_rate": 1.930321124898685e-05, + "loss": 0.2213, + "step": 2870 + }, + { + "epoch": 0.15, + "grad_norm": 1.28917194083532, + "learning_rate": 1.930260710371268e-05, + "loss": 0.2432, + "step": 2871 + }, + { + "epoch": 0.15, + "grad_norm": 1.9229131007885853, + "learning_rate": 1.9302002706104762e-05, + "loss": 0.2072, + "step": 2872 + }, + { + "epoch": 0.15, + "grad_norm": 2.9223149186191164, + "learning_rate": 1.9301398056179493e-05, + "loss": 0.2302, + "step": 2873 + }, + { + "epoch": 0.15, + "grad_norm": 1.6633490926322958, + "learning_rate": 1.930079315395328e-05, + "loss": 0.2199, + "step": 2874 + }, + { + "epoch": 0.15, + "grad_norm": 0.9742814069706348, + "learning_rate": 1.930018799944253e-05, + "loss": 0.2316, + "step": 2875 + }, + { + "epoch": 0.15, + "grad_norm": 1.1254691217888606, + "learning_rate": 1.929958259266365e-05, + "loss": 0.2259, + "step": 2876 + }, + { + "epoch": 0.15, + "grad_norm": 0.8419765118945145, + "learning_rate": 1.9298976933633068e-05, + "loss": 0.2367, + "step": 2877 + }, + { + "epoch": 0.15, + "grad_norm": 1.0637810663668488, + "learning_rate": 1.929837102236721e-05, + "loss": 0.2029, + "step": 2878 + }, + { + "epoch": 0.15, + "grad_norm": 1.1969913921398376, + "learning_rate": 1.9297764858882516e-05, + "loss": 0.219, + "step": 2879 + }, + { + "epoch": 0.15, + "grad_norm": 0.9500843843503507, + "learning_rate": 1.929715844319542e-05, + "loss": 0.204, + "step": 2880 + }, + { + "epoch": 0.15, + "grad_norm": 0.9159934453202914, + "learning_rate": 1.9296551775322383e-05, + "loss": 0.2314, + "step": 2881 + }, + { + "epoch": 0.15, + "grad_norm": 0.9819107607663788, + "learning_rate": 1.9295944855279853e-05, + "loss": 0.2259, + "step": 2882 + }, + { + "epoch": 0.15, + "grad_norm": 1.0649590229898775, + "learning_rate": 1.9295337683084292e-05, + "loss": 0.1982, + "step": 2883 + }, + { + "epoch": 0.15, + "grad_norm": 1.2130419725924906, + "learning_rate": 1.929473025875217e-05, + "loss": 0.23, + "step": 2884 + }, + { + "epoch": 0.15, + "grad_norm": 0.9468141626402687, + "learning_rate": 1.9294122582299964e-05, + "loss": 0.2211, + "step": 2885 + }, + { + "epoch": 0.15, + "grad_norm": 1.0795576693617677, + "learning_rate": 1.929351465374416e-05, + "loss": 0.2469, + "step": 2886 + }, + { + "epoch": 0.15, + "grad_norm": 0.7962182832585174, + "learning_rate": 1.9292906473101246e-05, + "loss": 0.1996, + "step": 2887 + }, + { + "epoch": 0.15, + "grad_norm": 0.9714002340563536, + "learning_rate": 1.929229804038772e-05, + "loss": 0.2417, + "step": 2888 + }, + { + "epoch": 0.15, + "grad_norm": 1.09410087899835, + "learning_rate": 1.9291689355620088e-05, + "loss": 0.2343, + "step": 2889 + }, + { + "epoch": 0.15, + "grad_norm": 0.7838595149854972, + "learning_rate": 1.9291080418814852e-05, + "loss": 0.1895, + "step": 2890 + }, + { + "epoch": 0.15, + "grad_norm": 1.274523284116608, + "learning_rate": 1.9290471229988536e-05, + "loss": 0.2156, + "step": 2891 + }, + { + "epoch": 0.15, + "grad_norm": 0.9507973660783633, + "learning_rate": 1.9289861789157666e-05, + "loss": 0.237, + "step": 2892 + }, + { + "epoch": 0.15, + "grad_norm": 1.1444280596229304, + "learning_rate": 1.9289252096338767e-05, + "loss": 0.2345, + "step": 2893 + }, + { + "epoch": 0.15, + "grad_norm": 0.8911921538107122, + "learning_rate": 1.928864215154838e-05, + "loss": 0.2183, + "step": 2894 + }, + { + "epoch": 0.15, + "grad_norm": 0.9689915619745783, + "learning_rate": 1.928803195480305e-05, + "loss": 0.2347, + "step": 2895 + }, + { + "epoch": 0.15, + "grad_norm": 1.1021079330412795, + "learning_rate": 1.9287421506119332e-05, + "loss": 0.2305, + "step": 2896 + }, + { + "epoch": 0.15, + "grad_norm": 1.0599028799692833, + "learning_rate": 1.9286810805513774e-05, + "loss": 0.2447, + "step": 2897 + }, + { + "epoch": 0.15, + "grad_norm": 1.0162405743349308, + "learning_rate": 1.9286199853002956e-05, + "loss": 0.2046, + "step": 2898 + }, + { + "epoch": 0.15, + "grad_norm": 1.1035504592713803, + "learning_rate": 1.928558864860344e-05, + "loss": 0.2172, + "step": 2899 + }, + { + "epoch": 0.15, + "grad_norm": 1.303054013613804, + "learning_rate": 1.9284977192331807e-05, + "loss": 0.2312, + "step": 2900 + }, + { + "epoch": 0.15, + "grad_norm": 1.2167740655890367, + "learning_rate": 1.9284365484204645e-05, + "loss": 0.2389, + "step": 2901 + }, + { + "epoch": 0.15, + "grad_norm": 1.3669197391194035, + "learning_rate": 1.9283753524238542e-05, + "loss": 0.2293, + "step": 2902 + }, + { + "epoch": 0.15, + "grad_norm": 1.1895888740056975, + "learning_rate": 1.92831413124501e-05, + "loss": 0.2241, + "step": 2903 + }, + { + "epoch": 0.15, + "grad_norm": 1.2522969542985063, + "learning_rate": 1.9282528848855925e-05, + "loss": 0.2601, + "step": 2904 + }, + { + "epoch": 0.15, + "grad_norm": 1.302544121030843, + "learning_rate": 1.9281916133472636e-05, + "loss": 0.1939, + "step": 2905 + }, + { + "epoch": 0.15, + "grad_norm": 1.0861778850757935, + "learning_rate": 1.9281303166316846e-05, + "loss": 0.2684, + "step": 2906 + }, + { + "epoch": 0.15, + "grad_norm": 1.2882850775483696, + "learning_rate": 1.928068994740518e-05, + "loss": 0.2334, + "step": 2907 + }, + { + "epoch": 0.15, + "grad_norm": 1.2321169609007878, + "learning_rate": 1.9280076476754276e-05, + "loss": 0.2164, + "step": 2908 + }, + { + "epoch": 0.15, + "grad_norm": 1.6514277681396714, + "learning_rate": 1.9279462754380774e-05, + "loss": 0.2389, + "step": 2909 + }, + { + "epoch": 0.15, + "grad_norm": 1.1270106887925726, + "learning_rate": 1.927884878030132e-05, + "loss": 0.2225, + "step": 2910 + }, + { + "epoch": 0.15, + "grad_norm": 1.8445104404776635, + "learning_rate": 1.927823455453257e-05, + "loss": 0.2282, + "step": 2911 + }, + { + "epoch": 0.15, + "grad_norm": 1.1310180099501284, + "learning_rate": 1.9277620077091184e-05, + "loss": 0.2357, + "step": 2912 + }, + { + "epoch": 0.15, + "grad_norm": 1.1734832772791386, + "learning_rate": 1.927700534799383e-05, + "loss": 0.2436, + "step": 2913 + }, + { + "epoch": 0.15, + "grad_norm": 1.055450159546421, + "learning_rate": 1.927639036725718e-05, + "loss": 0.2293, + "step": 2914 + }, + { + "epoch": 0.15, + "grad_norm": 1.235235898756223, + "learning_rate": 1.927577513489792e-05, + "loss": 0.2186, + "step": 2915 + }, + { + "epoch": 0.15, + "grad_norm": 1.0584009839980613, + "learning_rate": 1.927515965093273e-05, + "loss": 0.2259, + "step": 2916 + }, + { + "epoch": 0.15, + "grad_norm": 1.0460374119343292, + "learning_rate": 1.9274543915378315e-05, + "loss": 0.2276, + "step": 2917 + }, + { + "epoch": 0.15, + "grad_norm": 1.670885330919633, + "learning_rate": 1.927392792825137e-05, + "loss": 0.2071, + "step": 2918 + }, + { + "epoch": 0.15, + "grad_norm": 1.0249029467969002, + "learning_rate": 1.927331168956861e-05, + "loss": 0.2114, + "step": 2919 + }, + { + "epoch": 0.15, + "grad_norm": 0.9630012806325837, + "learning_rate": 1.9272695199346743e-05, + "loss": 0.2633, + "step": 2920 + }, + { + "epoch": 0.15, + "grad_norm": 1.0913292371471266, + "learning_rate": 1.92720784576025e-05, + "loss": 0.2235, + "step": 2921 + }, + { + "epoch": 0.15, + "grad_norm": 0.8716918819429021, + "learning_rate": 1.92714614643526e-05, + "loss": 0.2295, + "step": 2922 + }, + { + "epoch": 0.15, + "grad_norm": 1.1952869315038404, + "learning_rate": 1.9270844219613785e-05, + "loss": 0.2343, + "step": 2923 + }, + { + "epoch": 0.15, + "grad_norm": 0.9901067479541756, + "learning_rate": 1.9270226723402798e-05, + "loss": 0.2148, + "step": 2924 + }, + { + "epoch": 0.15, + "grad_norm": 0.9465534365045836, + "learning_rate": 1.926960897573639e-05, + "loss": 0.2189, + "step": 2925 + }, + { + "epoch": 0.15, + "grad_norm": 0.9050720063547557, + "learning_rate": 1.926899097663131e-05, + "loss": 0.205, + "step": 2926 + }, + { + "epoch": 0.15, + "grad_norm": 1.3984753431326877, + "learning_rate": 1.926837272610433e-05, + "loss": 0.2278, + "step": 2927 + }, + { + "epoch": 0.15, + "grad_norm": 1.331648313954839, + "learning_rate": 1.9267754224172216e-05, + "loss": 0.2252, + "step": 2928 + }, + { + "epoch": 0.15, + "grad_norm": 1.3474581608803735, + "learning_rate": 1.926713547085174e-05, + "loss": 0.2242, + "step": 2929 + }, + { + "epoch": 0.15, + "grad_norm": 1.1764702056906664, + "learning_rate": 1.9266516466159697e-05, + "loss": 0.2275, + "step": 2930 + }, + { + "epoch": 0.15, + "grad_norm": 0.9102442675036209, + "learning_rate": 1.9265897210112868e-05, + "loss": 0.209, + "step": 2931 + }, + { + "epoch": 0.15, + "grad_norm": 0.9579038746913898, + "learning_rate": 1.9265277702728058e-05, + "loss": 0.2238, + "step": 2932 + }, + { + "epoch": 0.15, + "grad_norm": 1.087611433992817, + "learning_rate": 1.9264657944022063e-05, + "loss": 0.2219, + "step": 2933 + }, + { + "epoch": 0.15, + "grad_norm": 0.9513234172795997, + "learning_rate": 1.92640379340117e-05, + "loss": 0.2293, + "step": 2934 + }, + { + "epoch": 0.15, + "grad_norm": 1.035375087455176, + "learning_rate": 1.9263417672713786e-05, + "loss": 0.2013, + "step": 2935 + }, + { + "epoch": 0.15, + "grad_norm": 0.9867731544458642, + "learning_rate": 1.926279716014514e-05, + "loss": 0.2239, + "step": 2936 + }, + { + "epoch": 0.15, + "grad_norm": 1.004772357407247, + "learning_rate": 1.92621763963226e-05, + "loss": 0.2428, + "step": 2937 + }, + { + "epoch": 0.15, + "grad_norm": 1.2468846047363114, + "learning_rate": 1.9261555381263003e-05, + "loss": 0.264, + "step": 2938 + }, + { + "epoch": 0.15, + "grad_norm": 1.3487744229263021, + "learning_rate": 1.926093411498319e-05, + "loss": 0.2162, + "step": 2939 + }, + { + "epoch": 0.15, + "grad_norm": 1.2469162410816559, + "learning_rate": 1.926031259750002e-05, + "loss": 0.2244, + "step": 2940 + }, + { + "epoch": 0.15, + "grad_norm": 1.1165749330292827, + "learning_rate": 1.9259690828830345e-05, + "loss": 0.2609, + "step": 2941 + }, + { + "epoch": 0.15, + "grad_norm": 0.879048795841088, + "learning_rate": 1.925906880899104e-05, + "loss": 0.2225, + "step": 2942 + }, + { + "epoch": 0.15, + "grad_norm": 1.1138683990180442, + "learning_rate": 1.9258446537998964e-05, + "loss": 0.209, + "step": 2943 + }, + { + "epoch": 0.15, + "grad_norm": 1.563574900308686, + "learning_rate": 1.9257824015871005e-05, + "loss": 0.2385, + "step": 2944 + }, + { + "epoch": 0.15, + "grad_norm": 1.8906959033595399, + "learning_rate": 1.9257201242624045e-05, + "loss": 0.2098, + "step": 2945 + }, + { + "epoch": 0.15, + "grad_norm": 1.030668698944916, + "learning_rate": 1.925657821827498e-05, + "loss": 0.2218, + "step": 2946 + }, + { + "epoch": 0.15, + "grad_norm": 1.096503438378198, + "learning_rate": 1.9255954942840706e-05, + "loss": 0.2421, + "step": 2947 + }, + { + "epoch": 0.15, + "grad_norm": 1.2449681975950357, + "learning_rate": 1.9255331416338134e-05, + "loss": 0.2259, + "step": 2948 + }, + { + "epoch": 0.15, + "grad_norm": 1.067845526085949, + "learning_rate": 1.9254707638784174e-05, + "loss": 0.2051, + "step": 2949 + }, + { + "epoch": 0.15, + "grad_norm": 1.645151745527325, + "learning_rate": 1.9254083610195745e-05, + "loss": 0.2271, + "step": 2950 + }, + { + "epoch": 0.15, + "grad_norm": 1.440117632626847, + "learning_rate": 1.9253459330589776e-05, + "loss": 0.2204, + "step": 2951 + }, + { + "epoch": 0.15, + "grad_norm": 1.294112671200197, + "learning_rate": 1.9252834799983197e-05, + "loss": 0.2179, + "step": 2952 + }, + { + "epoch": 0.15, + "grad_norm": 1.0002684462624296, + "learning_rate": 1.9252210018392957e-05, + "loss": 0.2398, + "step": 2953 + }, + { + "epoch": 0.15, + "grad_norm": 0.9685935114284663, + "learning_rate": 1.9251584985835996e-05, + "loss": 0.2193, + "step": 2954 + }, + { + "epoch": 0.15, + "grad_norm": 1.0508821576329554, + "learning_rate": 1.9250959702329268e-05, + "loss": 0.2246, + "step": 2955 + }, + { + "epoch": 0.15, + "grad_norm": 0.8610381375921086, + "learning_rate": 1.9250334167889737e-05, + "loss": 0.2236, + "step": 2956 + }, + { + "epoch": 0.15, + "grad_norm": 1.260320103814766, + "learning_rate": 1.9249708382534372e-05, + "loss": 0.2235, + "step": 2957 + }, + { + "epoch": 0.15, + "grad_norm": 1.070880397566758, + "learning_rate": 1.924908234628014e-05, + "loss": 0.2206, + "step": 2958 + }, + { + "epoch": 0.15, + "grad_norm": 0.8849634135287198, + "learning_rate": 1.9248456059144028e-05, + "loss": 0.2156, + "step": 2959 + }, + { + "epoch": 0.15, + "grad_norm": 0.8334624823803712, + "learning_rate": 1.9247829521143023e-05, + "loss": 0.2195, + "step": 2960 + }, + { + "epoch": 0.15, + "grad_norm": 0.9487501038077363, + "learning_rate": 1.924720273229412e-05, + "loss": 0.2473, + "step": 2961 + }, + { + "epoch": 0.15, + "grad_norm": 3.294794933510296, + "learning_rate": 1.9246575692614323e-05, + "loss": 0.2442, + "step": 2962 + }, + { + "epoch": 0.15, + "grad_norm": 1.2417909542707162, + "learning_rate": 1.9245948402120634e-05, + "loss": 0.2306, + "step": 2963 + }, + { + "epoch": 0.15, + "grad_norm": 1.0910392732978642, + "learning_rate": 1.9245320860830075e-05, + "loss": 0.2041, + "step": 2964 + }, + { + "epoch": 0.15, + "grad_norm": 0.9316213997274377, + "learning_rate": 1.9244693068759668e-05, + "loss": 0.2398, + "step": 2965 + }, + { + "epoch": 0.15, + "grad_norm": 1.3031353937967904, + "learning_rate": 1.9244065025926434e-05, + "loss": 0.2096, + "step": 2966 + }, + { + "epoch": 0.15, + "grad_norm": 0.7483928647007118, + "learning_rate": 1.9243436732347418e-05, + "loss": 0.2105, + "step": 2967 + }, + { + "epoch": 0.15, + "grad_norm": 1.2147163505835121, + "learning_rate": 1.9242808188039658e-05, + "loss": 0.242, + "step": 2968 + }, + { + "epoch": 0.15, + "grad_norm": 0.9165976744588308, + "learning_rate": 1.92421793930202e-05, + "loss": 0.2287, + "step": 2969 + }, + { + "epoch": 0.15, + "grad_norm": 1.129164002317354, + "learning_rate": 1.924155034730611e-05, + "loss": 0.2114, + "step": 2970 + }, + { + "epoch": 0.15, + "grad_norm": 0.8668190231085972, + "learning_rate": 1.924092105091444e-05, + "loss": 0.219, + "step": 2971 + }, + { + "epoch": 0.15, + "grad_norm": 0.8988161489801656, + "learning_rate": 1.9240291503862266e-05, + "loss": 0.2329, + "step": 2972 + }, + { + "epoch": 0.15, + "grad_norm": 1.5519469682120905, + "learning_rate": 1.9239661706166663e-05, + "loss": 0.2372, + "step": 2973 + }, + { + "epoch": 0.15, + "grad_norm": 0.9317913944468382, + "learning_rate": 1.9239031657844718e-05, + "loss": 0.2376, + "step": 2974 + }, + { + "epoch": 0.15, + "grad_norm": 1.0039810802977571, + "learning_rate": 1.9238401358913513e-05, + "loss": 0.2426, + "step": 2975 + }, + { + "epoch": 0.15, + "grad_norm": 0.8406615489980125, + "learning_rate": 1.923777080939015e-05, + "loss": 0.2081, + "step": 2976 + }, + { + "epoch": 0.15, + "grad_norm": 0.9565145582443978, + "learning_rate": 1.9237140009291733e-05, + "loss": 0.2146, + "step": 2977 + }, + { + "epoch": 0.15, + "grad_norm": 0.9123321433259106, + "learning_rate": 1.9236508958635372e-05, + "loss": 0.2132, + "step": 2978 + }, + { + "epoch": 0.15, + "grad_norm": 1.0533861024137494, + "learning_rate": 1.923587765743818e-05, + "loss": 0.1994, + "step": 2979 + }, + { + "epoch": 0.15, + "grad_norm": 1.2296651007941344, + "learning_rate": 1.923524610571729e-05, + "loss": 0.2353, + "step": 2980 + }, + { + "epoch": 0.15, + "grad_norm": 1.0783244560851948, + "learning_rate": 1.9234614303489823e-05, + "loss": 0.2487, + "step": 2981 + }, + { + "epoch": 0.15, + "grad_norm": 1.1787683750063183, + "learning_rate": 1.9233982250772927e-05, + "loss": 0.247, + "step": 2982 + }, + { + "epoch": 0.15, + "grad_norm": 1.0130160636525272, + "learning_rate": 1.9233349947583735e-05, + "loss": 0.2336, + "step": 2983 + }, + { + "epoch": 0.15, + "grad_norm": 1.4574101701417077, + "learning_rate": 1.923271739393941e-05, + "loss": 0.2048, + "step": 2984 + }, + { + "epoch": 0.15, + "grad_norm": 1.3197654603803377, + "learning_rate": 1.9232084589857103e-05, + "loss": 0.2364, + "step": 2985 + }, + { + "epoch": 0.15, + "grad_norm": 1.207157479828292, + "learning_rate": 1.9231451535353977e-05, + "loss": 0.2134, + "step": 2986 + }, + { + "epoch": 0.15, + "grad_norm": 0.9671518118956477, + "learning_rate": 1.9230818230447207e-05, + "loss": 0.209, + "step": 2987 + }, + { + "epoch": 0.15, + "grad_norm": 0.954083161294547, + "learning_rate": 1.9230184675153974e-05, + "loss": 0.2262, + "step": 2988 + }, + { + "epoch": 0.15, + "grad_norm": 1.0068100034817131, + "learning_rate": 1.9229550869491456e-05, + "loss": 0.2074, + "step": 2989 + }, + { + "epoch": 0.15, + "grad_norm": 1.2304033656255684, + "learning_rate": 1.9228916813476855e-05, + "loss": 0.2365, + "step": 2990 + }, + { + "epoch": 0.15, + "grad_norm": 1.1480477035501895, + "learning_rate": 1.922828250712736e-05, + "loss": 0.2176, + "step": 2991 + }, + { + "epoch": 0.15, + "grad_norm": 1.6072046871345662, + "learning_rate": 1.9227647950460184e-05, + "loss": 0.2173, + "step": 2992 + }, + { + "epoch": 0.15, + "grad_norm": 1.510713170555769, + "learning_rate": 1.9227013143492534e-05, + "loss": 0.2311, + "step": 2993 + }, + { + "epoch": 0.15, + "grad_norm": 1.607032952016177, + "learning_rate": 1.922637808624163e-05, + "loss": 0.2301, + "step": 2994 + }, + { + "epoch": 0.15, + "grad_norm": 1.2996906089038012, + "learning_rate": 1.92257427787247e-05, + "loss": 0.2596, + "step": 2995 + }, + { + "epoch": 0.15, + "grad_norm": 1.0050341574660457, + "learning_rate": 1.922510722095898e-05, + "loss": 0.2361, + "step": 2996 + }, + { + "epoch": 0.15, + "grad_norm": 0.9414520504409785, + "learning_rate": 1.92244714129617e-05, + "loss": 0.2298, + "step": 2997 + }, + { + "epoch": 0.15, + "grad_norm": 0.9724450849054851, + "learning_rate": 1.9223835354750117e-05, + "loss": 0.2228, + "step": 2998 + }, + { + "epoch": 0.15, + "grad_norm": 0.9975235707500363, + "learning_rate": 1.9223199046341477e-05, + "loss": 0.2471, + "step": 2999 + }, + { + "epoch": 0.15, + "grad_norm": 0.9784518821562274, + "learning_rate": 1.922256248775304e-05, + "loss": 0.222, + "step": 3000 + }, + { + "epoch": 0.15, + "grad_norm": 0.8459640070005651, + "learning_rate": 1.9221925679002076e-05, + "loss": 0.2217, + "step": 3001 + }, + { + "epoch": 0.15, + "grad_norm": 1.255243443188498, + "learning_rate": 1.9221288620105857e-05, + "loss": 0.2356, + "step": 3002 + }, + { + "epoch": 0.15, + "grad_norm": 0.8893090922997737, + "learning_rate": 1.9220651311081666e-05, + "loss": 0.2279, + "step": 3003 + }, + { + "epoch": 0.15, + "grad_norm": 0.9402338401958679, + "learning_rate": 1.922001375194678e-05, + "loss": 0.2093, + "step": 3004 + }, + { + "epoch": 0.15, + "grad_norm": 0.8455121896699624, + "learning_rate": 1.9219375942718508e-05, + "loss": 0.2226, + "step": 3005 + }, + { + "epoch": 0.15, + "grad_norm": 1.2765300051535464, + "learning_rate": 1.921873788341414e-05, + "loss": 0.2542, + "step": 3006 + }, + { + "epoch": 0.15, + "grad_norm": 1.2140638684126275, + "learning_rate": 1.9218099574050985e-05, + "loss": 0.2534, + "step": 3007 + }, + { + "epoch": 0.15, + "grad_norm": 1.0750642375996193, + "learning_rate": 1.9217461014646362e-05, + "loss": 0.2138, + "step": 3008 + }, + { + "epoch": 0.15, + "grad_norm": 1.246919525779567, + "learning_rate": 1.9216822205217586e-05, + "loss": 0.2293, + "step": 3009 + }, + { + "epoch": 0.15, + "grad_norm": 1.0217795188677212, + "learning_rate": 1.9216183145781984e-05, + "loss": 0.2211, + "step": 3010 + }, + { + "epoch": 0.15, + "grad_norm": 1.587256853484376, + "learning_rate": 1.92155438363569e-05, + "loss": 0.2676, + "step": 3011 + }, + { + "epoch": 0.15, + "grad_norm": 1.1552003644377378, + "learning_rate": 1.9214904276959664e-05, + "loss": 0.2202, + "step": 3012 + }, + { + "epoch": 0.15, + "grad_norm": 1.110685839383147, + "learning_rate": 1.921426446760763e-05, + "loss": 0.2331, + "step": 3013 + }, + { + "epoch": 0.15, + "grad_norm": 1.7931555285092697, + "learning_rate": 1.9213624408318155e-05, + "loss": 0.2201, + "step": 3014 + }, + { + "epoch": 0.15, + "grad_norm": 1.2477592588381938, + "learning_rate": 1.9212984099108594e-05, + "loss": 0.2276, + "step": 3015 + }, + { + "epoch": 0.15, + "grad_norm": 1.4367511021452408, + "learning_rate": 1.921234353999632e-05, + "loss": 0.2284, + "step": 3016 + }, + { + "epoch": 0.15, + "grad_norm": 1.306302205604074, + "learning_rate": 1.921170273099871e-05, + "loss": 0.2194, + "step": 3017 + }, + { + "epoch": 0.15, + "grad_norm": 1.0782994679740574, + "learning_rate": 1.921106167213314e-05, + "loss": 0.2372, + "step": 3018 + }, + { + "epoch": 0.15, + "grad_norm": 1.0505196944402984, + "learning_rate": 1.9210420363417e-05, + "loss": 0.2144, + "step": 3019 + }, + { + "epoch": 0.15, + "grad_norm": 1.9353288519979084, + "learning_rate": 1.920977880486769e-05, + "loss": 0.2283, + "step": 3020 + }, + { + "epoch": 0.15, + "grad_norm": 0.974558823252189, + "learning_rate": 1.920913699650261e-05, + "loss": 0.217, + "step": 3021 + }, + { + "epoch": 0.15, + "grad_norm": 0.9563677813824926, + "learning_rate": 1.920849493833917e-05, + "loss": 0.2466, + "step": 3022 + }, + { + "epoch": 0.15, + "grad_norm": 0.9999446931006459, + "learning_rate": 1.9207852630394782e-05, + "loss": 0.1926, + "step": 3023 + }, + { + "epoch": 0.15, + "grad_norm": 1.1905647435368694, + "learning_rate": 1.920721007268687e-05, + "loss": 0.211, + "step": 3024 + }, + { + "epoch": 0.15, + "grad_norm": 0.9547156412821325, + "learning_rate": 1.9206567265232867e-05, + "loss": 0.2114, + "step": 3025 + }, + { + "epoch": 0.15, + "grad_norm": 1.7980185646523026, + "learning_rate": 1.920592420805021e-05, + "loss": 0.2466, + "step": 3026 + }, + { + "epoch": 0.15, + "grad_norm": 1.160882813189863, + "learning_rate": 1.9205280901156332e-05, + "loss": 0.203, + "step": 3027 + }, + { + "epoch": 0.15, + "grad_norm": 1.482553328977277, + "learning_rate": 1.9204637344568694e-05, + "loss": 0.2092, + "step": 3028 + }, + { + "epoch": 0.15, + "grad_norm": 1.8412106230735823, + "learning_rate": 1.920399353830475e-05, + "loss": 0.2172, + "step": 3029 + }, + { + "epoch": 0.15, + "grad_norm": 1.029983017735956, + "learning_rate": 1.920334948238196e-05, + "loss": 0.2332, + "step": 3030 + }, + { + "epoch": 0.15, + "grad_norm": 0.9229551534640521, + "learning_rate": 1.9202705176817794e-05, + "loss": 0.2158, + "step": 3031 + }, + { + "epoch": 0.15, + "grad_norm": 1.1423176753684943, + "learning_rate": 1.920206062162973e-05, + "loss": 0.2294, + "step": 3032 + }, + { + "epoch": 0.15, + "grad_norm": 0.9970226971429255, + "learning_rate": 1.9201415816835254e-05, + "loss": 0.2271, + "step": 3033 + }, + { + "epoch": 0.15, + "grad_norm": 1.1127263130100777, + "learning_rate": 1.9200770762451854e-05, + "loss": 0.2265, + "step": 3034 + }, + { + "epoch": 0.15, + "grad_norm": 0.9070022790029587, + "learning_rate": 1.9200125458497025e-05, + "loss": 0.226, + "step": 3035 + }, + { + "epoch": 0.15, + "grad_norm": 1.4140524654210664, + "learning_rate": 1.9199479904988277e-05, + "loss": 0.2193, + "step": 3036 + }, + { + "epoch": 0.15, + "grad_norm": 1.2310860599371727, + "learning_rate": 1.9198834101943115e-05, + "loss": 0.2257, + "step": 3037 + }, + { + "epoch": 0.15, + "grad_norm": 1.2545161274463466, + "learning_rate": 1.9198188049379055e-05, + "loss": 0.2317, + "step": 3038 + }, + { + "epoch": 0.15, + "grad_norm": 1.0034022129693965, + "learning_rate": 1.919754174731363e-05, + "loss": 0.235, + "step": 3039 + }, + { + "epoch": 0.15, + "grad_norm": 1.114243885076107, + "learning_rate": 1.9196895195764363e-05, + "loss": 0.243, + "step": 3040 + }, + { + "epoch": 0.15, + "grad_norm": 1.0156746704279098, + "learning_rate": 1.9196248394748794e-05, + "loss": 0.2241, + "step": 3041 + }, + { + "epoch": 0.15, + "grad_norm": 1.0509811949541532, + "learning_rate": 1.919560134428447e-05, + "loss": 0.2409, + "step": 3042 + }, + { + "epoch": 0.15, + "grad_norm": 0.9396601478975011, + "learning_rate": 1.919495404438894e-05, + "loss": 0.2226, + "step": 3043 + }, + { + "epoch": 0.15, + "grad_norm": 4.312189831656218, + "learning_rate": 1.919430649507976e-05, + "loss": 0.2228, + "step": 3044 + }, + { + "epoch": 0.15, + "grad_norm": 1.1669179726287418, + "learning_rate": 1.9193658696374498e-05, + "loss": 0.2098, + "step": 3045 + }, + { + "epoch": 0.15, + "grad_norm": 1.053822503469039, + "learning_rate": 1.9193010648290725e-05, + "loss": 0.219, + "step": 3046 + }, + { + "epoch": 0.15, + "grad_norm": 2.897292649110749, + "learning_rate": 1.919236235084602e-05, + "loss": 0.2268, + "step": 3047 + }, + { + "epoch": 0.15, + "grad_norm": 1.1825640096018137, + "learning_rate": 1.9191713804057965e-05, + "loss": 0.2277, + "step": 3048 + }, + { + "epoch": 0.16, + "grad_norm": 1.0258482915835343, + "learning_rate": 1.9191065007944153e-05, + "loss": 0.229, + "step": 3049 + }, + { + "epoch": 0.16, + "grad_norm": 1.0150255181566332, + "learning_rate": 1.9190415962522186e-05, + "loss": 0.2122, + "step": 3050 + }, + { + "epoch": 0.16, + "grad_norm": 1.221461841386964, + "learning_rate": 1.9189766667809667e-05, + "loss": 0.2288, + "step": 3051 + }, + { + "epoch": 0.16, + "grad_norm": 1.0912579290474678, + "learning_rate": 1.9189117123824208e-05, + "loss": 0.2157, + "step": 3052 + }, + { + "epoch": 0.16, + "grad_norm": 1.092173960673062, + "learning_rate": 1.9188467330583428e-05, + "loss": 0.2197, + "step": 3053 + }, + { + "epoch": 0.16, + "grad_norm": 1.2105441945424267, + "learning_rate": 1.918781728810495e-05, + "loss": 0.2308, + "step": 3054 + }, + { + "epoch": 0.16, + "grad_norm": 1.0786894300997212, + "learning_rate": 1.9187166996406413e-05, + "loss": 0.2213, + "step": 3055 + }, + { + "epoch": 0.16, + "grad_norm": 1.0532621519315912, + "learning_rate": 1.918651645550545e-05, + "loss": 0.2421, + "step": 3056 + }, + { + "epoch": 0.16, + "grad_norm": 0.9811524305129725, + "learning_rate": 1.9185865665419708e-05, + "loss": 0.2341, + "step": 3057 + }, + { + "epoch": 0.16, + "grad_norm": 1.0652674203246557, + "learning_rate": 1.9185214626166845e-05, + "loss": 0.2249, + "step": 3058 + }, + { + "epoch": 0.16, + "grad_norm": 0.9858359011987853, + "learning_rate": 1.9184563337764516e-05, + "loss": 0.22, + "step": 3059 + }, + { + "epoch": 0.16, + "grad_norm": 0.9134652124007379, + "learning_rate": 1.9183911800230384e-05, + "loss": 0.2318, + "step": 3060 + }, + { + "epoch": 0.16, + "grad_norm": 1.2143347015760846, + "learning_rate": 1.9183260013582126e-05, + "loss": 0.2309, + "step": 3061 + }, + { + "epoch": 0.16, + "grad_norm": 0.9738448412568219, + "learning_rate": 1.9182607977837424e-05, + "loss": 0.2371, + "step": 3062 + }, + { + "epoch": 0.16, + "grad_norm": 0.7966606408722735, + "learning_rate": 1.9181955693013962e-05, + "loss": 0.2107, + "step": 3063 + }, + { + "epoch": 0.16, + "grad_norm": 1.4371746934422849, + "learning_rate": 1.918130315912943e-05, + "loss": 0.2405, + "step": 3064 + }, + { + "epoch": 0.16, + "grad_norm": 0.9880342880142227, + "learning_rate": 1.9180650376201536e-05, + "loss": 0.2291, + "step": 3065 + }, + { + "epoch": 0.16, + "grad_norm": 1.0844528067687451, + "learning_rate": 1.917999734424798e-05, + "loss": 0.2339, + "step": 3066 + }, + { + "epoch": 0.16, + "grad_norm": 1.273986948076706, + "learning_rate": 1.9179344063286475e-05, + "loss": 0.2195, + "step": 3067 + }, + { + "epoch": 0.16, + "grad_norm": 0.8814553758365897, + "learning_rate": 1.917869053333475e-05, + "loss": 0.2086, + "step": 3068 + }, + { + "epoch": 0.16, + "grad_norm": 1.077182563935586, + "learning_rate": 1.9178036754410518e-05, + "loss": 0.2181, + "step": 3069 + }, + { + "epoch": 0.16, + "grad_norm": 0.9425617308554278, + "learning_rate": 1.9177382726531527e-05, + "loss": 0.2067, + "step": 3070 + }, + { + "epoch": 0.16, + "grad_norm": 1.013551998686744, + "learning_rate": 1.9176728449715506e-05, + "loss": 0.2191, + "step": 3071 + }, + { + "epoch": 0.16, + "grad_norm": 1.0285491895566228, + "learning_rate": 1.9176073923980212e-05, + "loss": 0.2442, + "step": 3072 + }, + { + "epoch": 0.16, + "grad_norm": 1.0493606012843992, + "learning_rate": 1.917541914934339e-05, + "loss": 0.206, + "step": 3073 + }, + { + "epoch": 0.16, + "grad_norm": 1.2117368635392565, + "learning_rate": 1.917476412582281e-05, + "loss": 0.2272, + "step": 3074 + }, + { + "epoch": 0.16, + "grad_norm": 1.5386868025963205, + "learning_rate": 1.9174108853436234e-05, + "loss": 0.2331, + "step": 3075 + }, + { + "epoch": 0.16, + "grad_norm": 1.311440053988242, + "learning_rate": 1.9173453332201436e-05, + "loss": 0.2102, + "step": 3076 + }, + { + "epoch": 0.16, + "grad_norm": 1.4581511839772705, + "learning_rate": 1.91727975621362e-05, + "loss": 0.2173, + "step": 3077 + }, + { + "epoch": 0.16, + "grad_norm": 0.9644674644204856, + "learning_rate": 1.917214154325831e-05, + "loss": 0.2111, + "step": 3078 + }, + { + "epoch": 0.16, + "grad_norm": 1.1068412843688058, + "learning_rate": 1.917148527558556e-05, + "loss": 0.2383, + "step": 3079 + }, + { + "epoch": 0.16, + "grad_norm": 1.1228544945994425, + "learning_rate": 1.917082875913576e-05, + "loss": 0.2257, + "step": 3080 + }, + { + "epoch": 0.16, + "grad_norm": 1.1290992454356428, + "learning_rate": 1.9170171993926708e-05, + "loss": 0.2302, + "step": 3081 + }, + { + "epoch": 0.16, + "grad_norm": 1.5171743857569169, + "learning_rate": 1.9169514979976224e-05, + "loss": 0.2399, + "step": 3082 + }, + { + "epoch": 0.16, + "grad_norm": 1.179035837795795, + "learning_rate": 1.9168857717302128e-05, + "loss": 0.2181, + "step": 3083 + }, + { + "epoch": 0.16, + "grad_norm": 0.9525048557256401, + "learning_rate": 1.9168200205922248e-05, + "loss": 0.2206, + "step": 3084 + }, + { + "epoch": 0.16, + "grad_norm": 0.9192591243102435, + "learning_rate": 1.916754244585442e-05, + "loss": 0.2114, + "step": 3085 + }, + { + "epoch": 0.16, + "grad_norm": 1.2411058008519829, + "learning_rate": 1.9166884437116486e-05, + "loss": 0.2307, + "step": 3086 + }, + { + "epoch": 0.16, + "grad_norm": 1.4298363741722075, + "learning_rate": 1.9166226179726294e-05, + "loss": 0.2412, + "step": 3087 + }, + { + "epoch": 0.16, + "grad_norm": 1.051321479238389, + "learning_rate": 1.9165567673701696e-05, + "loss": 0.2105, + "step": 3088 + }, + { + "epoch": 0.16, + "grad_norm": 1.1983398040289723, + "learning_rate": 1.9164908919060562e-05, + "loss": 0.2134, + "step": 3089 + }, + { + "epoch": 0.16, + "grad_norm": 0.9650459782933238, + "learning_rate": 1.9164249915820753e-05, + "loss": 0.2188, + "step": 3090 + }, + { + "epoch": 0.16, + "grad_norm": 1.1929879231929374, + "learning_rate": 1.9163590664000145e-05, + "loss": 0.2292, + "step": 3091 + }, + { + "epoch": 0.16, + "grad_norm": 0.9577148201922246, + "learning_rate": 1.916293116361663e-05, + "loss": 0.2178, + "step": 3092 + }, + { + "epoch": 0.16, + "grad_norm": 1.0158885393736095, + "learning_rate": 1.916227141468808e-05, + "loss": 0.2073, + "step": 3093 + }, + { + "epoch": 0.16, + "grad_norm": 1.2815901739893645, + "learning_rate": 1.9161611417232407e-05, + "loss": 0.1999, + "step": 3094 + }, + { + "epoch": 0.16, + "grad_norm": 1.197536993244301, + "learning_rate": 1.9160951171267508e-05, + "loss": 0.2276, + "step": 3095 + }, + { + "epoch": 0.16, + "grad_norm": 1.2323433276514397, + "learning_rate": 1.9160290676811288e-05, + "loss": 0.2112, + "step": 3096 + }, + { + "epoch": 0.16, + "grad_norm": 1.119380261757278, + "learning_rate": 1.9159629933881666e-05, + "loss": 0.2015, + "step": 3097 + }, + { + "epoch": 0.16, + "grad_norm": 1.045749768250887, + "learning_rate": 1.915896894249657e-05, + "loss": 0.2191, + "step": 3098 + }, + { + "epoch": 0.16, + "grad_norm": 0.9658151735731435, + "learning_rate": 1.9158307702673917e-05, + "loss": 0.2234, + "step": 3099 + }, + { + "epoch": 0.16, + "grad_norm": 1.9059897342936998, + "learning_rate": 1.9157646214431653e-05, + "loss": 0.2434, + "step": 3100 + }, + { + "epoch": 0.16, + "grad_norm": 1.1570549601192839, + "learning_rate": 1.9156984477787717e-05, + "loss": 0.2198, + "step": 3101 + }, + { + "epoch": 0.16, + "grad_norm": 1.142206977481465, + "learning_rate": 1.9156322492760064e-05, + "loss": 0.2396, + "step": 3102 + }, + { + "epoch": 0.16, + "grad_norm": 1.4612871008091388, + "learning_rate": 1.915566025936664e-05, + "loss": 0.2434, + "step": 3103 + }, + { + "epoch": 0.16, + "grad_norm": 1.161012691350098, + "learning_rate": 1.9154997777625418e-05, + "loss": 0.2316, + "step": 3104 + }, + { + "epoch": 0.16, + "grad_norm": 1.0674154619263267, + "learning_rate": 1.9154335047554364e-05, + "loss": 0.2143, + "step": 3105 + }, + { + "epoch": 0.16, + "grad_norm": 1.420357609574483, + "learning_rate": 1.9153672069171454e-05, + "loss": 0.222, + "step": 3106 + }, + { + "epoch": 0.16, + "grad_norm": 1.1620397000841887, + "learning_rate": 1.9153008842494673e-05, + "loss": 0.235, + "step": 3107 + }, + { + "epoch": 0.16, + "grad_norm": 1.1933801258599444, + "learning_rate": 1.9152345367542008e-05, + "loss": 0.221, + "step": 3108 + }, + { + "epoch": 0.16, + "grad_norm": 0.92162970400094, + "learning_rate": 1.915168164433146e-05, + "loss": 0.2049, + "step": 3109 + }, + { + "epoch": 0.16, + "grad_norm": 1.0999983754740923, + "learning_rate": 1.9151017672881032e-05, + "loss": 0.2288, + "step": 3110 + }, + { + "epoch": 0.16, + "grad_norm": 1.7372094733382606, + "learning_rate": 1.915035345320873e-05, + "loss": 0.262, + "step": 3111 + }, + { + "epoch": 0.16, + "grad_norm": 1.158268282961799, + "learning_rate": 1.9149688985332575e-05, + "loss": 0.2077, + "step": 3112 + }, + { + "epoch": 0.16, + "grad_norm": 1.0296742546521, + "learning_rate": 1.914902426927059e-05, + "loss": 0.2285, + "step": 3113 + }, + { + "epoch": 0.16, + "grad_norm": 0.8850667462372587, + "learning_rate": 1.9148359305040802e-05, + "loss": 0.223, + "step": 3114 + }, + { + "epoch": 0.16, + "grad_norm": 1.1648725757139722, + "learning_rate": 1.9147694092661254e-05, + "loss": 0.222, + "step": 3115 + }, + { + "epoch": 0.16, + "grad_norm": 1.1864890985053547, + "learning_rate": 1.914702863214999e-05, + "loss": 0.2313, + "step": 3116 + }, + { + "epoch": 0.16, + "grad_norm": 1.9597519082119035, + "learning_rate": 1.9146362923525053e-05, + "loss": 0.2367, + "step": 3117 + }, + { + "epoch": 0.16, + "grad_norm": 1.0768884920382897, + "learning_rate": 1.9145696966804505e-05, + "loss": 0.2137, + "step": 3118 + }, + { + "epoch": 0.16, + "grad_norm": 1.2765280153968082, + "learning_rate": 1.914503076200641e-05, + "loss": 0.2335, + "step": 3119 + }, + { + "epoch": 0.16, + "grad_norm": 0.891702028568063, + "learning_rate": 1.9144364309148842e-05, + "loss": 0.2163, + "step": 3120 + }, + { + "epoch": 0.16, + "grad_norm": 1.0401173324989785, + "learning_rate": 1.9143697608249873e-05, + "loss": 0.2443, + "step": 3121 + }, + { + "epoch": 0.16, + "grad_norm": 1.150292160815212, + "learning_rate": 1.914303065932759e-05, + "loss": 0.2442, + "step": 3122 + }, + { + "epoch": 0.16, + "grad_norm": 0.9544448291925663, + "learning_rate": 1.9142363462400087e-05, + "loss": 0.2108, + "step": 3123 + }, + { + "epoch": 0.16, + "grad_norm": 0.9610639977235427, + "learning_rate": 1.914169601748546e-05, + "loss": 0.2076, + "step": 3124 + }, + { + "epoch": 0.16, + "grad_norm": 1.0201336974255446, + "learning_rate": 1.9141028324601808e-05, + "loss": 0.2314, + "step": 3125 + }, + { + "epoch": 0.16, + "grad_norm": 1.189912911424515, + "learning_rate": 1.9140360383767248e-05, + "loss": 0.1984, + "step": 3126 + }, + { + "epoch": 0.16, + "grad_norm": 1.2961454181303826, + "learning_rate": 1.9139692194999894e-05, + "loss": 0.2383, + "step": 3127 + }, + { + "epoch": 0.16, + "grad_norm": 1.4274584800275039, + "learning_rate": 1.9139023758317875e-05, + "loss": 0.235, + "step": 3128 + }, + { + "epoch": 0.16, + "grad_norm": 0.8855836301343938, + "learning_rate": 1.913835507373932e-05, + "loss": 0.2156, + "step": 3129 + }, + { + "epoch": 0.16, + "grad_norm": 1.0934329027469938, + "learning_rate": 1.9137686141282368e-05, + "loss": 0.2233, + "step": 3130 + }, + { + "epoch": 0.16, + "grad_norm": 0.8189168623294839, + "learning_rate": 1.9137016960965164e-05, + "loss": 0.1943, + "step": 3131 + }, + { + "epoch": 0.16, + "grad_norm": 1.2129777212990402, + "learning_rate": 1.9136347532805855e-05, + "loss": 0.2486, + "step": 3132 + }, + { + "epoch": 0.16, + "grad_norm": 0.8058648848724157, + "learning_rate": 1.9135677856822606e-05, + "loss": 0.2119, + "step": 3133 + }, + { + "epoch": 0.16, + "grad_norm": 1.01784769091552, + "learning_rate": 1.9135007933033583e-05, + "loss": 0.2201, + "step": 3134 + }, + { + "epoch": 0.16, + "grad_norm": 0.9792483281736258, + "learning_rate": 1.913433776145695e-05, + "loss": 0.238, + "step": 3135 + }, + { + "epoch": 0.16, + "grad_norm": 1.0007834040170969, + "learning_rate": 1.9133667342110887e-05, + "loss": 0.1993, + "step": 3136 + }, + { + "epoch": 0.16, + "grad_norm": 1.1539922532606464, + "learning_rate": 1.9132996675013583e-05, + "loss": 0.2184, + "step": 3137 + }, + { + "epoch": 0.16, + "grad_norm": 1.1508044653257241, + "learning_rate": 1.913232576018323e-05, + "loss": 0.2313, + "step": 3138 + }, + { + "epoch": 0.16, + "grad_norm": 1.069508988072164, + "learning_rate": 1.9131654597638024e-05, + "loss": 0.2287, + "step": 3139 + }, + { + "epoch": 0.16, + "grad_norm": 1.374642745001945, + "learning_rate": 1.9130983187396174e-05, + "loss": 0.2417, + "step": 3140 + }, + { + "epoch": 0.16, + "grad_norm": 1.131383765935505, + "learning_rate": 1.9130311529475886e-05, + "loss": 0.2367, + "step": 3141 + }, + { + "epoch": 0.16, + "grad_norm": 0.9307771276706998, + "learning_rate": 1.9129639623895382e-05, + "loss": 0.2307, + "step": 3142 + }, + { + "epoch": 0.16, + "grad_norm": 1.3643681521091495, + "learning_rate": 1.9128967470672887e-05, + "loss": 0.2675, + "step": 3143 + }, + { + "epoch": 0.16, + "grad_norm": 1.0992636255726658, + "learning_rate": 1.9128295069826636e-05, + "loss": 0.238, + "step": 3144 + }, + { + "epoch": 0.16, + "grad_norm": 0.8847005171553731, + "learning_rate": 1.9127622421374866e-05, + "loss": 0.2194, + "step": 3145 + }, + { + "epoch": 0.16, + "grad_norm": 1.2081783271673872, + "learning_rate": 1.912694952533582e-05, + "loss": 0.2383, + "step": 3146 + }, + { + "epoch": 0.16, + "grad_norm": 0.9796553656595477, + "learning_rate": 1.9126276381727752e-05, + "loss": 0.2247, + "step": 3147 + }, + { + "epoch": 0.16, + "grad_norm": 0.8756885079427202, + "learning_rate": 1.9125602990568925e-05, + "loss": 0.1946, + "step": 3148 + }, + { + "epoch": 0.16, + "grad_norm": 1.2317980453228476, + "learning_rate": 1.91249293518776e-05, + "loss": 0.2202, + "step": 3149 + }, + { + "epoch": 0.16, + "grad_norm": 1.0438633428794166, + "learning_rate": 1.9124255465672053e-05, + "loss": 0.2118, + "step": 3150 + }, + { + "epoch": 0.16, + "grad_norm": 0.9645029574938986, + "learning_rate": 1.9123581331970558e-05, + "loss": 0.2419, + "step": 3151 + }, + { + "epoch": 0.16, + "grad_norm": 1.0684720842692546, + "learning_rate": 1.9122906950791406e-05, + "loss": 0.2385, + "step": 3152 + }, + { + "epoch": 0.16, + "grad_norm": 1.3288111295644136, + "learning_rate": 1.9122232322152883e-05, + "loss": 0.1971, + "step": 3153 + }, + { + "epoch": 0.16, + "grad_norm": 1.033806895095567, + "learning_rate": 1.91215574460733e-05, + "loss": 0.2276, + "step": 3154 + }, + { + "epoch": 0.16, + "grad_norm": 3.0238508770319936, + "learning_rate": 1.9120882322570952e-05, + "loss": 0.2428, + "step": 3155 + }, + { + "epoch": 0.16, + "grad_norm": 1.03641331436747, + "learning_rate": 1.912020695166416e-05, + "loss": 0.2319, + "step": 3156 + }, + { + "epoch": 0.16, + "grad_norm": 1.5371657052324845, + "learning_rate": 1.9119531333371233e-05, + "loss": 0.2649, + "step": 3157 + }, + { + "epoch": 0.16, + "grad_norm": 0.9765508272616171, + "learning_rate": 1.9118855467710507e-05, + "loss": 0.1991, + "step": 3158 + }, + { + "epoch": 0.16, + "grad_norm": 1.1342684309424567, + "learning_rate": 1.911817935470031e-05, + "loss": 0.2457, + "step": 3159 + }, + { + "epoch": 0.16, + "grad_norm": 1.332260724706468, + "learning_rate": 1.9117502994358984e-05, + "loss": 0.2313, + "step": 3160 + }, + { + "epoch": 0.16, + "grad_norm": 1.2189330783389225, + "learning_rate": 1.9116826386704873e-05, + "loss": 0.2363, + "step": 3161 + }, + { + "epoch": 0.16, + "grad_norm": 2.037243374045091, + "learning_rate": 1.9116149531756333e-05, + "loss": 0.2506, + "step": 3162 + }, + { + "epoch": 0.16, + "grad_norm": 1.4394206019917581, + "learning_rate": 1.9115472429531722e-05, + "loss": 0.2078, + "step": 3163 + }, + { + "epoch": 0.16, + "grad_norm": 1.2227594911150255, + "learning_rate": 1.911479508004941e-05, + "loss": 0.2286, + "step": 3164 + }, + { + "epoch": 0.16, + "grad_norm": 0.8577485490636886, + "learning_rate": 1.911411748332776e-05, + "loss": 0.2209, + "step": 3165 + }, + { + "epoch": 0.16, + "grad_norm": 1.2941653244360138, + "learning_rate": 1.9113439639385164e-05, + "loss": 0.2426, + "step": 3166 + }, + { + "epoch": 0.16, + "grad_norm": 1.0202153420555122, + "learning_rate": 1.9112761548239996e-05, + "loss": 0.2191, + "step": 3167 + }, + { + "epoch": 0.16, + "grad_norm": 1.6811743560837742, + "learning_rate": 1.911208320991066e-05, + "loss": 0.2217, + "step": 3168 + }, + { + "epoch": 0.16, + "grad_norm": 1.4294763694872188, + "learning_rate": 1.9111404624415554e-05, + "loss": 0.2437, + "step": 3169 + }, + { + "epoch": 0.16, + "grad_norm": 2.3818248282341945, + "learning_rate": 1.9110725791773085e-05, + "loss": 0.2393, + "step": 3170 + }, + { + "epoch": 0.16, + "grad_norm": 0.9783655888403653, + "learning_rate": 1.911004671200166e-05, + "loss": 0.2074, + "step": 3171 + }, + { + "epoch": 0.16, + "grad_norm": 1.3146336646097434, + "learning_rate": 1.9109367385119705e-05, + "loss": 0.2205, + "step": 3172 + }, + { + "epoch": 0.16, + "grad_norm": 0.8869805809256699, + "learning_rate": 1.9108687811145645e-05, + "loss": 0.2042, + "step": 3173 + }, + { + "epoch": 0.16, + "grad_norm": 1.0622836441472836, + "learning_rate": 1.9108007990097913e-05, + "loss": 0.2256, + "step": 3174 + }, + { + "epoch": 0.16, + "grad_norm": 0.9475793238096982, + "learning_rate": 1.910732792199495e-05, + "loss": 0.2319, + "step": 3175 + }, + { + "epoch": 0.16, + "grad_norm": 1.0748427136473677, + "learning_rate": 1.9106647606855203e-05, + "loss": 0.2387, + "step": 3176 + }, + { + "epoch": 0.16, + "grad_norm": 1.0525166947996967, + "learning_rate": 1.9105967044697125e-05, + "loss": 0.2305, + "step": 3177 + }, + { + "epoch": 0.16, + "grad_norm": 1.0904798663674182, + "learning_rate": 1.9105286235539178e-05, + "loss": 0.2356, + "step": 3178 + }, + { + "epoch": 0.16, + "grad_norm": 1.1674600619281295, + "learning_rate": 1.9104605179399827e-05, + "loss": 0.2236, + "step": 3179 + }, + { + "epoch": 0.16, + "grad_norm": 1.090127583465506, + "learning_rate": 1.9103923876297544e-05, + "loss": 0.2298, + "step": 3180 + }, + { + "epoch": 0.16, + "grad_norm": 0.8327437385704604, + "learning_rate": 1.9103242326250815e-05, + "loss": 0.2066, + "step": 3181 + }, + { + "epoch": 0.16, + "grad_norm": 1.1699184536903102, + "learning_rate": 1.9102560529278122e-05, + "loss": 0.234, + "step": 3182 + }, + { + "epoch": 0.16, + "grad_norm": 1.1169478692759198, + "learning_rate": 1.910187848539796e-05, + "loss": 0.2445, + "step": 3183 + }, + { + "epoch": 0.16, + "grad_norm": 0.8226074400974731, + "learning_rate": 1.9101196194628834e-05, + "loss": 0.1799, + "step": 3184 + }, + { + "epoch": 0.16, + "grad_norm": 0.9114950516377383, + "learning_rate": 1.9100513656989244e-05, + "loss": 0.2268, + "step": 3185 + }, + { + "epoch": 0.16, + "grad_norm": 0.9785174645275201, + "learning_rate": 1.9099830872497707e-05, + "loss": 0.2279, + "step": 3186 + }, + { + "epoch": 0.16, + "grad_norm": 0.8498906357021067, + "learning_rate": 1.909914784117274e-05, + "loss": 0.1969, + "step": 3187 + }, + { + "epoch": 0.16, + "grad_norm": 1.16707129988209, + "learning_rate": 1.9098464563032878e-05, + "loss": 0.243, + "step": 3188 + }, + { + "epoch": 0.16, + "grad_norm": 1.1269165149984086, + "learning_rate": 1.9097781038096652e-05, + "loss": 0.2092, + "step": 3189 + }, + { + "epoch": 0.16, + "grad_norm": 1.2750144293869767, + "learning_rate": 1.9097097266382598e-05, + "loss": 0.2094, + "step": 3190 + }, + { + "epoch": 0.16, + "grad_norm": 1.0660217584543055, + "learning_rate": 1.909641324790927e-05, + "loss": 0.2109, + "step": 3191 + }, + { + "epoch": 0.16, + "grad_norm": 1.122843501272006, + "learning_rate": 1.909572898269522e-05, + "loss": 0.2205, + "step": 3192 + }, + { + "epoch": 0.16, + "grad_norm": 1.1216680184980137, + "learning_rate": 1.9095044470759004e-05, + "loss": 0.2398, + "step": 3193 + }, + { + "epoch": 0.16, + "grad_norm": 2.32198193611617, + "learning_rate": 1.9094359712119192e-05, + "loss": 0.2416, + "step": 3194 + }, + { + "epoch": 0.16, + "grad_norm": 1.2047762964034372, + "learning_rate": 1.9093674706794363e-05, + "loss": 0.2091, + "step": 3195 + }, + { + "epoch": 0.16, + "grad_norm": 1.0298224869937613, + "learning_rate": 1.9092989454803094e-05, + "loss": 0.2606, + "step": 3196 + }, + { + "epoch": 0.16, + "grad_norm": 1.5457240549213624, + "learning_rate": 1.909230395616397e-05, + "loss": 0.2052, + "step": 3197 + }, + { + "epoch": 0.16, + "grad_norm": 0.9502655176707185, + "learning_rate": 1.909161821089559e-05, + "loss": 0.2274, + "step": 3198 + }, + { + "epoch": 0.16, + "grad_norm": 1.01536728621932, + "learning_rate": 1.9090932219016548e-05, + "loss": 0.2049, + "step": 3199 + }, + { + "epoch": 0.16, + "grad_norm": 2.2487157303403302, + "learning_rate": 1.909024598054546e-05, + "loss": 0.2306, + "step": 3200 + }, + { + "epoch": 0.16, + "grad_norm": 1.2078412801534302, + "learning_rate": 1.9089559495500934e-05, + "loss": 0.2371, + "step": 3201 + }, + { + "epoch": 0.16, + "grad_norm": 1.0483351379753305, + "learning_rate": 1.90888727639016e-05, + "loss": 0.2389, + "step": 3202 + }, + { + "epoch": 0.16, + "grad_norm": 1.128869956503603, + "learning_rate": 1.908818578576607e-05, + "loss": 0.235, + "step": 3203 + }, + { + "epoch": 0.16, + "grad_norm": 1.0102531146450082, + "learning_rate": 1.9087498561112992e-05, + "loss": 0.211, + "step": 3204 + }, + { + "epoch": 0.16, + "grad_norm": 1.0299810783657681, + "learning_rate": 1.9086811089961e-05, + "loss": 0.2387, + "step": 3205 + }, + { + "epoch": 0.16, + "grad_norm": 0.9802814387541344, + "learning_rate": 1.9086123372328748e-05, + "loss": 0.2237, + "step": 3206 + }, + { + "epoch": 0.16, + "grad_norm": 0.9266592538656312, + "learning_rate": 1.9085435408234882e-05, + "loss": 0.2249, + "step": 3207 + }, + { + "epoch": 0.16, + "grad_norm": 0.9220119641068419, + "learning_rate": 1.9084747197698068e-05, + "loss": 0.2266, + "step": 3208 + }, + { + "epoch": 0.16, + "grad_norm": 1.1514474631890381, + "learning_rate": 1.9084058740736974e-05, + "loss": 0.2119, + "step": 3209 + }, + { + "epoch": 0.16, + "grad_norm": 1.1158632704978646, + "learning_rate": 1.9083370037370276e-05, + "loss": 0.219, + "step": 3210 + }, + { + "epoch": 0.16, + "grad_norm": 1.0141902460312189, + "learning_rate": 1.908268108761665e-05, + "loss": 0.2253, + "step": 3211 + }, + { + "epoch": 0.16, + "grad_norm": 1.6050271110549044, + "learning_rate": 1.9081991891494787e-05, + "loss": 0.2286, + "step": 3212 + }, + { + "epoch": 0.16, + "grad_norm": 1.2109911266896056, + "learning_rate": 1.908130244902338e-05, + "loss": 0.2159, + "step": 3213 + }, + { + "epoch": 0.16, + "grad_norm": 1.3546715824770614, + "learning_rate": 1.9080612760221134e-05, + "loss": 0.2547, + "step": 3214 + }, + { + "epoch": 0.16, + "grad_norm": 1.7959685045832672, + "learning_rate": 1.907992282510675e-05, + "loss": 0.2115, + "step": 3215 + }, + { + "epoch": 0.16, + "grad_norm": 1.5087742014255319, + "learning_rate": 1.9079232643698947e-05, + "loss": 0.2251, + "step": 3216 + }, + { + "epoch": 0.16, + "grad_norm": 0.924666360606091, + "learning_rate": 1.907854221601645e-05, + "loss": 0.2053, + "step": 3217 + }, + { + "epoch": 0.16, + "grad_norm": 0.9530973107451134, + "learning_rate": 1.9077851542077978e-05, + "loss": 0.2252, + "step": 3218 + }, + { + "epoch": 0.16, + "grad_norm": 0.950666139394692, + "learning_rate": 1.9077160621902274e-05, + "loss": 0.2505, + "step": 3219 + }, + { + "epoch": 0.16, + "grad_norm": 1.5103529027736473, + "learning_rate": 1.9076469455508072e-05, + "loss": 0.2258, + "step": 3220 + }, + { + "epoch": 0.16, + "grad_norm": 1.0096711939190677, + "learning_rate": 1.9075778042914126e-05, + "loss": 0.2177, + "step": 3221 + }, + { + "epoch": 0.16, + "grad_norm": 1.0032885320723282, + "learning_rate": 1.9075086384139187e-05, + "loss": 0.2226, + "step": 3222 + }, + { + "epoch": 0.16, + "grad_norm": 1.206005456150721, + "learning_rate": 1.9074394479202017e-05, + "loss": 0.2112, + "step": 3223 + }, + { + "epoch": 0.16, + "grad_norm": 1.150609400759136, + "learning_rate": 1.9073702328121382e-05, + "loss": 0.2051, + "step": 3224 + }, + { + "epoch": 0.16, + "grad_norm": 1.1756724863924575, + "learning_rate": 1.9073009930916063e-05, + "loss": 0.2261, + "step": 3225 + }, + { + "epoch": 0.16, + "grad_norm": 1.5673489315487212, + "learning_rate": 1.9072317287604837e-05, + "loss": 0.1954, + "step": 3226 + }, + { + "epoch": 0.16, + "grad_norm": 1.568033424099215, + "learning_rate": 1.907162439820649e-05, + "loss": 0.2572, + "step": 3227 + }, + { + "epoch": 0.16, + "grad_norm": 0.9642478242450317, + "learning_rate": 1.907093126273982e-05, + "loss": 0.2057, + "step": 3228 + }, + { + "epoch": 0.16, + "grad_norm": 6.618424478919842, + "learning_rate": 1.907023788122363e-05, + "loss": 0.2433, + "step": 3229 + }, + { + "epoch": 0.16, + "grad_norm": 0.9986359526257098, + "learning_rate": 1.906954425367672e-05, + "loss": 0.2171, + "step": 3230 + }, + { + "epoch": 0.16, + "grad_norm": 1.1235099739225607, + "learning_rate": 1.906885038011791e-05, + "loss": 0.2228, + "step": 3231 + }, + { + "epoch": 0.16, + "grad_norm": 1.154482508979427, + "learning_rate": 1.906815626056602e-05, + "loss": 0.2396, + "step": 3232 + }, + { + "epoch": 0.16, + "grad_norm": 1.3890043410430053, + "learning_rate": 1.9067461895039888e-05, + "loss": 0.2621, + "step": 3233 + }, + { + "epoch": 0.16, + "grad_norm": 1.358399667722323, + "learning_rate": 1.906676728355833e-05, + "loss": 0.2182, + "step": 3234 + }, + { + "epoch": 0.16, + "grad_norm": 1.0794146703989658, + "learning_rate": 1.9066072426140203e-05, + "loss": 0.2255, + "step": 3235 + }, + { + "epoch": 0.16, + "grad_norm": 0.9904688704649632, + "learning_rate": 1.9065377322804347e-05, + "loss": 0.227, + "step": 3236 + }, + { + "epoch": 0.16, + "grad_norm": 1.558693059373171, + "learning_rate": 1.9064681973569622e-05, + "loss": 0.2473, + "step": 3237 + }, + { + "epoch": 0.16, + "grad_norm": 0.9947430603467428, + "learning_rate": 1.9063986378454884e-05, + "loss": 0.2325, + "step": 3238 + }, + { + "epoch": 0.16, + "grad_norm": 1.2265249569014518, + "learning_rate": 1.9063290537479004e-05, + "loss": 0.2128, + "step": 3239 + }, + { + "epoch": 0.16, + "grad_norm": 1.0611571246887315, + "learning_rate": 1.9062594450660857e-05, + "loss": 0.2107, + "step": 3240 + }, + { + "epoch": 0.16, + "grad_norm": 1.1814888608068674, + "learning_rate": 1.9061898118019326e-05, + "loss": 0.2456, + "step": 3241 + }, + { + "epoch": 0.16, + "grad_norm": 2.7326384660130874, + "learning_rate": 1.9061201539573292e-05, + "loss": 0.2249, + "step": 3242 + }, + { + "epoch": 0.16, + "grad_norm": 1.0426774919663655, + "learning_rate": 1.9060504715341654e-05, + "loss": 0.2328, + "step": 3243 + }, + { + "epoch": 0.16, + "grad_norm": 1.318233062490341, + "learning_rate": 1.905980764534332e-05, + "loss": 0.2281, + "step": 3244 + }, + { + "epoch": 0.17, + "grad_norm": 1.2924652569164976, + "learning_rate": 1.9059110329597185e-05, + "loss": 0.2243, + "step": 3245 + }, + { + "epoch": 0.17, + "grad_norm": 1.2512314642191262, + "learning_rate": 1.9058412768122175e-05, + "loss": 0.2308, + "step": 3246 + }, + { + "epoch": 0.17, + "grad_norm": 1.083747561104402, + "learning_rate": 1.9057714960937205e-05, + "loss": 0.2267, + "step": 3247 + }, + { + "epoch": 0.17, + "grad_norm": 1.0294046112909658, + "learning_rate": 1.9057016908061205e-05, + "loss": 0.2287, + "step": 3248 + }, + { + "epoch": 0.17, + "grad_norm": 1.3535822475892574, + "learning_rate": 1.905631860951311e-05, + "loss": 0.2515, + "step": 3249 + }, + { + "epoch": 0.17, + "grad_norm": 2.7848478161203745, + "learning_rate": 1.905562006531186e-05, + "loss": 0.2369, + "step": 3250 + }, + { + "epoch": 0.17, + "grad_norm": 1.3252968240520733, + "learning_rate": 1.905492127547641e-05, + "loss": 0.2383, + "step": 3251 + }, + { + "epoch": 0.17, + "grad_norm": 1.093520045965069, + "learning_rate": 1.9054222240025706e-05, + "loss": 0.2449, + "step": 3252 + }, + { + "epoch": 0.17, + "grad_norm": 1.0924413513876863, + "learning_rate": 1.905352295897871e-05, + "loss": 0.2235, + "step": 3253 + }, + { + "epoch": 0.17, + "grad_norm": 1.1210881431514452, + "learning_rate": 1.9052823432354396e-05, + "loss": 0.2335, + "step": 3254 + }, + { + "epoch": 0.17, + "grad_norm": 1.2761817272497882, + "learning_rate": 1.905212366017173e-05, + "loss": 0.2387, + "step": 3255 + }, + { + "epoch": 0.17, + "grad_norm": 1.6487483979669733, + "learning_rate": 1.90514236424497e-05, + "loss": 0.213, + "step": 3256 + }, + { + "epoch": 0.17, + "grad_norm": 1.0463799652349561, + "learning_rate": 1.9050723379207296e-05, + "loss": 0.2427, + "step": 3257 + }, + { + "epoch": 0.17, + "grad_norm": 0.9613364989367936, + "learning_rate": 1.9050022870463507e-05, + "loss": 0.2282, + "step": 3258 + }, + { + "epoch": 0.17, + "grad_norm": 1.0254909889735437, + "learning_rate": 1.9049322116237336e-05, + "loss": 0.2341, + "step": 3259 + }, + { + "epoch": 0.17, + "grad_norm": 1.4414193791955492, + "learning_rate": 1.9048621116547793e-05, + "loss": 0.2174, + "step": 3260 + }, + { + "epoch": 0.17, + "grad_norm": 1.5035822960911616, + "learning_rate": 1.904791987141389e-05, + "loss": 0.2139, + "step": 3261 + }, + { + "epoch": 0.17, + "grad_norm": 0.9153873984869315, + "learning_rate": 1.9047218380854652e-05, + "loss": 0.2077, + "step": 3262 + }, + { + "epoch": 0.17, + "grad_norm": 1.204895202902941, + "learning_rate": 1.90465166448891e-05, + "loss": 0.2203, + "step": 3263 + }, + { + "epoch": 0.17, + "grad_norm": 1.235198349138494, + "learning_rate": 1.9045814663536275e-05, + "loss": 0.2553, + "step": 3264 + }, + { + "epoch": 0.17, + "grad_norm": 1.6304314205413217, + "learning_rate": 1.9045112436815217e-05, + "loss": 0.2391, + "step": 3265 + }, + { + "epoch": 0.17, + "grad_norm": 0.9506320321314072, + "learning_rate": 1.904440996474497e-05, + "loss": 0.2381, + "step": 3266 + }, + { + "epoch": 0.17, + "grad_norm": 1.1184506110462116, + "learning_rate": 1.9043707247344596e-05, + "loss": 0.212, + "step": 3267 + }, + { + "epoch": 0.17, + "grad_norm": 1.058991829044029, + "learning_rate": 1.904300428463315e-05, + "loss": 0.2115, + "step": 3268 + }, + { + "epoch": 0.17, + "grad_norm": 1.5336373043447653, + "learning_rate": 1.9042301076629704e-05, + "loss": 0.2197, + "step": 3269 + }, + { + "epoch": 0.17, + "grad_norm": 1.1747866988506088, + "learning_rate": 1.9041597623353327e-05, + "loss": 0.2236, + "step": 3270 + }, + { + "epoch": 0.17, + "grad_norm": 1.3912454174431848, + "learning_rate": 1.9040893924823108e-05, + "loss": 0.2098, + "step": 3271 + }, + { + "epoch": 0.17, + "grad_norm": 1.2756488798101397, + "learning_rate": 1.9040189981058128e-05, + "loss": 0.2524, + "step": 3272 + }, + { + "epoch": 0.17, + "grad_norm": 1.8518406486526697, + "learning_rate": 1.9039485792077485e-05, + "loss": 0.2237, + "step": 3273 + }, + { + "epoch": 0.17, + "grad_norm": 1.2949126687880619, + "learning_rate": 1.903878135790028e-05, + "loss": 0.2297, + "step": 3274 + }, + { + "epoch": 0.17, + "grad_norm": 1.193263514417983, + "learning_rate": 1.903807667854562e-05, + "loss": 0.2105, + "step": 3275 + }, + { + "epoch": 0.17, + "grad_norm": 1.3225038029048628, + "learning_rate": 1.9037371754032618e-05, + "loss": 0.2345, + "step": 3276 + }, + { + "epoch": 0.17, + "grad_norm": 1.1300634224130286, + "learning_rate": 1.9036666584380398e-05, + "loss": 0.1938, + "step": 3277 + }, + { + "epoch": 0.17, + "grad_norm": 0.9816663216784832, + "learning_rate": 1.9035961169608087e-05, + "loss": 0.247, + "step": 3278 + }, + { + "epoch": 0.17, + "grad_norm": 1.5861016137562751, + "learning_rate": 1.9035255509734816e-05, + "loss": 0.2081, + "step": 3279 + }, + { + "epoch": 0.17, + "grad_norm": 1.082690305307336, + "learning_rate": 1.903454960477973e-05, + "loss": 0.219, + "step": 3280 + }, + { + "epoch": 0.17, + "grad_norm": 1.3516616270012032, + "learning_rate": 1.903384345476198e-05, + "loss": 0.1982, + "step": 3281 + }, + { + "epoch": 0.17, + "grad_norm": 1.1284573755467393, + "learning_rate": 1.9033137059700712e-05, + "loss": 0.2256, + "step": 3282 + }, + { + "epoch": 0.17, + "grad_norm": 1.0071641984300603, + "learning_rate": 1.9032430419615094e-05, + "loss": 0.2387, + "step": 3283 + }, + { + "epoch": 0.17, + "grad_norm": 0.9908369990968336, + "learning_rate": 1.9031723534524287e-05, + "loss": 0.2043, + "step": 3284 + }, + { + "epoch": 0.17, + "grad_norm": 0.9434326851921663, + "learning_rate": 1.903101640444747e-05, + "loss": 0.1993, + "step": 3285 + }, + { + "epoch": 0.17, + "grad_norm": 0.9552024564147497, + "learning_rate": 1.9030309029403825e-05, + "loss": 0.2083, + "step": 3286 + }, + { + "epoch": 0.17, + "grad_norm": 1.7097234946706654, + "learning_rate": 1.9029601409412536e-05, + "loss": 0.2347, + "step": 3287 + }, + { + "epoch": 0.17, + "grad_norm": 1.2451337932069761, + "learning_rate": 1.90288935444928e-05, + "loss": 0.2595, + "step": 3288 + }, + { + "epoch": 0.17, + "grad_norm": 1.1152757703933747, + "learning_rate": 1.9028185434663814e-05, + "loss": 0.221, + "step": 3289 + }, + { + "epoch": 0.17, + "grad_norm": 0.8721790062954682, + "learning_rate": 1.902747707994479e-05, + "loss": 0.2246, + "step": 3290 + }, + { + "epoch": 0.17, + "grad_norm": 0.9772443493974295, + "learning_rate": 1.902676848035494e-05, + "loss": 0.2276, + "step": 3291 + }, + { + "epoch": 0.17, + "grad_norm": 1.1658165944676975, + "learning_rate": 1.9026059635913484e-05, + "loss": 0.2275, + "step": 3292 + }, + { + "epoch": 0.17, + "grad_norm": 1.4208291073865067, + "learning_rate": 1.9025350546639654e-05, + "loss": 0.2043, + "step": 3293 + }, + { + "epoch": 0.17, + "grad_norm": 1.1546386268942097, + "learning_rate": 1.902464121255268e-05, + "loss": 0.2295, + "step": 3294 + }, + { + "epoch": 0.17, + "grad_norm": 0.9581823924399043, + "learning_rate": 1.90239316336718e-05, + "loss": 0.2291, + "step": 3295 + }, + { + "epoch": 0.17, + "grad_norm": 0.9678091509808123, + "learning_rate": 1.9023221810016268e-05, + "loss": 0.2213, + "step": 3296 + }, + { + "epoch": 0.17, + "grad_norm": 0.9801111140281928, + "learning_rate": 1.9022511741605334e-05, + "loss": 0.2178, + "step": 3297 + }, + { + "epoch": 0.17, + "grad_norm": 1.2702708802151763, + "learning_rate": 1.9021801428458258e-05, + "loss": 0.2208, + "step": 3298 + }, + { + "epoch": 0.17, + "grad_norm": 0.9622585836212532, + "learning_rate": 1.902109087059431e-05, + "loss": 0.2191, + "step": 3299 + }, + { + "epoch": 0.17, + "grad_norm": 1.0074093143700964, + "learning_rate": 1.902038006803276e-05, + "loss": 0.2198, + "step": 3300 + }, + { + "epoch": 0.17, + "grad_norm": 1.0348778645975119, + "learning_rate": 1.9019669020792896e-05, + "loss": 0.2068, + "step": 3301 + }, + { + "epoch": 0.17, + "grad_norm": 0.878794499122924, + "learning_rate": 1.9018957728893997e-05, + "loss": 0.1931, + "step": 3302 + }, + { + "epoch": 0.17, + "grad_norm": 0.9688383797935949, + "learning_rate": 1.901824619235536e-05, + "loss": 0.2115, + "step": 3303 + }, + { + "epoch": 0.17, + "grad_norm": 1.1918226103761322, + "learning_rate": 1.9017534411196286e-05, + "loss": 0.2262, + "step": 3304 + }, + { + "epoch": 0.17, + "grad_norm": 0.909382788046008, + "learning_rate": 1.9016822385436086e-05, + "loss": 0.2242, + "step": 3305 + }, + { + "epoch": 0.17, + "grad_norm": 0.8945541647722558, + "learning_rate": 1.9016110115094064e-05, + "loss": 0.2313, + "step": 3306 + }, + { + "epoch": 0.17, + "grad_norm": 0.9286600500193802, + "learning_rate": 1.9015397600189548e-05, + "loss": 0.2113, + "step": 3307 + }, + { + "epoch": 0.17, + "grad_norm": 1.0063058750169587, + "learning_rate": 1.9014684840741863e-05, + "loss": 0.2239, + "step": 3308 + }, + { + "epoch": 0.17, + "grad_norm": 1.1075815585913915, + "learning_rate": 1.9013971836770342e-05, + "loss": 0.2347, + "step": 3309 + }, + { + "epoch": 0.17, + "grad_norm": 0.9537824608305381, + "learning_rate": 1.9013258588294324e-05, + "loss": 0.2021, + "step": 3310 + }, + { + "epoch": 0.17, + "grad_norm": 1.3363127593510533, + "learning_rate": 1.9012545095333163e-05, + "loss": 0.2246, + "step": 3311 + }, + { + "epoch": 0.17, + "grad_norm": 1.0966856534204155, + "learning_rate": 1.9011831357906204e-05, + "loss": 0.2427, + "step": 3312 + }, + { + "epoch": 0.17, + "grad_norm": 1.0293981595871236, + "learning_rate": 1.901111737603281e-05, + "loss": 0.2134, + "step": 3313 + }, + { + "epoch": 0.17, + "grad_norm": 1.2481574644414424, + "learning_rate": 1.9010403149732347e-05, + "loss": 0.2696, + "step": 3314 + }, + { + "epoch": 0.17, + "grad_norm": 0.9090088403265032, + "learning_rate": 1.900968867902419e-05, + "loss": 0.214, + "step": 3315 + }, + { + "epoch": 0.17, + "grad_norm": 1.228778878223172, + "learning_rate": 1.9008973963927722e-05, + "loss": 0.2327, + "step": 3316 + }, + { + "epoch": 0.17, + "grad_norm": 1.6815691699994177, + "learning_rate": 1.9008259004462326e-05, + "loss": 0.2281, + "step": 3317 + }, + { + "epoch": 0.17, + "grad_norm": 1.0204039327333942, + "learning_rate": 1.9007543800647395e-05, + "loss": 0.2123, + "step": 3318 + }, + { + "epoch": 0.17, + "grad_norm": 0.9460619920119041, + "learning_rate": 1.9006828352502328e-05, + "loss": 0.257, + "step": 3319 + }, + { + "epoch": 0.17, + "grad_norm": 1.270528757519636, + "learning_rate": 1.9006112660046535e-05, + "loss": 0.2336, + "step": 3320 + }, + { + "epoch": 0.17, + "grad_norm": 3.407866180437645, + "learning_rate": 1.9005396723299426e-05, + "loss": 0.2366, + "step": 3321 + }, + { + "epoch": 0.17, + "grad_norm": 1.1886401077079094, + "learning_rate": 1.9004680542280423e-05, + "loss": 0.2178, + "step": 3322 + }, + { + "epoch": 0.17, + "grad_norm": 1.3281765035685824, + "learning_rate": 1.900396411700895e-05, + "loss": 0.2115, + "step": 3323 + }, + { + "epoch": 0.17, + "grad_norm": 1.3100309021873207, + "learning_rate": 1.9003247447504447e-05, + "loss": 0.2379, + "step": 3324 + }, + { + "epoch": 0.17, + "grad_norm": 2.9845075767531712, + "learning_rate": 1.900253053378634e-05, + "loss": 0.233, + "step": 3325 + }, + { + "epoch": 0.17, + "grad_norm": 2.6700218837971317, + "learning_rate": 1.9001813375874093e-05, + "loss": 0.2322, + "step": 3326 + }, + { + "epoch": 0.17, + "grad_norm": 1.0300567032531562, + "learning_rate": 1.900109597378714e-05, + "loss": 0.2444, + "step": 3327 + }, + { + "epoch": 0.17, + "grad_norm": 1.10901757705295, + "learning_rate": 1.900037832754496e-05, + "loss": 0.2259, + "step": 3328 + }, + { + "epoch": 0.17, + "grad_norm": 0.9447163714069051, + "learning_rate": 1.8999660437167003e-05, + "loss": 0.2366, + "step": 3329 + }, + { + "epoch": 0.17, + "grad_norm": 1.307139230075517, + "learning_rate": 1.8998942302672753e-05, + "loss": 0.256, + "step": 3330 + }, + { + "epoch": 0.17, + "grad_norm": 1.371731200627975, + "learning_rate": 1.8998223924081683e-05, + "loss": 0.2393, + "step": 3331 + }, + { + "epoch": 0.17, + "grad_norm": 1.1905128490939072, + "learning_rate": 1.899750530141328e-05, + "loss": 0.2536, + "step": 3332 + }, + { + "epoch": 0.17, + "grad_norm": 0.9503692709073092, + "learning_rate": 1.8996786434687035e-05, + "loss": 0.2219, + "step": 3333 + }, + { + "epoch": 0.17, + "grad_norm": 0.9439331749855291, + "learning_rate": 1.8996067323922454e-05, + "loss": 0.2323, + "step": 3334 + }, + { + "epoch": 0.17, + "grad_norm": 0.9532307715531738, + "learning_rate": 1.8995347969139034e-05, + "loss": 0.2252, + "step": 3335 + }, + { + "epoch": 0.17, + "grad_norm": 1.552957222597281, + "learning_rate": 1.8994628370356296e-05, + "loss": 0.2267, + "step": 3336 + }, + { + "epoch": 0.17, + "grad_norm": 1.2716668963571127, + "learning_rate": 1.8993908527593756e-05, + "loss": 0.2372, + "step": 3337 + }, + { + "epoch": 0.17, + "grad_norm": 0.7846699126598632, + "learning_rate": 1.899318844087094e-05, + "loss": 0.196, + "step": 3338 + }, + { + "epoch": 0.17, + "grad_norm": 1.0129496638676174, + "learning_rate": 1.8992468110207376e-05, + "loss": 0.2155, + "step": 3339 + }, + { + "epoch": 0.17, + "grad_norm": 0.8783271979901689, + "learning_rate": 1.8991747535622607e-05, + "loss": 0.2038, + "step": 3340 + }, + { + "epoch": 0.17, + "grad_norm": 0.9169434293805285, + "learning_rate": 1.8991026717136182e-05, + "loss": 0.2245, + "step": 3341 + }, + { + "epoch": 0.17, + "grad_norm": 0.9595660483435009, + "learning_rate": 1.8990305654767646e-05, + "loss": 0.2497, + "step": 3342 + }, + { + "epoch": 0.17, + "grad_norm": 1.5644155971430844, + "learning_rate": 1.8989584348536563e-05, + "loss": 0.201, + "step": 3343 + }, + { + "epoch": 0.17, + "grad_norm": 1.3322471354041998, + "learning_rate": 1.8988862798462496e-05, + "loss": 0.2064, + "step": 3344 + }, + { + "epoch": 0.17, + "grad_norm": 0.9960819465156675, + "learning_rate": 1.8988141004565017e-05, + "loss": 0.2317, + "step": 3345 + }, + { + "epoch": 0.17, + "grad_norm": 1.025625915335841, + "learning_rate": 1.8987418966863708e-05, + "loss": 0.223, + "step": 3346 + }, + { + "epoch": 0.17, + "grad_norm": 0.9463078148518489, + "learning_rate": 1.898669668537815e-05, + "loss": 0.2218, + "step": 3347 + }, + { + "epoch": 0.17, + "grad_norm": 1.0451425798298368, + "learning_rate": 1.898597416012794e-05, + "loss": 0.2416, + "step": 3348 + }, + { + "epoch": 0.17, + "grad_norm": 1.220357318287464, + "learning_rate": 1.898525139113267e-05, + "loss": 0.2352, + "step": 3349 + }, + { + "epoch": 0.17, + "grad_norm": 0.9201741657190172, + "learning_rate": 1.898452837841195e-05, + "loss": 0.1981, + "step": 3350 + }, + { + "epoch": 0.17, + "grad_norm": 1.1030367729470085, + "learning_rate": 1.898380512198539e-05, + "loss": 0.206, + "step": 3351 + }, + { + "epoch": 0.17, + "grad_norm": 1.018621044627046, + "learning_rate": 1.898308162187261e-05, + "loss": 0.208, + "step": 3352 + }, + { + "epoch": 0.17, + "grad_norm": 1.0250380740128227, + "learning_rate": 1.898235787809323e-05, + "loss": 0.2217, + "step": 3353 + }, + { + "epoch": 0.17, + "grad_norm": 1.0443360909922148, + "learning_rate": 1.8981633890666886e-05, + "loss": 0.2061, + "step": 3354 + }, + { + "epoch": 0.17, + "grad_norm": 1.118094985175832, + "learning_rate": 1.8980909659613217e-05, + "loss": 0.242, + "step": 3355 + }, + { + "epoch": 0.17, + "grad_norm": 1.1719394317035994, + "learning_rate": 1.8980185184951864e-05, + "loss": 0.2262, + "step": 3356 + }, + { + "epoch": 0.17, + "grad_norm": 1.1119499336976413, + "learning_rate": 1.8979460466702483e-05, + "loss": 0.2172, + "step": 3357 + }, + { + "epoch": 0.17, + "grad_norm": 0.9199146540350096, + "learning_rate": 1.897873550488473e-05, + "loss": 0.2207, + "step": 3358 + }, + { + "epoch": 0.17, + "grad_norm": 1.597157202627985, + "learning_rate": 1.8978010299518268e-05, + "loss": 0.2236, + "step": 3359 + }, + { + "epoch": 0.17, + "grad_norm": 1.145656969444609, + "learning_rate": 1.897728485062277e-05, + "loss": 0.2448, + "step": 3360 + }, + { + "epoch": 0.17, + "grad_norm": 1.196306172255373, + "learning_rate": 1.8976559158217913e-05, + "loss": 0.2421, + "step": 3361 + }, + { + "epoch": 0.17, + "grad_norm": 1.0543796183789402, + "learning_rate": 1.8975833222323383e-05, + "loss": 0.2004, + "step": 3362 + }, + { + "epoch": 0.17, + "grad_norm": 1.1382328752417115, + "learning_rate": 1.897510704295887e-05, + "loss": 0.2205, + "step": 3363 + }, + { + "epoch": 0.17, + "grad_norm": 1.2594780069794822, + "learning_rate": 1.897438062014407e-05, + "loss": 0.2492, + "step": 3364 + }, + { + "epoch": 0.17, + "grad_norm": 1.1099254758261405, + "learning_rate": 1.897365395389869e-05, + "loss": 0.2082, + "step": 3365 + }, + { + "epoch": 0.17, + "grad_norm": 1.105618387840811, + "learning_rate": 1.8972927044242438e-05, + "loss": 0.2275, + "step": 3366 + }, + { + "epoch": 0.17, + "grad_norm": 1.2596995074456943, + "learning_rate": 1.8972199891195034e-05, + "loss": 0.2188, + "step": 3367 + }, + { + "epoch": 0.17, + "grad_norm": 1.1453752628087497, + "learning_rate": 1.8971472494776203e-05, + "loss": 0.2037, + "step": 3368 + }, + { + "epoch": 0.17, + "grad_norm": 1.850769972503201, + "learning_rate": 1.8970744855005674e-05, + "loss": 0.2566, + "step": 3369 + }, + { + "epoch": 0.17, + "grad_norm": 1.539628585952559, + "learning_rate": 1.897001697190318e-05, + "loss": 0.2761, + "step": 3370 + }, + { + "epoch": 0.17, + "grad_norm": 1.1468763974627258, + "learning_rate": 1.8969288845488473e-05, + "loss": 0.2271, + "step": 3371 + }, + { + "epoch": 0.17, + "grad_norm": 1.2157208655996792, + "learning_rate": 1.8968560475781297e-05, + "loss": 0.2655, + "step": 3372 + }, + { + "epoch": 0.17, + "grad_norm": 1.1547576211357717, + "learning_rate": 1.8967831862801414e-05, + "loss": 0.213, + "step": 3373 + }, + { + "epoch": 0.17, + "grad_norm": 1.2452954267397751, + "learning_rate": 1.8967103006568583e-05, + "loss": 0.2269, + "step": 3374 + }, + { + "epoch": 0.17, + "grad_norm": 1.1873167847152575, + "learning_rate": 1.8966373907102577e-05, + "loss": 0.2443, + "step": 3375 + }, + { + "epoch": 0.17, + "grad_norm": 1.0945146479285706, + "learning_rate": 1.8965644564423173e-05, + "loss": 0.2098, + "step": 3376 + }, + { + "epoch": 0.17, + "grad_norm": 1.59138745969418, + "learning_rate": 1.8964914978550154e-05, + "loss": 0.2093, + "step": 3377 + }, + { + "epoch": 0.17, + "grad_norm": 1.0840798791455932, + "learning_rate": 1.896418514950331e-05, + "loss": 0.2356, + "step": 3378 + }, + { + "epoch": 0.17, + "grad_norm": 1.207943842119613, + "learning_rate": 1.8963455077302435e-05, + "loss": 0.2311, + "step": 3379 + }, + { + "epoch": 0.17, + "grad_norm": 1.1114184292207892, + "learning_rate": 1.896272476196734e-05, + "loss": 0.2061, + "step": 3380 + }, + { + "epoch": 0.17, + "grad_norm": 1.6099084050038648, + "learning_rate": 1.8961994203517822e-05, + "loss": 0.2319, + "step": 3381 + }, + { + "epoch": 0.17, + "grad_norm": 0.9832513731356815, + "learning_rate": 1.896126340197371e-05, + "loss": 0.2509, + "step": 3382 + }, + { + "epoch": 0.17, + "grad_norm": 1.2736609473875997, + "learning_rate": 1.896053235735482e-05, + "loss": 0.2466, + "step": 3383 + }, + { + "epoch": 0.17, + "grad_norm": 1.028911036798015, + "learning_rate": 1.8959801069680986e-05, + "loss": 0.2225, + "step": 3384 + }, + { + "epoch": 0.17, + "grad_norm": 1.378791568014497, + "learning_rate": 1.8959069538972043e-05, + "loss": 0.203, + "step": 3385 + }, + { + "epoch": 0.17, + "grad_norm": 1.2369113654728618, + "learning_rate": 1.895833776524783e-05, + "loss": 0.2125, + "step": 3386 + }, + { + "epoch": 0.17, + "grad_norm": 1.192780451638109, + "learning_rate": 1.89576057485282e-05, + "loss": 0.2195, + "step": 3387 + }, + { + "epoch": 0.17, + "grad_norm": 1.3540915753515144, + "learning_rate": 1.8956873488833008e-05, + "loss": 0.2378, + "step": 3388 + }, + { + "epoch": 0.17, + "grad_norm": 1.194988330128871, + "learning_rate": 1.8956140986182116e-05, + "loss": 0.2271, + "step": 3389 + }, + { + "epoch": 0.17, + "grad_norm": 1.473213780154375, + "learning_rate": 1.8955408240595396e-05, + "loss": 0.2136, + "step": 3390 + }, + { + "epoch": 0.17, + "grad_norm": 1.8201258083185985, + "learning_rate": 1.8954675252092717e-05, + "loss": 0.2231, + "step": 3391 + }, + { + "epoch": 0.17, + "grad_norm": 1.8603983223800908, + "learning_rate": 1.895394202069397e-05, + "loss": 0.2228, + "step": 3392 + }, + { + "epoch": 0.17, + "grad_norm": 2.894489027911129, + "learning_rate": 1.895320854641904e-05, + "loss": 0.2111, + "step": 3393 + }, + { + "epoch": 0.17, + "grad_norm": 1.1816876791342712, + "learning_rate": 1.8952474829287825e-05, + "loss": 0.2426, + "step": 3394 + }, + { + "epoch": 0.17, + "grad_norm": 1.155318420616888, + "learning_rate": 1.895174086932022e-05, + "loss": 0.2128, + "step": 3395 + }, + { + "epoch": 0.17, + "grad_norm": 0.9597525424228827, + "learning_rate": 1.895100666653614e-05, + "loss": 0.2171, + "step": 3396 + }, + { + "epoch": 0.17, + "grad_norm": 0.9415740622946664, + "learning_rate": 1.8950272220955497e-05, + "loss": 0.2316, + "step": 3397 + }, + { + "epoch": 0.17, + "grad_norm": 0.9863523292989811, + "learning_rate": 1.8949537532598213e-05, + "loss": 0.2078, + "step": 3398 + }, + { + "epoch": 0.17, + "grad_norm": 1.6101817215152627, + "learning_rate": 1.8948802601484224e-05, + "loss": 0.2297, + "step": 3399 + }, + { + "epoch": 0.17, + "grad_norm": 1.2594625137553501, + "learning_rate": 1.8948067427633456e-05, + "loss": 0.2318, + "step": 3400 + }, + { + "epoch": 0.17, + "grad_norm": 1.7816854513161775, + "learning_rate": 1.8947332011065853e-05, + "loss": 0.1975, + "step": 3401 + }, + { + "epoch": 0.17, + "grad_norm": 1.1061855335081938, + "learning_rate": 1.8946596351801363e-05, + "loss": 0.2419, + "step": 3402 + }, + { + "epoch": 0.17, + "grad_norm": 1.5434594029241404, + "learning_rate": 1.8945860449859945e-05, + "loss": 0.2057, + "step": 3403 + }, + { + "epoch": 0.17, + "grad_norm": 1.1371130973679873, + "learning_rate": 1.8945124305261555e-05, + "loss": 0.2249, + "step": 3404 + }, + { + "epoch": 0.17, + "grad_norm": 0.9609027280934379, + "learning_rate": 1.8944387918026162e-05, + "loss": 0.2081, + "step": 3405 + }, + { + "epoch": 0.17, + "grad_norm": 0.9246659917659515, + "learning_rate": 1.8943651288173743e-05, + "loss": 0.2175, + "step": 3406 + }, + { + "epoch": 0.17, + "grad_norm": 0.9675906075566838, + "learning_rate": 1.8942914415724275e-05, + "loss": 0.2108, + "step": 3407 + }, + { + "epoch": 0.17, + "grad_norm": 1.167654837285515, + "learning_rate": 1.8942177300697753e-05, + "loss": 0.2064, + "step": 3408 + }, + { + "epoch": 0.17, + "grad_norm": 2.30419671209794, + "learning_rate": 1.8941439943114162e-05, + "loss": 0.1932, + "step": 3409 + }, + { + "epoch": 0.17, + "grad_norm": 1.6847133694504224, + "learning_rate": 1.8940702342993512e-05, + "loss": 0.2183, + "step": 3410 + }, + { + "epoch": 0.17, + "grad_norm": 1.2953727771799972, + "learning_rate": 1.8939964500355806e-05, + "loss": 0.2059, + "step": 3411 + }, + { + "epoch": 0.17, + "grad_norm": 1.3106937767502531, + "learning_rate": 1.8939226415221054e-05, + "loss": 0.205, + "step": 3412 + }, + { + "epoch": 0.17, + "grad_norm": 1.592550504669995, + "learning_rate": 1.893848808760928e-05, + "loss": 0.1927, + "step": 3413 + }, + { + "epoch": 0.17, + "grad_norm": 1.4358442707709609, + "learning_rate": 1.8937749517540516e-05, + "loss": 0.2047, + "step": 3414 + }, + { + "epoch": 0.17, + "grad_norm": 1.2130697178931753, + "learning_rate": 1.8937010705034788e-05, + "loss": 0.2185, + "step": 3415 + }, + { + "epoch": 0.17, + "grad_norm": 1.204625251601906, + "learning_rate": 1.8936271650112143e-05, + "loss": 0.2199, + "step": 3416 + }, + { + "epoch": 0.17, + "grad_norm": 1.648059762048116, + "learning_rate": 1.8935532352792624e-05, + "loss": 0.2435, + "step": 3417 + }, + { + "epoch": 0.17, + "grad_norm": 1.5113204599402539, + "learning_rate": 1.8934792813096283e-05, + "loss": 0.2171, + "step": 3418 + }, + { + "epoch": 0.17, + "grad_norm": 1.08312601172551, + "learning_rate": 1.8934053031043185e-05, + "loss": 0.2164, + "step": 3419 + }, + { + "epoch": 0.17, + "grad_norm": 2.8703153818528033, + "learning_rate": 1.8933313006653392e-05, + "loss": 0.2151, + "step": 3420 + }, + { + "epoch": 0.17, + "grad_norm": 0.9451239356452712, + "learning_rate": 1.893257273994698e-05, + "loss": 0.2394, + "step": 3421 + }, + { + "epoch": 0.17, + "grad_norm": 1.2083503347871671, + "learning_rate": 1.8931832230944026e-05, + "loss": 0.2349, + "step": 3422 + }, + { + "epoch": 0.17, + "grad_norm": 1.2617570854350872, + "learning_rate": 1.8931091479664622e-05, + "loss": 0.2264, + "step": 3423 + }, + { + "epoch": 0.17, + "grad_norm": 1.1220719798890133, + "learning_rate": 1.8930350486128855e-05, + "loss": 0.2257, + "step": 3424 + }, + { + "epoch": 0.17, + "grad_norm": 2.210152001777275, + "learning_rate": 1.8929609250356827e-05, + "loss": 0.2366, + "step": 3425 + }, + { + "epoch": 0.17, + "grad_norm": 1.5213675641639302, + "learning_rate": 1.8928867772368644e-05, + "loss": 0.2119, + "step": 3426 + }, + { + "epoch": 0.17, + "grad_norm": 1.161765898603026, + "learning_rate": 1.892812605218442e-05, + "loss": 0.2024, + "step": 3427 + }, + { + "epoch": 0.17, + "grad_norm": 1.038853006912805, + "learning_rate": 1.8927384089824267e-05, + "loss": 0.1975, + "step": 3428 + }, + { + "epoch": 0.17, + "grad_norm": 1.2892374237322095, + "learning_rate": 1.8926641885308325e-05, + "loss": 0.2194, + "step": 3429 + }, + { + "epoch": 0.17, + "grad_norm": 1.5090525000593058, + "learning_rate": 1.8925899438656708e-05, + "loss": 0.2007, + "step": 3430 + }, + { + "epoch": 0.17, + "grad_norm": 1.3066340896057012, + "learning_rate": 1.892515674988957e-05, + "loss": 0.2439, + "step": 3431 + }, + { + "epoch": 0.17, + "grad_norm": 2.6703291899021413, + "learning_rate": 1.892441381902705e-05, + "loss": 0.2068, + "step": 3432 + }, + { + "epoch": 0.17, + "grad_norm": 1.470448296567249, + "learning_rate": 1.8923670646089303e-05, + "loss": 0.218, + "step": 3433 + }, + { + "epoch": 0.17, + "grad_norm": 1.3832623440700667, + "learning_rate": 1.8922927231096482e-05, + "loss": 0.2101, + "step": 3434 + }, + { + "epoch": 0.17, + "grad_norm": 1.2149799232129805, + "learning_rate": 1.892218357406876e-05, + "loss": 0.2379, + "step": 3435 + }, + { + "epoch": 0.17, + "grad_norm": 1.2009435208243155, + "learning_rate": 1.89214396750263e-05, + "loss": 0.2305, + "step": 3436 + }, + { + "epoch": 0.17, + "grad_norm": 1.9612695423719364, + "learning_rate": 1.892069553398929e-05, + "loss": 0.2457, + "step": 3437 + }, + { + "epoch": 0.17, + "grad_norm": 1.1203701248483504, + "learning_rate": 1.8919951150977908e-05, + "loss": 0.2138, + "step": 3438 + }, + { + "epoch": 0.17, + "grad_norm": 1.1046369072938969, + "learning_rate": 1.8919206526012346e-05, + "loss": 0.222, + "step": 3439 + }, + { + "epoch": 0.17, + "grad_norm": 1.0531311533915446, + "learning_rate": 1.8918461659112805e-05, + "loss": 0.2276, + "step": 3440 + }, + { + "epoch": 0.17, + "grad_norm": 1.291918364659948, + "learning_rate": 1.8917716550299485e-05, + "loss": 0.2109, + "step": 3441 + }, + { + "epoch": 0.18, + "grad_norm": 1.5619486000971663, + "learning_rate": 1.8916971199592603e-05, + "loss": 0.222, + "step": 3442 + }, + { + "epoch": 0.18, + "grad_norm": 2.11363710814902, + "learning_rate": 1.891622560701237e-05, + "loss": 0.2221, + "step": 3443 + }, + { + "epoch": 0.18, + "grad_norm": 1.8271041690104084, + "learning_rate": 1.8915479772579017e-05, + "loss": 0.223, + "step": 3444 + }, + { + "epoch": 0.18, + "grad_norm": 1.1204943235932574, + "learning_rate": 1.891473369631277e-05, + "loss": 0.2328, + "step": 3445 + }, + { + "epoch": 0.18, + "grad_norm": 1.0413365146492488, + "learning_rate": 1.891398737823387e-05, + "loss": 0.2104, + "step": 3446 + }, + { + "epoch": 0.18, + "grad_norm": 1.2649950767159048, + "learning_rate": 1.8913240818362556e-05, + "loss": 0.2119, + "step": 3447 + }, + { + "epoch": 0.18, + "grad_norm": 1.4830297934477925, + "learning_rate": 1.8912494016719084e-05, + "loss": 0.2071, + "step": 3448 + }, + { + "epoch": 0.18, + "grad_norm": 1.1955636236263514, + "learning_rate": 1.8911746973323706e-05, + "loss": 0.2253, + "step": 3449 + }, + { + "epoch": 0.18, + "grad_norm": 1.268887849540954, + "learning_rate": 1.8910999688196688e-05, + "loss": 0.2314, + "step": 3450 + }, + { + "epoch": 0.18, + "grad_norm": 1.8540019787261037, + "learning_rate": 1.8910252161358302e-05, + "loss": 0.2186, + "step": 3451 + }, + { + "epoch": 0.18, + "grad_norm": 1.3500922534295132, + "learning_rate": 1.8909504392828822e-05, + "loss": 0.193, + "step": 3452 + }, + { + "epoch": 0.18, + "grad_norm": 1.1737633956260982, + "learning_rate": 1.8908756382628534e-05, + "loss": 0.2178, + "step": 3453 + }, + { + "epoch": 0.18, + "grad_norm": 1.5649160231502703, + "learning_rate": 1.8908008130777724e-05, + "loss": 0.2198, + "step": 3454 + }, + { + "epoch": 0.18, + "grad_norm": 10.954757815776317, + "learning_rate": 1.890725963729669e-05, + "loss": 0.2328, + "step": 3455 + }, + { + "epoch": 0.18, + "grad_norm": 1.5228254564372452, + "learning_rate": 1.8906510902205736e-05, + "loss": 0.2107, + "step": 3456 + }, + { + "epoch": 0.18, + "grad_norm": 1.0442097517235953, + "learning_rate": 1.890576192552517e-05, + "loss": 0.1938, + "step": 3457 + }, + { + "epoch": 0.18, + "grad_norm": 1.3410754929651154, + "learning_rate": 1.890501270727531e-05, + "loss": 0.205, + "step": 3458 + }, + { + "epoch": 0.18, + "grad_norm": 1.1872730608250062, + "learning_rate": 1.8904263247476478e-05, + "loss": 0.2256, + "step": 3459 + }, + { + "epoch": 0.18, + "grad_norm": 5.085472544504074, + "learning_rate": 1.8903513546149e-05, + "loss": 0.1979, + "step": 3460 + }, + { + "epoch": 0.18, + "grad_norm": 3.133249139850197, + "learning_rate": 1.8902763603313213e-05, + "loss": 0.2256, + "step": 3461 + }, + { + "epoch": 0.18, + "grad_norm": 3.24180581737015, + "learning_rate": 1.8902013418989464e-05, + "loss": 0.2318, + "step": 3462 + }, + { + "epoch": 0.18, + "grad_norm": 1.8748478086219742, + "learning_rate": 1.89012629931981e-05, + "loss": 0.2209, + "step": 3463 + }, + { + "epoch": 0.18, + "grad_norm": 1.6952368589548503, + "learning_rate": 1.890051232595947e-05, + "loss": 0.2124, + "step": 3464 + }, + { + "epoch": 0.18, + "grad_norm": 1.1286193647776563, + "learning_rate": 1.8899761417293944e-05, + "loss": 0.214, + "step": 3465 + }, + { + "epoch": 0.18, + "grad_norm": 2.3691731443740385, + "learning_rate": 1.8899010267221884e-05, + "loss": 0.236, + "step": 3466 + }, + { + "epoch": 0.18, + "grad_norm": 5.106432902418719, + "learning_rate": 1.8898258875763668e-05, + "loss": 0.1958, + "step": 3467 + }, + { + "epoch": 0.18, + "grad_norm": 2.42216513069837, + "learning_rate": 1.889750724293968e-05, + "loss": 0.2178, + "step": 3468 + }, + { + "epoch": 0.18, + "grad_norm": 1.039236057239844, + "learning_rate": 1.88967553687703e-05, + "loss": 0.211, + "step": 3469 + }, + { + "epoch": 0.18, + "grad_norm": 1.1630241938144983, + "learning_rate": 1.8896003253275934e-05, + "loss": 0.2031, + "step": 3470 + }, + { + "epoch": 0.18, + "grad_norm": 1.133200488856877, + "learning_rate": 1.8895250896476976e-05, + "loss": 0.2157, + "step": 3471 + }, + { + "epoch": 0.18, + "grad_norm": 1.4087899695633774, + "learning_rate": 1.8894498298393835e-05, + "loss": 0.2696, + "step": 3472 + }, + { + "epoch": 0.18, + "grad_norm": 1.7305735648730336, + "learning_rate": 1.8893745459046922e-05, + "loss": 0.2323, + "step": 3473 + }, + { + "epoch": 0.18, + "grad_norm": 3.142204157272831, + "learning_rate": 1.8892992378456664e-05, + "loss": 0.2282, + "step": 3474 + }, + { + "epoch": 0.18, + "grad_norm": 1.1118952952593542, + "learning_rate": 1.8892239056643487e-05, + "loss": 0.2036, + "step": 3475 + }, + { + "epoch": 0.18, + "grad_norm": 1.0687130740086, + "learning_rate": 1.8891485493627823e-05, + "loss": 0.2208, + "step": 3476 + }, + { + "epoch": 0.18, + "grad_norm": 1.6734111624578498, + "learning_rate": 1.8890731689430108e-05, + "loss": 0.2395, + "step": 3477 + }, + { + "epoch": 0.18, + "grad_norm": 1.3438494750974113, + "learning_rate": 1.88899776440708e-05, + "loss": 0.2236, + "step": 3478 + }, + { + "epoch": 0.18, + "grad_norm": 1.2048291784648975, + "learning_rate": 1.8889223357570342e-05, + "loss": 0.2327, + "step": 3479 + }, + { + "epoch": 0.18, + "grad_norm": 2.0046820562623493, + "learning_rate": 1.88884688299492e-05, + "loss": 0.2235, + "step": 3480 + }, + { + "epoch": 0.18, + "grad_norm": 1.5238429554318127, + "learning_rate": 1.8887714061227838e-05, + "loss": 0.2112, + "step": 3481 + }, + { + "epoch": 0.18, + "grad_norm": 1.213570258302131, + "learning_rate": 1.8886959051426733e-05, + "loss": 0.1763, + "step": 3482 + }, + { + "epoch": 0.18, + "grad_norm": 1.0069459742245659, + "learning_rate": 1.8886203800566357e-05, + "loss": 0.2067, + "step": 3483 + }, + { + "epoch": 0.18, + "grad_norm": 1.198997190207289, + "learning_rate": 1.8885448308667204e-05, + "loss": 0.2202, + "step": 3484 + }, + { + "epoch": 0.18, + "grad_norm": 1.1565023181231882, + "learning_rate": 1.8884692575749762e-05, + "loss": 0.2215, + "step": 3485 + }, + { + "epoch": 0.18, + "grad_norm": 1.2061530708342951, + "learning_rate": 1.8883936601834533e-05, + "loss": 0.1979, + "step": 3486 + }, + { + "epoch": 0.18, + "grad_norm": 1.0102406613321953, + "learning_rate": 1.8883180386942022e-05, + "loss": 0.2072, + "step": 3487 + }, + { + "epoch": 0.18, + "grad_norm": 2.6719172697318467, + "learning_rate": 1.8882423931092745e-05, + "loss": 0.2198, + "step": 3488 + }, + { + "epoch": 0.18, + "grad_norm": 1.55648843029096, + "learning_rate": 1.888166723430721e-05, + "loss": 0.2062, + "step": 3489 + }, + { + "epoch": 0.18, + "grad_norm": 1.2375859182455549, + "learning_rate": 1.8880910296605956e-05, + "loss": 0.1965, + "step": 3490 + }, + { + "epoch": 0.18, + "grad_norm": 1.2731145678375624, + "learning_rate": 1.8880153118009505e-05, + "loss": 0.2256, + "step": 3491 + }, + { + "epoch": 0.18, + "grad_norm": 1.164193051691617, + "learning_rate": 1.8879395698538402e-05, + "loss": 0.2163, + "step": 3492 + }, + { + "epoch": 0.18, + "grad_norm": 1.3166783793707386, + "learning_rate": 1.8878638038213186e-05, + "loss": 0.2129, + "step": 3493 + }, + { + "epoch": 0.18, + "grad_norm": 1.1521288362770579, + "learning_rate": 1.8877880137054413e-05, + "loss": 0.2269, + "step": 3494 + }, + { + "epoch": 0.18, + "grad_norm": 1.1986007406716817, + "learning_rate": 1.8877121995082638e-05, + "loss": 0.223, + "step": 3495 + }, + { + "epoch": 0.18, + "grad_norm": 1.0455207674415665, + "learning_rate": 1.887636361231843e-05, + "loss": 0.2218, + "step": 3496 + }, + { + "epoch": 0.18, + "grad_norm": 0.975037927396976, + "learning_rate": 1.887560498878236e-05, + "loss": 0.205, + "step": 3497 + }, + { + "epoch": 0.18, + "grad_norm": 0.99183527876214, + "learning_rate": 1.8874846124495e-05, + "loss": 0.2056, + "step": 3498 + }, + { + "epoch": 0.18, + "grad_norm": 1.3402372514211767, + "learning_rate": 1.8874087019476937e-05, + "loss": 0.2011, + "step": 3499 + }, + { + "epoch": 0.18, + "grad_norm": 1.3365592978089498, + "learning_rate": 1.8873327673748765e-05, + "loss": 0.2356, + "step": 3500 + }, + { + "epoch": 0.18, + "grad_norm": 1.4976108619117818, + "learning_rate": 1.8872568087331074e-05, + "loss": 0.2298, + "step": 3501 + }, + { + "epoch": 0.18, + "grad_norm": 1.0639877022857973, + "learning_rate": 1.8871808260244476e-05, + "loss": 0.2193, + "step": 3502 + }, + { + "epoch": 0.18, + "grad_norm": 1.029750538822256, + "learning_rate": 1.8871048192509576e-05, + "loss": 0.2353, + "step": 3503 + }, + { + "epoch": 0.18, + "grad_norm": 1.5591330622484874, + "learning_rate": 1.8870287884147e-05, + "loss": 0.2064, + "step": 3504 + }, + { + "epoch": 0.18, + "grad_norm": 1.2796091932736209, + "learning_rate": 1.8869527335177354e-05, + "loss": 0.2481, + "step": 3505 + }, + { + "epoch": 0.18, + "grad_norm": 1.1622857079794304, + "learning_rate": 1.8868766545621286e-05, + "loss": 0.2504, + "step": 3506 + }, + { + "epoch": 0.18, + "grad_norm": 1.1477897384360691, + "learning_rate": 1.886800551549942e-05, + "loss": 0.2014, + "step": 3507 + }, + { + "epoch": 0.18, + "grad_norm": 1.1667763938280227, + "learning_rate": 1.88672442448324e-05, + "loss": 0.2078, + "step": 3508 + }, + { + "epoch": 0.18, + "grad_norm": 1.137503241315197, + "learning_rate": 1.886648273364089e-05, + "loss": 0.2268, + "step": 3509 + }, + { + "epoch": 0.18, + "grad_norm": 1.5931312444608623, + "learning_rate": 1.886572098194553e-05, + "loss": 0.2399, + "step": 3510 + }, + { + "epoch": 0.18, + "grad_norm": 0.998589900804111, + "learning_rate": 1.8864958989766982e-05, + "loss": 0.2177, + "step": 3511 + }, + { + "epoch": 0.18, + "grad_norm": 1.0766991349805863, + "learning_rate": 1.8864196757125926e-05, + "loss": 0.219, + "step": 3512 + }, + { + "epoch": 0.18, + "grad_norm": 1.2669927981226383, + "learning_rate": 1.886343428404303e-05, + "loss": 0.2455, + "step": 3513 + }, + { + "epoch": 0.18, + "grad_norm": 1.3821721941809648, + "learning_rate": 1.8862671570538983e-05, + "loss": 0.2291, + "step": 3514 + }, + { + "epoch": 0.18, + "grad_norm": 1.9922061065377965, + "learning_rate": 1.8861908616634465e-05, + "loss": 0.2355, + "step": 3515 + }, + { + "epoch": 0.18, + "grad_norm": 0.9647774432744618, + "learning_rate": 1.8861145422350175e-05, + "loss": 0.2225, + "step": 3516 + }, + { + "epoch": 0.18, + "grad_norm": 1.2450905932652228, + "learning_rate": 1.8860381987706815e-05, + "loss": 0.1923, + "step": 3517 + }, + { + "epoch": 0.18, + "grad_norm": 1.174949296255601, + "learning_rate": 1.8859618312725097e-05, + "loss": 0.2273, + "step": 3518 + }, + { + "epoch": 0.18, + "grad_norm": 0.9185629784663724, + "learning_rate": 1.885885439742573e-05, + "loss": 0.2421, + "step": 3519 + }, + { + "epoch": 0.18, + "grad_norm": 1.0252428894923662, + "learning_rate": 1.8858090241829435e-05, + "loss": 0.2276, + "step": 3520 + }, + { + "epoch": 0.18, + "grad_norm": 1.3017122731058193, + "learning_rate": 1.8857325845956943e-05, + "loss": 0.2324, + "step": 3521 + }, + { + "epoch": 0.18, + "grad_norm": 1.3801729559240783, + "learning_rate": 1.8856561209828985e-05, + "loss": 0.2102, + "step": 3522 + }, + { + "epoch": 0.18, + "grad_norm": 1.518892942443358, + "learning_rate": 1.8855796333466306e-05, + "loss": 0.2125, + "step": 3523 + }, + { + "epoch": 0.18, + "grad_norm": 2.623769747136324, + "learning_rate": 1.8855031216889654e-05, + "loss": 0.2158, + "step": 3524 + }, + { + "epoch": 0.18, + "grad_norm": 1.0182322618961068, + "learning_rate": 1.8854265860119777e-05, + "loss": 0.2311, + "step": 3525 + }, + { + "epoch": 0.18, + "grad_norm": 1.6124376494565882, + "learning_rate": 1.8853500263177438e-05, + "loss": 0.243, + "step": 3526 + }, + { + "epoch": 0.18, + "grad_norm": 1.3650878655188263, + "learning_rate": 1.8852734426083407e-05, + "loss": 0.2351, + "step": 3527 + }, + { + "epoch": 0.18, + "grad_norm": 1.3001949420466634, + "learning_rate": 1.8851968348858452e-05, + "loss": 0.2381, + "step": 3528 + }, + { + "epoch": 0.18, + "grad_norm": 1.0896299953265736, + "learning_rate": 1.8851202031523357e-05, + "loss": 0.1966, + "step": 3529 + }, + { + "epoch": 0.18, + "grad_norm": 1.149107350644368, + "learning_rate": 1.8850435474098903e-05, + "loss": 0.2114, + "step": 3530 + }, + { + "epoch": 0.18, + "grad_norm": 1.1056201887813963, + "learning_rate": 1.8849668676605892e-05, + "loss": 0.2094, + "step": 3531 + }, + { + "epoch": 0.18, + "grad_norm": 1.0996511570022964, + "learning_rate": 1.884890163906512e-05, + "loss": 0.2075, + "step": 3532 + }, + { + "epoch": 0.18, + "grad_norm": 1.2245004022629145, + "learning_rate": 1.8848134361497385e-05, + "loss": 0.2215, + "step": 3533 + }, + { + "epoch": 0.18, + "grad_norm": 0.9834659334969303, + "learning_rate": 1.8847366843923512e-05, + "loss": 0.2498, + "step": 3534 + }, + { + "epoch": 0.18, + "grad_norm": 1.034031100069306, + "learning_rate": 1.8846599086364307e-05, + "loss": 0.2281, + "step": 3535 + }, + { + "epoch": 0.18, + "grad_norm": 1.1124308025623266, + "learning_rate": 1.8845831088840607e-05, + "loss": 0.2255, + "step": 3536 + }, + { + "epoch": 0.18, + "grad_norm": 0.9862681621323288, + "learning_rate": 1.884506285137324e-05, + "loss": 0.205, + "step": 3537 + }, + { + "epoch": 0.18, + "grad_norm": 0.9765289155940603, + "learning_rate": 1.884429437398304e-05, + "loss": 0.2293, + "step": 3538 + }, + { + "epoch": 0.18, + "grad_norm": 1.1166529919622326, + "learning_rate": 1.8843525656690856e-05, + "loss": 0.2419, + "step": 3539 + }, + { + "epoch": 0.18, + "grad_norm": 1.310916406449081, + "learning_rate": 1.884275669951754e-05, + "loss": 0.2174, + "step": 3540 + }, + { + "epoch": 0.18, + "grad_norm": 1.1772746264552305, + "learning_rate": 1.8841987502483947e-05, + "loss": 0.2124, + "step": 3541 + }, + { + "epoch": 0.18, + "grad_norm": 2.001411267184766, + "learning_rate": 1.8841218065610946e-05, + "loss": 0.2046, + "step": 3542 + }, + { + "epoch": 0.18, + "grad_norm": 1.7958941047302297, + "learning_rate": 1.8840448388919404e-05, + "loss": 0.2354, + "step": 3543 + }, + { + "epoch": 0.18, + "grad_norm": 1.3784337374121243, + "learning_rate": 1.8839678472430202e-05, + "loss": 0.2234, + "step": 3544 + }, + { + "epoch": 0.18, + "grad_norm": 1.386781588632047, + "learning_rate": 1.883890831616422e-05, + "loss": 0.2139, + "step": 3545 + }, + { + "epoch": 0.18, + "grad_norm": 1.0081326543848246, + "learning_rate": 1.8838137920142353e-05, + "loss": 0.2172, + "step": 3546 + }, + { + "epoch": 0.18, + "grad_norm": 1.925414447020237, + "learning_rate": 1.8837367284385495e-05, + "loss": 0.2212, + "step": 3547 + }, + { + "epoch": 0.18, + "grad_norm": 1.3165789878342848, + "learning_rate": 1.883659640891455e-05, + "loss": 0.218, + "step": 3548 + }, + { + "epoch": 0.18, + "grad_norm": 1.4404730700778872, + "learning_rate": 1.883582529375043e-05, + "loss": 0.2222, + "step": 3549 + }, + { + "epoch": 0.18, + "grad_norm": 1.5777733140194132, + "learning_rate": 1.883505393891405e-05, + "loss": 0.247, + "step": 3550 + }, + { + "epoch": 0.18, + "grad_norm": 1.363219632836876, + "learning_rate": 1.883428234442633e-05, + "loss": 0.2386, + "step": 3551 + }, + { + "epoch": 0.18, + "grad_norm": 2.3569503082449117, + "learning_rate": 1.8833510510308205e-05, + "loss": 0.2317, + "step": 3552 + }, + { + "epoch": 0.18, + "grad_norm": 1.2587246937536893, + "learning_rate": 1.8832738436580606e-05, + "loss": 0.2188, + "step": 3553 + }, + { + "epoch": 0.18, + "grad_norm": 1.12233768922461, + "learning_rate": 1.883196612326448e-05, + "loss": 0.2236, + "step": 3554 + }, + { + "epoch": 0.18, + "grad_norm": 1.2126151051536773, + "learning_rate": 1.8831193570380773e-05, + "loss": 0.2086, + "step": 3555 + }, + { + "epoch": 0.18, + "grad_norm": 1.2393899252829297, + "learning_rate": 1.883042077795044e-05, + "loss": 0.2391, + "step": 3556 + }, + { + "epoch": 0.18, + "grad_norm": 1.4836398610286818, + "learning_rate": 1.8829647745994445e-05, + "loss": 0.2554, + "step": 3557 + }, + { + "epoch": 0.18, + "grad_norm": 0.9733825299769355, + "learning_rate": 1.882887447453376e-05, + "loss": 0.2076, + "step": 3558 + }, + { + "epoch": 0.18, + "grad_norm": 1.169740840648058, + "learning_rate": 1.8828100963589357e-05, + "loss": 0.1992, + "step": 3559 + }, + { + "epoch": 0.18, + "grad_norm": 1.0783743006030566, + "learning_rate": 1.882732721318221e-05, + "loss": 0.2232, + "step": 3560 + }, + { + "epoch": 0.18, + "grad_norm": 1.2929527357581567, + "learning_rate": 1.882655322333332e-05, + "loss": 0.2312, + "step": 3561 + }, + { + "epoch": 0.18, + "grad_norm": 0.8276497602849175, + "learning_rate": 1.8825778994063672e-05, + "loss": 0.1998, + "step": 3562 + }, + { + "epoch": 0.18, + "grad_norm": 1.822612547922766, + "learning_rate": 1.882500452539427e-05, + "loss": 0.2186, + "step": 3563 + }, + { + "epoch": 0.18, + "grad_norm": 1.185368918244924, + "learning_rate": 1.8824229817346124e-05, + "loss": 0.2269, + "step": 3564 + }, + { + "epoch": 0.18, + "grad_norm": 1.044763669380613, + "learning_rate": 1.8823454869940243e-05, + "loss": 0.2022, + "step": 3565 + }, + { + "epoch": 0.18, + "grad_norm": 1.6220896032433259, + "learning_rate": 1.8822679683197654e-05, + "loss": 0.2309, + "step": 3566 + }, + { + "epoch": 0.18, + "grad_norm": 0.9932681947347778, + "learning_rate": 1.882190425713938e-05, + "loss": 0.2171, + "step": 3567 + }, + { + "epoch": 0.18, + "grad_norm": 1.3632488336960016, + "learning_rate": 1.882112859178645e-05, + "loss": 0.2188, + "step": 3568 + }, + { + "epoch": 0.18, + "grad_norm": 1.6684759325774463, + "learning_rate": 1.8820352687159912e-05, + "loss": 0.2151, + "step": 3569 + }, + { + "epoch": 0.18, + "grad_norm": 1.1266573469024315, + "learning_rate": 1.881957654328081e-05, + "loss": 0.2164, + "step": 3570 + }, + { + "epoch": 0.18, + "grad_norm": 1.2447540550144214, + "learning_rate": 1.8818800160170193e-05, + "loss": 0.2361, + "step": 3571 + }, + { + "epoch": 0.18, + "grad_norm": 1.3002904781282625, + "learning_rate": 1.8818023537849124e-05, + "loss": 0.2265, + "step": 3572 + }, + { + "epoch": 0.18, + "grad_norm": 1.0874912434200428, + "learning_rate": 1.8817246676338674e-05, + "loss": 0.2042, + "step": 3573 + }, + { + "epoch": 0.18, + "grad_norm": 1.2065451355021113, + "learning_rate": 1.8816469575659905e-05, + "loss": 0.1963, + "step": 3574 + }, + { + "epoch": 0.18, + "grad_norm": 1.6011427686798987, + "learning_rate": 1.8815692235833903e-05, + "loss": 0.2542, + "step": 3575 + }, + { + "epoch": 0.18, + "grad_norm": 1.0630131633964672, + "learning_rate": 1.881491465688175e-05, + "loss": 0.1862, + "step": 3576 + }, + { + "epoch": 0.18, + "grad_norm": 1.192757320656651, + "learning_rate": 1.881413683882454e-05, + "loss": 0.2016, + "step": 3577 + }, + { + "epoch": 0.18, + "grad_norm": 1.0317469350067587, + "learning_rate": 1.881335878168337e-05, + "loss": 0.2074, + "step": 3578 + }, + { + "epoch": 0.18, + "grad_norm": 1.239604331744985, + "learning_rate": 1.881258048547934e-05, + "loss": 0.2103, + "step": 3579 + }, + { + "epoch": 0.18, + "grad_norm": 1.3107029936706969, + "learning_rate": 1.8811801950233576e-05, + "loss": 0.2162, + "step": 3580 + }, + { + "epoch": 0.18, + "grad_norm": 1.0157843501686425, + "learning_rate": 1.881102317596718e-05, + "loss": 0.2354, + "step": 3581 + }, + { + "epoch": 0.18, + "grad_norm": 2.9623200968488543, + "learning_rate": 1.8810244162701282e-05, + "loss": 0.229, + "step": 3582 + }, + { + "epoch": 0.18, + "grad_norm": 1.1685273424380547, + "learning_rate": 1.8809464910457018e-05, + "loss": 0.244, + "step": 3583 + }, + { + "epoch": 0.18, + "grad_norm": 1.1103339740841685, + "learning_rate": 1.880868541925552e-05, + "loss": 0.2169, + "step": 3584 + }, + { + "epoch": 0.18, + "grad_norm": 1.311960033700126, + "learning_rate": 1.8807905689117932e-05, + "loss": 0.2723, + "step": 3585 + }, + { + "epoch": 0.18, + "grad_norm": 1.1836207348653984, + "learning_rate": 1.8807125720065402e-05, + "loss": 0.2174, + "step": 3586 + }, + { + "epoch": 0.18, + "grad_norm": 1.006572134286583, + "learning_rate": 1.8806345512119094e-05, + "loss": 0.2188, + "step": 3587 + }, + { + "epoch": 0.18, + "grad_norm": 0.9780003684667589, + "learning_rate": 1.880556506530016e-05, + "loss": 0.2171, + "step": 3588 + }, + { + "epoch": 0.18, + "grad_norm": 1.1732560568868864, + "learning_rate": 1.8804784379629782e-05, + "loss": 0.1826, + "step": 3589 + }, + { + "epoch": 0.18, + "grad_norm": 0.9094677603104601, + "learning_rate": 1.880400345512913e-05, + "loss": 0.2436, + "step": 3590 + }, + { + "epoch": 0.18, + "grad_norm": 1.5987479896408254, + "learning_rate": 1.8803222291819384e-05, + "loss": 0.2424, + "step": 3591 + }, + { + "epoch": 0.18, + "grad_norm": 1.9268098947402248, + "learning_rate": 1.8802440889721738e-05, + "loss": 0.2332, + "step": 3592 + }, + { + "epoch": 0.18, + "grad_norm": 1.1590799031555827, + "learning_rate": 1.8801659248857387e-05, + "loss": 0.2174, + "step": 3593 + }, + { + "epoch": 0.18, + "grad_norm": 0.8734401012949662, + "learning_rate": 1.880087736924753e-05, + "loss": 0.1889, + "step": 3594 + }, + { + "epoch": 0.18, + "grad_norm": 0.9763292582262146, + "learning_rate": 1.8800095250913378e-05, + "loss": 0.2301, + "step": 3595 + }, + { + "epoch": 0.18, + "grad_norm": 1.1036759320154053, + "learning_rate": 1.8799312893876144e-05, + "loss": 0.2354, + "step": 3596 + }, + { + "epoch": 0.18, + "grad_norm": 1.2068573877020075, + "learning_rate": 1.8798530298157053e-05, + "loss": 0.2257, + "step": 3597 + }, + { + "epoch": 0.18, + "grad_norm": 1.3724862448248425, + "learning_rate": 1.879774746377733e-05, + "loss": 0.2239, + "step": 3598 + }, + { + "epoch": 0.18, + "grad_norm": 1.4278671837999257, + "learning_rate": 1.8796964390758208e-05, + "loss": 0.2295, + "step": 3599 + }, + { + "epoch": 0.18, + "grad_norm": 1.0753048332681465, + "learning_rate": 1.879618107912093e-05, + "loss": 0.2333, + "step": 3600 + }, + { + "epoch": 0.18, + "grad_norm": 1.435980345300096, + "learning_rate": 1.8795397528886744e-05, + "loss": 0.195, + "step": 3601 + }, + { + "epoch": 0.18, + "grad_norm": 1.4710044713102228, + "learning_rate": 1.8794613740076905e-05, + "loss": 0.2436, + "step": 3602 + }, + { + "epoch": 0.18, + "grad_norm": 1.1614473226565285, + "learning_rate": 1.8793829712712674e-05, + "loss": 0.2332, + "step": 3603 + }, + { + "epoch": 0.18, + "grad_norm": 1.1045520516660092, + "learning_rate": 1.879304544681531e-05, + "loss": 0.224, + "step": 3604 + }, + { + "epoch": 0.18, + "grad_norm": 1.319759137203225, + "learning_rate": 1.8792260942406093e-05, + "loss": 0.2268, + "step": 3605 + }, + { + "epoch": 0.18, + "grad_norm": 1.1487889779994411, + "learning_rate": 1.87914761995063e-05, + "loss": 0.2067, + "step": 3606 + }, + { + "epoch": 0.18, + "grad_norm": 1.3210062460129575, + "learning_rate": 1.8790691218137223e-05, + "loss": 0.1936, + "step": 3607 + }, + { + "epoch": 0.18, + "grad_norm": 1.0186501739958207, + "learning_rate": 1.8789905998320148e-05, + "loss": 0.2092, + "step": 3608 + }, + { + "epoch": 0.18, + "grad_norm": 1.1235348787736643, + "learning_rate": 1.8789120540076377e-05, + "loss": 0.2477, + "step": 3609 + }, + { + "epoch": 0.18, + "grad_norm": 0.8505143793933634, + "learning_rate": 1.8788334843427213e-05, + "loss": 0.2175, + "step": 3610 + }, + { + "epoch": 0.18, + "grad_norm": 1.0042245499894789, + "learning_rate": 1.878754890839397e-05, + "loss": 0.2083, + "step": 3611 + }, + { + "epoch": 0.18, + "grad_norm": 1.088541889713761, + "learning_rate": 1.8786762734997967e-05, + "loss": 0.2161, + "step": 3612 + }, + { + "epoch": 0.18, + "grad_norm": 1.7685156296862572, + "learning_rate": 1.878597632326053e-05, + "loss": 0.2166, + "step": 3613 + }, + { + "epoch": 0.18, + "grad_norm": 1.12394810734318, + "learning_rate": 1.8785189673202987e-05, + "loss": 0.2276, + "step": 3614 + }, + { + "epoch": 0.18, + "grad_norm": 2.3965025337178076, + "learning_rate": 1.8784402784846683e-05, + "loss": 0.2035, + "step": 3615 + }, + { + "epoch": 0.18, + "grad_norm": 1.0159742971193932, + "learning_rate": 1.8783615658212954e-05, + "loss": 0.2333, + "step": 3616 + }, + { + "epoch": 0.18, + "grad_norm": 1.3262982321738739, + "learning_rate": 1.8782828293323148e-05, + "loss": 0.2175, + "step": 3617 + }, + { + "epoch": 0.18, + "grad_norm": 0.9727403288284343, + "learning_rate": 1.8782040690198638e-05, + "loss": 0.2162, + "step": 3618 + }, + { + "epoch": 0.18, + "grad_norm": 1.147172083948449, + "learning_rate": 1.8781252848860774e-05, + "loss": 0.1996, + "step": 3619 + }, + { + "epoch": 0.18, + "grad_norm": 1.6406618327820583, + "learning_rate": 1.878046476933093e-05, + "loss": 0.2053, + "step": 3620 + }, + { + "epoch": 0.18, + "grad_norm": 1.1381954215972985, + "learning_rate": 1.8779676451630483e-05, + "loss": 0.2293, + "step": 3621 + }, + { + "epoch": 0.18, + "grad_norm": 1.2442005673537484, + "learning_rate": 1.877888789578082e-05, + "loss": 0.22, + "step": 3622 + }, + { + "epoch": 0.18, + "grad_norm": 1.12587064239475, + "learning_rate": 1.877809910180332e-05, + "loss": 0.2325, + "step": 3623 + }, + { + "epoch": 0.18, + "grad_norm": 0.9723723171760922, + "learning_rate": 1.8777310069719395e-05, + "loss": 0.216, + "step": 3624 + }, + { + "epoch": 0.18, + "grad_norm": 0.8174785819072351, + "learning_rate": 1.8776520799550432e-05, + "loss": 0.2109, + "step": 3625 + }, + { + "epoch": 0.18, + "grad_norm": 0.9122603631781808, + "learning_rate": 1.8775731291317848e-05, + "loss": 0.2309, + "step": 3626 + }, + { + "epoch": 0.18, + "grad_norm": 0.8848068716660894, + "learning_rate": 1.877494154504306e-05, + "loss": 0.2194, + "step": 3627 + }, + { + "epoch": 0.18, + "grad_norm": 0.9278675771741361, + "learning_rate": 1.8774151560747483e-05, + "loss": 0.2183, + "step": 3628 + }, + { + "epoch": 0.18, + "grad_norm": 1.9191651194789452, + "learning_rate": 1.8773361338452552e-05, + "loss": 0.2161, + "step": 3629 + }, + { + "epoch": 0.18, + "grad_norm": 1.0679724192636588, + "learning_rate": 1.87725708781797e-05, + "loss": 0.2244, + "step": 3630 + }, + { + "epoch": 0.18, + "grad_norm": 1.0051450683135599, + "learning_rate": 1.8771780179950365e-05, + "loss": 0.236, + "step": 3631 + }, + { + "epoch": 0.18, + "grad_norm": 1.054895456949411, + "learning_rate": 1.8770989243785996e-05, + "loss": 0.2241, + "step": 3632 + }, + { + "epoch": 0.18, + "grad_norm": 0.9056539769716427, + "learning_rate": 1.8770198069708053e-05, + "loss": 0.2322, + "step": 3633 + }, + { + "epoch": 0.18, + "grad_norm": 1.0234613563851183, + "learning_rate": 1.8769406657737987e-05, + "loss": 0.2316, + "step": 3634 + }, + { + "epoch": 0.18, + "grad_norm": 0.9956590739601004, + "learning_rate": 1.8768615007897274e-05, + "loss": 0.199, + "step": 3635 + }, + { + "epoch": 0.18, + "grad_norm": 0.8869981627351374, + "learning_rate": 1.8767823120207382e-05, + "loss": 0.2109, + "step": 3636 + }, + { + "epoch": 0.18, + "grad_norm": 0.9934235815819739, + "learning_rate": 1.876703099468979e-05, + "loss": 0.2304, + "step": 3637 + }, + { + "epoch": 0.18, + "grad_norm": 1.4633457424825305, + "learning_rate": 1.8766238631365993e-05, + "loss": 0.2083, + "step": 3638 + }, + { + "epoch": 0.19, + "grad_norm": 1.1406976495870418, + "learning_rate": 1.8765446030257475e-05, + "loss": 0.2153, + "step": 3639 + }, + { + "epoch": 0.19, + "grad_norm": 0.9879296696273988, + "learning_rate": 1.8764653191385737e-05, + "loss": 0.2085, + "step": 3640 + }, + { + "epoch": 0.19, + "grad_norm": 0.910282344054396, + "learning_rate": 1.876386011477229e-05, + "loss": 0.2094, + "step": 3641 + }, + { + "epoch": 0.19, + "grad_norm": 1.0136285939917482, + "learning_rate": 1.8763066800438638e-05, + "loss": 0.2312, + "step": 3642 + }, + { + "epoch": 0.19, + "grad_norm": 0.9982969903185347, + "learning_rate": 1.8762273248406308e-05, + "loss": 0.2199, + "step": 3643 + }, + { + "epoch": 0.19, + "grad_norm": 1.0218645304104221, + "learning_rate": 1.8761479458696817e-05, + "loss": 0.2435, + "step": 3644 + }, + { + "epoch": 0.19, + "grad_norm": 0.801950250637673, + "learning_rate": 1.87606854313317e-05, + "loss": 0.2279, + "step": 3645 + }, + { + "epoch": 0.19, + "grad_norm": 0.9830240715600229, + "learning_rate": 1.87598911663325e-05, + "loss": 0.202, + "step": 3646 + }, + { + "epoch": 0.19, + "grad_norm": 1.062684827845211, + "learning_rate": 1.8759096663720757e-05, + "loss": 0.2038, + "step": 3647 + }, + { + "epoch": 0.19, + "grad_norm": 0.8424173998687214, + "learning_rate": 1.8758301923518022e-05, + "loss": 0.2075, + "step": 3648 + }, + { + "epoch": 0.19, + "grad_norm": 1.2659538205110992, + "learning_rate": 1.8757506945745853e-05, + "loss": 0.2135, + "step": 3649 + }, + { + "epoch": 0.19, + "grad_norm": 0.8779279555202277, + "learning_rate": 1.875671173042581e-05, + "loss": 0.2124, + "step": 3650 + }, + { + "epoch": 0.19, + "grad_norm": 1.0724684529868374, + "learning_rate": 1.875591627757947e-05, + "loss": 0.2271, + "step": 3651 + }, + { + "epoch": 0.19, + "grad_norm": 0.997713641099804, + "learning_rate": 1.8755120587228407e-05, + "loss": 0.211, + "step": 3652 + }, + { + "epoch": 0.19, + "grad_norm": 0.9509704006661915, + "learning_rate": 1.87543246593942e-05, + "loss": 0.2153, + "step": 3653 + }, + { + "epoch": 0.19, + "grad_norm": 0.9876044502985013, + "learning_rate": 1.8753528494098448e-05, + "loss": 0.2446, + "step": 3654 + }, + { + "epoch": 0.19, + "grad_norm": 0.9074009768086965, + "learning_rate": 1.8752732091362737e-05, + "loss": 0.2376, + "step": 3655 + }, + { + "epoch": 0.19, + "grad_norm": 0.8466323825950363, + "learning_rate": 1.8751935451208672e-05, + "loss": 0.2198, + "step": 3656 + }, + { + "epoch": 0.19, + "grad_norm": 0.9408926489341524, + "learning_rate": 1.875113857365787e-05, + "loss": 0.2191, + "step": 3657 + }, + { + "epoch": 0.19, + "grad_norm": 0.9072872274928622, + "learning_rate": 1.8750341458731934e-05, + "loss": 0.2319, + "step": 3658 + }, + { + "epoch": 0.19, + "grad_norm": 0.9542440451622091, + "learning_rate": 1.8749544106452493e-05, + "loss": 0.1976, + "step": 3659 + }, + { + "epoch": 0.19, + "grad_norm": 0.7645091657265274, + "learning_rate": 1.8748746516841173e-05, + "loss": 0.2083, + "step": 3660 + }, + { + "epoch": 0.19, + "grad_norm": 1.156226521004843, + "learning_rate": 1.8747948689919613e-05, + "loss": 0.2487, + "step": 3661 + }, + { + "epoch": 0.19, + "grad_norm": 0.955620834216678, + "learning_rate": 1.8747150625709447e-05, + "loss": 0.2347, + "step": 3662 + }, + { + "epoch": 0.19, + "grad_norm": 0.9942108737792252, + "learning_rate": 1.874635232423233e-05, + "loss": 0.2062, + "step": 3663 + }, + { + "epoch": 0.19, + "grad_norm": 1.007020797262451, + "learning_rate": 1.874555378550991e-05, + "loss": 0.2038, + "step": 3664 + }, + { + "epoch": 0.19, + "grad_norm": 0.9402919883523149, + "learning_rate": 1.874475500956385e-05, + "loss": 0.2045, + "step": 3665 + }, + { + "epoch": 0.19, + "grad_norm": 1.3820371958137763, + "learning_rate": 1.8743955996415816e-05, + "loss": 0.2158, + "step": 3666 + }, + { + "epoch": 0.19, + "grad_norm": 1.3681512159679738, + "learning_rate": 1.8743156746087484e-05, + "loss": 0.2455, + "step": 3667 + }, + { + "epoch": 0.19, + "grad_norm": 0.8308239350203164, + "learning_rate": 1.874235725860053e-05, + "loss": 0.1835, + "step": 3668 + }, + { + "epoch": 0.19, + "grad_norm": 0.9181442338099888, + "learning_rate": 1.874155753397664e-05, + "loss": 0.2669, + "step": 3669 + }, + { + "epoch": 0.19, + "grad_norm": 1.5206039594663598, + "learning_rate": 1.874075757223751e-05, + "loss": 0.2338, + "step": 3670 + }, + { + "epoch": 0.19, + "grad_norm": 1.1480071255535726, + "learning_rate": 1.8739957373404835e-05, + "loss": 0.2121, + "step": 3671 + }, + { + "epoch": 0.19, + "grad_norm": 1.389739264223165, + "learning_rate": 1.8739156937500323e-05, + "loss": 0.2205, + "step": 3672 + }, + { + "epoch": 0.19, + "grad_norm": 1.238895768277098, + "learning_rate": 1.8738356264545685e-05, + "loss": 0.218, + "step": 3673 + }, + { + "epoch": 0.19, + "grad_norm": 0.9526703867083856, + "learning_rate": 1.8737555354562644e-05, + "loss": 0.2024, + "step": 3674 + }, + { + "epoch": 0.19, + "grad_norm": 0.8969813658287388, + "learning_rate": 1.873675420757292e-05, + "loss": 0.2295, + "step": 3675 + }, + { + "epoch": 0.19, + "grad_norm": 1.2168913052504897, + "learning_rate": 1.873595282359824e-05, + "loss": 0.2147, + "step": 3676 + }, + { + "epoch": 0.19, + "grad_norm": 1.021148886487509, + "learning_rate": 1.873515120266035e-05, + "loss": 0.2271, + "step": 3677 + }, + { + "epoch": 0.19, + "grad_norm": 1.089162382736061, + "learning_rate": 1.8734349344780985e-05, + "loss": 0.2538, + "step": 3678 + }, + { + "epoch": 0.19, + "grad_norm": 1.087846460938777, + "learning_rate": 1.8733547249981904e-05, + "loss": 0.2473, + "step": 3679 + }, + { + "epoch": 0.19, + "grad_norm": 1.6091696891202787, + "learning_rate": 1.8732744918284866e-05, + "loss": 0.2228, + "step": 3680 + }, + { + "epoch": 0.19, + "grad_norm": 0.9178567817841584, + "learning_rate": 1.873194234971162e-05, + "loss": 0.1982, + "step": 3681 + }, + { + "epoch": 0.19, + "grad_norm": 0.9954008637114219, + "learning_rate": 1.8731139544283952e-05, + "loss": 0.1894, + "step": 3682 + }, + { + "epoch": 0.19, + "grad_norm": 0.8191302667217233, + "learning_rate": 1.873033650202363e-05, + "loss": 0.2321, + "step": 3683 + }, + { + "epoch": 0.19, + "grad_norm": 1.7547033099045546, + "learning_rate": 1.872953322295243e-05, + "loss": 0.2222, + "step": 3684 + }, + { + "epoch": 0.19, + "grad_norm": 1.0682176638270313, + "learning_rate": 1.8728729707092156e-05, + "loss": 0.2266, + "step": 3685 + }, + { + "epoch": 0.19, + "grad_norm": 1.0770283192110661, + "learning_rate": 1.8727925954464588e-05, + "loss": 0.209, + "step": 3686 + }, + { + "epoch": 0.19, + "grad_norm": 1.0130660718947984, + "learning_rate": 1.8727121965091542e-05, + "loss": 0.2055, + "step": 3687 + }, + { + "epoch": 0.19, + "grad_norm": 1.1315284288169722, + "learning_rate": 1.8726317738994817e-05, + "loss": 0.2378, + "step": 3688 + }, + { + "epoch": 0.19, + "grad_norm": 1.0757318688843531, + "learning_rate": 1.8725513276196232e-05, + "loss": 0.214, + "step": 3689 + }, + { + "epoch": 0.19, + "grad_norm": 0.8001330749908178, + "learning_rate": 1.8724708576717607e-05, + "loss": 0.2101, + "step": 3690 + }, + { + "epoch": 0.19, + "grad_norm": 1.2760143890057158, + "learning_rate": 1.872390364058077e-05, + "loss": 0.2274, + "step": 3691 + }, + { + "epoch": 0.19, + "grad_norm": 1.198424221315609, + "learning_rate": 1.872309846780755e-05, + "loss": 0.2008, + "step": 3692 + }, + { + "epoch": 0.19, + "grad_norm": 1.085392171878298, + "learning_rate": 1.8722293058419794e-05, + "loss": 0.2215, + "step": 3693 + }, + { + "epoch": 0.19, + "grad_norm": 1.0167219653500086, + "learning_rate": 1.8721487412439344e-05, + "loss": 0.2413, + "step": 3694 + }, + { + "epoch": 0.19, + "grad_norm": 1.6452475011732255, + "learning_rate": 1.8720681529888057e-05, + "loss": 0.2311, + "step": 3695 + }, + { + "epoch": 0.19, + "grad_norm": 1.148723352911266, + "learning_rate": 1.8719875410787793e-05, + "loss": 0.2149, + "step": 3696 + }, + { + "epoch": 0.19, + "grad_norm": 0.9979650627009616, + "learning_rate": 1.8719069055160415e-05, + "loss": 0.2114, + "step": 3697 + }, + { + "epoch": 0.19, + "grad_norm": 0.878102123527887, + "learning_rate": 1.8718262463027795e-05, + "loss": 0.2167, + "step": 3698 + }, + { + "epoch": 0.19, + "grad_norm": 0.8587460522038438, + "learning_rate": 1.8717455634411813e-05, + "loss": 0.2157, + "step": 3699 + }, + { + "epoch": 0.19, + "grad_norm": 0.9939679426048222, + "learning_rate": 1.8716648569334355e-05, + "loss": 0.2142, + "step": 3700 + }, + { + "epoch": 0.19, + "grad_norm": 1.0933099196445442, + "learning_rate": 1.8715841267817313e-05, + "loss": 0.24, + "step": 3701 + }, + { + "epoch": 0.19, + "grad_norm": 0.9072361235671731, + "learning_rate": 1.8715033729882585e-05, + "loss": 0.2267, + "step": 3702 + }, + { + "epoch": 0.19, + "grad_norm": 1.5972984097976861, + "learning_rate": 1.8714225955552074e-05, + "loss": 0.2271, + "step": 3703 + }, + { + "epoch": 0.19, + "grad_norm": 1.195176661277982, + "learning_rate": 1.8713417944847688e-05, + "loss": 0.2464, + "step": 3704 + }, + { + "epoch": 0.19, + "grad_norm": 0.964239160509171, + "learning_rate": 1.871260969779135e-05, + "loss": 0.224, + "step": 3705 + }, + { + "epoch": 0.19, + "grad_norm": 1.1009244798701132, + "learning_rate": 1.8711801214404986e-05, + "loss": 0.2277, + "step": 3706 + }, + { + "epoch": 0.19, + "grad_norm": 1.4679413308921536, + "learning_rate": 1.871099249471052e-05, + "loss": 0.2207, + "step": 3707 + }, + { + "epoch": 0.19, + "grad_norm": 1.4215098692749524, + "learning_rate": 1.871018353872989e-05, + "loss": 0.2172, + "step": 3708 + }, + { + "epoch": 0.19, + "grad_norm": 1.0315434809422932, + "learning_rate": 1.870937434648504e-05, + "loss": 0.221, + "step": 3709 + }, + { + "epoch": 0.19, + "grad_norm": 0.8370794377256259, + "learning_rate": 1.8708564917997917e-05, + "loss": 0.2127, + "step": 3710 + }, + { + "epoch": 0.19, + "grad_norm": 1.3562200726557778, + "learning_rate": 1.870775525329048e-05, + "loss": 0.2168, + "step": 3711 + }, + { + "epoch": 0.19, + "grad_norm": 1.0137792718910874, + "learning_rate": 1.870694535238469e-05, + "loss": 0.2214, + "step": 3712 + }, + { + "epoch": 0.19, + "grad_norm": 0.9076632568937337, + "learning_rate": 1.8706135215302517e-05, + "loss": 0.2201, + "step": 3713 + }, + { + "epoch": 0.19, + "grad_norm": 0.878009141999452, + "learning_rate": 1.8705324842065933e-05, + "loss": 0.2232, + "step": 3714 + }, + { + "epoch": 0.19, + "grad_norm": 1.0509396224366285, + "learning_rate": 1.870451423269692e-05, + "loss": 0.212, + "step": 3715 + }, + { + "epoch": 0.19, + "grad_norm": 1.043258822287421, + "learning_rate": 1.870370338721747e-05, + "loss": 0.212, + "step": 3716 + }, + { + "epoch": 0.19, + "grad_norm": 1.0121568442288489, + "learning_rate": 1.870289230564957e-05, + "loss": 0.2585, + "step": 3717 + }, + { + "epoch": 0.19, + "grad_norm": 1.0953299936197005, + "learning_rate": 1.870208098801523e-05, + "loss": 0.2228, + "step": 3718 + }, + { + "epoch": 0.19, + "grad_norm": 1.0607744241849235, + "learning_rate": 1.8701269434336447e-05, + "loss": 0.2489, + "step": 3719 + }, + { + "epoch": 0.19, + "grad_norm": 0.8472590526760381, + "learning_rate": 1.870045764463524e-05, + "loss": 0.2173, + "step": 3720 + }, + { + "epoch": 0.19, + "grad_norm": 1.0576878592684522, + "learning_rate": 1.8699645618933628e-05, + "loss": 0.2177, + "step": 3721 + }, + { + "epoch": 0.19, + "grad_norm": 1.0360486988527098, + "learning_rate": 1.869883335725364e-05, + "loss": 0.2238, + "step": 3722 + }, + { + "epoch": 0.19, + "grad_norm": 1.1969945924023768, + "learning_rate": 1.8698020859617303e-05, + "loss": 0.229, + "step": 3723 + }, + { + "epoch": 0.19, + "grad_norm": 1.1733345648184206, + "learning_rate": 1.869720812604666e-05, + "loss": 0.2289, + "step": 3724 + }, + { + "epoch": 0.19, + "grad_norm": 1.1734416691118115, + "learning_rate": 1.8696395156563753e-05, + "loss": 0.2121, + "step": 3725 + }, + { + "epoch": 0.19, + "grad_norm": 1.7741114933790696, + "learning_rate": 1.8695581951190636e-05, + "loss": 0.2237, + "step": 3726 + }, + { + "epoch": 0.19, + "grad_norm": 0.8425428534413996, + "learning_rate": 1.8694768509949374e-05, + "loss": 0.2082, + "step": 3727 + }, + { + "epoch": 0.19, + "grad_norm": 0.908533200343883, + "learning_rate": 1.8693954832862017e-05, + "loss": 0.2088, + "step": 3728 + }, + { + "epoch": 0.19, + "grad_norm": 1.084859595161081, + "learning_rate": 1.869314091995065e-05, + "loss": 0.191, + "step": 3729 + }, + { + "epoch": 0.19, + "grad_norm": 1.840271807447316, + "learning_rate": 1.8692326771237344e-05, + "loss": 0.258, + "step": 3730 + }, + { + "epoch": 0.19, + "grad_norm": 0.9170431065446714, + "learning_rate": 1.8691512386744183e-05, + "loss": 0.225, + "step": 3731 + }, + { + "epoch": 0.19, + "grad_norm": 1.0010752864096515, + "learning_rate": 1.8690697766493252e-05, + "loss": 0.2519, + "step": 3732 + }, + { + "epoch": 0.19, + "grad_norm": 0.8686061614515938, + "learning_rate": 1.8689882910506658e-05, + "loss": 0.2076, + "step": 3733 + }, + { + "epoch": 0.19, + "grad_norm": 1.0749405681204596, + "learning_rate": 1.8689067818806503e-05, + "loss": 0.2127, + "step": 3734 + }, + { + "epoch": 0.19, + "grad_norm": 1.0258137289693825, + "learning_rate": 1.8688252491414886e-05, + "loss": 0.2108, + "step": 3735 + }, + { + "epoch": 0.19, + "grad_norm": 1.0969570804965412, + "learning_rate": 1.868743692835393e-05, + "loss": 0.1829, + "step": 3736 + }, + { + "epoch": 0.19, + "grad_norm": 1.259486363094936, + "learning_rate": 1.868662112964576e-05, + "loss": 0.2184, + "step": 3737 + }, + { + "epoch": 0.19, + "grad_norm": 1.0370119946877436, + "learning_rate": 1.8685805095312498e-05, + "loss": 0.2101, + "step": 3738 + }, + { + "epoch": 0.19, + "grad_norm": 1.0349854099522016, + "learning_rate": 1.8684988825376285e-05, + "loss": 0.21, + "step": 3739 + }, + { + "epoch": 0.19, + "grad_norm": 1.2262944485867913, + "learning_rate": 1.8684172319859258e-05, + "loss": 0.2563, + "step": 3740 + }, + { + "epoch": 0.19, + "grad_norm": 1.1655799061302887, + "learning_rate": 1.8683355578783567e-05, + "loss": 0.2179, + "step": 3741 + }, + { + "epoch": 0.19, + "grad_norm": 2.4736756001428297, + "learning_rate": 1.868253860217137e-05, + "loss": 0.2407, + "step": 3742 + }, + { + "epoch": 0.19, + "grad_norm": 0.9410716912521949, + "learning_rate": 1.8681721390044817e-05, + "loss": 0.2069, + "step": 3743 + }, + { + "epoch": 0.19, + "grad_norm": 1.4025378391460672, + "learning_rate": 1.868090394242608e-05, + "loss": 0.2414, + "step": 3744 + }, + { + "epoch": 0.19, + "grad_norm": 1.313299060245258, + "learning_rate": 1.8680086259337337e-05, + "loss": 0.2157, + "step": 3745 + }, + { + "epoch": 0.19, + "grad_norm": 0.920574490398171, + "learning_rate": 1.8679268340800764e-05, + "loss": 0.2423, + "step": 3746 + }, + { + "epoch": 0.19, + "grad_norm": 1.0078079186913158, + "learning_rate": 1.8678450186838545e-05, + "loss": 0.2178, + "step": 3747 + }, + { + "epoch": 0.19, + "grad_norm": 0.8418987504966208, + "learning_rate": 1.8677631797472874e-05, + "loss": 0.2322, + "step": 3748 + }, + { + "epoch": 0.19, + "grad_norm": 1.2665665598701452, + "learning_rate": 1.867681317272595e-05, + "loss": 0.2214, + "step": 3749 + }, + { + "epoch": 0.19, + "grad_norm": 0.9829511796364977, + "learning_rate": 1.867599431261998e-05, + "loss": 0.2184, + "step": 3750 + }, + { + "epoch": 0.19, + "grad_norm": 1.3460645041914767, + "learning_rate": 1.8675175217177176e-05, + "loss": 0.2331, + "step": 3751 + }, + { + "epoch": 0.19, + "grad_norm": 0.9805157466720837, + "learning_rate": 1.867435588641975e-05, + "loss": 0.2293, + "step": 3752 + }, + { + "epoch": 0.19, + "grad_norm": 1.234993024768991, + "learning_rate": 1.8673536320369936e-05, + "loss": 0.2311, + "step": 3753 + }, + { + "epoch": 0.19, + "grad_norm": 1.373040649077192, + "learning_rate": 1.8672716519049957e-05, + "loss": 0.2258, + "step": 3754 + }, + { + "epoch": 0.19, + "grad_norm": 1.0665080193056045, + "learning_rate": 1.867189648248205e-05, + "loss": 0.2163, + "step": 3755 + }, + { + "epoch": 0.19, + "grad_norm": 1.42465957114889, + "learning_rate": 1.8671076210688467e-05, + "loss": 0.2035, + "step": 3756 + }, + { + "epoch": 0.19, + "grad_norm": 1.2238898901999282, + "learning_rate": 1.867025570369145e-05, + "loss": 0.2401, + "step": 3757 + }, + { + "epoch": 0.19, + "grad_norm": 1.176117512236095, + "learning_rate": 1.8669434961513256e-05, + "loss": 0.2117, + "step": 3758 + }, + { + "epoch": 0.19, + "grad_norm": 1.1875416816543796, + "learning_rate": 1.866861398417615e-05, + "loss": 0.2207, + "step": 3759 + }, + { + "epoch": 0.19, + "grad_norm": 1.2449151104729497, + "learning_rate": 1.8667792771702397e-05, + "loss": 0.2156, + "step": 3760 + }, + { + "epoch": 0.19, + "grad_norm": 0.8601865869175008, + "learning_rate": 1.8666971324114277e-05, + "loss": 0.1938, + "step": 3761 + }, + { + "epoch": 0.19, + "grad_norm": 1.0584334643359323, + "learning_rate": 1.866614964143407e-05, + "loss": 0.2206, + "step": 3762 + }, + { + "epoch": 0.19, + "grad_norm": 0.9599952315995963, + "learning_rate": 1.8665327723684065e-05, + "loss": 0.2334, + "step": 3763 + }, + { + "epoch": 0.19, + "grad_norm": 3.068548285779253, + "learning_rate": 1.8664505570886557e-05, + "loss": 0.2138, + "step": 3764 + }, + { + "epoch": 0.19, + "grad_norm": 0.9137698886234791, + "learning_rate": 1.8663683183063846e-05, + "loss": 0.2418, + "step": 3765 + }, + { + "epoch": 0.19, + "grad_norm": 0.8822421507354169, + "learning_rate": 1.8662860560238238e-05, + "loss": 0.2357, + "step": 3766 + }, + { + "epoch": 0.19, + "grad_norm": 0.9839602464096109, + "learning_rate": 1.866203770243205e-05, + "loss": 0.219, + "step": 3767 + }, + { + "epoch": 0.19, + "grad_norm": 1.017921041143888, + "learning_rate": 1.86612146096676e-05, + "loss": 0.1929, + "step": 3768 + }, + { + "epoch": 0.19, + "grad_norm": 0.9666760157968136, + "learning_rate": 1.8660391281967213e-05, + "loss": 0.2327, + "step": 3769 + }, + { + "epoch": 0.19, + "grad_norm": 1.2427749818036062, + "learning_rate": 1.8659567719353223e-05, + "loss": 0.2206, + "step": 3770 + }, + { + "epoch": 0.19, + "grad_norm": 1.0818309717884633, + "learning_rate": 1.865874392184797e-05, + "loss": 0.2248, + "step": 3771 + }, + { + "epoch": 0.19, + "grad_norm": 1.5483282083497667, + "learning_rate": 1.86579198894738e-05, + "loss": 0.2569, + "step": 3772 + }, + { + "epoch": 0.19, + "grad_norm": 1.0491292439109805, + "learning_rate": 1.8657095622253064e-05, + "loss": 0.2199, + "step": 3773 + }, + { + "epoch": 0.19, + "grad_norm": 0.98191139928569, + "learning_rate": 1.8656271120208118e-05, + "loss": 0.2415, + "step": 3774 + }, + { + "epoch": 0.19, + "grad_norm": 0.8249343376451618, + "learning_rate": 1.8655446383361332e-05, + "loss": 0.2419, + "step": 3775 + }, + { + "epoch": 0.19, + "grad_norm": 1.0439989508003285, + "learning_rate": 1.8654621411735072e-05, + "loss": 0.2125, + "step": 3776 + }, + { + "epoch": 0.19, + "grad_norm": 0.9956541278544652, + "learning_rate": 1.8653796205351717e-05, + "loss": 0.2423, + "step": 3777 + }, + { + "epoch": 0.19, + "grad_norm": 0.7951566731952017, + "learning_rate": 1.865297076423365e-05, + "loss": 0.2042, + "step": 3778 + }, + { + "epoch": 0.19, + "grad_norm": 1.0345974375671212, + "learning_rate": 1.8652145088403267e-05, + "loss": 0.2209, + "step": 3779 + }, + { + "epoch": 0.19, + "grad_norm": 0.9969738180774795, + "learning_rate": 1.8651319177882957e-05, + "loss": 0.2287, + "step": 3780 + }, + { + "epoch": 0.19, + "grad_norm": 1.4389658031539585, + "learning_rate": 1.8650493032695124e-05, + "loss": 0.2236, + "step": 3781 + }, + { + "epoch": 0.19, + "grad_norm": 1.134999809634415, + "learning_rate": 1.8649666652862183e-05, + "loss": 0.2247, + "step": 3782 + }, + { + "epoch": 0.19, + "grad_norm": 0.9820850253804223, + "learning_rate": 1.864884003840654e-05, + "loss": 0.2137, + "step": 3783 + }, + { + "epoch": 0.19, + "grad_norm": 0.8792425222701588, + "learning_rate": 1.8648013189350628e-05, + "loss": 0.2228, + "step": 3784 + }, + { + "epoch": 0.19, + "grad_norm": 1.2159676018930894, + "learning_rate": 1.864718610571687e-05, + "loss": 0.2443, + "step": 3785 + }, + { + "epoch": 0.19, + "grad_norm": 0.9169076017661532, + "learning_rate": 1.8646358787527697e-05, + "loss": 0.197, + "step": 3786 + }, + { + "epoch": 0.19, + "grad_norm": 1.1325155925959844, + "learning_rate": 1.8645531234805554e-05, + "loss": 0.2191, + "step": 3787 + }, + { + "epoch": 0.19, + "grad_norm": 1.2026712867680391, + "learning_rate": 1.864470344757289e-05, + "loss": 0.2246, + "step": 3788 + }, + { + "epoch": 0.19, + "grad_norm": 1.1107125406261882, + "learning_rate": 1.8643875425852155e-05, + "loss": 0.2394, + "step": 3789 + }, + { + "epoch": 0.19, + "grad_norm": 1.4123550933251017, + "learning_rate": 1.864304716966581e-05, + "loss": 0.2606, + "step": 3790 + }, + { + "epoch": 0.19, + "grad_norm": 1.325865887213283, + "learning_rate": 1.8642218679036324e-05, + "loss": 0.2172, + "step": 3791 + }, + { + "epoch": 0.19, + "grad_norm": 1.0620807363636982, + "learning_rate": 1.8641389953986165e-05, + "loss": 0.2409, + "step": 3792 + }, + { + "epoch": 0.19, + "grad_norm": 1.071205185857897, + "learning_rate": 1.864056099453782e-05, + "loss": 0.2386, + "step": 3793 + }, + { + "epoch": 0.19, + "grad_norm": 0.8456964313282443, + "learning_rate": 1.8639731800713766e-05, + "loss": 0.1992, + "step": 3794 + }, + { + "epoch": 0.19, + "grad_norm": 1.1931661849786743, + "learning_rate": 1.8638902372536502e-05, + "loss": 0.2136, + "step": 3795 + }, + { + "epoch": 0.19, + "grad_norm": 1.120405301713219, + "learning_rate": 1.8638072710028523e-05, + "loss": 0.2087, + "step": 3796 + }, + { + "epoch": 0.19, + "grad_norm": 0.9879170452045153, + "learning_rate": 1.8637242813212334e-05, + "loss": 0.2056, + "step": 3797 + }, + { + "epoch": 0.19, + "grad_norm": 3.2824930991489314, + "learning_rate": 1.8636412682110445e-05, + "loss": 0.2151, + "step": 3798 + }, + { + "epoch": 0.19, + "grad_norm": 1.293469054776063, + "learning_rate": 1.8635582316745374e-05, + "loss": 0.2052, + "step": 3799 + }, + { + "epoch": 0.19, + "grad_norm": 1.025378532115027, + "learning_rate": 1.8634751717139644e-05, + "loss": 0.2118, + "step": 3800 + }, + { + "epoch": 0.19, + "grad_norm": 0.7830661665580738, + "learning_rate": 1.863392088331579e-05, + "loss": 0.2128, + "step": 3801 + }, + { + "epoch": 0.19, + "grad_norm": 1.3021193311965715, + "learning_rate": 1.863308981529634e-05, + "loss": 0.2381, + "step": 3802 + }, + { + "epoch": 0.19, + "grad_norm": 0.762346379086396, + "learning_rate": 1.863225851310384e-05, + "loss": 0.2063, + "step": 3803 + }, + { + "epoch": 0.19, + "grad_norm": 1.4515520631191818, + "learning_rate": 1.8631426976760844e-05, + "loss": 0.2143, + "step": 3804 + }, + { + "epoch": 0.19, + "grad_norm": 1.171511742865271, + "learning_rate": 1.8630595206289904e-05, + "loss": 0.227, + "step": 3805 + }, + { + "epoch": 0.19, + "grad_norm": 0.8935558090507881, + "learning_rate": 1.862976320171358e-05, + "loss": 0.2127, + "step": 3806 + }, + { + "epoch": 0.19, + "grad_norm": 0.9868740593439831, + "learning_rate": 1.8628930963054444e-05, + "loss": 0.2191, + "step": 3807 + }, + { + "epoch": 0.19, + "grad_norm": 0.9324619016671554, + "learning_rate": 1.8628098490335064e-05, + "loss": 0.2184, + "step": 3808 + }, + { + "epoch": 0.19, + "grad_norm": 1.002216078090249, + "learning_rate": 1.8627265783578028e-05, + "loss": 0.2105, + "step": 3809 + }, + { + "epoch": 0.19, + "grad_norm": 1.1699929258731216, + "learning_rate": 1.862643284280592e-05, + "loss": 0.2267, + "step": 3810 + }, + { + "epoch": 0.19, + "grad_norm": 0.837422894813569, + "learning_rate": 1.8625599668041334e-05, + "loss": 0.1931, + "step": 3811 + }, + { + "epoch": 0.19, + "grad_norm": 0.9597112271853646, + "learning_rate": 1.862476625930687e-05, + "loss": 0.222, + "step": 3812 + }, + { + "epoch": 0.19, + "grad_norm": 0.9219677680827443, + "learning_rate": 1.8623932616625133e-05, + "loss": 0.2037, + "step": 3813 + }, + { + "epoch": 0.19, + "grad_norm": 0.9555404866784017, + "learning_rate": 1.8623098740018736e-05, + "loss": 0.244, + "step": 3814 + }, + { + "epoch": 0.19, + "grad_norm": 0.8565637533356807, + "learning_rate": 1.86222646295103e-05, + "loss": 0.2166, + "step": 3815 + }, + { + "epoch": 0.19, + "grad_norm": 1.0481091626759864, + "learning_rate": 1.8621430285122447e-05, + "loss": 0.231, + "step": 3816 + }, + { + "epoch": 0.19, + "grad_norm": 0.8774766428514378, + "learning_rate": 1.8620595706877816e-05, + "loss": 0.214, + "step": 3817 + }, + { + "epoch": 0.19, + "grad_norm": 0.9865676986752081, + "learning_rate": 1.8619760894799034e-05, + "loss": 0.2236, + "step": 3818 + }, + { + "epoch": 0.19, + "grad_norm": 1.2433378952932157, + "learning_rate": 1.8618925848908757e-05, + "loss": 0.2178, + "step": 3819 + }, + { + "epoch": 0.19, + "grad_norm": 0.8438590361016094, + "learning_rate": 1.861809056922962e-05, + "loss": 0.2188, + "step": 3820 + }, + { + "epoch": 0.19, + "grad_norm": 0.9083392931246398, + "learning_rate": 1.8617255055784298e-05, + "loss": 0.2152, + "step": 3821 + }, + { + "epoch": 0.19, + "grad_norm": 0.9041176626279603, + "learning_rate": 1.8616419308595443e-05, + "loss": 0.2471, + "step": 3822 + }, + { + "epoch": 0.19, + "grad_norm": 0.8617167966436966, + "learning_rate": 1.8615583327685726e-05, + "loss": 0.2185, + "step": 3823 + }, + { + "epoch": 0.19, + "grad_norm": 0.831294907600138, + "learning_rate": 1.8614747113077826e-05, + "loss": 0.238, + "step": 3824 + }, + { + "epoch": 0.19, + "grad_norm": 1.1312425674849975, + "learning_rate": 1.8613910664794424e-05, + "loss": 0.2646, + "step": 3825 + }, + { + "epoch": 0.19, + "grad_norm": 0.9087120849416299, + "learning_rate": 1.861307398285821e-05, + "loss": 0.2043, + "step": 3826 + }, + { + "epoch": 0.19, + "grad_norm": 1.0711854562301435, + "learning_rate": 1.8612237067291878e-05, + "loss": 0.2137, + "step": 3827 + }, + { + "epoch": 0.19, + "grad_norm": 1.1537888299444026, + "learning_rate": 1.8611399918118124e-05, + "loss": 0.2275, + "step": 3828 + }, + { + "epoch": 0.19, + "grad_norm": 1.053077637022216, + "learning_rate": 1.8610562535359667e-05, + "loss": 0.2573, + "step": 3829 + }, + { + "epoch": 0.19, + "grad_norm": 0.8794836263083736, + "learning_rate": 1.8609724919039213e-05, + "loss": 0.2217, + "step": 3830 + }, + { + "epoch": 0.19, + "grad_norm": 1.1927003302097705, + "learning_rate": 1.8608887069179483e-05, + "loss": 0.2239, + "step": 3831 + }, + { + "epoch": 0.19, + "grad_norm": 0.9961929880291288, + "learning_rate": 1.8608048985803205e-05, + "loss": 0.2101, + "step": 3832 + }, + { + "epoch": 0.19, + "grad_norm": 0.7476771727973698, + "learning_rate": 1.8607210668933114e-05, + "loss": 0.2028, + "step": 3833 + }, + { + "epoch": 0.19, + "grad_norm": 1.1708729140370346, + "learning_rate": 1.8606372118591943e-05, + "loss": 0.2418, + "step": 3834 + }, + { + "epoch": 0.2, + "grad_norm": 1.95154019222484, + "learning_rate": 1.8605533334802448e-05, + "loss": 0.22, + "step": 3835 + }, + { + "epoch": 0.2, + "grad_norm": 0.9420981207262242, + "learning_rate": 1.8604694317587372e-05, + "loss": 0.1962, + "step": 3836 + }, + { + "epoch": 0.2, + "grad_norm": 0.7574527137216649, + "learning_rate": 1.8603855066969478e-05, + "loss": 0.2019, + "step": 3837 + }, + { + "epoch": 0.2, + "grad_norm": 1.1338212264538146, + "learning_rate": 1.860301558297153e-05, + "loss": 0.234, + "step": 3838 + }, + { + "epoch": 0.2, + "grad_norm": 0.9869099042527179, + "learning_rate": 1.8602175865616296e-05, + "loss": 0.233, + "step": 3839 + }, + { + "epoch": 0.2, + "grad_norm": 0.9117557983918922, + "learning_rate": 1.8601335914926558e-05, + "loss": 0.2072, + "step": 3840 + }, + { + "epoch": 0.2, + "grad_norm": 1.0213142783338018, + "learning_rate": 1.8600495730925095e-05, + "loss": 0.2166, + "step": 3841 + }, + { + "epoch": 0.2, + "grad_norm": 1.0960729307387658, + "learning_rate": 1.8599655313634702e-05, + "loss": 0.1988, + "step": 3842 + }, + { + "epoch": 0.2, + "grad_norm": 1.2800084319729785, + "learning_rate": 1.859881466307817e-05, + "loss": 0.2403, + "step": 3843 + }, + { + "epoch": 0.2, + "grad_norm": 1.643017787270073, + "learning_rate": 1.8597973779278307e-05, + "loss": 0.2231, + "step": 3844 + }, + { + "epoch": 0.2, + "grad_norm": 1.0880616948226376, + "learning_rate": 1.859713266225792e-05, + "loss": 0.2414, + "step": 3845 + }, + { + "epoch": 0.2, + "grad_norm": 0.9851831423429016, + "learning_rate": 1.859629131203982e-05, + "loss": 0.2098, + "step": 3846 + }, + { + "epoch": 0.2, + "grad_norm": 0.9979121868241856, + "learning_rate": 1.859544972864684e-05, + "loss": 0.223, + "step": 3847 + }, + { + "epoch": 0.2, + "grad_norm": 0.9845690582968679, + "learning_rate": 1.8594607912101797e-05, + "loss": 0.2172, + "step": 3848 + }, + { + "epoch": 0.2, + "grad_norm": 1.2156036882618944, + "learning_rate": 1.8593765862427526e-05, + "loss": 0.2268, + "step": 3849 + }, + { + "epoch": 0.2, + "grad_norm": 2.037311775310428, + "learning_rate": 1.8592923579646874e-05, + "loss": 0.2036, + "step": 3850 + }, + { + "epoch": 0.2, + "grad_norm": 2.254690899134614, + "learning_rate": 1.8592081063782685e-05, + "loss": 0.2185, + "step": 3851 + }, + { + "epoch": 0.2, + "grad_norm": 1.3888197734442886, + "learning_rate": 1.8591238314857806e-05, + "loss": 0.2096, + "step": 3852 + }, + { + "epoch": 0.2, + "grad_norm": 1.7310357928446478, + "learning_rate": 1.859039533289511e-05, + "loss": 0.2426, + "step": 3853 + }, + { + "epoch": 0.2, + "grad_norm": 0.8752949059665149, + "learning_rate": 1.858955211791745e-05, + "loss": 0.2248, + "step": 3854 + }, + { + "epoch": 0.2, + "grad_norm": 0.9799619031728044, + "learning_rate": 1.858870866994771e-05, + "loss": 0.2234, + "step": 3855 + }, + { + "epoch": 0.2, + "grad_norm": 0.8881141779724255, + "learning_rate": 1.8587864989008758e-05, + "loss": 0.2308, + "step": 3856 + }, + { + "epoch": 0.2, + "grad_norm": 0.929858899344705, + "learning_rate": 1.8587021075123482e-05, + "loss": 0.1902, + "step": 3857 + }, + { + "epoch": 0.2, + "grad_norm": 0.8424469468404533, + "learning_rate": 1.8586176928314774e-05, + "loss": 0.2072, + "step": 3858 + }, + { + "epoch": 0.2, + "grad_norm": 1.7706289278768073, + "learning_rate": 1.858533254860553e-05, + "loss": 0.223, + "step": 3859 + }, + { + "epoch": 0.2, + "grad_norm": 1.2901367072560834, + "learning_rate": 1.8584487936018663e-05, + "loss": 0.2194, + "step": 3860 + }, + { + "epoch": 0.2, + "grad_norm": 0.9724146249544461, + "learning_rate": 1.8583643090577072e-05, + "loss": 0.2297, + "step": 3861 + }, + { + "epoch": 0.2, + "grad_norm": 0.9272187008477141, + "learning_rate": 1.8582798012303674e-05, + "loss": 0.2279, + "step": 3862 + }, + { + "epoch": 0.2, + "grad_norm": 0.8680660636380514, + "learning_rate": 1.85819527012214e-05, + "loss": 0.2178, + "step": 3863 + }, + { + "epoch": 0.2, + "grad_norm": 1.0452921030928661, + "learning_rate": 1.8581107157353175e-05, + "loss": 0.2579, + "step": 3864 + }, + { + "epoch": 0.2, + "grad_norm": 0.8615649767726768, + "learning_rate": 1.8580261380721932e-05, + "loss": 0.2024, + "step": 3865 + }, + { + "epoch": 0.2, + "grad_norm": 1.2325980969486496, + "learning_rate": 1.8579415371350613e-05, + "loss": 0.2322, + "step": 3866 + }, + { + "epoch": 0.2, + "grad_norm": 0.8933378457355367, + "learning_rate": 1.8578569129262168e-05, + "loss": 0.2031, + "step": 3867 + }, + { + "epoch": 0.2, + "grad_norm": 0.9453357111940053, + "learning_rate": 1.857772265447955e-05, + "loss": 0.2287, + "step": 3868 + }, + { + "epoch": 0.2, + "grad_norm": 0.9394950082674527, + "learning_rate": 1.8576875947025725e-05, + "loss": 0.2162, + "step": 3869 + }, + { + "epoch": 0.2, + "grad_norm": 1.3055556466692884, + "learning_rate": 1.8576029006923653e-05, + "loss": 0.2314, + "step": 3870 + }, + { + "epoch": 0.2, + "grad_norm": 0.9320739710953067, + "learning_rate": 1.8575181834196308e-05, + "loss": 0.2099, + "step": 3871 + }, + { + "epoch": 0.2, + "grad_norm": 1.2266250587407823, + "learning_rate": 1.857433442886667e-05, + "loss": 0.2115, + "step": 3872 + }, + { + "epoch": 0.2, + "grad_norm": 1.034361450688478, + "learning_rate": 1.8573486790957732e-05, + "loss": 0.2377, + "step": 3873 + }, + { + "epoch": 0.2, + "grad_norm": 1.061886386036358, + "learning_rate": 1.8572638920492476e-05, + "loss": 0.2303, + "step": 3874 + }, + { + "epoch": 0.2, + "grad_norm": 0.9135745761327706, + "learning_rate": 1.857179081749391e-05, + "loss": 0.227, + "step": 3875 + }, + { + "epoch": 0.2, + "grad_norm": 2.692975173577015, + "learning_rate": 1.8570942481985027e-05, + "loss": 0.2338, + "step": 3876 + }, + { + "epoch": 0.2, + "grad_norm": 1.114073471726701, + "learning_rate": 1.857009391398885e-05, + "loss": 0.2414, + "step": 3877 + }, + { + "epoch": 0.2, + "grad_norm": 0.8604903674257696, + "learning_rate": 1.856924511352839e-05, + "loss": 0.2341, + "step": 3878 + }, + { + "epoch": 0.2, + "grad_norm": 1.0061238541965627, + "learning_rate": 1.8568396080626673e-05, + "loss": 0.2605, + "step": 3879 + }, + { + "epoch": 0.2, + "grad_norm": 0.8186410015687037, + "learning_rate": 1.8567546815306726e-05, + "loss": 0.2205, + "step": 3880 + }, + { + "epoch": 0.2, + "grad_norm": 0.9094401156407541, + "learning_rate": 1.856669731759159e-05, + "loss": 0.2217, + "step": 3881 + }, + { + "epoch": 0.2, + "grad_norm": 0.8157865383260736, + "learning_rate": 1.8565847587504305e-05, + "loss": 0.2128, + "step": 3882 + }, + { + "epoch": 0.2, + "grad_norm": 1.0473730645092414, + "learning_rate": 1.856499762506792e-05, + "loss": 0.213, + "step": 3883 + }, + { + "epoch": 0.2, + "grad_norm": 0.9802341644805417, + "learning_rate": 1.8564147430305493e-05, + "loss": 0.272, + "step": 3884 + }, + { + "epoch": 0.2, + "grad_norm": 0.9530522304023589, + "learning_rate": 1.8563297003240078e-05, + "loss": 0.225, + "step": 3885 + }, + { + "epoch": 0.2, + "grad_norm": 0.9902349476942686, + "learning_rate": 1.8562446343894753e-05, + "loss": 0.214, + "step": 3886 + }, + { + "epoch": 0.2, + "grad_norm": 0.9234130370394749, + "learning_rate": 1.8561595452292587e-05, + "loss": 0.2148, + "step": 3887 + }, + { + "epoch": 0.2, + "grad_norm": 0.948780172751885, + "learning_rate": 1.8560744328456657e-05, + "loss": 0.2093, + "step": 3888 + }, + { + "epoch": 0.2, + "grad_norm": 1.1596350769709731, + "learning_rate": 1.855989297241006e-05, + "loss": 0.2314, + "step": 3889 + }, + { + "epoch": 0.2, + "grad_norm": 0.8493065367210507, + "learning_rate": 1.8559041384175874e-05, + "loss": 0.2287, + "step": 3890 + }, + { + "epoch": 0.2, + "grad_norm": 1.0826434951585162, + "learning_rate": 1.8558189563777213e-05, + "loss": 0.2296, + "step": 3891 + }, + { + "epoch": 0.2, + "grad_norm": 1.233129074251226, + "learning_rate": 1.8557337511237178e-05, + "loss": 0.2124, + "step": 3892 + }, + { + "epoch": 0.2, + "grad_norm": 2.275827478065935, + "learning_rate": 1.8556485226578877e-05, + "loss": 0.2059, + "step": 3893 + }, + { + "epoch": 0.2, + "grad_norm": 1.0039858133301296, + "learning_rate": 1.8555632709825427e-05, + "loss": 0.2189, + "step": 3894 + }, + { + "epoch": 0.2, + "grad_norm": 1.298069959049279, + "learning_rate": 1.8554779960999963e-05, + "loss": 0.2339, + "step": 3895 + }, + { + "epoch": 0.2, + "grad_norm": 0.9895375285393244, + "learning_rate": 1.8553926980125608e-05, + "loss": 0.2092, + "step": 3896 + }, + { + "epoch": 0.2, + "grad_norm": 1.2551382881470876, + "learning_rate": 1.85530737672255e-05, + "loss": 0.2152, + "step": 3897 + }, + { + "epoch": 0.2, + "grad_norm": 1.0977374431389633, + "learning_rate": 1.8552220322322782e-05, + "loss": 0.235, + "step": 3898 + }, + { + "epoch": 0.2, + "grad_norm": 1.1044374060859814, + "learning_rate": 1.8551366645440604e-05, + "loss": 0.2008, + "step": 3899 + }, + { + "epoch": 0.2, + "grad_norm": 1.2014769038354687, + "learning_rate": 1.8550512736602125e-05, + "loss": 0.2217, + "step": 3900 + }, + { + "epoch": 0.2, + "grad_norm": 0.9532092971485372, + "learning_rate": 1.8549658595830505e-05, + "loss": 0.2228, + "step": 3901 + }, + { + "epoch": 0.2, + "grad_norm": 0.841470746178915, + "learning_rate": 1.8548804223148914e-05, + "loss": 0.2394, + "step": 3902 + }, + { + "epoch": 0.2, + "grad_norm": 1.1083325341107813, + "learning_rate": 1.854794961858052e-05, + "loss": 0.2163, + "step": 3903 + }, + { + "epoch": 0.2, + "grad_norm": 0.9741102853470924, + "learning_rate": 1.8547094782148513e-05, + "loss": 0.2372, + "step": 3904 + }, + { + "epoch": 0.2, + "grad_norm": 0.9752661376202427, + "learning_rate": 1.8546239713876077e-05, + "loss": 0.2322, + "step": 3905 + }, + { + "epoch": 0.2, + "grad_norm": 1.077642457000483, + "learning_rate": 1.8545384413786406e-05, + "loss": 0.2166, + "step": 3906 + }, + { + "epoch": 0.2, + "grad_norm": 1.112417457963183, + "learning_rate": 1.85445288819027e-05, + "loss": 0.2521, + "step": 3907 + }, + { + "epoch": 0.2, + "grad_norm": 1.133826703746313, + "learning_rate": 1.8543673118248167e-05, + "loss": 0.207, + "step": 3908 + }, + { + "epoch": 0.2, + "grad_norm": 0.9714603769415149, + "learning_rate": 1.854281712284602e-05, + "loss": 0.2076, + "step": 3909 + }, + { + "epoch": 0.2, + "grad_norm": 1.0225184907059015, + "learning_rate": 1.8541960895719473e-05, + "loss": 0.228, + "step": 3910 + }, + { + "epoch": 0.2, + "grad_norm": 1.420720984524292, + "learning_rate": 1.8541104436891756e-05, + "loss": 0.2016, + "step": 3911 + }, + { + "epoch": 0.2, + "grad_norm": 0.9235109665556697, + "learning_rate": 1.8540247746386095e-05, + "loss": 0.2355, + "step": 3912 + }, + { + "epoch": 0.2, + "grad_norm": 1.0854904386542399, + "learning_rate": 1.8539390824225735e-05, + "loss": 0.2196, + "step": 3913 + }, + { + "epoch": 0.2, + "grad_norm": 0.9777382832585857, + "learning_rate": 1.8538533670433912e-05, + "loss": 0.2034, + "step": 3914 + }, + { + "epoch": 0.2, + "grad_norm": 1.0064862201866491, + "learning_rate": 1.8537676285033886e-05, + "loss": 0.1908, + "step": 3915 + }, + { + "epoch": 0.2, + "grad_norm": 0.9964750671495186, + "learning_rate": 1.8536818668048906e-05, + "loss": 0.2261, + "step": 3916 + }, + { + "epoch": 0.2, + "grad_norm": 1.0298903324507094, + "learning_rate": 1.8535960819502243e-05, + "loss": 0.2136, + "step": 3917 + }, + { + "epoch": 0.2, + "grad_norm": 1.023825717413679, + "learning_rate": 1.853510273941715e-05, + "loss": 0.2548, + "step": 3918 + }, + { + "epoch": 0.2, + "grad_norm": 1.0799038437526578, + "learning_rate": 1.8534244427816924e-05, + "loss": 0.2313, + "step": 3919 + }, + { + "epoch": 0.2, + "grad_norm": 1.683700267828652, + "learning_rate": 1.853338588472483e-05, + "loss": 0.2098, + "step": 3920 + }, + { + "epoch": 0.2, + "grad_norm": 1.002263888788098, + "learning_rate": 1.8532527110164163e-05, + "loss": 0.2272, + "step": 3921 + }, + { + "epoch": 0.2, + "grad_norm": 0.8523969928905435, + "learning_rate": 1.8531668104158215e-05, + "loss": 0.1953, + "step": 3922 + }, + { + "epoch": 0.2, + "grad_norm": 0.9766389662463112, + "learning_rate": 1.853080886673029e-05, + "loss": 0.2032, + "step": 3923 + }, + { + "epoch": 0.2, + "grad_norm": 0.9793413109004094, + "learning_rate": 1.8529949397903692e-05, + "loss": 0.2229, + "step": 3924 + }, + { + "epoch": 0.2, + "grad_norm": 1.0853519331853911, + "learning_rate": 1.8529089697701735e-05, + "loss": 0.2243, + "step": 3925 + }, + { + "epoch": 0.2, + "grad_norm": 1.0091941317388682, + "learning_rate": 1.8528229766147737e-05, + "loss": 0.2156, + "step": 3926 + }, + { + "epoch": 0.2, + "grad_norm": 1.428407186069211, + "learning_rate": 1.8527369603265027e-05, + "loss": 0.2173, + "step": 3927 + }, + { + "epoch": 0.2, + "grad_norm": 0.9884423582417243, + "learning_rate": 1.852650920907693e-05, + "loss": 0.2074, + "step": 3928 + }, + { + "epoch": 0.2, + "grad_norm": 1.0081667501207707, + "learning_rate": 1.8525648583606797e-05, + "loss": 0.2362, + "step": 3929 + }, + { + "epoch": 0.2, + "grad_norm": 0.9400868797070588, + "learning_rate": 1.8524787726877956e-05, + "loss": 0.2298, + "step": 3930 + }, + { + "epoch": 0.2, + "grad_norm": 1.0042055292298442, + "learning_rate": 1.8523926638913772e-05, + "loss": 0.207, + "step": 3931 + }, + { + "epoch": 0.2, + "grad_norm": 1.0274160199055828, + "learning_rate": 1.8523065319737597e-05, + "loss": 0.2311, + "step": 3932 + }, + { + "epoch": 0.2, + "grad_norm": 0.8839768613869868, + "learning_rate": 1.8522203769372794e-05, + "loss": 0.2022, + "step": 3933 + }, + { + "epoch": 0.2, + "grad_norm": 0.9604317794149866, + "learning_rate": 1.8521341987842726e-05, + "loss": 0.2192, + "step": 3934 + }, + { + "epoch": 0.2, + "grad_norm": 1.1657719535264452, + "learning_rate": 1.8520479975170783e-05, + "loss": 0.228, + "step": 3935 + }, + { + "epoch": 0.2, + "grad_norm": 1.0916660032667729, + "learning_rate": 1.8519617731380334e-05, + "loss": 0.2314, + "step": 3936 + }, + { + "epoch": 0.2, + "grad_norm": 0.9529536523373129, + "learning_rate": 1.8518755256494776e-05, + "loss": 0.2706, + "step": 3937 + }, + { + "epoch": 0.2, + "grad_norm": 0.9799853626728516, + "learning_rate": 1.85178925505375e-05, + "loss": 0.2381, + "step": 3938 + }, + { + "epoch": 0.2, + "grad_norm": 1.987670708955151, + "learning_rate": 1.851702961353191e-05, + "loss": 0.222, + "step": 3939 + }, + { + "epoch": 0.2, + "grad_norm": 1.0361197104887185, + "learning_rate": 1.8516166445501405e-05, + "loss": 0.2459, + "step": 3940 + }, + { + "epoch": 0.2, + "grad_norm": 1.1496121131841177, + "learning_rate": 1.8515303046469407e-05, + "loss": 0.2305, + "step": 3941 + }, + { + "epoch": 0.2, + "grad_norm": 1.175067670408769, + "learning_rate": 1.851443941645933e-05, + "loss": 0.2046, + "step": 3942 + }, + { + "epoch": 0.2, + "grad_norm": 1.0394576988286925, + "learning_rate": 1.851357555549461e-05, + "loss": 0.2249, + "step": 3943 + }, + { + "epoch": 0.2, + "grad_norm": 1.1378105288243487, + "learning_rate": 1.8512711463598666e-05, + "loss": 0.2161, + "step": 3944 + }, + { + "epoch": 0.2, + "grad_norm": 0.9691228729639899, + "learning_rate": 1.8511847140794944e-05, + "loss": 0.2451, + "step": 3945 + }, + { + "epoch": 0.2, + "grad_norm": 0.9107941557336705, + "learning_rate": 1.8510982587106888e-05, + "loss": 0.2163, + "step": 3946 + }, + { + "epoch": 0.2, + "grad_norm": 0.8438094197302133, + "learning_rate": 1.8510117802557948e-05, + "loss": 0.2172, + "step": 3947 + }, + { + "epoch": 0.2, + "grad_norm": 1.122561032622386, + "learning_rate": 1.8509252787171585e-05, + "loss": 0.2091, + "step": 3948 + }, + { + "epoch": 0.2, + "grad_norm": 1.1670322241173445, + "learning_rate": 1.8508387540971258e-05, + "loss": 0.2265, + "step": 3949 + }, + { + "epoch": 0.2, + "grad_norm": 0.8737935359329443, + "learning_rate": 1.8507522063980436e-05, + "loss": 0.2014, + "step": 3950 + }, + { + "epoch": 0.2, + "grad_norm": 0.957459298710243, + "learning_rate": 1.85066563562226e-05, + "loss": 0.2232, + "step": 3951 + }, + { + "epoch": 0.2, + "grad_norm": 0.9140556809479499, + "learning_rate": 1.850579041772123e-05, + "loss": 0.2213, + "step": 3952 + }, + { + "epoch": 0.2, + "grad_norm": 0.9341055775625612, + "learning_rate": 1.850492424849981e-05, + "loss": 0.2475, + "step": 3953 + }, + { + "epoch": 0.2, + "grad_norm": 1.8285233367924434, + "learning_rate": 1.8504057848581846e-05, + "loss": 0.2267, + "step": 3954 + }, + { + "epoch": 0.2, + "grad_norm": 1.2274016809994188, + "learning_rate": 1.850319121799083e-05, + "loss": 0.2167, + "step": 3955 + }, + { + "epoch": 0.2, + "grad_norm": 0.9569895683683748, + "learning_rate": 1.850232435675027e-05, + "loss": 0.2139, + "step": 3956 + }, + { + "epoch": 0.2, + "grad_norm": 0.9707813355784036, + "learning_rate": 1.8501457264883684e-05, + "loss": 0.2247, + "step": 3957 + }, + { + "epoch": 0.2, + "grad_norm": 1.6606571832523476, + "learning_rate": 1.850058994241459e-05, + "loss": 0.207, + "step": 3958 + }, + { + "epoch": 0.2, + "grad_norm": 0.9890119440101421, + "learning_rate": 1.8499722389366513e-05, + "loss": 0.2126, + "step": 3959 + }, + { + "epoch": 0.2, + "grad_norm": 0.8781210948864995, + "learning_rate": 1.8498854605762982e-05, + "loss": 0.2236, + "step": 3960 + }, + { + "epoch": 0.2, + "grad_norm": 1.2677885365794521, + "learning_rate": 1.8497986591627546e-05, + "loss": 0.2384, + "step": 3961 + }, + { + "epoch": 0.2, + "grad_norm": 0.8925478669144093, + "learning_rate": 1.849711834698374e-05, + "loss": 0.192, + "step": 3962 + }, + { + "epoch": 0.2, + "grad_norm": 1.096343147470085, + "learning_rate": 1.849624987185512e-05, + "loss": 0.1939, + "step": 3963 + }, + { + "epoch": 0.2, + "grad_norm": 0.9426657053588514, + "learning_rate": 1.8495381166265243e-05, + "loss": 0.2222, + "step": 3964 + }, + { + "epoch": 0.2, + "grad_norm": 1.0983582096020919, + "learning_rate": 1.849451223023767e-05, + "loss": 0.2271, + "step": 3965 + }, + { + "epoch": 0.2, + "grad_norm": 1.1097098319224445, + "learning_rate": 1.8493643063795973e-05, + "loss": 0.21, + "step": 3966 + }, + { + "epoch": 0.2, + "grad_norm": 0.9815198326231998, + "learning_rate": 1.8492773666963734e-05, + "loss": 0.197, + "step": 3967 + }, + { + "epoch": 0.2, + "grad_norm": 1.1798340581543854, + "learning_rate": 1.8491904039764523e-05, + "loss": 0.2297, + "step": 3968 + }, + { + "epoch": 0.2, + "grad_norm": 1.0813074125313995, + "learning_rate": 1.849103418222194e-05, + "loss": 0.24, + "step": 3969 + }, + { + "epoch": 0.2, + "grad_norm": 1.0502962640861704, + "learning_rate": 1.849016409435957e-05, + "loss": 0.2127, + "step": 3970 + }, + { + "epoch": 0.2, + "grad_norm": 1.005448488445372, + "learning_rate": 1.848929377620102e-05, + "loss": 0.2212, + "step": 3971 + }, + { + "epoch": 0.2, + "grad_norm": 1.3691830425652256, + "learning_rate": 1.84884232277699e-05, + "loss": 0.2317, + "step": 3972 + }, + { + "epoch": 0.2, + "grad_norm": 1.4197298065121104, + "learning_rate": 1.848755244908982e-05, + "loss": 0.2427, + "step": 3973 + }, + { + "epoch": 0.2, + "grad_norm": 1.2433237283809655, + "learning_rate": 1.84866814401844e-05, + "loss": 0.2357, + "step": 3974 + }, + { + "epoch": 0.2, + "grad_norm": 1.5182774149915292, + "learning_rate": 1.8485810201077266e-05, + "loss": 0.2133, + "step": 3975 + }, + { + "epoch": 0.2, + "grad_norm": 1.0417444382667709, + "learning_rate": 1.8484938731792052e-05, + "loss": 0.247, + "step": 3976 + }, + { + "epoch": 0.2, + "grad_norm": 1.1087194985346591, + "learning_rate": 1.8484067032352394e-05, + "loss": 0.2289, + "step": 3977 + }, + { + "epoch": 0.2, + "grad_norm": 3.227029058062171, + "learning_rate": 1.848319510278194e-05, + "loss": 0.2142, + "step": 3978 + }, + { + "epoch": 0.2, + "grad_norm": 1.0410937941188434, + "learning_rate": 1.848232294310434e-05, + "loss": 0.207, + "step": 3979 + }, + { + "epoch": 0.2, + "grad_norm": 1.013795250561338, + "learning_rate": 1.848145055334325e-05, + "loss": 0.2171, + "step": 3980 + }, + { + "epoch": 0.2, + "grad_norm": 0.9561721192043167, + "learning_rate": 1.8480577933522337e-05, + "loss": 0.2293, + "step": 3981 + }, + { + "epoch": 0.2, + "grad_norm": 0.9544153277598909, + "learning_rate": 1.8479705083665265e-05, + "loss": 0.1992, + "step": 3982 + }, + { + "epoch": 0.2, + "grad_norm": 2.4377696993777693, + "learning_rate": 1.8478832003795718e-05, + "loss": 0.2259, + "step": 3983 + }, + { + "epoch": 0.2, + "grad_norm": 1.0252596856842724, + "learning_rate": 1.847795869393737e-05, + "loss": 0.2228, + "step": 3984 + }, + { + "epoch": 0.2, + "grad_norm": 1.1646243348780456, + "learning_rate": 1.8477085154113913e-05, + "loss": 0.2211, + "step": 3985 + }, + { + "epoch": 0.2, + "grad_norm": 1.435138324835618, + "learning_rate": 1.8476211384349048e-05, + "loss": 0.2141, + "step": 3986 + }, + { + "epoch": 0.2, + "grad_norm": 1.1034704971175715, + "learning_rate": 1.8475337384666464e-05, + "loss": 0.2461, + "step": 3987 + }, + { + "epoch": 0.2, + "grad_norm": 1.040306523846338, + "learning_rate": 1.8474463155089882e-05, + "loss": 0.2177, + "step": 3988 + }, + { + "epoch": 0.2, + "grad_norm": 0.9668078218640307, + "learning_rate": 1.8473588695643002e-05, + "loss": 0.2249, + "step": 3989 + }, + { + "epoch": 0.2, + "grad_norm": 0.8420323004225635, + "learning_rate": 1.8472714006349554e-05, + "loss": 0.2322, + "step": 3990 + }, + { + "epoch": 0.2, + "grad_norm": 1.0936358241608921, + "learning_rate": 1.8471839087233255e-05, + "loss": 0.241, + "step": 3991 + }, + { + "epoch": 0.2, + "grad_norm": 1.139357797659001, + "learning_rate": 1.8470963938317846e-05, + "loss": 0.2545, + "step": 3992 + }, + { + "epoch": 0.2, + "grad_norm": 0.9151370967374843, + "learning_rate": 1.847008855962706e-05, + "loss": 0.2387, + "step": 3993 + }, + { + "epoch": 0.2, + "grad_norm": 0.9739624100709218, + "learning_rate": 1.8469212951184646e-05, + "loss": 0.2377, + "step": 3994 + }, + { + "epoch": 0.2, + "grad_norm": 0.9524532724244088, + "learning_rate": 1.846833711301435e-05, + "loss": 0.2261, + "step": 3995 + }, + { + "epoch": 0.2, + "grad_norm": 0.8491076395346986, + "learning_rate": 1.8467461045139933e-05, + "loss": 0.2342, + "step": 3996 + }, + { + "epoch": 0.2, + "grad_norm": 1.129923799376718, + "learning_rate": 1.8466584747585153e-05, + "loss": 0.2223, + "step": 3997 + }, + { + "epoch": 0.2, + "grad_norm": 1.7812650811136905, + "learning_rate": 1.846570822037379e-05, + "loss": 0.2453, + "step": 3998 + }, + { + "epoch": 0.2, + "grad_norm": 0.9166033090870708, + "learning_rate": 1.846483146352961e-05, + "loss": 0.2068, + "step": 3999 + }, + { + "epoch": 0.2, + "grad_norm": 1.3533118562066813, + "learning_rate": 1.84639544770764e-05, + "loss": 0.2234, + "step": 4000 + }, + { + "epoch": 0.2, + "grad_norm": 1.0150576918701262, + "learning_rate": 1.8463077261037946e-05, + "loss": 0.2315, + "step": 4001 + }, + { + "epoch": 0.2, + "grad_norm": 0.7586040200761496, + "learning_rate": 1.846219981543804e-05, + "loss": 0.1942, + "step": 4002 + }, + { + "epoch": 0.2, + "grad_norm": 1.086690733510189, + "learning_rate": 1.8461322140300487e-05, + "loss": 0.2162, + "step": 4003 + }, + { + "epoch": 0.2, + "grad_norm": 1.0870007848264334, + "learning_rate": 1.8460444235649097e-05, + "loss": 0.2274, + "step": 4004 + }, + { + "epoch": 0.2, + "grad_norm": 1.0814676375496997, + "learning_rate": 1.8459566101507675e-05, + "loss": 0.2228, + "step": 4005 + }, + { + "epoch": 0.2, + "grad_norm": 1.2847826029249587, + "learning_rate": 1.845868773790005e-05, + "loss": 0.2466, + "step": 4006 + }, + { + "epoch": 0.2, + "grad_norm": 1.0485152699205598, + "learning_rate": 1.845780914485004e-05, + "loss": 0.2193, + "step": 4007 + }, + { + "epoch": 0.2, + "grad_norm": 1.3231739436000518, + "learning_rate": 1.8456930322381476e-05, + "loss": 0.1984, + "step": 4008 + }, + { + "epoch": 0.2, + "grad_norm": 1.1393536976170808, + "learning_rate": 1.8456051270518204e-05, + "loss": 0.1964, + "step": 4009 + }, + { + "epoch": 0.2, + "grad_norm": 0.9858193805660505, + "learning_rate": 1.845517198928406e-05, + "loss": 0.2093, + "step": 4010 + }, + { + "epoch": 0.2, + "grad_norm": 2.102746359893761, + "learning_rate": 1.8454292478702898e-05, + "loss": 0.2292, + "step": 4011 + }, + { + "epoch": 0.2, + "grad_norm": 1.2060706071043537, + "learning_rate": 1.8453412738798577e-05, + "loss": 0.2282, + "step": 4012 + }, + { + "epoch": 0.2, + "grad_norm": 0.9578800418566528, + "learning_rate": 1.8452532769594956e-05, + "loss": 0.1901, + "step": 4013 + }, + { + "epoch": 0.2, + "grad_norm": 1.0015965893238714, + "learning_rate": 1.845165257111591e-05, + "loss": 0.2223, + "step": 4014 + }, + { + "epoch": 0.2, + "grad_norm": 2.23810803358979, + "learning_rate": 1.845077214338531e-05, + "loss": 0.1973, + "step": 4015 + }, + { + "epoch": 0.2, + "grad_norm": 0.8611856599209402, + "learning_rate": 1.8449891486427037e-05, + "loss": 0.2108, + "step": 4016 + }, + { + "epoch": 0.2, + "grad_norm": 1.0767077470178619, + "learning_rate": 1.844901060026498e-05, + "loss": 0.2134, + "step": 4017 + }, + { + "epoch": 0.2, + "grad_norm": 0.9266494162460877, + "learning_rate": 1.844812948492303e-05, + "loss": 0.1851, + "step": 4018 + }, + { + "epoch": 0.2, + "grad_norm": 1.0603717922287728, + "learning_rate": 1.8447248140425093e-05, + "loss": 0.2049, + "step": 4019 + }, + { + "epoch": 0.2, + "grad_norm": 1.3324952507743006, + "learning_rate": 1.8446366566795075e-05, + "loss": 0.2196, + "step": 4020 + }, + { + "epoch": 0.2, + "grad_norm": 0.9255293165654925, + "learning_rate": 1.8445484764056886e-05, + "loss": 0.2316, + "step": 4021 + }, + { + "epoch": 0.2, + "grad_norm": 1.216017989027153, + "learning_rate": 1.8444602732234448e-05, + "loss": 0.2204, + "step": 4022 + }, + { + "epoch": 0.2, + "grad_norm": 1.389842141852318, + "learning_rate": 1.844372047135168e-05, + "loss": 0.2344, + "step": 4023 + }, + { + "epoch": 0.2, + "grad_norm": 1.4110691252305398, + "learning_rate": 1.844283798143252e-05, + "loss": 0.1954, + "step": 4024 + }, + { + "epoch": 0.2, + "grad_norm": 1.005032913346493, + "learning_rate": 1.84419552625009e-05, + "loss": 0.2107, + "step": 4025 + }, + { + "epoch": 0.2, + "grad_norm": 0.9342081803585582, + "learning_rate": 1.8441072314580768e-05, + "loss": 0.2152, + "step": 4026 + }, + { + "epoch": 0.2, + "grad_norm": 1.53410098202529, + "learning_rate": 1.8440189137696075e-05, + "loss": 0.2326, + "step": 4027 + }, + { + "epoch": 0.2, + "grad_norm": 1.281389367713628, + "learning_rate": 1.843930573187077e-05, + "loss": 0.2088, + "step": 4028 + }, + { + "epoch": 0.2, + "grad_norm": 0.9824915018152038, + "learning_rate": 1.8438422097128824e-05, + "loss": 0.221, + "step": 4029 + }, + { + "epoch": 0.2, + "grad_norm": 0.9399054436239845, + "learning_rate": 1.8437538233494205e-05, + "loss": 0.2168, + "step": 4030 + }, + { + "epoch": 0.2, + "grad_norm": 1.0611693014693109, + "learning_rate": 1.8436654140990877e-05, + "loss": 0.2201, + "step": 4031 + }, + { + "epoch": 0.21, + "grad_norm": 2.6926024066852987, + "learning_rate": 1.8435769819642835e-05, + "loss": 0.2168, + "step": 4032 + }, + { + "epoch": 0.21, + "grad_norm": 1.425124544776791, + "learning_rate": 1.843488526947406e-05, + "loss": 0.2332, + "step": 4033 + }, + { + "epoch": 0.21, + "grad_norm": 1.2986176817240906, + "learning_rate": 1.843400049050854e-05, + "loss": 0.2054, + "step": 4034 + }, + { + "epoch": 0.21, + "grad_norm": 1.4592279022760886, + "learning_rate": 1.8433115482770286e-05, + "loss": 0.1938, + "step": 4035 + }, + { + "epoch": 0.21, + "grad_norm": 1.1344940384527544, + "learning_rate": 1.8432230246283298e-05, + "loss": 0.2134, + "step": 4036 + }, + { + "epoch": 0.21, + "grad_norm": 1.4920957263868553, + "learning_rate": 1.8431344781071586e-05, + "loss": 0.2299, + "step": 4037 + }, + { + "epoch": 0.21, + "grad_norm": 1.6332568235688114, + "learning_rate": 1.8430459087159172e-05, + "loss": 0.2037, + "step": 4038 + }, + { + "epoch": 0.21, + "grad_norm": 1.0305365277043756, + "learning_rate": 1.842957316457008e-05, + "loss": 0.2396, + "step": 4039 + }, + { + "epoch": 0.21, + "grad_norm": 1.2429204314009332, + "learning_rate": 1.8428687013328338e-05, + "loss": 0.2146, + "step": 4040 + }, + { + "epoch": 0.21, + "grad_norm": 1.46064835422682, + "learning_rate": 1.8427800633457984e-05, + "loss": 0.213, + "step": 4041 + }, + { + "epoch": 0.21, + "grad_norm": 1.315399648856405, + "learning_rate": 1.8426914024983064e-05, + "loss": 0.2228, + "step": 4042 + }, + { + "epoch": 0.21, + "grad_norm": 1.148265946387433, + "learning_rate": 1.8426027187927622e-05, + "loss": 0.2231, + "step": 4043 + }, + { + "epoch": 0.21, + "grad_norm": 1.2784427788505968, + "learning_rate": 1.842514012231572e-05, + "loss": 0.214, + "step": 4044 + }, + { + "epoch": 0.21, + "grad_norm": 1.3232033473579259, + "learning_rate": 1.8424252828171415e-05, + "loss": 0.2424, + "step": 4045 + }, + { + "epoch": 0.21, + "grad_norm": 0.8993518609330069, + "learning_rate": 1.842336530551878e-05, + "loss": 0.2106, + "step": 4046 + }, + { + "epoch": 0.21, + "grad_norm": 0.9717114990888179, + "learning_rate": 1.8422477554381877e-05, + "loss": 0.2137, + "step": 4047 + }, + { + "epoch": 0.21, + "grad_norm": 1.1005113781951843, + "learning_rate": 1.84215895747848e-05, + "loss": 0.2485, + "step": 4048 + }, + { + "epoch": 0.21, + "grad_norm": 1.78659538266863, + "learning_rate": 1.842070136675163e-05, + "loss": 0.2314, + "step": 4049 + }, + { + "epoch": 0.21, + "grad_norm": 1.0992894999940652, + "learning_rate": 1.8419812930306456e-05, + "loss": 0.2355, + "step": 4050 + }, + { + "epoch": 0.21, + "grad_norm": 0.9472424028153374, + "learning_rate": 1.8418924265473387e-05, + "loss": 0.2097, + "step": 4051 + }, + { + "epoch": 0.21, + "grad_norm": 1.2428118737564922, + "learning_rate": 1.8418035372276518e-05, + "loss": 0.2048, + "step": 4052 + }, + { + "epoch": 0.21, + "grad_norm": 1.115620427326506, + "learning_rate": 1.8417146250739963e-05, + "loss": 0.245, + "step": 4053 + }, + { + "epoch": 0.21, + "grad_norm": 1.1725585511883763, + "learning_rate": 1.8416256900887846e-05, + "loss": 0.2305, + "step": 4054 + }, + { + "epoch": 0.21, + "grad_norm": 1.164010195877797, + "learning_rate": 1.841536732274428e-05, + "loss": 0.2317, + "step": 4055 + }, + { + "epoch": 0.21, + "grad_norm": 0.9901967708490926, + "learning_rate": 1.8414477516333404e-05, + "loss": 0.2073, + "step": 4056 + }, + { + "epoch": 0.21, + "grad_norm": 1.203022719344018, + "learning_rate": 1.8413587481679343e-05, + "loss": 0.221, + "step": 4057 + }, + { + "epoch": 0.21, + "grad_norm": 1.1685669183493879, + "learning_rate": 1.841269721880625e-05, + "loss": 0.2065, + "step": 4058 + }, + { + "epoch": 0.21, + "grad_norm": 1.1292187794920885, + "learning_rate": 1.8411806727738276e-05, + "loss": 0.2128, + "step": 4059 + }, + { + "epoch": 0.21, + "grad_norm": 1.0460899988252699, + "learning_rate": 1.8410916008499563e-05, + "loss": 0.2065, + "step": 4060 + }, + { + "epoch": 0.21, + "grad_norm": 0.9625875477372454, + "learning_rate": 1.8410025061114278e-05, + "loss": 0.1914, + "step": 4061 + }, + { + "epoch": 0.21, + "grad_norm": 1.013377246352948, + "learning_rate": 1.840913388560659e-05, + "loss": 0.2486, + "step": 4062 + }, + { + "epoch": 0.21, + "grad_norm": 1.054808112700336, + "learning_rate": 1.840824248200067e-05, + "loss": 0.2036, + "step": 4063 + }, + { + "epoch": 0.21, + "grad_norm": 0.915720731760347, + "learning_rate": 1.8407350850320698e-05, + "loss": 0.215, + "step": 4064 + }, + { + "epoch": 0.21, + "grad_norm": 0.9131373890768805, + "learning_rate": 1.840645899059086e-05, + "loss": 0.2176, + "step": 4065 + }, + { + "epoch": 0.21, + "grad_norm": 1.4499765627745933, + "learning_rate": 1.8405566902835345e-05, + "loss": 0.2072, + "step": 4066 + }, + { + "epoch": 0.21, + "grad_norm": 1.0072741469965514, + "learning_rate": 1.8404674587078355e-05, + "loss": 0.216, + "step": 4067 + }, + { + "epoch": 0.21, + "grad_norm": 1.116054906081107, + "learning_rate": 1.840378204334409e-05, + "loss": 0.2289, + "step": 4068 + }, + { + "epoch": 0.21, + "grad_norm": 6.393753900145944, + "learning_rate": 1.8402889271656764e-05, + "loss": 0.2156, + "step": 4069 + }, + { + "epoch": 0.21, + "grad_norm": 1.7990680573099183, + "learning_rate": 1.8401996272040593e-05, + "loss": 0.2357, + "step": 4070 + }, + { + "epoch": 0.21, + "grad_norm": 0.9247437119879226, + "learning_rate": 1.8401103044519798e-05, + "loss": 0.2104, + "step": 4071 + }, + { + "epoch": 0.21, + "grad_norm": 1.0478663800704129, + "learning_rate": 1.8400209589118606e-05, + "loss": 0.2605, + "step": 4072 + }, + { + "epoch": 0.21, + "grad_norm": 1.606507652653388, + "learning_rate": 1.839931590586126e-05, + "loss": 0.2146, + "step": 4073 + }, + { + "epoch": 0.21, + "grad_norm": 0.8734486664573401, + "learning_rate": 1.8398421994771987e-05, + "loss": 0.2386, + "step": 4074 + }, + { + "epoch": 0.21, + "grad_norm": 0.8981227604859134, + "learning_rate": 1.839752785587505e-05, + "loss": 0.248, + "step": 4075 + }, + { + "epoch": 0.21, + "grad_norm": 0.8195126193179083, + "learning_rate": 1.8396633489194693e-05, + "loss": 0.2247, + "step": 4076 + }, + { + "epoch": 0.21, + "grad_norm": 0.9562878691843798, + "learning_rate": 1.839573889475518e-05, + "loss": 0.2231, + "step": 4077 + }, + { + "epoch": 0.21, + "grad_norm": 1.0949687933363768, + "learning_rate": 1.8394844072580772e-05, + "loss": 0.2342, + "step": 4078 + }, + { + "epoch": 0.21, + "grad_norm": 0.8892593014012868, + "learning_rate": 1.8393949022695747e-05, + "loss": 0.1962, + "step": 4079 + }, + { + "epoch": 0.21, + "grad_norm": 1.1401884052275038, + "learning_rate": 1.839305374512438e-05, + "loss": 0.2407, + "step": 4080 + }, + { + "epoch": 0.21, + "grad_norm": 0.9840977904794602, + "learning_rate": 1.8392158239890957e-05, + "loss": 0.2285, + "step": 4081 + }, + { + "epoch": 0.21, + "grad_norm": 1.0557285677899844, + "learning_rate": 1.839126250701977e-05, + "loss": 0.1971, + "step": 4082 + }, + { + "epoch": 0.21, + "grad_norm": 1.0082003899112544, + "learning_rate": 1.8390366546535107e-05, + "loss": 0.2373, + "step": 4083 + }, + { + "epoch": 0.21, + "grad_norm": 1.482950351936631, + "learning_rate": 1.8389470358461282e-05, + "loss": 0.217, + "step": 4084 + }, + { + "epoch": 0.21, + "grad_norm": 1.6604273182636915, + "learning_rate": 1.8388573942822597e-05, + "loss": 0.1997, + "step": 4085 + }, + { + "epoch": 0.21, + "grad_norm": 1.061341683760589, + "learning_rate": 1.8387677299643374e-05, + "loss": 0.2058, + "step": 4086 + }, + { + "epoch": 0.21, + "grad_norm": 0.8804008446399976, + "learning_rate": 1.838678042894793e-05, + "loss": 0.2171, + "step": 4087 + }, + { + "epoch": 0.21, + "grad_norm": 1.159037826236392, + "learning_rate": 1.838588333076059e-05, + "loss": 0.2258, + "step": 4088 + }, + { + "epoch": 0.21, + "grad_norm": 1.3007209396237813, + "learning_rate": 1.8384986005105694e-05, + "loss": 0.197, + "step": 4089 + }, + { + "epoch": 0.21, + "grad_norm": 1.1739064368395806, + "learning_rate": 1.838408845200758e-05, + "loss": 0.2115, + "step": 4090 + }, + { + "epoch": 0.21, + "grad_norm": 1.3309068976708802, + "learning_rate": 1.8383190671490586e-05, + "loss": 0.2085, + "step": 4091 + }, + { + "epoch": 0.21, + "grad_norm": 1.0072035766635603, + "learning_rate": 1.838229266357908e-05, + "loss": 0.2207, + "step": 4092 + }, + { + "epoch": 0.21, + "grad_norm": 1.0280329947360802, + "learning_rate": 1.8381394428297406e-05, + "loss": 0.2161, + "step": 4093 + }, + { + "epoch": 0.21, + "grad_norm": 0.9584474084596254, + "learning_rate": 1.8380495965669938e-05, + "loss": 0.2323, + "step": 4094 + }, + { + "epoch": 0.21, + "grad_norm": 1.192651766726789, + "learning_rate": 1.8379597275721043e-05, + "loss": 0.2065, + "step": 4095 + }, + { + "epoch": 0.21, + "grad_norm": 1.279939931690827, + "learning_rate": 1.83786983584751e-05, + "loss": 0.2361, + "step": 4096 + }, + { + "epoch": 0.21, + "grad_norm": 1.6723687874802045, + "learning_rate": 1.8377799213956487e-05, + "loss": 0.2095, + "step": 4097 + }, + { + "epoch": 0.21, + "grad_norm": 1.0774746273066567, + "learning_rate": 1.83768998421896e-05, + "loss": 0.2343, + "step": 4098 + }, + { + "epoch": 0.21, + "grad_norm": 2.8065607882690657, + "learning_rate": 1.837600024319883e-05, + "loss": 0.2407, + "step": 4099 + }, + { + "epoch": 0.21, + "grad_norm": 0.9560382672546652, + "learning_rate": 1.8375100417008584e-05, + "loss": 0.2235, + "step": 4100 + }, + { + "epoch": 0.21, + "grad_norm": 0.9020948472107335, + "learning_rate": 1.8374200363643263e-05, + "loss": 0.2353, + "step": 4101 + }, + { + "epoch": 0.21, + "grad_norm": 0.9690265583262132, + "learning_rate": 1.8373300083127284e-05, + "loss": 0.2275, + "step": 4102 + }, + { + "epoch": 0.21, + "grad_norm": 2.3507623603560543, + "learning_rate": 1.8372399575485068e-05, + "loss": 0.2636, + "step": 4103 + }, + { + "epoch": 0.21, + "grad_norm": 1.4806389422531303, + "learning_rate": 1.837149884074104e-05, + "loss": 0.1946, + "step": 4104 + }, + { + "epoch": 0.21, + "grad_norm": 1.002612736176136, + "learning_rate": 1.8370597878919633e-05, + "loss": 0.2273, + "step": 4105 + }, + { + "epoch": 0.21, + "grad_norm": 1.0672473118919987, + "learning_rate": 1.8369696690045288e-05, + "loss": 0.2122, + "step": 4106 + }, + { + "epoch": 0.21, + "grad_norm": 1.029358419541676, + "learning_rate": 1.8368795274142446e-05, + "loss": 0.2098, + "step": 4107 + }, + { + "epoch": 0.21, + "grad_norm": 0.8667773516314685, + "learning_rate": 1.8367893631235558e-05, + "loss": 0.2162, + "step": 4108 + }, + { + "epoch": 0.21, + "grad_norm": 0.8290441652256468, + "learning_rate": 1.8366991761349084e-05, + "loss": 0.2286, + "step": 4109 + }, + { + "epoch": 0.21, + "grad_norm": 1.6958139792602507, + "learning_rate": 1.8366089664507488e-05, + "loss": 0.219, + "step": 4110 + }, + { + "epoch": 0.21, + "grad_norm": 1.04419481834506, + "learning_rate": 1.836518734073523e-05, + "loss": 0.2209, + "step": 4111 + }, + { + "epoch": 0.21, + "grad_norm": 0.9153247072007938, + "learning_rate": 1.8364284790056804e-05, + "loss": 0.246, + "step": 4112 + }, + { + "epoch": 0.21, + "grad_norm": 1.0906657376798792, + "learning_rate": 1.8363382012496672e-05, + "loss": 0.1906, + "step": 4113 + }, + { + "epoch": 0.21, + "grad_norm": 1.0728346171084582, + "learning_rate": 1.8362479008079334e-05, + "loss": 0.2121, + "step": 4114 + }, + { + "epoch": 0.21, + "grad_norm": 1.0315598757042255, + "learning_rate": 1.836157577682928e-05, + "loss": 0.2437, + "step": 4115 + }, + { + "epoch": 0.21, + "grad_norm": 0.9828449756288894, + "learning_rate": 1.8360672318771013e-05, + "loss": 0.2268, + "step": 4116 + }, + { + "epoch": 0.21, + "grad_norm": 1.1873815853782363, + "learning_rate": 1.8359768633929035e-05, + "loss": 0.217, + "step": 4117 + }, + { + "epoch": 0.21, + "grad_norm": 0.8210484261375784, + "learning_rate": 1.835886472232786e-05, + "loss": 0.1881, + "step": 4118 + }, + { + "epoch": 0.21, + "grad_norm": 1.2900341454161641, + "learning_rate": 1.835796058399201e-05, + "loss": 0.2366, + "step": 4119 + }, + { + "epoch": 0.21, + "grad_norm": 1.0392124418066642, + "learning_rate": 1.8357056218946003e-05, + "loss": 0.2439, + "step": 4120 + }, + { + "epoch": 0.21, + "grad_norm": 0.8318283207469718, + "learning_rate": 1.8356151627214377e-05, + "loss": 0.206, + "step": 4121 + }, + { + "epoch": 0.21, + "grad_norm": 0.827521381720995, + "learning_rate": 1.8355246808821664e-05, + "loss": 0.2123, + "step": 4122 + }, + { + "epoch": 0.21, + "grad_norm": 1.0971340042134967, + "learning_rate": 1.835434176379241e-05, + "loss": 0.208, + "step": 4123 + }, + { + "epoch": 0.21, + "grad_norm": 1.0739765981940956, + "learning_rate": 1.8353436492151165e-05, + "loss": 0.2095, + "step": 4124 + }, + { + "epoch": 0.21, + "grad_norm": 0.6935205221026808, + "learning_rate": 1.8352530993922483e-05, + "loss": 0.1864, + "step": 4125 + }, + { + "epoch": 0.21, + "grad_norm": 1.247650082249108, + "learning_rate": 1.8351625269130927e-05, + "loss": 0.2093, + "step": 4126 + }, + { + "epoch": 0.21, + "grad_norm": 0.9056832888194429, + "learning_rate": 1.8350719317801062e-05, + "loss": 0.214, + "step": 4127 + }, + { + "epoch": 0.21, + "grad_norm": 1.3095563898978886, + "learning_rate": 1.8349813139957464e-05, + "loss": 0.2077, + "step": 4128 + }, + { + "epoch": 0.21, + "grad_norm": 0.9821047572278827, + "learning_rate": 1.8348906735624716e-05, + "loss": 0.2112, + "step": 4129 + }, + { + "epoch": 0.21, + "grad_norm": 1.0638206523680762, + "learning_rate": 1.8348000104827396e-05, + "loss": 0.2321, + "step": 4130 + }, + { + "epoch": 0.21, + "grad_norm": 0.7993847668276511, + "learning_rate": 1.8347093247590106e-05, + "loss": 0.199, + "step": 4131 + }, + { + "epoch": 0.21, + "grad_norm": 0.9723068373547615, + "learning_rate": 1.834618616393744e-05, + "loss": 0.1977, + "step": 4132 + }, + { + "epoch": 0.21, + "grad_norm": 1.2098802854352386, + "learning_rate": 1.8345278853894e-05, + "loss": 0.2031, + "step": 4133 + }, + { + "epoch": 0.21, + "grad_norm": 0.9292414406170786, + "learning_rate": 1.8344371317484402e-05, + "loss": 0.2152, + "step": 4134 + }, + { + "epoch": 0.21, + "grad_norm": 1.0821646783609271, + "learning_rate": 1.834346355473326e-05, + "loss": 0.2294, + "step": 4135 + }, + { + "epoch": 0.21, + "grad_norm": 0.9828080305508093, + "learning_rate": 1.83425555656652e-05, + "loss": 0.2166, + "step": 4136 + }, + { + "epoch": 0.21, + "grad_norm": 1.0148458077937668, + "learning_rate": 1.8341647350304844e-05, + "loss": 0.2078, + "step": 4137 + }, + { + "epoch": 0.21, + "grad_norm": 0.8506156892427307, + "learning_rate": 1.8340738908676837e-05, + "loss": 0.2113, + "step": 4138 + }, + { + "epoch": 0.21, + "grad_norm": 0.9936066991084263, + "learning_rate": 1.8339830240805814e-05, + "loss": 0.2344, + "step": 4139 + }, + { + "epoch": 0.21, + "grad_norm": 1.1197923087940107, + "learning_rate": 1.8338921346716426e-05, + "loss": 0.2049, + "step": 4140 + }, + { + "epoch": 0.21, + "grad_norm": 0.8916498553865597, + "learning_rate": 1.8338012226433322e-05, + "loss": 0.2569, + "step": 4141 + }, + { + "epoch": 0.21, + "grad_norm": 0.9060713658218383, + "learning_rate": 1.833710287998117e-05, + "loss": 0.2112, + "step": 4142 + }, + { + "epoch": 0.21, + "grad_norm": 1.4070229822572158, + "learning_rate": 1.833619330738463e-05, + "loss": 0.2375, + "step": 4143 + }, + { + "epoch": 0.21, + "grad_norm": 0.9140861151714124, + "learning_rate": 1.8335283508668375e-05, + "loss": 0.2301, + "step": 4144 + }, + { + "epoch": 0.21, + "grad_norm": 1.1048618491381426, + "learning_rate": 1.8334373483857083e-05, + "loss": 0.2266, + "step": 4145 + }, + { + "epoch": 0.21, + "grad_norm": 0.9368278246902417, + "learning_rate": 1.8333463232975442e-05, + "loss": 0.2336, + "step": 4146 + }, + { + "epoch": 0.21, + "grad_norm": 0.9103085291862169, + "learning_rate": 1.833255275604814e-05, + "loss": 0.2229, + "step": 4147 + }, + { + "epoch": 0.21, + "grad_norm": 0.8832449772404098, + "learning_rate": 1.8331642053099873e-05, + "loss": 0.2189, + "step": 4148 + }, + { + "epoch": 0.21, + "grad_norm": 0.8529873970875689, + "learning_rate": 1.8330731124155347e-05, + "loss": 0.231, + "step": 4149 + }, + { + "epoch": 0.21, + "grad_norm": 0.9615479076879223, + "learning_rate": 1.8329819969239263e-05, + "loss": 0.2316, + "step": 4150 + }, + { + "epoch": 0.21, + "grad_norm": 0.8526259123087873, + "learning_rate": 1.8328908588376345e-05, + "loss": 0.2168, + "step": 4151 + }, + { + "epoch": 0.21, + "grad_norm": 1.0171620974352265, + "learning_rate": 1.832799698159131e-05, + "loss": 0.2518, + "step": 4152 + }, + { + "epoch": 0.21, + "grad_norm": 1.078204216193984, + "learning_rate": 1.832708514890889e-05, + "loss": 0.2242, + "step": 4153 + }, + { + "epoch": 0.21, + "grad_norm": 0.9815351765693545, + "learning_rate": 1.8326173090353815e-05, + "loss": 0.2537, + "step": 4154 + }, + { + "epoch": 0.21, + "grad_norm": 1.7385114890983147, + "learning_rate": 1.832526080595082e-05, + "loss": 0.2249, + "step": 4155 + }, + { + "epoch": 0.21, + "grad_norm": 1.0942431223529165, + "learning_rate": 1.8324348295724658e-05, + "loss": 0.2312, + "step": 4156 + }, + { + "epoch": 0.21, + "grad_norm": 0.905791708407855, + "learning_rate": 1.8323435559700077e-05, + "loss": 0.2104, + "step": 4157 + }, + { + "epoch": 0.21, + "grad_norm": 1.0635355993872433, + "learning_rate": 1.832252259790184e-05, + "loss": 0.2407, + "step": 4158 + }, + { + "epoch": 0.21, + "grad_norm": 0.9400388503599094, + "learning_rate": 1.8321609410354705e-05, + "loss": 0.2014, + "step": 4159 + }, + { + "epoch": 0.21, + "grad_norm": 1.1323684487566767, + "learning_rate": 1.8320695997083443e-05, + "loss": 0.2531, + "step": 4160 + }, + { + "epoch": 0.21, + "grad_norm": 0.9332268296880125, + "learning_rate": 1.8319782358112836e-05, + "loss": 0.206, + "step": 4161 + }, + { + "epoch": 0.21, + "grad_norm": 0.9945749482242312, + "learning_rate": 1.8318868493467657e-05, + "loss": 0.2189, + "step": 4162 + }, + { + "epoch": 0.21, + "grad_norm": 1.1305659185344676, + "learning_rate": 1.8317954403172708e-05, + "loss": 0.2187, + "step": 4163 + }, + { + "epoch": 0.21, + "grad_norm": 1.1061371469329477, + "learning_rate": 1.831704008725277e-05, + "loss": 0.2221, + "step": 4164 + }, + { + "epoch": 0.21, + "grad_norm": 0.8551129861299791, + "learning_rate": 1.8316125545732653e-05, + "loss": 0.1808, + "step": 4165 + }, + { + "epoch": 0.21, + "grad_norm": 0.9663446712855458, + "learning_rate": 1.831521077863716e-05, + "loss": 0.2057, + "step": 4166 + }, + { + "epoch": 0.21, + "grad_norm": 0.9287018223950793, + "learning_rate": 1.8314295785991103e-05, + "loss": 0.2417, + "step": 4167 + }, + { + "epoch": 0.21, + "grad_norm": 1.200372329075485, + "learning_rate": 1.8313380567819306e-05, + "loss": 0.2132, + "step": 4168 + }, + { + "epoch": 0.21, + "grad_norm": 1.015871584887287, + "learning_rate": 1.831246512414659e-05, + "loss": 0.2127, + "step": 4169 + }, + { + "epoch": 0.21, + "grad_norm": 1.0873608708788338, + "learning_rate": 1.8311549454997788e-05, + "loss": 0.2173, + "step": 4170 + }, + { + "epoch": 0.21, + "grad_norm": 0.8060995006211655, + "learning_rate": 1.8310633560397737e-05, + "loss": 0.2203, + "step": 4171 + }, + { + "epoch": 0.21, + "grad_norm": 0.9484747795626762, + "learning_rate": 1.8309717440371282e-05, + "loss": 0.2061, + "step": 4172 + }, + { + "epoch": 0.21, + "grad_norm": 1.011706199342892, + "learning_rate": 1.8308801094943275e-05, + "loss": 0.2099, + "step": 4173 + }, + { + "epoch": 0.21, + "grad_norm": 1.0158471698512874, + "learning_rate": 1.8307884524138564e-05, + "loss": 0.2141, + "step": 4174 + }, + { + "epoch": 0.21, + "grad_norm": 1.0120488489692896, + "learning_rate": 1.830696772798202e-05, + "loss": 0.2109, + "step": 4175 + }, + { + "epoch": 0.21, + "grad_norm": 1.1944801073571805, + "learning_rate": 1.8306050706498503e-05, + "loss": 0.2105, + "step": 4176 + }, + { + "epoch": 0.21, + "grad_norm": 1.1529388807643852, + "learning_rate": 1.8305133459712897e-05, + "loss": 0.2308, + "step": 4177 + }, + { + "epoch": 0.21, + "grad_norm": 0.95321287948936, + "learning_rate": 1.8304215987650074e-05, + "loss": 0.2175, + "step": 4178 + }, + { + "epoch": 0.21, + "grad_norm": 1.0593397775678555, + "learning_rate": 1.830329829033492e-05, + "loss": 0.2177, + "step": 4179 + }, + { + "epoch": 0.21, + "grad_norm": 2.0005744492424244, + "learning_rate": 1.8302380367792336e-05, + "loss": 0.2282, + "step": 4180 + }, + { + "epoch": 0.21, + "grad_norm": 0.9528885425476983, + "learning_rate": 1.8301462220047213e-05, + "loss": 0.2404, + "step": 4181 + }, + { + "epoch": 0.21, + "grad_norm": 1.4416274627260237, + "learning_rate": 1.830054384712446e-05, + "loss": 0.2366, + "step": 4182 + }, + { + "epoch": 0.21, + "grad_norm": 0.8464081363401605, + "learning_rate": 1.8299625249048986e-05, + "loss": 0.2069, + "step": 4183 + }, + { + "epoch": 0.21, + "grad_norm": 0.9436797273759833, + "learning_rate": 1.8298706425845707e-05, + "loss": 0.2426, + "step": 4184 + }, + { + "epoch": 0.21, + "grad_norm": 0.7711041358692556, + "learning_rate": 1.829778737753955e-05, + "loss": 0.2209, + "step": 4185 + }, + { + "epoch": 0.21, + "grad_norm": 0.9606908868034619, + "learning_rate": 1.8296868104155437e-05, + "loss": 0.1989, + "step": 4186 + }, + { + "epoch": 0.21, + "grad_norm": 0.8278759406694916, + "learning_rate": 1.8295948605718316e-05, + "loss": 0.2192, + "step": 4187 + }, + { + "epoch": 0.21, + "grad_norm": 0.921098878397764, + "learning_rate": 1.8295028882253113e-05, + "loss": 0.2132, + "step": 4188 + }, + { + "epoch": 0.21, + "grad_norm": 0.8795989807205208, + "learning_rate": 1.8294108933784788e-05, + "loss": 0.2211, + "step": 4189 + }, + { + "epoch": 0.21, + "grad_norm": 1.2470788534129764, + "learning_rate": 1.8293188760338285e-05, + "loss": 0.188, + "step": 4190 + }, + { + "epoch": 0.21, + "grad_norm": 0.9763265103504567, + "learning_rate": 1.829226836193857e-05, + "loss": 0.2235, + "step": 4191 + }, + { + "epoch": 0.21, + "grad_norm": 1.1680172459987113, + "learning_rate": 1.829134773861061e-05, + "loss": 0.2086, + "step": 4192 + }, + { + "epoch": 0.21, + "grad_norm": 0.9284572674758612, + "learning_rate": 1.8290426890379372e-05, + "loss": 0.2337, + "step": 4193 + }, + { + "epoch": 0.21, + "grad_norm": 0.8362341065194638, + "learning_rate": 1.8289505817269838e-05, + "loss": 0.199, + "step": 4194 + }, + { + "epoch": 0.21, + "grad_norm": 0.884264449424886, + "learning_rate": 1.8288584519306985e-05, + "loss": 0.2095, + "step": 4195 + }, + { + "epoch": 0.21, + "grad_norm": 1.1174268482330303, + "learning_rate": 1.8287662996515815e-05, + "loss": 0.239, + "step": 4196 + }, + { + "epoch": 0.21, + "grad_norm": 0.9739645939599759, + "learning_rate": 1.8286741248921317e-05, + "loss": 0.2161, + "step": 4197 + }, + { + "epoch": 0.21, + "grad_norm": 0.96075065186622, + "learning_rate": 1.8285819276548494e-05, + "loss": 0.2154, + "step": 4198 + }, + { + "epoch": 0.21, + "grad_norm": 0.7959212458472489, + "learning_rate": 1.8284897079422356e-05, + "loss": 0.2084, + "step": 4199 + }, + { + "epoch": 0.21, + "grad_norm": 1.171455556259447, + "learning_rate": 1.8283974657567915e-05, + "loss": 0.2254, + "step": 4200 + }, + { + "epoch": 0.21, + "grad_norm": 0.943038970137586, + "learning_rate": 1.8283052011010195e-05, + "loss": 0.1983, + "step": 4201 + }, + { + "epoch": 0.21, + "grad_norm": 0.8424994263691186, + "learning_rate": 1.828212913977422e-05, + "loss": 0.2031, + "step": 4202 + }, + { + "epoch": 0.21, + "grad_norm": 0.9446157746444535, + "learning_rate": 1.8281206043885024e-05, + "loss": 0.2401, + "step": 4203 + }, + { + "epoch": 0.21, + "grad_norm": 1.1912114330547572, + "learning_rate": 1.8280282723367647e-05, + "loss": 0.2461, + "step": 4204 + }, + { + "epoch": 0.21, + "grad_norm": 0.9212399950951802, + "learning_rate": 1.8279359178247134e-05, + "loss": 0.2282, + "step": 4205 + }, + { + "epoch": 0.21, + "grad_norm": 0.9175859980076985, + "learning_rate": 1.8278435408548538e-05, + "loss": 0.2271, + "step": 4206 + }, + { + "epoch": 0.21, + "grad_norm": 0.8472401194222476, + "learning_rate": 1.827751141429691e-05, + "loss": 0.2393, + "step": 4207 + }, + { + "epoch": 0.21, + "grad_norm": 0.866124853602909, + "learning_rate": 1.827658719551732e-05, + "loss": 0.2121, + "step": 4208 + }, + { + "epoch": 0.21, + "grad_norm": 0.8963703373904084, + "learning_rate": 1.8275662752234834e-05, + "loss": 0.2013, + "step": 4209 + }, + { + "epoch": 0.21, + "grad_norm": 1.0791283238506562, + "learning_rate": 1.8274738084474525e-05, + "loss": 0.2207, + "step": 4210 + }, + { + "epoch": 0.21, + "grad_norm": 0.8172063107163877, + "learning_rate": 1.8273813192261483e-05, + "loss": 0.2197, + "step": 4211 + }, + { + "epoch": 0.21, + "grad_norm": 1.1167127205533103, + "learning_rate": 1.827288807562079e-05, + "loss": 0.2292, + "step": 4212 + }, + { + "epoch": 0.21, + "grad_norm": 0.765006058341567, + "learning_rate": 1.8271962734577536e-05, + "loss": 0.2197, + "step": 4213 + }, + { + "epoch": 0.21, + "grad_norm": 0.7670932099972653, + "learning_rate": 1.827103716915683e-05, + "loss": 0.2094, + "step": 4214 + }, + { + "epoch": 0.21, + "grad_norm": 0.9312624220759745, + "learning_rate": 1.8270111379383773e-05, + "loss": 0.2108, + "step": 4215 + }, + { + "epoch": 0.21, + "grad_norm": 0.9058587404838406, + "learning_rate": 1.826918536528348e-05, + "loss": 0.2213, + "step": 4216 + }, + { + "epoch": 0.21, + "grad_norm": 0.8238733963219388, + "learning_rate": 1.8268259126881064e-05, + "loss": 0.2249, + "step": 4217 + }, + { + "epoch": 0.21, + "grad_norm": 0.960822749130999, + "learning_rate": 1.8267332664201653e-05, + "loss": 0.2311, + "step": 4218 + }, + { + "epoch": 0.21, + "grad_norm": 1.242662687452094, + "learning_rate": 1.8266405977270377e-05, + "loss": 0.2289, + "step": 4219 + }, + { + "epoch": 0.21, + "grad_norm": 0.9535625295217197, + "learning_rate": 1.826547906611237e-05, + "loss": 0.2116, + "step": 4220 + }, + { + "epoch": 0.21, + "grad_norm": 0.8201907851659748, + "learning_rate": 1.826455193075278e-05, + "loss": 0.1827, + "step": 4221 + }, + { + "epoch": 0.21, + "grad_norm": 0.9915348756782004, + "learning_rate": 1.826362457121675e-05, + "loss": 0.2064, + "step": 4222 + }, + { + "epoch": 0.21, + "grad_norm": 0.783989783933111, + "learning_rate": 1.8262696987529434e-05, + "loss": 0.2036, + "step": 4223 + }, + { + "epoch": 0.21, + "grad_norm": 0.8067961548800195, + "learning_rate": 1.8261769179716e-05, + "loss": 0.2137, + "step": 4224 + }, + { + "epoch": 0.21, + "grad_norm": 0.7977405884808337, + "learning_rate": 1.826084114780161e-05, + "loss": 0.2121, + "step": 4225 + }, + { + "epoch": 0.21, + "grad_norm": 0.8556151782037259, + "learning_rate": 1.8259912891811433e-05, + "loss": 0.2025, + "step": 4226 + }, + { + "epoch": 0.21, + "grad_norm": 0.9531497815790603, + "learning_rate": 1.8258984411770656e-05, + "loss": 0.2398, + "step": 4227 + }, + { + "epoch": 0.22, + "grad_norm": 1.0105056727131227, + "learning_rate": 1.825805570770446e-05, + "loss": 0.2234, + "step": 4228 + }, + { + "epoch": 0.22, + "grad_norm": 0.7200577937137307, + "learning_rate": 1.825712677963804e-05, + "loss": 0.2025, + "step": 4229 + }, + { + "epoch": 0.22, + "grad_norm": 0.8241487265119518, + "learning_rate": 1.8256197627596582e-05, + "loss": 0.2261, + "step": 4230 + }, + { + "epoch": 0.22, + "grad_norm": 0.9253391184043739, + "learning_rate": 1.8255268251605303e-05, + "loss": 0.2122, + "step": 4231 + }, + { + "epoch": 0.22, + "grad_norm": 1.201686032556116, + "learning_rate": 1.8254338651689402e-05, + "loss": 0.2139, + "step": 4232 + }, + { + "epoch": 0.22, + "grad_norm": 0.8768725928195028, + "learning_rate": 1.82534088278741e-05, + "loss": 0.205, + "step": 4233 + }, + { + "epoch": 0.22, + "grad_norm": 1.059742002088647, + "learning_rate": 1.825247878018462e-05, + "loss": 0.1921, + "step": 4234 + }, + { + "epoch": 0.22, + "grad_norm": 0.916012985496521, + "learning_rate": 1.8251548508646186e-05, + "loss": 0.2451, + "step": 4235 + }, + { + "epoch": 0.22, + "grad_norm": 0.9015541425224542, + "learning_rate": 1.825061801328403e-05, + "loss": 0.2123, + "step": 4236 + }, + { + "epoch": 0.22, + "grad_norm": 1.0308071881686403, + "learning_rate": 1.8249687294123396e-05, + "loss": 0.2194, + "step": 4237 + }, + { + "epoch": 0.22, + "grad_norm": 0.9070747617664877, + "learning_rate": 1.8248756351189533e-05, + "loss": 0.1742, + "step": 4238 + }, + { + "epoch": 0.22, + "grad_norm": 0.8358651708734051, + "learning_rate": 1.8247825184507683e-05, + "loss": 0.2048, + "step": 4239 + }, + { + "epoch": 0.22, + "grad_norm": 0.8122009336805338, + "learning_rate": 1.8246893794103113e-05, + "loss": 0.2149, + "step": 4240 + }, + { + "epoch": 0.22, + "grad_norm": 1.5371587945589475, + "learning_rate": 1.8245962180001075e-05, + "loss": 0.2427, + "step": 4241 + }, + { + "epoch": 0.22, + "grad_norm": 3.0558139292489592, + "learning_rate": 1.824503034222685e-05, + "loss": 0.219, + "step": 4242 + }, + { + "epoch": 0.22, + "grad_norm": 0.8009101214707464, + "learning_rate": 1.8244098280805715e-05, + "loss": 0.2196, + "step": 4243 + }, + { + "epoch": 0.22, + "grad_norm": 0.7883653453666257, + "learning_rate": 1.8243165995762947e-05, + "loss": 0.2169, + "step": 4244 + }, + { + "epoch": 0.22, + "grad_norm": 1.4303484593440836, + "learning_rate": 1.8242233487123837e-05, + "loss": 0.2248, + "step": 4245 + }, + { + "epoch": 0.22, + "grad_norm": 0.9900955967331986, + "learning_rate": 1.8241300754913677e-05, + "loss": 0.2525, + "step": 4246 + }, + { + "epoch": 0.22, + "grad_norm": 1.0160311186185382, + "learning_rate": 1.824036779915777e-05, + "loss": 0.1988, + "step": 4247 + }, + { + "epoch": 0.22, + "grad_norm": 0.7568400522467681, + "learning_rate": 1.8239434619881418e-05, + "loss": 0.201, + "step": 4248 + }, + { + "epoch": 0.22, + "grad_norm": 1.0200762232544731, + "learning_rate": 1.8238501217109938e-05, + "loss": 0.2425, + "step": 4249 + }, + { + "epoch": 0.22, + "grad_norm": 0.7002983112880843, + "learning_rate": 1.8237567590868645e-05, + "loss": 0.196, + "step": 4250 + }, + { + "epoch": 0.22, + "grad_norm": 0.9417901704321855, + "learning_rate": 1.823663374118287e-05, + "loss": 0.1868, + "step": 4251 + }, + { + "epoch": 0.22, + "grad_norm": 1.0660457871013291, + "learning_rate": 1.8235699668077938e-05, + "loss": 0.2183, + "step": 4252 + }, + { + "epoch": 0.22, + "grad_norm": 1.3133422232494105, + "learning_rate": 1.8234765371579186e-05, + "loss": 0.2164, + "step": 4253 + }, + { + "epoch": 0.22, + "grad_norm": 1.766017062175045, + "learning_rate": 1.8233830851711957e-05, + "loss": 0.2402, + "step": 4254 + }, + { + "epoch": 0.22, + "grad_norm": 1.474811583060738, + "learning_rate": 1.8232896108501606e-05, + "loss": 0.2328, + "step": 4255 + }, + { + "epoch": 0.22, + "grad_norm": 0.7342884869301584, + "learning_rate": 1.823196114197348e-05, + "loss": 0.2072, + "step": 4256 + }, + { + "epoch": 0.22, + "grad_norm": 1.1920093532059117, + "learning_rate": 1.823102595215294e-05, + "loss": 0.238, + "step": 4257 + }, + { + "epoch": 0.22, + "grad_norm": 0.9002978568597515, + "learning_rate": 1.823009053906536e-05, + "loss": 0.2045, + "step": 4258 + }, + { + "epoch": 0.22, + "grad_norm": 1.0125908254260827, + "learning_rate": 1.8229154902736113e-05, + "loss": 0.2097, + "step": 4259 + }, + { + "epoch": 0.22, + "grad_norm": 0.9305152927324657, + "learning_rate": 1.822821904319057e-05, + "loss": 0.237, + "step": 4260 + }, + { + "epoch": 0.22, + "grad_norm": 0.9620551761878576, + "learning_rate": 1.822728296045412e-05, + "loss": 0.2567, + "step": 4261 + }, + { + "epoch": 0.22, + "grad_norm": 0.9060764864986361, + "learning_rate": 1.8226346654552155e-05, + "loss": 0.2139, + "step": 4262 + }, + { + "epoch": 0.22, + "grad_norm": 0.8190218386720736, + "learning_rate": 1.8225410125510073e-05, + "loss": 0.2387, + "step": 4263 + }, + { + "epoch": 0.22, + "grad_norm": 0.7968878989634448, + "learning_rate": 1.8224473373353276e-05, + "loss": 0.189, + "step": 4264 + }, + { + "epoch": 0.22, + "grad_norm": 0.9643006351217062, + "learning_rate": 1.8223536398107177e-05, + "loss": 0.2083, + "step": 4265 + }, + { + "epoch": 0.22, + "grad_norm": 1.0532074306008665, + "learning_rate": 1.8222599199797188e-05, + "loss": 0.2094, + "step": 4266 + }, + { + "epoch": 0.22, + "grad_norm": 0.9184139480026474, + "learning_rate": 1.822166177844873e-05, + "loss": 0.1897, + "step": 4267 + }, + { + "epoch": 0.22, + "grad_norm": 1.0147986932673607, + "learning_rate": 1.8220724134087232e-05, + "loss": 0.2154, + "step": 4268 + }, + { + "epoch": 0.22, + "grad_norm": 0.9907113242466752, + "learning_rate": 1.8219786266738125e-05, + "loss": 0.1967, + "step": 4269 + }, + { + "epoch": 0.22, + "grad_norm": 0.8044965495570748, + "learning_rate": 1.8218848176426857e-05, + "loss": 0.2316, + "step": 4270 + }, + { + "epoch": 0.22, + "grad_norm": 0.9013298789933274, + "learning_rate": 1.8217909863178868e-05, + "loss": 0.2534, + "step": 4271 + }, + { + "epoch": 0.22, + "grad_norm": 0.8488274163325105, + "learning_rate": 1.8216971327019603e-05, + "loss": 0.2254, + "step": 4272 + }, + { + "epoch": 0.22, + "grad_norm": 0.8406913678522667, + "learning_rate": 1.821603256797453e-05, + "loss": 0.1963, + "step": 4273 + }, + { + "epoch": 0.22, + "grad_norm": 1.6575549638758935, + "learning_rate": 1.821509358606911e-05, + "loss": 0.2213, + "step": 4274 + }, + { + "epoch": 0.22, + "grad_norm": 0.8083485169095919, + "learning_rate": 1.8214154381328815e-05, + "loss": 0.2073, + "step": 4275 + }, + { + "epoch": 0.22, + "grad_norm": 0.9830926770574077, + "learning_rate": 1.8213214953779114e-05, + "loss": 0.2458, + "step": 4276 + }, + { + "epoch": 0.22, + "grad_norm": 0.7787523267533243, + "learning_rate": 1.8212275303445498e-05, + "loss": 0.2295, + "step": 4277 + }, + { + "epoch": 0.22, + "grad_norm": 0.8838887511848333, + "learning_rate": 1.8211335430353444e-05, + "loss": 0.2107, + "step": 4278 + }, + { + "epoch": 0.22, + "grad_norm": 1.0056438528043299, + "learning_rate": 1.821039533452846e-05, + "loss": 0.2183, + "step": 4279 + }, + { + "epoch": 0.22, + "grad_norm": 0.6772895668147175, + "learning_rate": 1.8209455015996034e-05, + "loss": 0.1904, + "step": 4280 + }, + { + "epoch": 0.22, + "grad_norm": 0.8927283512952466, + "learning_rate": 1.820851447478168e-05, + "loss": 0.2229, + "step": 4281 + }, + { + "epoch": 0.22, + "grad_norm": 0.9770840012710763, + "learning_rate": 1.8207573710910905e-05, + "loss": 0.2418, + "step": 4282 + }, + { + "epoch": 0.22, + "grad_norm": 0.8916316262230806, + "learning_rate": 1.820663272440923e-05, + "loss": 0.2156, + "step": 4283 + }, + { + "epoch": 0.22, + "grad_norm": 1.2917448104325193, + "learning_rate": 1.8205691515302183e-05, + "loss": 0.2447, + "step": 4284 + }, + { + "epoch": 0.22, + "grad_norm": 0.9632645005230199, + "learning_rate": 1.8204750083615283e-05, + "loss": 0.2363, + "step": 4285 + }, + { + "epoch": 0.22, + "grad_norm": 0.8966795248674965, + "learning_rate": 1.8203808429374078e-05, + "loss": 0.2229, + "step": 4286 + }, + { + "epoch": 0.22, + "grad_norm": 1.1035150706057364, + "learning_rate": 1.8202866552604104e-05, + "loss": 0.2508, + "step": 4287 + }, + { + "epoch": 0.22, + "grad_norm": 1.4050937171557807, + "learning_rate": 1.8201924453330914e-05, + "loss": 0.186, + "step": 4288 + }, + { + "epoch": 0.22, + "grad_norm": 1.0667736728743291, + "learning_rate": 1.8200982131580058e-05, + "loss": 0.2044, + "step": 4289 + }, + { + "epoch": 0.22, + "grad_norm": 0.8705957052526101, + "learning_rate": 1.82000395873771e-05, + "loss": 0.2234, + "step": 4290 + }, + { + "epoch": 0.22, + "grad_norm": 0.7867090078526178, + "learning_rate": 1.8199096820747603e-05, + "loss": 0.2106, + "step": 4291 + }, + { + "epoch": 0.22, + "grad_norm": 0.904053305616991, + "learning_rate": 1.819815383171714e-05, + "loss": 0.2097, + "step": 4292 + }, + { + "epoch": 0.22, + "grad_norm": 0.9641010472452598, + "learning_rate": 1.8197210620311297e-05, + "loss": 0.2055, + "step": 4293 + }, + { + "epoch": 0.22, + "grad_norm": 2.3678290393221864, + "learning_rate": 1.819626718655565e-05, + "loss": 0.2159, + "step": 4294 + }, + { + "epoch": 0.22, + "grad_norm": 1.1708498945803798, + "learning_rate": 1.819532353047579e-05, + "loss": 0.2329, + "step": 4295 + }, + { + "epoch": 0.22, + "grad_norm": 1.1560113389228808, + "learning_rate": 1.819437965209732e-05, + "loss": 0.2324, + "step": 4296 + }, + { + "epoch": 0.22, + "grad_norm": 0.9184708309489177, + "learning_rate": 1.819343555144584e-05, + "loss": 0.2311, + "step": 4297 + }, + { + "epoch": 0.22, + "grad_norm": 0.9343546421034021, + "learning_rate": 1.8192491228546953e-05, + "loss": 0.199, + "step": 4298 + }, + { + "epoch": 0.22, + "grad_norm": 0.8418429083232742, + "learning_rate": 1.8191546683426283e-05, + "loss": 0.2182, + "step": 4299 + }, + { + "epoch": 0.22, + "grad_norm": 1.1363584516751166, + "learning_rate": 1.8190601916109444e-05, + "loss": 0.2071, + "step": 4300 + }, + { + "epoch": 0.22, + "grad_norm": 0.9556669160393405, + "learning_rate": 1.8189656926622066e-05, + "loss": 0.227, + "step": 4301 + }, + { + "epoch": 0.22, + "grad_norm": 1.0732355260283306, + "learning_rate": 1.8188711714989785e-05, + "loss": 0.2038, + "step": 4302 + }, + { + "epoch": 0.22, + "grad_norm": 0.7696215465058844, + "learning_rate": 1.818776628123823e-05, + "loss": 0.1983, + "step": 4303 + }, + { + "epoch": 0.22, + "grad_norm": 1.2331616863270052, + "learning_rate": 1.818682062539306e-05, + "loss": 0.2271, + "step": 4304 + }, + { + "epoch": 0.22, + "grad_norm": 1.0942386697318467, + "learning_rate": 1.8185874747479912e-05, + "loss": 0.2177, + "step": 4305 + }, + { + "epoch": 0.22, + "grad_norm": 1.4450879115009336, + "learning_rate": 1.818492864752445e-05, + "loss": 0.2128, + "step": 4306 + }, + { + "epoch": 0.22, + "grad_norm": 0.8904155962150285, + "learning_rate": 1.8183982325552338e-05, + "loss": 0.2184, + "step": 4307 + }, + { + "epoch": 0.22, + "grad_norm": 0.9143306634271516, + "learning_rate": 1.8183035781589238e-05, + "loss": 0.2208, + "step": 4308 + }, + { + "epoch": 0.22, + "grad_norm": 0.9853438320265109, + "learning_rate": 1.8182089015660836e-05, + "loss": 0.2013, + "step": 4309 + }, + { + "epoch": 0.22, + "grad_norm": 0.9777431632030403, + "learning_rate": 1.8181142027792807e-05, + "loss": 0.229, + "step": 4310 + }, + { + "epoch": 0.22, + "grad_norm": 0.8076202137496955, + "learning_rate": 1.8180194818010833e-05, + "loss": 0.193, + "step": 4311 + }, + { + "epoch": 0.22, + "grad_norm": 0.8673930948175105, + "learning_rate": 1.8179247386340617e-05, + "loss": 0.2152, + "step": 4312 + }, + { + "epoch": 0.22, + "grad_norm": 1.3583227306954313, + "learning_rate": 1.8178299732807853e-05, + "loss": 0.2024, + "step": 4313 + }, + { + "epoch": 0.22, + "grad_norm": 0.9205591976032004, + "learning_rate": 1.8177351857438248e-05, + "loss": 0.2488, + "step": 4314 + }, + { + "epoch": 0.22, + "grad_norm": 1.082096516321356, + "learning_rate": 1.817640376025751e-05, + "loss": 0.2381, + "step": 4315 + }, + { + "epoch": 0.22, + "grad_norm": 1.4396388035174683, + "learning_rate": 1.817545544129136e-05, + "loss": 0.247, + "step": 4316 + }, + { + "epoch": 0.22, + "grad_norm": 0.8504382943282013, + "learning_rate": 1.8174506900565518e-05, + "loss": 0.1785, + "step": 4317 + }, + { + "epoch": 0.22, + "grad_norm": 0.9081660689283023, + "learning_rate": 1.8173558138105717e-05, + "loss": 0.2023, + "step": 4318 + }, + { + "epoch": 0.22, + "grad_norm": 1.446106615498979, + "learning_rate": 1.8172609153937685e-05, + "loss": 0.2047, + "step": 4319 + }, + { + "epoch": 0.22, + "grad_norm": 1.005259860103065, + "learning_rate": 1.8171659948087175e-05, + "loss": 0.2266, + "step": 4320 + }, + { + "epoch": 0.22, + "grad_norm": 1.111929062773582, + "learning_rate": 1.8170710520579923e-05, + "loss": 0.2047, + "step": 4321 + }, + { + "epoch": 0.22, + "grad_norm": 0.9763553401215549, + "learning_rate": 1.8169760871441683e-05, + "loss": 0.199, + "step": 4322 + }, + { + "epoch": 0.22, + "grad_norm": 1.2920743833932382, + "learning_rate": 1.8168811000698224e-05, + "loss": 0.2212, + "step": 4323 + }, + { + "epoch": 0.22, + "grad_norm": 1.0361696005188632, + "learning_rate": 1.8167860908375307e-05, + "loss": 0.1859, + "step": 4324 + }, + { + "epoch": 0.22, + "grad_norm": 0.9516237535805719, + "learning_rate": 1.8166910594498695e-05, + "loss": 0.2199, + "step": 4325 + }, + { + "epoch": 0.22, + "grad_norm": 1.7244801875399887, + "learning_rate": 1.8165960059094174e-05, + "loss": 0.2319, + "step": 4326 + }, + { + "epoch": 0.22, + "grad_norm": 1.051239459001659, + "learning_rate": 1.8165009302187526e-05, + "loss": 0.2182, + "step": 4327 + }, + { + "epoch": 0.22, + "grad_norm": 0.8694760347180661, + "learning_rate": 1.816405832380454e-05, + "loss": 0.2181, + "step": 4328 + }, + { + "epoch": 0.22, + "grad_norm": 0.9093718969648045, + "learning_rate": 1.8163107123971012e-05, + "loss": 0.199, + "step": 4329 + }, + { + "epoch": 0.22, + "grad_norm": 0.7777573023668687, + "learning_rate": 1.816215570271274e-05, + "loss": 0.1974, + "step": 4330 + }, + { + "epoch": 0.22, + "grad_norm": 4.887007737951543, + "learning_rate": 1.8161204060055533e-05, + "loss": 0.2242, + "step": 4331 + }, + { + "epoch": 0.22, + "grad_norm": 1.0061088241270033, + "learning_rate": 1.8160252196025205e-05, + "loss": 0.2336, + "step": 4332 + }, + { + "epoch": 0.22, + "grad_norm": 0.9102452692339625, + "learning_rate": 1.8159300110647576e-05, + "loss": 0.1983, + "step": 4333 + }, + { + "epoch": 0.22, + "grad_norm": 1.9184937177752597, + "learning_rate": 1.815834780394847e-05, + "loss": 0.1995, + "step": 4334 + }, + { + "epoch": 0.22, + "grad_norm": 0.8389755034590195, + "learning_rate": 1.8157395275953722e-05, + "loss": 0.22, + "step": 4335 + }, + { + "epoch": 0.22, + "grad_norm": 0.9664717962957975, + "learning_rate": 1.8156442526689165e-05, + "loss": 0.2253, + "step": 4336 + }, + { + "epoch": 0.22, + "grad_norm": 1.206033123512191, + "learning_rate": 1.8155489556180643e-05, + "loss": 0.2136, + "step": 4337 + }, + { + "epoch": 0.22, + "grad_norm": 0.9918052284301853, + "learning_rate": 1.8154536364454003e-05, + "loss": 0.2266, + "step": 4338 + }, + { + "epoch": 0.22, + "grad_norm": 0.8834020921272379, + "learning_rate": 1.8153582951535108e-05, + "loss": 0.2002, + "step": 4339 + }, + { + "epoch": 0.22, + "grad_norm": 1.474921529252916, + "learning_rate": 1.8152629317449814e-05, + "loss": 0.2103, + "step": 4340 + }, + { + "epoch": 0.22, + "grad_norm": 0.7762391625825757, + "learning_rate": 1.815167546222399e-05, + "loss": 0.1926, + "step": 4341 + }, + { + "epoch": 0.22, + "grad_norm": 1.0448223957540366, + "learning_rate": 1.8150721385883508e-05, + "loss": 0.2103, + "step": 4342 + }, + { + "epoch": 0.22, + "grad_norm": 0.8368969397679634, + "learning_rate": 1.8149767088454248e-05, + "loss": 0.205, + "step": 4343 + }, + { + "epoch": 0.22, + "grad_norm": 1.2714716714784662, + "learning_rate": 1.8148812569962095e-05, + "loss": 0.1951, + "step": 4344 + }, + { + "epoch": 0.22, + "grad_norm": 1.3140737363744213, + "learning_rate": 1.814785783043294e-05, + "loss": 0.211, + "step": 4345 + }, + { + "epoch": 0.22, + "grad_norm": 0.909422860508571, + "learning_rate": 1.8146902869892682e-05, + "loss": 0.2139, + "step": 4346 + }, + { + "epoch": 0.22, + "grad_norm": 0.9128532924458792, + "learning_rate": 1.8145947688367224e-05, + "loss": 0.2059, + "step": 4347 + }, + { + "epoch": 0.22, + "grad_norm": 1.4055777535594336, + "learning_rate": 1.8144992285882478e-05, + "loss": 0.208, + "step": 4348 + }, + { + "epoch": 0.22, + "grad_norm": 1.6393726545336686, + "learning_rate": 1.814403666246435e-05, + "loss": 0.2216, + "step": 4349 + }, + { + "epoch": 0.22, + "grad_norm": 1.012495449839806, + "learning_rate": 1.814308081813877e-05, + "loss": 0.227, + "step": 4350 + }, + { + "epoch": 0.22, + "grad_norm": 0.8809379247618583, + "learning_rate": 1.8142124752931662e-05, + "loss": 0.2018, + "step": 4351 + }, + { + "epoch": 0.22, + "grad_norm": 1.0518298070909586, + "learning_rate": 1.8141168466868962e-05, + "loss": 0.2372, + "step": 4352 + }, + { + "epoch": 0.22, + "grad_norm": 1.185197093186988, + "learning_rate": 1.8140211959976608e-05, + "loss": 0.221, + "step": 4353 + }, + { + "epoch": 0.22, + "grad_norm": 0.8956554633281654, + "learning_rate": 1.8139255232280545e-05, + "loss": 0.2144, + "step": 4354 + }, + { + "epoch": 0.22, + "grad_norm": 0.8595694807094763, + "learning_rate": 1.813829828380672e-05, + "loss": 0.2269, + "step": 4355 + }, + { + "epoch": 0.22, + "grad_norm": 0.8742860722630785, + "learning_rate": 1.81373411145811e-05, + "loss": 0.207, + "step": 4356 + }, + { + "epoch": 0.22, + "grad_norm": 0.8565448494079153, + "learning_rate": 1.8136383724629637e-05, + "loss": 0.203, + "step": 4357 + }, + { + "epoch": 0.22, + "grad_norm": 0.7449865604280017, + "learning_rate": 1.8135426113978312e-05, + "loss": 0.1995, + "step": 4358 + }, + { + "epoch": 0.22, + "grad_norm": 0.9507927834239618, + "learning_rate": 1.8134468282653092e-05, + "loss": 0.2466, + "step": 4359 + }, + { + "epoch": 0.22, + "grad_norm": 0.8884456657081311, + "learning_rate": 1.8133510230679956e-05, + "loss": 0.2351, + "step": 4360 + }, + { + "epoch": 0.22, + "grad_norm": 0.9767879397277819, + "learning_rate": 1.8132551958084902e-05, + "loss": 0.2417, + "step": 4361 + }, + { + "epoch": 0.22, + "grad_norm": 1.028331879327762, + "learning_rate": 1.8131593464893913e-05, + "loss": 0.194, + "step": 4362 + }, + { + "epoch": 0.22, + "grad_norm": 0.9500463986143157, + "learning_rate": 1.813063475113299e-05, + "loss": 0.2152, + "step": 4363 + }, + { + "epoch": 0.22, + "grad_norm": 1.1014894639629662, + "learning_rate": 1.8129675816828144e-05, + "loss": 0.2095, + "step": 4364 + }, + { + "epoch": 0.22, + "grad_norm": 0.8469646740472858, + "learning_rate": 1.8128716662005385e-05, + "loss": 0.2113, + "step": 4365 + }, + { + "epoch": 0.22, + "grad_norm": 0.8171205513709684, + "learning_rate": 1.812775728669072e-05, + "loss": 0.217, + "step": 4366 + }, + { + "epoch": 0.22, + "grad_norm": 1.052823131487029, + "learning_rate": 1.812679769091018e-05, + "loss": 0.2206, + "step": 4367 + }, + { + "epoch": 0.22, + "grad_norm": 1.0175056353860659, + "learning_rate": 1.81258378746898e-05, + "loss": 0.2229, + "step": 4368 + }, + { + "epoch": 0.22, + "grad_norm": 0.8699858373898745, + "learning_rate": 1.8124877838055604e-05, + "loss": 0.2248, + "step": 4369 + }, + { + "epoch": 0.22, + "grad_norm": 0.9018639095903852, + "learning_rate": 1.8123917581033642e-05, + "loss": 0.2302, + "step": 4370 + }, + { + "epoch": 0.22, + "grad_norm": 0.8339848039989683, + "learning_rate": 1.8122957103649953e-05, + "loss": 0.2337, + "step": 4371 + }, + { + "epoch": 0.22, + "grad_norm": 0.8561161698245993, + "learning_rate": 1.81219964059306e-05, + "loss": 0.1902, + "step": 4372 + }, + { + "epoch": 0.22, + "grad_norm": 0.9040620220822301, + "learning_rate": 1.8121035487901627e-05, + "loss": 0.2363, + "step": 4373 + }, + { + "epoch": 0.22, + "grad_norm": 0.8117035779864158, + "learning_rate": 1.8120074349589114e-05, + "loss": 0.2052, + "step": 4374 + }, + { + "epoch": 0.22, + "grad_norm": 0.7757265435166741, + "learning_rate": 1.8119112991019123e-05, + "loss": 0.1952, + "step": 4375 + }, + { + "epoch": 0.22, + "grad_norm": 1.3018928098117228, + "learning_rate": 1.811815141221773e-05, + "loss": 0.2489, + "step": 4376 + }, + { + "epoch": 0.22, + "grad_norm": 0.8514347367780373, + "learning_rate": 1.8117189613211023e-05, + "loss": 0.2428, + "step": 4377 + }, + { + "epoch": 0.22, + "grad_norm": 0.9570291743456296, + "learning_rate": 1.8116227594025092e-05, + "loss": 0.198, + "step": 4378 + }, + { + "epoch": 0.22, + "grad_norm": 0.6959724960679516, + "learning_rate": 1.811526535468603e-05, + "loss": 0.2027, + "step": 4379 + }, + { + "epoch": 0.22, + "grad_norm": 1.0305907382430886, + "learning_rate": 1.8114302895219936e-05, + "loss": 0.2047, + "step": 4380 + }, + { + "epoch": 0.22, + "grad_norm": 1.171345519575611, + "learning_rate": 1.8113340215652916e-05, + "loss": 0.2283, + "step": 4381 + }, + { + "epoch": 0.22, + "grad_norm": 0.8155710544954411, + "learning_rate": 1.8112377316011086e-05, + "loss": 0.2179, + "step": 4382 + }, + { + "epoch": 0.22, + "grad_norm": 0.7938806830241489, + "learning_rate": 1.8111414196320562e-05, + "loss": 0.1821, + "step": 4383 + }, + { + "epoch": 0.22, + "grad_norm": 0.8821549536458148, + "learning_rate": 1.811045085660747e-05, + "loss": 0.2104, + "step": 4384 + }, + { + "epoch": 0.22, + "grad_norm": 0.9977566410273979, + "learning_rate": 1.8109487296897944e-05, + "loss": 0.2039, + "step": 4385 + }, + { + "epoch": 0.22, + "grad_norm": 0.8429772323990313, + "learning_rate": 1.8108523517218112e-05, + "loss": 0.2079, + "step": 4386 + }, + { + "epoch": 0.22, + "grad_norm": 0.932220713953027, + "learning_rate": 1.8107559517594123e-05, + "loss": 0.2242, + "step": 4387 + }, + { + "epoch": 0.22, + "grad_norm": 0.9463348327375277, + "learning_rate": 1.8106595298052124e-05, + "loss": 0.2109, + "step": 4388 + }, + { + "epoch": 0.22, + "grad_norm": 1.058926891012995, + "learning_rate": 1.810563085861827e-05, + "loss": 0.221, + "step": 4389 + }, + { + "epoch": 0.22, + "grad_norm": 0.7967025288772267, + "learning_rate": 1.810466619931872e-05, + "loss": 0.2069, + "step": 4390 + }, + { + "epoch": 0.22, + "grad_norm": 1.3536066297444167, + "learning_rate": 1.8103701320179644e-05, + "loss": 0.2321, + "step": 4391 + }, + { + "epoch": 0.22, + "grad_norm": 0.943141752961332, + "learning_rate": 1.8102736221227212e-05, + "loss": 0.2142, + "step": 4392 + }, + { + "epoch": 0.22, + "grad_norm": 0.8920841817138594, + "learning_rate": 1.81017709024876e-05, + "loss": 0.2064, + "step": 4393 + }, + { + "epoch": 0.22, + "grad_norm": 0.7738233796062852, + "learning_rate": 1.8100805363986996e-05, + "loss": 0.2137, + "step": 4394 + }, + { + "epoch": 0.22, + "grad_norm": 1.1456208241941903, + "learning_rate": 1.809983960575159e-05, + "loss": 0.2198, + "step": 4395 + }, + { + "epoch": 0.22, + "grad_norm": 0.7602743082590431, + "learning_rate": 1.8098873627807576e-05, + "loss": 0.2049, + "step": 4396 + }, + { + "epoch": 0.22, + "grad_norm": 0.8663595195507451, + "learning_rate": 1.8097907430181157e-05, + "loss": 0.2049, + "step": 4397 + }, + { + "epoch": 0.22, + "grad_norm": 0.9999632680595777, + "learning_rate": 1.8096941012898545e-05, + "loss": 0.2197, + "step": 4398 + }, + { + "epoch": 0.22, + "grad_norm": 0.899024685249853, + "learning_rate": 1.8095974375985945e-05, + "loss": 0.2233, + "step": 4399 + }, + { + "epoch": 0.22, + "grad_norm": 0.9689895282474176, + "learning_rate": 1.8095007519469583e-05, + "loss": 0.2149, + "step": 4400 + }, + { + "epoch": 0.22, + "grad_norm": 0.8861221836852705, + "learning_rate": 1.8094040443375692e-05, + "loss": 0.201, + "step": 4401 + }, + { + "epoch": 0.22, + "grad_norm": 1.147376630225734, + "learning_rate": 1.8093073147730492e-05, + "loss": 0.2381, + "step": 4402 + }, + { + "epoch": 0.22, + "grad_norm": 0.9875533678572314, + "learning_rate": 1.8092105632560227e-05, + "loss": 0.2176, + "step": 4403 + }, + { + "epoch": 0.22, + "grad_norm": 1.8599201761775375, + "learning_rate": 1.8091137897891138e-05, + "loss": 0.2246, + "step": 4404 + }, + { + "epoch": 0.22, + "grad_norm": 2.548059073446093, + "learning_rate": 1.8090169943749477e-05, + "loss": 0.2429, + "step": 4405 + }, + { + "epoch": 0.22, + "grad_norm": 1.4045767740844242, + "learning_rate": 1.80892017701615e-05, + "loss": 0.2402, + "step": 4406 + }, + { + "epoch": 0.22, + "grad_norm": 1.0615450153346424, + "learning_rate": 1.808823337715347e-05, + "loss": 0.2311, + "step": 4407 + }, + { + "epoch": 0.22, + "grad_norm": 1.4170629407778401, + "learning_rate": 1.8087264764751645e-05, + "loss": 0.1938, + "step": 4408 + }, + { + "epoch": 0.22, + "grad_norm": 0.9763221746153268, + "learning_rate": 1.8086295932982315e-05, + "loss": 0.2535, + "step": 4409 + }, + { + "epoch": 0.22, + "grad_norm": 0.7656690291051992, + "learning_rate": 1.8085326881871747e-05, + "loss": 0.1918, + "step": 4410 + }, + { + "epoch": 0.22, + "grad_norm": 1.136065388243066, + "learning_rate": 1.8084357611446232e-05, + "loss": 0.2491, + "step": 4411 + }, + { + "epoch": 0.22, + "grad_norm": 1.082286482252834, + "learning_rate": 1.808338812173206e-05, + "loss": 0.1978, + "step": 4412 + }, + { + "epoch": 0.22, + "grad_norm": 1.6479564360605128, + "learning_rate": 1.8082418412755527e-05, + "loss": 0.2176, + "step": 4413 + }, + { + "epoch": 0.22, + "grad_norm": 0.8694686452576879, + "learning_rate": 1.808144848454294e-05, + "loss": 0.2314, + "step": 4414 + }, + { + "epoch": 0.22, + "grad_norm": 0.7858990839349932, + "learning_rate": 1.8080478337120604e-05, + "loss": 0.2163, + "step": 4415 + }, + { + "epoch": 0.22, + "grad_norm": 0.839738455990263, + "learning_rate": 1.807950797051484e-05, + "loss": 0.1958, + "step": 4416 + }, + { + "epoch": 0.22, + "grad_norm": 1.2351394659797974, + "learning_rate": 1.8078537384751968e-05, + "loss": 0.2261, + "step": 4417 + }, + { + "epoch": 0.22, + "grad_norm": 0.9296011925545019, + "learning_rate": 1.8077566579858306e-05, + "loss": 0.2095, + "step": 4418 + }, + { + "epoch": 0.22, + "grad_norm": 0.9096268661088964, + "learning_rate": 1.80765955558602e-05, + "loss": 0.2349, + "step": 4419 + }, + { + "epoch": 0.22, + "grad_norm": 0.9189740778262212, + "learning_rate": 1.8075624312783984e-05, + "loss": 0.2157, + "step": 4420 + }, + { + "epoch": 0.22, + "grad_norm": 0.8732626554297037, + "learning_rate": 1.8074652850656e-05, + "loss": 0.2223, + "step": 4421 + }, + { + "epoch": 0.22, + "grad_norm": 2.5194779373667795, + "learning_rate": 1.80736811695026e-05, + "loss": 0.237, + "step": 4422 + }, + { + "epoch": 0.22, + "grad_norm": 2.011478056498499, + "learning_rate": 1.807270926935014e-05, + "loss": 0.2526, + "step": 4423 + }, + { + "epoch": 0.22, + "grad_norm": 1.072836618698817, + "learning_rate": 1.8071737150224993e-05, + "loss": 0.2207, + "step": 4424 + }, + { + "epoch": 0.23, + "grad_norm": 1.003240262280801, + "learning_rate": 1.8070764812153518e-05, + "loss": 0.2195, + "step": 4425 + }, + { + "epoch": 0.23, + "grad_norm": 0.8553296303325555, + "learning_rate": 1.8069792255162088e-05, + "loss": 0.2143, + "step": 4426 + }, + { + "epoch": 0.23, + "grad_norm": 0.9706401479122935, + "learning_rate": 1.8068819479277087e-05, + "loss": 0.2185, + "step": 4427 + }, + { + "epoch": 0.23, + "grad_norm": 0.8534705006698775, + "learning_rate": 1.8067846484524905e-05, + "loss": 0.1879, + "step": 4428 + }, + { + "epoch": 0.23, + "grad_norm": 0.8578306627076671, + "learning_rate": 1.806687327093193e-05, + "loss": 0.2213, + "step": 4429 + }, + { + "epoch": 0.23, + "grad_norm": 0.9165415623957645, + "learning_rate": 1.806589983852456e-05, + "loss": 0.2171, + "step": 4430 + }, + { + "epoch": 0.23, + "grad_norm": 1.0313737327911443, + "learning_rate": 1.8064926187329205e-05, + "loss": 0.2255, + "step": 4431 + }, + { + "epoch": 0.23, + "grad_norm": 0.9197916913770108, + "learning_rate": 1.8063952317372265e-05, + "loss": 0.1972, + "step": 4432 + }, + { + "epoch": 0.23, + "grad_norm": 1.3479992755805241, + "learning_rate": 1.8062978228680172e-05, + "loss": 0.2628, + "step": 4433 + }, + { + "epoch": 0.23, + "grad_norm": 0.8254972992938581, + "learning_rate": 1.8062003921279335e-05, + "loss": 0.2115, + "step": 4434 + }, + { + "epoch": 0.23, + "grad_norm": 0.8963891068322509, + "learning_rate": 1.8061029395196186e-05, + "loss": 0.2027, + "step": 4435 + }, + { + "epoch": 0.23, + "grad_norm": 0.9369312287584682, + "learning_rate": 1.8060054650457157e-05, + "loss": 0.2128, + "step": 4436 + }, + { + "epoch": 0.23, + "grad_norm": 1.4957509743791233, + "learning_rate": 1.8059079687088695e-05, + "loss": 0.1977, + "step": 4437 + }, + { + "epoch": 0.23, + "grad_norm": 0.8374581910764776, + "learning_rate": 1.805810450511724e-05, + "loss": 0.2171, + "step": 4438 + }, + { + "epoch": 0.23, + "grad_norm": 0.9093962161701271, + "learning_rate": 1.8057129104569246e-05, + "loss": 0.2155, + "step": 4439 + }, + { + "epoch": 0.23, + "grad_norm": 1.0819334654052752, + "learning_rate": 1.8056153485471167e-05, + "loss": 0.1863, + "step": 4440 + }, + { + "epoch": 0.23, + "grad_norm": 0.8524882699421736, + "learning_rate": 1.805517764784947e-05, + "loss": 0.2006, + "step": 4441 + }, + { + "epoch": 0.23, + "grad_norm": 1.1360487230080902, + "learning_rate": 1.8054201591730627e-05, + "loss": 0.2413, + "step": 4442 + }, + { + "epoch": 0.23, + "grad_norm": 0.91748364528214, + "learning_rate": 1.8053225317141106e-05, + "loss": 0.1981, + "step": 4443 + }, + { + "epoch": 0.23, + "grad_norm": 0.9272715829052295, + "learning_rate": 1.80522488241074e-05, + "loss": 0.2271, + "step": 4444 + }, + { + "epoch": 0.23, + "grad_norm": 0.8682592457592926, + "learning_rate": 1.8051272112655983e-05, + "loss": 0.2152, + "step": 4445 + }, + { + "epoch": 0.23, + "grad_norm": 0.9262847244346153, + "learning_rate": 1.805029518281336e-05, + "loss": 0.2093, + "step": 4446 + }, + { + "epoch": 0.23, + "grad_norm": 0.9188337809176588, + "learning_rate": 1.8049318034606025e-05, + "loss": 0.2187, + "step": 4447 + }, + { + "epoch": 0.23, + "grad_norm": 1.2028573072889497, + "learning_rate": 1.8048340668060483e-05, + "loss": 0.2444, + "step": 4448 + }, + { + "epoch": 0.23, + "grad_norm": 0.8717286742127507, + "learning_rate": 1.8047363083203245e-05, + "loss": 0.2009, + "step": 4449 + }, + { + "epoch": 0.23, + "grad_norm": 0.8561068852096136, + "learning_rate": 1.8046385280060832e-05, + "loss": 0.2097, + "step": 4450 + }, + { + "epoch": 0.23, + "grad_norm": 1.0878027107170953, + "learning_rate": 1.8045407258659763e-05, + "loss": 0.2059, + "step": 4451 + }, + { + "epoch": 0.23, + "grad_norm": 1.0482682779855905, + "learning_rate": 1.8044429019026565e-05, + "loss": 0.2468, + "step": 4452 + }, + { + "epoch": 0.23, + "grad_norm": 1.1067602177028681, + "learning_rate": 1.8043450561187775e-05, + "loss": 0.2366, + "step": 4453 + }, + { + "epoch": 0.23, + "grad_norm": 0.9203132473066146, + "learning_rate": 1.8042471885169934e-05, + "loss": 0.2343, + "step": 4454 + }, + { + "epoch": 0.23, + "grad_norm": 0.9013155181915874, + "learning_rate": 1.8041492990999595e-05, + "loss": 0.2291, + "step": 4455 + }, + { + "epoch": 0.23, + "grad_norm": 1.002047390111081, + "learning_rate": 1.8040513878703296e-05, + "loss": 0.2341, + "step": 4456 + }, + { + "epoch": 0.23, + "grad_norm": 0.7729887234708337, + "learning_rate": 1.803953454830761e-05, + "loss": 0.2187, + "step": 4457 + }, + { + "epoch": 0.23, + "grad_norm": 0.8295487988881244, + "learning_rate": 1.803855499983909e-05, + "loss": 0.2292, + "step": 4458 + }, + { + "epoch": 0.23, + "grad_norm": 0.9514396998810606, + "learning_rate": 1.803757523332432e-05, + "loss": 0.2199, + "step": 4459 + }, + { + "epoch": 0.23, + "grad_norm": 0.8581480115536537, + "learning_rate": 1.8036595248789862e-05, + "loss": 0.2185, + "step": 4460 + }, + { + "epoch": 0.23, + "grad_norm": 0.9062317373994906, + "learning_rate": 1.8035615046262305e-05, + "loss": 0.2406, + "step": 4461 + }, + { + "epoch": 0.23, + "grad_norm": 0.7518853352440985, + "learning_rate": 1.8034634625768234e-05, + "loss": 0.2094, + "step": 4462 + }, + { + "epoch": 0.23, + "grad_norm": 0.7640860466551985, + "learning_rate": 1.803365398733425e-05, + "loss": 0.2243, + "step": 4463 + }, + { + "epoch": 0.23, + "grad_norm": 1.0101277920294256, + "learning_rate": 1.803267313098694e-05, + "loss": 0.237, + "step": 4464 + }, + { + "epoch": 0.23, + "grad_norm": 1.1549794199372658, + "learning_rate": 1.8031692056752926e-05, + "loss": 0.207, + "step": 4465 + }, + { + "epoch": 0.23, + "grad_norm": 1.0995658668374693, + "learning_rate": 1.8030710764658804e-05, + "loss": 0.2352, + "step": 4466 + }, + { + "epoch": 0.23, + "grad_norm": 0.9735906734273259, + "learning_rate": 1.8029729254731204e-05, + "loss": 0.1905, + "step": 4467 + }, + { + "epoch": 0.23, + "grad_norm": 0.8437175055191298, + "learning_rate": 1.8028747526996745e-05, + "loss": 0.2186, + "step": 4468 + }, + { + "epoch": 0.23, + "grad_norm": 0.7296734364054673, + "learning_rate": 1.802776558148205e-05, + "loss": 0.2166, + "step": 4469 + }, + { + "epoch": 0.23, + "grad_norm": 1.047129361115201, + "learning_rate": 1.8026783418213768e-05, + "loss": 0.2186, + "step": 4470 + }, + { + "epoch": 0.23, + "grad_norm": 0.8087153659486408, + "learning_rate": 1.8025801037218528e-05, + "loss": 0.22, + "step": 4471 + }, + { + "epoch": 0.23, + "grad_norm": 0.8948010111876558, + "learning_rate": 1.8024818438522984e-05, + "loss": 0.2275, + "step": 4472 + }, + { + "epoch": 0.23, + "grad_norm": 2.4481573277116695, + "learning_rate": 1.802383562215378e-05, + "loss": 0.2121, + "step": 4473 + }, + { + "epoch": 0.23, + "grad_norm": 0.8894301960584611, + "learning_rate": 1.802285258813759e-05, + "loss": 0.2199, + "step": 4474 + }, + { + "epoch": 0.23, + "grad_norm": 0.8852898290343183, + "learning_rate": 1.8021869336501066e-05, + "loss": 0.2371, + "step": 4475 + }, + { + "epoch": 0.23, + "grad_norm": 0.861758973541769, + "learning_rate": 1.8020885867270883e-05, + "loss": 0.2244, + "step": 4476 + }, + { + "epoch": 0.23, + "grad_norm": 0.8802216161853176, + "learning_rate": 1.8019902180473717e-05, + "loss": 0.2146, + "step": 4477 + }, + { + "epoch": 0.23, + "grad_norm": 0.7756772290225344, + "learning_rate": 1.8018918276136255e-05, + "loss": 0.1897, + "step": 4478 + }, + { + "epoch": 0.23, + "grad_norm": 0.7696394101610075, + "learning_rate": 1.801793415428518e-05, + "loss": 0.2345, + "step": 4479 + }, + { + "epoch": 0.23, + "grad_norm": 0.9654896233324601, + "learning_rate": 1.8016949814947188e-05, + "loss": 0.2156, + "step": 4480 + }, + { + "epoch": 0.23, + "grad_norm": 0.9399179508195146, + "learning_rate": 1.801596525814898e-05, + "loss": 0.2044, + "step": 4481 + }, + { + "epoch": 0.23, + "grad_norm": 0.8600710403521159, + "learning_rate": 1.8014980483917263e-05, + "loss": 0.2086, + "step": 4482 + }, + { + "epoch": 0.23, + "grad_norm": 1.4090094998911113, + "learning_rate": 1.8013995492278744e-05, + "loss": 0.1984, + "step": 4483 + }, + { + "epoch": 0.23, + "grad_norm": 0.7783023424568493, + "learning_rate": 1.8013010283260146e-05, + "loss": 0.2126, + "step": 4484 + }, + { + "epoch": 0.23, + "grad_norm": 0.9956937185992838, + "learning_rate": 1.801202485688819e-05, + "loss": 0.1867, + "step": 4485 + }, + { + "epoch": 0.23, + "grad_norm": 0.9183789056572046, + "learning_rate": 1.801103921318961e-05, + "loss": 0.205, + "step": 4486 + }, + { + "epoch": 0.23, + "grad_norm": 0.9357084137452508, + "learning_rate": 1.801005335219114e-05, + "loss": 0.2161, + "step": 4487 + }, + { + "epoch": 0.23, + "grad_norm": 0.8767707480218008, + "learning_rate": 1.8009067273919518e-05, + "loss": 0.2093, + "step": 4488 + }, + { + "epoch": 0.23, + "grad_norm": 0.8962239859524315, + "learning_rate": 1.8008080978401493e-05, + "loss": 0.2267, + "step": 4489 + }, + { + "epoch": 0.23, + "grad_norm": 0.719171880216484, + "learning_rate": 1.8007094465663823e-05, + "loss": 0.2018, + "step": 4490 + }, + { + "epoch": 0.23, + "grad_norm": 1.2807368660385212, + "learning_rate": 1.8006107735733258e-05, + "loss": 0.2312, + "step": 4491 + }, + { + "epoch": 0.23, + "grad_norm": 1.0189680130523846, + "learning_rate": 1.8005120788636574e-05, + "loss": 0.2144, + "step": 4492 + }, + { + "epoch": 0.23, + "grad_norm": 0.8109524372881051, + "learning_rate": 1.8004133624400535e-05, + "loss": 0.1963, + "step": 4493 + }, + { + "epoch": 0.23, + "grad_norm": 0.7260968412969371, + "learning_rate": 1.8003146243051916e-05, + "loss": 0.2186, + "step": 4494 + }, + { + "epoch": 0.23, + "grad_norm": 0.8667090314175855, + "learning_rate": 1.8002158644617508e-05, + "loss": 0.1815, + "step": 4495 + }, + { + "epoch": 0.23, + "grad_norm": 0.8716288269195112, + "learning_rate": 1.800117082912409e-05, + "loss": 0.2088, + "step": 4496 + }, + { + "epoch": 0.23, + "grad_norm": 0.9035607795569606, + "learning_rate": 1.8000182796598466e-05, + "loss": 0.2078, + "step": 4497 + }, + { + "epoch": 0.23, + "grad_norm": 1.0125622963855248, + "learning_rate": 1.7999194547067426e-05, + "loss": 0.2469, + "step": 4498 + }, + { + "epoch": 0.23, + "grad_norm": 1.291516448269081, + "learning_rate": 1.7998206080557786e-05, + "loss": 0.2066, + "step": 4499 + }, + { + "epoch": 0.23, + "grad_norm": 1.3954274607458725, + "learning_rate": 1.7997217397096355e-05, + "loss": 0.182, + "step": 4500 + }, + { + "epoch": 0.23, + "grad_norm": 1.0939380315378482, + "learning_rate": 1.7996228496709948e-05, + "loss": 0.2223, + "step": 4501 + }, + { + "epoch": 0.23, + "grad_norm": 1.1328049643732674, + "learning_rate": 1.7995239379425393e-05, + "loss": 0.2029, + "step": 4502 + }, + { + "epoch": 0.23, + "grad_norm": 0.8570053201008193, + "learning_rate": 1.7994250045269518e-05, + "loss": 0.2048, + "step": 4503 + }, + { + "epoch": 0.23, + "grad_norm": 1.4699377915752037, + "learning_rate": 1.799326049426916e-05, + "loss": 0.2187, + "step": 4504 + }, + { + "epoch": 0.23, + "grad_norm": 0.8427132865080169, + "learning_rate": 1.7992270726451158e-05, + "loss": 0.2206, + "step": 4505 + }, + { + "epoch": 0.23, + "grad_norm": 1.0109750267372164, + "learning_rate": 1.7991280741842362e-05, + "loss": 0.2087, + "step": 4506 + }, + { + "epoch": 0.23, + "grad_norm": 0.8486429289414997, + "learning_rate": 1.799029054046962e-05, + "loss": 0.2015, + "step": 4507 + }, + { + "epoch": 0.23, + "grad_norm": 0.8440700530295214, + "learning_rate": 1.7989300122359802e-05, + "loss": 0.1891, + "step": 4508 + }, + { + "epoch": 0.23, + "grad_norm": 1.3246815330558233, + "learning_rate": 1.7988309487539767e-05, + "loss": 0.2286, + "step": 4509 + }, + { + "epoch": 0.23, + "grad_norm": 1.0044511865123302, + "learning_rate": 1.7987318636036382e-05, + "loss": 0.2147, + "step": 4510 + }, + { + "epoch": 0.23, + "grad_norm": 0.9616052245640965, + "learning_rate": 1.798632756787653e-05, + "loss": 0.2006, + "step": 4511 + }, + { + "epoch": 0.23, + "grad_norm": 1.007859581807647, + "learning_rate": 1.7985336283087092e-05, + "loss": 0.2149, + "step": 4512 + }, + { + "epoch": 0.23, + "grad_norm": 0.9269182161077623, + "learning_rate": 1.7984344781694957e-05, + "loss": 0.2327, + "step": 4513 + }, + { + "epoch": 0.23, + "grad_norm": 1.1873968613561616, + "learning_rate": 1.7983353063727014e-05, + "loss": 0.1958, + "step": 4514 + }, + { + "epoch": 0.23, + "grad_norm": 0.7675397374303244, + "learning_rate": 1.7982361129210172e-05, + "loss": 0.2071, + "step": 4515 + }, + { + "epoch": 0.23, + "grad_norm": 0.8842111254876002, + "learning_rate": 1.7981368978171336e-05, + "loss": 0.2177, + "step": 4516 + }, + { + "epoch": 0.23, + "grad_norm": 0.9063416103174554, + "learning_rate": 1.798037661063741e-05, + "loss": 0.2146, + "step": 4517 + }, + { + "epoch": 0.23, + "grad_norm": 0.8730461639506438, + "learning_rate": 1.7979384026635323e-05, + "loss": 0.2323, + "step": 4518 + }, + { + "epoch": 0.23, + "grad_norm": 0.9719249466422303, + "learning_rate": 1.7978391226191993e-05, + "loss": 0.2191, + "step": 4519 + }, + { + "epoch": 0.23, + "grad_norm": 1.098487152706154, + "learning_rate": 1.7977398209334347e-05, + "loss": 0.2052, + "step": 4520 + }, + { + "epoch": 0.23, + "grad_norm": 0.7818334103477671, + "learning_rate": 1.7976404976089327e-05, + "loss": 0.2216, + "step": 4521 + }, + { + "epoch": 0.23, + "grad_norm": 1.3452405241980792, + "learning_rate": 1.797541152648387e-05, + "loss": 0.2454, + "step": 4522 + }, + { + "epoch": 0.23, + "grad_norm": 0.9202422091727288, + "learning_rate": 1.7974417860544924e-05, + "loss": 0.2089, + "step": 4523 + }, + { + "epoch": 0.23, + "grad_norm": 1.0015645377017552, + "learning_rate": 1.797342397829944e-05, + "loss": 0.222, + "step": 4524 + }, + { + "epoch": 0.23, + "grad_norm": 0.944338652787783, + "learning_rate": 1.797242987977439e-05, + "loss": 0.2476, + "step": 4525 + }, + { + "epoch": 0.23, + "grad_norm": 0.887805161891117, + "learning_rate": 1.7971435564996723e-05, + "loss": 0.2149, + "step": 4526 + }, + { + "epoch": 0.23, + "grad_norm": 1.463548393298575, + "learning_rate": 1.7970441033993416e-05, + "loss": 0.2617, + "step": 4527 + }, + { + "epoch": 0.23, + "grad_norm": 1.064025199962864, + "learning_rate": 1.7969446286791448e-05, + "loss": 0.2037, + "step": 4528 + }, + { + "epoch": 0.23, + "grad_norm": 0.7289751041322153, + "learning_rate": 1.7968451323417798e-05, + "loss": 0.1879, + "step": 4529 + }, + { + "epoch": 0.23, + "grad_norm": 1.1059318897182686, + "learning_rate": 1.7967456143899453e-05, + "loss": 0.2267, + "step": 4530 + }, + { + "epoch": 0.23, + "grad_norm": 0.8711198948794134, + "learning_rate": 1.7966460748263412e-05, + "loss": 0.2093, + "step": 4531 + }, + { + "epoch": 0.23, + "grad_norm": 0.8817964338927127, + "learning_rate": 1.7965465136536677e-05, + "loss": 0.2207, + "step": 4532 + }, + { + "epoch": 0.23, + "grad_norm": 1.087682593067581, + "learning_rate": 1.7964469308746246e-05, + "loss": 0.2616, + "step": 4533 + }, + { + "epoch": 0.23, + "grad_norm": 0.9860911239425852, + "learning_rate": 1.7963473264919137e-05, + "loss": 0.2439, + "step": 4534 + }, + { + "epoch": 0.23, + "grad_norm": 0.8686097317747554, + "learning_rate": 1.7962477005082367e-05, + "loss": 0.2096, + "step": 4535 + }, + { + "epoch": 0.23, + "grad_norm": 0.851414230084223, + "learning_rate": 1.7961480529262956e-05, + "loss": 0.232, + "step": 4536 + }, + { + "epoch": 0.23, + "grad_norm": 0.9712965457220927, + "learning_rate": 1.7960483837487935e-05, + "loss": 0.2178, + "step": 4537 + }, + { + "epoch": 0.23, + "grad_norm": 0.7603316153751536, + "learning_rate": 1.7959486929784344e-05, + "loss": 0.2008, + "step": 4538 + }, + { + "epoch": 0.23, + "grad_norm": 1.305590804409271, + "learning_rate": 1.7958489806179214e-05, + "loss": 0.2322, + "step": 4539 + }, + { + "epoch": 0.23, + "grad_norm": 1.5399175125976559, + "learning_rate": 1.7957492466699606e-05, + "loss": 0.2103, + "step": 4540 + }, + { + "epoch": 0.23, + "grad_norm": 2.4627194244171826, + "learning_rate": 1.7956494911372564e-05, + "loss": 0.2199, + "step": 4541 + }, + { + "epoch": 0.23, + "grad_norm": 0.8683388348736799, + "learning_rate": 1.7955497140225145e-05, + "loss": 0.204, + "step": 4542 + }, + { + "epoch": 0.23, + "grad_norm": 0.9920027394506458, + "learning_rate": 1.7954499153284418e-05, + "loss": 0.214, + "step": 4543 + }, + { + "epoch": 0.23, + "grad_norm": 0.859917912663988, + "learning_rate": 1.7953500950577453e-05, + "loss": 0.1982, + "step": 4544 + }, + { + "epoch": 0.23, + "grad_norm": 1.0837324969574404, + "learning_rate": 1.7952502532131326e-05, + "loss": 0.2313, + "step": 4545 + }, + { + "epoch": 0.23, + "grad_norm": 0.8575927627504152, + "learning_rate": 1.795150389797312e-05, + "loss": 0.1942, + "step": 4546 + }, + { + "epoch": 0.23, + "grad_norm": 1.0625777986683949, + "learning_rate": 1.7950505048129917e-05, + "loss": 0.2335, + "step": 4547 + }, + { + "epoch": 0.23, + "grad_norm": 1.0713708397729715, + "learning_rate": 1.794950598262882e-05, + "loss": 0.2093, + "step": 4548 + }, + { + "epoch": 0.23, + "grad_norm": 1.229848965244706, + "learning_rate": 1.794850670149692e-05, + "loss": 0.2176, + "step": 4549 + }, + { + "epoch": 0.23, + "grad_norm": 1.1091741040774479, + "learning_rate": 1.794750720476133e-05, + "loss": 0.231, + "step": 4550 + }, + { + "epoch": 0.23, + "grad_norm": 1.2156486310229675, + "learning_rate": 1.7946507492449158e-05, + "loss": 0.2472, + "step": 4551 + }, + { + "epoch": 0.23, + "grad_norm": 1.1075155329929134, + "learning_rate": 1.794550756458752e-05, + "loss": 0.2283, + "step": 4552 + }, + { + "epoch": 0.23, + "grad_norm": 1.3998141024285211, + "learning_rate": 1.794450742120354e-05, + "loss": 0.2225, + "step": 4553 + }, + { + "epoch": 0.23, + "grad_norm": 1.011647270613291, + "learning_rate": 1.794350706232435e-05, + "loss": 0.2363, + "step": 4554 + }, + { + "epoch": 0.23, + "grad_norm": 0.9205082012421598, + "learning_rate": 1.794250648797708e-05, + "loss": 0.1958, + "step": 4555 + }, + { + "epoch": 0.23, + "grad_norm": 0.9069710312413244, + "learning_rate": 1.7941505698188873e-05, + "loss": 0.2098, + "step": 4556 + }, + { + "epoch": 0.23, + "grad_norm": 1.4467541990079829, + "learning_rate": 1.7940504692986876e-05, + "loss": 0.2196, + "step": 4557 + }, + { + "epoch": 0.23, + "grad_norm": 0.993067278432018, + "learning_rate": 1.793950347239824e-05, + "loss": 0.2426, + "step": 4558 + }, + { + "epoch": 0.23, + "grad_norm": 0.8076871561454932, + "learning_rate": 1.7938502036450128e-05, + "loss": 0.2241, + "step": 4559 + }, + { + "epoch": 0.23, + "grad_norm": 1.2539845527913558, + "learning_rate": 1.7937500385169695e-05, + "loss": 0.2151, + "step": 4560 + }, + { + "epoch": 0.23, + "grad_norm": 0.8088486414006637, + "learning_rate": 1.793649851858412e-05, + "loss": 0.2041, + "step": 4561 + }, + { + "epoch": 0.23, + "grad_norm": 1.186444873543506, + "learning_rate": 1.793549643672057e-05, + "loss": 0.2007, + "step": 4562 + }, + { + "epoch": 0.23, + "grad_norm": 1.041431065850028, + "learning_rate": 1.793449413960623e-05, + "loss": 0.2139, + "step": 4563 + }, + { + "epoch": 0.23, + "grad_norm": 1.5800775392251105, + "learning_rate": 1.7933491627268297e-05, + "loss": 0.2198, + "step": 4564 + }, + { + "epoch": 0.23, + "grad_norm": 0.9856604566007479, + "learning_rate": 1.793248889973395e-05, + "loss": 0.2261, + "step": 4565 + }, + { + "epoch": 0.23, + "grad_norm": 1.2768031361915468, + "learning_rate": 1.793148595703039e-05, + "loss": 0.211, + "step": 4566 + }, + { + "epoch": 0.23, + "grad_norm": 1.2841177512879824, + "learning_rate": 1.793048279918483e-05, + "loss": 0.1976, + "step": 4567 + }, + { + "epoch": 0.23, + "grad_norm": 0.9419078723736817, + "learning_rate": 1.7929479426224473e-05, + "loss": 0.2049, + "step": 4568 + }, + { + "epoch": 0.23, + "grad_norm": 1.17988154802931, + "learning_rate": 1.792847583817654e-05, + "loss": 0.204, + "step": 4569 + }, + { + "epoch": 0.23, + "grad_norm": 1.230600233347415, + "learning_rate": 1.7927472035068252e-05, + "loss": 0.2062, + "step": 4570 + }, + { + "epoch": 0.23, + "grad_norm": 0.9443095266324063, + "learning_rate": 1.792646801692684e-05, + "loss": 0.2091, + "step": 4571 + }, + { + "epoch": 0.23, + "grad_norm": 0.8913426583688561, + "learning_rate": 1.7925463783779534e-05, + "loss": 0.2403, + "step": 4572 + }, + { + "epoch": 0.23, + "grad_norm": 1.1466558580416424, + "learning_rate": 1.7924459335653575e-05, + "loss": 0.1967, + "step": 4573 + }, + { + "epoch": 0.23, + "grad_norm": 0.8535354994960708, + "learning_rate": 1.7923454672576208e-05, + "loss": 0.2254, + "step": 4574 + }, + { + "epoch": 0.23, + "grad_norm": 0.9666574182720151, + "learning_rate": 1.7922449794574686e-05, + "loss": 0.2471, + "step": 4575 + }, + { + "epoch": 0.23, + "grad_norm": 1.0837227290752445, + "learning_rate": 1.7921444701676267e-05, + "loss": 0.2148, + "step": 4576 + }, + { + "epoch": 0.23, + "grad_norm": 2.0686405368447454, + "learning_rate": 1.7920439393908212e-05, + "loss": 0.2152, + "step": 4577 + }, + { + "epoch": 0.23, + "grad_norm": 0.9850592832279286, + "learning_rate": 1.791943387129779e-05, + "loss": 0.2447, + "step": 4578 + }, + { + "epoch": 0.23, + "grad_norm": 1.155939191794011, + "learning_rate": 1.7918428133872278e-05, + "loss": 0.2238, + "step": 4579 + }, + { + "epoch": 0.23, + "grad_norm": 1.1574756641823711, + "learning_rate": 1.7917422181658954e-05, + "loss": 0.2134, + "step": 4580 + }, + { + "epoch": 0.23, + "grad_norm": 1.2070656126027186, + "learning_rate": 1.791641601468511e-05, + "loss": 0.1976, + "step": 4581 + }, + { + "epoch": 0.23, + "grad_norm": 2.436803255730702, + "learning_rate": 1.791540963297803e-05, + "loss": 0.2095, + "step": 4582 + }, + { + "epoch": 0.23, + "grad_norm": 0.9244624436617368, + "learning_rate": 1.791440303656502e-05, + "loss": 0.226, + "step": 4583 + }, + { + "epoch": 0.23, + "grad_norm": 1.9845022785819262, + "learning_rate": 1.791339622547338e-05, + "loss": 0.2091, + "step": 4584 + }, + { + "epoch": 0.23, + "grad_norm": 2.5238633874008007, + "learning_rate": 1.791238919973042e-05, + "loss": 0.2155, + "step": 4585 + }, + { + "epoch": 0.23, + "grad_norm": 1.213475206616372, + "learning_rate": 1.7911381959363456e-05, + "loss": 0.2218, + "step": 4586 + }, + { + "epoch": 0.23, + "grad_norm": 0.8457478367488127, + "learning_rate": 1.7910374504399812e-05, + "loss": 0.2093, + "step": 4587 + }, + { + "epoch": 0.23, + "grad_norm": 0.9261967112688656, + "learning_rate": 1.7909366834866805e-05, + "loss": 0.2165, + "step": 4588 + }, + { + "epoch": 0.23, + "grad_norm": 1.1208944218946013, + "learning_rate": 1.7908358950791784e-05, + "loss": 0.2366, + "step": 4589 + }, + { + "epoch": 0.23, + "grad_norm": 1.0587552863356118, + "learning_rate": 1.7907350852202078e-05, + "loss": 0.1997, + "step": 4590 + }, + { + "epoch": 0.23, + "grad_norm": 1.7929672703579542, + "learning_rate": 1.7906342539125032e-05, + "loss": 0.2123, + "step": 4591 + }, + { + "epoch": 0.23, + "grad_norm": 1.0807088767456519, + "learning_rate": 1.7905334011588e-05, + "loss": 0.214, + "step": 4592 + }, + { + "epoch": 0.23, + "grad_norm": 1.1805389793523693, + "learning_rate": 1.7904325269618335e-05, + "loss": 0.251, + "step": 4593 + }, + { + "epoch": 0.23, + "grad_norm": 1.530345554012429, + "learning_rate": 1.7903316313243398e-05, + "loss": 0.2266, + "step": 4594 + }, + { + "epoch": 0.23, + "grad_norm": 1.2375683235074402, + "learning_rate": 1.790230714249056e-05, + "loss": 0.2357, + "step": 4595 + }, + { + "epoch": 0.23, + "grad_norm": 1.2876949405743958, + "learning_rate": 1.7901297757387198e-05, + "loss": 0.2271, + "step": 4596 + }, + { + "epoch": 0.23, + "grad_norm": 1.0113586380944988, + "learning_rate": 1.7900288157960687e-05, + "loss": 0.2156, + "step": 4597 + }, + { + "epoch": 0.23, + "grad_norm": 0.9330662813880115, + "learning_rate": 1.7899278344238414e-05, + "loss": 0.2374, + "step": 4598 + }, + { + "epoch": 0.23, + "grad_norm": 1.393123277375178, + "learning_rate": 1.7898268316247767e-05, + "loss": 0.2137, + "step": 4599 + }, + { + "epoch": 0.23, + "grad_norm": 0.9077590609258267, + "learning_rate": 1.7897258074016152e-05, + "loss": 0.2313, + "step": 4600 + }, + { + "epoch": 0.23, + "grad_norm": 1.2412729292962352, + "learning_rate": 1.789624761757096e-05, + "loss": 0.2221, + "step": 4601 + }, + { + "epoch": 0.23, + "grad_norm": 1.0000254167505898, + "learning_rate": 1.7895236946939605e-05, + "loss": 0.2125, + "step": 4602 + }, + { + "epoch": 0.23, + "grad_norm": 1.1119944212749695, + "learning_rate": 1.7894226062149504e-05, + "loss": 0.1924, + "step": 4603 + }, + { + "epoch": 0.23, + "grad_norm": 0.9252582068282856, + "learning_rate": 1.7893214963228075e-05, + "loss": 0.2079, + "step": 4604 + }, + { + "epoch": 0.23, + "grad_norm": 1.6891110956771893, + "learning_rate": 1.7892203650202747e-05, + "loss": 0.2346, + "step": 4605 + }, + { + "epoch": 0.23, + "grad_norm": 1.1950376349533391, + "learning_rate": 1.7891192123100945e-05, + "loss": 0.2271, + "step": 4606 + }, + { + "epoch": 0.23, + "grad_norm": 1.437835315691099, + "learning_rate": 1.7890180381950113e-05, + "loss": 0.2208, + "step": 4607 + }, + { + "epoch": 0.23, + "grad_norm": 1.3801696979354348, + "learning_rate": 1.7889168426777693e-05, + "loss": 0.2373, + "step": 4608 + }, + { + "epoch": 0.23, + "grad_norm": 1.109176689844192, + "learning_rate": 1.7888156257611134e-05, + "loss": 0.1968, + "step": 4609 + }, + { + "epoch": 0.23, + "grad_norm": 1.687963492958192, + "learning_rate": 1.7887143874477887e-05, + "loss": 0.2459, + "step": 4610 + }, + { + "epoch": 0.23, + "grad_norm": 1.7422820151768772, + "learning_rate": 1.788613127740542e-05, + "loss": 0.2015, + "step": 4611 + }, + { + "epoch": 0.23, + "grad_norm": 1.7380741907169548, + "learning_rate": 1.7885118466421198e-05, + "loss": 0.215, + "step": 4612 + }, + { + "epoch": 0.23, + "grad_norm": 1.904065245765066, + "learning_rate": 1.788410544155269e-05, + "loss": 0.2017, + "step": 4613 + }, + { + "epoch": 0.23, + "grad_norm": 1.3133917809519462, + "learning_rate": 1.788309220282738e-05, + "loss": 0.1889, + "step": 4614 + }, + { + "epoch": 0.23, + "grad_norm": 1.5825665488429745, + "learning_rate": 1.788207875027274e-05, + "loss": 0.2168, + "step": 4615 + }, + { + "epoch": 0.23, + "grad_norm": 1.4843139736052093, + "learning_rate": 1.788106508391628e-05, + "loss": 0.2074, + "step": 4616 + }, + { + "epoch": 0.23, + "grad_norm": 1.7859163752239253, + "learning_rate": 1.788005120378548e-05, + "loss": 0.2277, + "step": 4617 + }, + { + "epoch": 0.23, + "grad_norm": 1.1062328359599314, + "learning_rate": 1.787903710990784e-05, + "loss": 0.2155, + "step": 4618 + }, + { + "epoch": 0.23, + "grad_norm": 1.509252746197654, + "learning_rate": 1.7878022802310882e-05, + "loss": 0.1734, + "step": 4619 + }, + { + "epoch": 0.23, + "grad_norm": 1.3313165535638438, + "learning_rate": 1.7877008281022107e-05, + "loss": 0.2105, + "step": 4620 + }, + { + "epoch": 0.23, + "grad_norm": 1.8325763249283111, + "learning_rate": 1.7875993546069036e-05, + "loss": 0.1965, + "step": 4621 + }, + { + "epoch": 0.24, + "grad_norm": 1.154417787978655, + "learning_rate": 1.7874978597479196e-05, + "loss": 0.2303, + "step": 4622 + }, + { + "epoch": 0.24, + "grad_norm": 1.319512505731216, + "learning_rate": 1.7873963435280122e-05, + "loss": 0.2245, + "step": 4623 + }, + { + "epoch": 0.24, + "grad_norm": 1.9450188451477706, + "learning_rate": 1.787294805949934e-05, + "loss": 0.2398, + "step": 4624 + }, + { + "epoch": 0.24, + "grad_norm": 1.4580963839574002, + "learning_rate": 1.7871932470164396e-05, + "loss": 0.2106, + "step": 4625 + }, + { + "epoch": 0.24, + "grad_norm": 1.0446207440063051, + "learning_rate": 1.787091666730284e-05, + "loss": 0.2261, + "step": 4626 + }, + { + "epoch": 0.24, + "grad_norm": 1.675123317131691, + "learning_rate": 1.7869900650942228e-05, + "loss": 0.1998, + "step": 4627 + }, + { + "epoch": 0.24, + "grad_norm": 1.4430054449111767, + "learning_rate": 1.7868884421110115e-05, + "loss": 0.2103, + "step": 4628 + }, + { + "epoch": 0.24, + "grad_norm": 0.9541529011227019, + "learning_rate": 1.7867867977834067e-05, + "loss": 0.2249, + "step": 4629 + }, + { + "epoch": 0.24, + "grad_norm": 1.3281638875589192, + "learning_rate": 1.7866851321141655e-05, + "loss": 0.2276, + "step": 4630 + }, + { + "epoch": 0.24, + "grad_norm": 1.151641575951757, + "learning_rate": 1.7865834451060458e-05, + "loss": 0.2107, + "step": 4631 + }, + { + "epoch": 0.24, + "grad_norm": 1.2337527355105584, + "learning_rate": 1.7864817367618058e-05, + "loss": 0.2081, + "step": 4632 + }, + { + "epoch": 0.24, + "grad_norm": 1.5264640015634976, + "learning_rate": 1.7863800070842038e-05, + "loss": 0.1938, + "step": 4633 + }, + { + "epoch": 0.24, + "grad_norm": 1.1466772922197281, + "learning_rate": 1.7862782560760004e-05, + "loss": 0.2127, + "step": 4634 + }, + { + "epoch": 0.24, + "grad_norm": 11.273989830855673, + "learning_rate": 1.7861764837399544e-05, + "loss": 0.1881, + "step": 4635 + }, + { + "epoch": 0.24, + "grad_norm": 1.3201828598781473, + "learning_rate": 1.786074690078827e-05, + "loss": 0.2431, + "step": 4636 + }, + { + "epoch": 0.24, + "grad_norm": 1.5748727581569109, + "learning_rate": 1.785972875095379e-05, + "loss": 0.1946, + "step": 4637 + }, + { + "epoch": 0.24, + "grad_norm": 1.450493532647831, + "learning_rate": 1.785871038792373e-05, + "loss": 0.1972, + "step": 4638 + }, + { + "epoch": 0.24, + "grad_norm": 1.2015481182386216, + "learning_rate": 1.7857691811725702e-05, + "loss": 0.2096, + "step": 4639 + }, + { + "epoch": 0.24, + "grad_norm": 2.25848049603337, + "learning_rate": 1.785667302238734e-05, + "loss": 0.2265, + "step": 4640 + }, + { + "epoch": 0.24, + "grad_norm": 4.349525557841808, + "learning_rate": 1.785565401993628e-05, + "loss": 0.2234, + "step": 4641 + }, + { + "epoch": 0.24, + "grad_norm": 1.8130057352094755, + "learning_rate": 1.785463480440016e-05, + "loss": 0.2088, + "step": 4642 + }, + { + "epoch": 0.24, + "grad_norm": 1.8040832321063347, + "learning_rate": 1.7853615375806627e-05, + "loss": 0.2371, + "step": 4643 + }, + { + "epoch": 0.24, + "grad_norm": 1.4228604220338537, + "learning_rate": 1.7852595734183333e-05, + "loss": 0.216, + "step": 4644 + }, + { + "epoch": 0.24, + "grad_norm": 2.310556142843705, + "learning_rate": 1.7851575879557937e-05, + "loss": 0.2051, + "step": 4645 + }, + { + "epoch": 0.24, + "grad_norm": 1.7051940112440398, + "learning_rate": 1.78505558119581e-05, + "loss": 0.1949, + "step": 4646 + }, + { + "epoch": 0.24, + "grad_norm": 1.366779457579173, + "learning_rate": 1.7849535531411498e-05, + "loss": 0.1994, + "step": 4647 + }, + { + "epoch": 0.24, + "grad_norm": 10.223823201496756, + "learning_rate": 1.7848515037945797e-05, + "loss": 0.2092, + "step": 4648 + }, + { + "epoch": 0.24, + "grad_norm": 1.9146218576980645, + "learning_rate": 1.784749433158868e-05, + "loss": 0.2279, + "step": 4649 + }, + { + "epoch": 0.24, + "grad_norm": 1.8574756201466331, + "learning_rate": 1.7846473412367845e-05, + "loss": 0.2066, + "step": 4650 + }, + { + "epoch": 0.24, + "grad_norm": 5.236915808421173, + "learning_rate": 1.7845452280310967e-05, + "loss": 0.217, + "step": 4651 + }, + { + "epoch": 0.24, + "grad_norm": 1.721888692196491, + "learning_rate": 1.784443093544576e-05, + "loss": 0.2415, + "step": 4652 + }, + { + "epoch": 0.24, + "grad_norm": 1.487684047623923, + "learning_rate": 1.7843409377799914e-05, + "loss": 0.2167, + "step": 4653 + }, + { + "epoch": 0.24, + "grad_norm": 1.7412624169564745, + "learning_rate": 1.7842387607401148e-05, + "loss": 0.2083, + "step": 4654 + }, + { + "epoch": 0.24, + "grad_norm": 1.40719210246589, + "learning_rate": 1.7841365624277176e-05, + "loss": 0.2482, + "step": 4655 + }, + { + "epoch": 0.24, + "grad_norm": 1.529733771727649, + "learning_rate": 1.7840343428455716e-05, + "loss": 0.2119, + "step": 4656 + }, + { + "epoch": 0.24, + "grad_norm": 1.7667059024593066, + "learning_rate": 1.78393210199645e-05, + "loss": 0.2074, + "step": 4657 + }, + { + "epoch": 0.24, + "grad_norm": 1.1288614053658006, + "learning_rate": 1.7838298398831263e-05, + "loss": 0.2383, + "step": 4658 + }, + { + "epoch": 0.24, + "grad_norm": 1.026408149145776, + "learning_rate": 1.783727556508373e-05, + "loss": 0.2058, + "step": 4659 + }, + { + "epoch": 0.24, + "grad_norm": 1.4706443339849393, + "learning_rate": 1.783625251874966e-05, + "loss": 0.2228, + "step": 4660 + }, + { + "epoch": 0.24, + "grad_norm": 1.0917198179045484, + "learning_rate": 1.78352292598568e-05, + "loss": 0.2419, + "step": 4661 + }, + { + "epoch": 0.24, + "grad_norm": 0.9710987685726439, + "learning_rate": 1.78342057884329e-05, + "loss": 0.2331, + "step": 4662 + }, + { + "epoch": 0.24, + "grad_norm": 1.2486942535582013, + "learning_rate": 1.7833182104505727e-05, + "loss": 0.2098, + "step": 4663 + }, + { + "epoch": 0.24, + "grad_norm": 1.4638630936861796, + "learning_rate": 1.7832158208103046e-05, + "loss": 0.2116, + "step": 4664 + }, + { + "epoch": 0.24, + "grad_norm": 1.2763911256155431, + "learning_rate": 1.7831134099252633e-05, + "loss": 0.228, + "step": 4665 + }, + { + "epoch": 0.24, + "grad_norm": 1.2850151021671732, + "learning_rate": 1.7830109777982264e-05, + "loss": 0.2158, + "step": 4666 + }, + { + "epoch": 0.24, + "grad_norm": 1.3120744860308333, + "learning_rate": 1.7829085244319722e-05, + "loss": 0.2157, + "step": 4667 + }, + { + "epoch": 0.24, + "grad_norm": 1.495724487270258, + "learning_rate": 1.7828060498292807e-05, + "loss": 0.2049, + "step": 4668 + }, + { + "epoch": 0.24, + "grad_norm": 1.1252993166897158, + "learning_rate": 1.7827035539929304e-05, + "loss": 0.221, + "step": 4669 + }, + { + "epoch": 0.24, + "grad_norm": 1.0361332579638716, + "learning_rate": 1.7826010369257023e-05, + "loss": 0.2397, + "step": 4670 + }, + { + "epoch": 0.24, + "grad_norm": 0.9062157745950375, + "learning_rate": 1.7824984986303767e-05, + "loss": 0.2221, + "step": 4671 + }, + { + "epoch": 0.24, + "grad_norm": 1.0599935506985416, + "learning_rate": 1.782395939109735e-05, + "loss": 0.1988, + "step": 4672 + }, + { + "epoch": 0.24, + "grad_norm": 1.0761451179044146, + "learning_rate": 1.7822933583665595e-05, + "loss": 0.1832, + "step": 4673 + }, + { + "epoch": 0.24, + "grad_norm": 1.2453986166897961, + "learning_rate": 1.782190756403632e-05, + "loss": 0.2137, + "step": 4674 + }, + { + "epoch": 0.24, + "grad_norm": 0.9699109057442683, + "learning_rate": 1.7820881332237366e-05, + "loss": 0.2233, + "step": 4675 + }, + { + "epoch": 0.24, + "grad_norm": 1.0480747153682368, + "learning_rate": 1.7819854888296563e-05, + "loss": 0.2394, + "step": 4676 + }, + { + "epoch": 0.24, + "grad_norm": 1.139973735215863, + "learning_rate": 1.7818828232241756e-05, + "loss": 0.2286, + "step": 4677 + }, + { + "epoch": 0.24, + "grad_norm": 1.6594254452523631, + "learning_rate": 1.781780136410079e-05, + "loss": 0.2264, + "step": 4678 + }, + { + "epoch": 0.24, + "grad_norm": 0.892424483882286, + "learning_rate": 1.7816774283901518e-05, + "loss": 0.2145, + "step": 4679 + }, + { + "epoch": 0.24, + "grad_norm": 0.9634119808029931, + "learning_rate": 1.7815746991671804e-05, + "loss": 0.2322, + "step": 4680 + }, + { + "epoch": 0.24, + "grad_norm": 1.0708682845486202, + "learning_rate": 1.781471948743951e-05, + "loss": 0.2455, + "step": 4681 + }, + { + "epoch": 0.24, + "grad_norm": 1.5709346655885017, + "learning_rate": 1.781369177123251e-05, + "loss": 0.2179, + "step": 4682 + }, + { + "epoch": 0.24, + "grad_norm": 1.0173880811846758, + "learning_rate": 1.7812663843078677e-05, + "loss": 0.1937, + "step": 4683 + }, + { + "epoch": 0.24, + "grad_norm": 0.8846051241989114, + "learning_rate": 1.78116357030059e-05, + "loss": 0.1912, + "step": 4684 + }, + { + "epoch": 0.24, + "grad_norm": 1.251193835858283, + "learning_rate": 1.7810607351042062e-05, + "loss": 0.2091, + "step": 4685 + }, + { + "epoch": 0.24, + "grad_norm": 1.1693427107436283, + "learning_rate": 1.780957878721506e-05, + "loss": 0.2114, + "step": 4686 + }, + { + "epoch": 0.24, + "grad_norm": 1.1338587726128928, + "learning_rate": 1.7808550011552788e-05, + "loss": 0.2243, + "step": 4687 + }, + { + "epoch": 0.24, + "grad_norm": 0.8121839280092648, + "learning_rate": 1.780752102408316e-05, + "loss": 0.2332, + "step": 4688 + }, + { + "epoch": 0.24, + "grad_norm": 0.9126963444453843, + "learning_rate": 1.780649182483408e-05, + "loss": 0.2059, + "step": 4689 + }, + { + "epoch": 0.24, + "grad_norm": 0.9287395868561739, + "learning_rate": 1.780546241383347e-05, + "loss": 0.2336, + "step": 4690 + }, + { + "epoch": 0.24, + "grad_norm": 0.861224039628638, + "learning_rate": 1.7804432791109253e-05, + "loss": 0.193, + "step": 4691 + }, + { + "epoch": 0.24, + "grad_norm": 1.2011478859769236, + "learning_rate": 1.7803402956689353e-05, + "loss": 0.2309, + "step": 4692 + }, + { + "epoch": 0.24, + "grad_norm": 0.8349673231029032, + "learning_rate": 1.7802372910601707e-05, + "loss": 0.22, + "step": 4693 + }, + { + "epoch": 0.24, + "grad_norm": 1.1764232090330085, + "learning_rate": 1.7801342652874256e-05, + "loss": 0.2082, + "step": 4694 + }, + { + "epoch": 0.24, + "grad_norm": 1.31740133504077, + "learning_rate": 1.7800312183534946e-05, + "loss": 0.1989, + "step": 4695 + }, + { + "epoch": 0.24, + "grad_norm": 0.8859932028097436, + "learning_rate": 1.7799281502611725e-05, + "loss": 0.2259, + "step": 4696 + }, + { + "epoch": 0.24, + "grad_norm": 1.068731884988904, + "learning_rate": 1.7798250610132555e-05, + "loss": 0.2286, + "step": 4697 + }, + { + "epoch": 0.24, + "grad_norm": 0.9480913590614505, + "learning_rate": 1.7797219506125393e-05, + "loss": 0.2121, + "step": 4698 + }, + { + "epoch": 0.24, + "grad_norm": 1.1259551001945691, + "learning_rate": 1.7796188190618217e-05, + "loss": 0.2447, + "step": 4699 + }, + { + "epoch": 0.24, + "grad_norm": 1.0424075409961195, + "learning_rate": 1.7795156663638993e-05, + "loss": 0.2487, + "step": 4700 + }, + { + "epoch": 0.24, + "grad_norm": 0.822807998801817, + "learning_rate": 1.7794124925215706e-05, + "loss": 0.2308, + "step": 4701 + }, + { + "epoch": 0.24, + "grad_norm": 0.9446368597507817, + "learning_rate": 1.7793092975376337e-05, + "loss": 0.2051, + "step": 4702 + }, + { + "epoch": 0.24, + "grad_norm": 0.8001210824626251, + "learning_rate": 1.779206081414888e-05, + "loss": 0.2054, + "step": 4703 + }, + { + "epoch": 0.24, + "grad_norm": 0.7007932947970836, + "learning_rate": 1.779102844156134e-05, + "loss": 0.2126, + "step": 4704 + }, + { + "epoch": 0.24, + "grad_norm": 1.018612629375446, + "learning_rate": 1.778999585764171e-05, + "loss": 0.1843, + "step": 4705 + }, + { + "epoch": 0.24, + "grad_norm": 2.571513674538329, + "learning_rate": 1.7788963062418e-05, + "loss": 0.2114, + "step": 4706 + }, + { + "epoch": 0.24, + "grad_norm": 0.7412928909282924, + "learning_rate": 1.778793005591823e-05, + "loss": 0.1917, + "step": 4707 + }, + { + "epoch": 0.24, + "grad_norm": 0.8497538991645085, + "learning_rate": 1.7786896838170414e-05, + "loss": 0.1995, + "step": 4708 + }, + { + "epoch": 0.24, + "grad_norm": 0.8581836762327298, + "learning_rate": 1.7785863409202587e-05, + "loss": 0.2296, + "step": 4709 + }, + { + "epoch": 0.24, + "grad_norm": 1.0205692084242963, + "learning_rate": 1.778482976904277e-05, + "loss": 0.2222, + "step": 4710 + }, + { + "epoch": 0.24, + "grad_norm": 0.7773003288749312, + "learning_rate": 1.7783795917719006e-05, + "loss": 0.2123, + "step": 4711 + }, + { + "epoch": 0.24, + "grad_norm": 0.8669931706685519, + "learning_rate": 1.7782761855259343e-05, + "loss": 0.196, + "step": 4712 + }, + { + "epoch": 0.24, + "grad_norm": 1.261557946455817, + "learning_rate": 1.778172758169182e-05, + "loss": 0.2428, + "step": 4713 + }, + { + "epoch": 0.24, + "grad_norm": 0.8542981027231593, + "learning_rate": 1.77806930970445e-05, + "loss": 0.1949, + "step": 4714 + }, + { + "epoch": 0.24, + "grad_norm": 1.1725172257027152, + "learning_rate": 1.7779658401345437e-05, + "loss": 0.2447, + "step": 4715 + }, + { + "epoch": 0.24, + "grad_norm": 0.9171805823393477, + "learning_rate": 1.7778623494622703e-05, + "loss": 0.2291, + "step": 4716 + }, + { + "epoch": 0.24, + "grad_norm": 0.8329476077948494, + "learning_rate": 1.7777588376904367e-05, + "loss": 0.1999, + "step": 4717 + }, + { + "epoch": 0.24, + "grad_norm": 0.8706417523196315, + "learning_rate": 1.777655304821851e-05, + "loss": 0.1986, + "step": 4718 + }, + { + "epoch": 0.24, + "grad_norm": 0.9122189195503517, + "learning_rate": 1.7775517508593208e-05, + "loss": 0.2286, + "step": 4719 + }, + { + "epoch": 0.24, + "grad_norm": 0.9221898541481, + "learning_rate": 1.7774481758056553e-05, + "loss": 0.2033, + "step": 4720 + }, + { + "epoch": 0.24, + "grad_norm": 0.913523118983051, + "learning_rate": 1.7773445796636647e-05, + "loss": 0.2321, + "step": 4721 + }, + { + "epoch": 0.24, + "grad_norm": 1.1853772461053191, + "learning_rate": 1.777240962436158e-05, + "loss": 0.2175, + "step": 4722 + }, + { + "epoch": 0.24, + "grad_norm": 0.9836678716893172, + "learning_rate": 1.7771373241259463e-05, + "loss": 0.2354, + "step": 4723 + }, + { + "epoch": 0.24, + "grad_norm": 4.933538952221682, + "learning_rate": 1.777033664735841e-05, + "loss": 0.2198, + "step": 4724 + }, + { + "epoch": 0.24, + "grad_norm": 0.9053145119436731, + "learning_rate": 1.7769299842686537e-05, + "loss": 0.1993, + "step": 4725 + }, + { + "epoch": 0.24, + "grad_norm": 1.4561874749144696, + "learning_rate": 1.7768262827271967e-05, + "loss": 0.21, + "step": 4726 + }, + { + "epoch": 0.24, + "grad_norm": 0.8923224164256436, + "learning_rate": 1.7767225601142827e-05, + "loss": 0.2417, + "step": 4727 + }, + { + "epoch": 0.24, + "grad_norm": 2.1688299476678536, + "learning_rate": 1.7766188164327255e-05, + "loss": 0.2227, + "step": 4728 + }, + { + "epoch": 0.24, + "grad_norm": 1.1615482229124374, + "learning_rate": 1.7765150516853393e-05, + "loss": 0.2098, + "step": 4729 + }, + { + "epoch": 0.24, + "grad_norm": 0.9914331330545041, + "learning_rate": 1.776411265874938e-05, + "loss": 0.207, + "step": 4730 + }, + { + "epoch": 0.24, + "grad_norm": 1.216108796145146, + "learning_rate": 1.7763074590043373e-05, + "loss": 0.2333, + "step": 4731 + }, + { + "epoch": 0.24, + "grad_norm": 1.0612278807406503, + "learning_rate": 1.7762036310763533e-05, + "loss": 0.2331, + "step": 4732 + }, + { + "epoch": 0.24, + "grad_norm": 0.9308158763736551, + "learning_rate": 1.7760997820938017e-05, + "loss": 0.2139, + "step": 4733 + }, + { + "epoch": 0.24, + "grad_norm": 1.02496181162953, + "learning_rate": 1.7759959120594995e-05, + "loss": 0.2208, + "step": 4734 + }, + { + "epoch": 0.24, + "grad_norm": 0.9621707948335559, + "learning_rate": 1.7758920209762646e-05, + "loss": 0.2095, + "step": 4735 + }, + { + "epoch": 0.24, + "grad_norm": 0.8073536972901817, + "learning_rate": 1.7757881088469152e-05, + "loss": 0.213, + "step": 4736 + }, + { + "epoch": 0.24, + "grad_norm": 3.602279863576911, + "learning_rate": 1.775684175674269e-05, + "loss": 0.1964, + "step": 4737 + }, + { + "epoch": 0.24, + "grad_norm": 0.7919440808875956, + "learning_rate": 1.7755802214611456e-05, + "loss": 0.2221, + "step": 4738 + }, + { + "epoch": 0.24, + "grad_norm": 1.2264881157763754, + "learning_rate": 1.7754762462103653e-05, + "loss": 0.2061, + "step": 4739 + }, + { + "epoch": 0.24, + "grad_norm": 1.0081452576317012, + "learning_rate": 1.775372249924748e-05, + "loss": 0.2341, + "step": 4740 + }, + { + "epoch": 0.24, + "grad_norm": 1.4966706046285734, + "learning_rate": 1.775268232607114e-05, + "loss": 0.1968, + "step": 4741 + }, + { + "epoch": 0.24, + "grad_norm": 0.9282699330047943, + "learning_rate": 1.775164194260286e-05, + "loss": 0.1917, + "step": 4742 + }, + { + "epoch": 0.24, + "grad_norm": 1.0831685554881305, + "learning_rate": 1.7750601348870857e-05, + "loss": 0.2245, + "step": 4743 + }, + { + "epoch": 0.24, + "grad_norm": 1.0721042421741023, + "learning_rate": 1.774956054490335e-05, + "loss": 0.226, + "step": 4744 + }, + { + "epoch": 0.24, + "grad_norm": 1.1269726140008207, + "learning_rate": 1.7748519530728578e-05, + "loss": 0.2066, + "step": 4745 + }, + { + "epoch": 0.24, + "grad_norm": 1.3184525037031192, + "learning_rate": 1.7747478306374774e-05, + "loss": 0.1888, + "step": 4746 + }, + { + "epoch": 0.24, + "grad_norm": 1.447342958685647, + "learning_rate": 1.7746436871870185e-05, + "loss": 0.2092, + "step": 4747 + }, + { + "epoch": 0.24, + "grad_norm": 1.0632538284053596, + "learning_rate": 1.7745395227243057e-05, + "loss": 0.2367, + "step": 4748 + }, + { + "epoch": 0.24, + "grad_norm": 0.8571598963074442, + "learning_rate": 1.7744353372521645e-05, + "loss": 0.1998, + "step": 4749 + }, + { + "epoch": 0.24, + "grad_norm": 1.4045763430768659, + "learning_rate": 1.7743311307734212e-05, + "loss": 0.2075, + "step": 4750 + }, + { + "epoch": 0.24, + "grad_norm": 1.140852461397593, + "learning_rate": 1.7742269032909022e-05, + "loss": 0.2327, + "step": 4751 + }, + { + "epoch": 0.24, + "grad_norm": 1.0450382616949419, + "learning_rate": 1.774122654807435e-05, + "loss": 0.2048, + "step": 4752 + }, + { + "epoch": 0.24, + "grad_norm": 0.9913868061762731, + "learning_rate": 1.7740183853258463e-05, + "loss": 0.2349, + "step": 4753 + }, + { + "epoch": 0.24, + "grad_norm": 0.9902220057154458, + "learning_rate": 1.773914094848966e-05, + "loss": 0.1981, + "step": 4754 + }, + { + "epoch": 0.24, + "grad_norm": 0.9141793025191702, + "learning_rate": 1.7738097833796218e-05, + "loss": 0.225, + "step": 4755 + }, + { + "epoch": 0.24, + "grad_norm": 1.122258777024154, + "learning_rate": 1.7737054509206437e-05, + "loss": 0.2029, + "step": 4756 + }, + { + "epoch": 0.24, + "grad_norm": 0.9388722008012684, + "learning_rate": 1.773601097474861e-05, + "loss": 0.1799, + "step": 4757 + }, + { + "epoch": 0.24, + "grad_norm": 1.1338319108176849, + "learning_rate": 1.7734967230451053e-05, + "loss": 0.198, + "step": 4758 + }, + { + "epoch": 0.24, + "grad_norm": 1.1696782476614866, + "learning_rate": 1.7733923276342072e-05, + "loss": 0.2082, + "step": 4759 + }, + { + "epoch": 0.24, + "grad_norm": 0.9570850227198349, + "learning_rate": 1.7732879112449987e-05, + "loss": 0.1937, + "step": 4760 + }, + { + "epoch": 0.24, + "grad_norm": 0.9987829452675776, + "learning_rate": 1.773183473880312e-05, + "loss": 0.2065, + "step": 4761 + }, + { + "epoch": 0.24, + "grad_norm": 0.8658344198195547, + "learning_rate": 1.7730790155429796e-05, + "loss": 0.2535, + "step": 4762 + }, + { + "epoch": 0.24, + "grad_norm": 1.5105970576810357, + "learning_rate": 1.7729745362358354e-05, + "loss": 0.2292, + "step": 4763 + }, + { + "epoch": 0.24, + "grad_norm": 1.030621919076012, + "learning_rate": 1.772870035961713e-05, + "loss": 0.2424, + "step": 4764 + }, + { + "epoch": 0.24, + "grad_norm": 0.7911133713120762, + "learning_rate": 1.772765514723448e-05, + "loss": 0.2031, + "step": 4765 + }, + { + "epoch": 0.24, + "grad_norm": 0.7733501456620687, + "learning_rate": 1.7726609725238736e-05, + "loss": 0.2015, + "step": 4766 + }, + { + "epoch": 0.24, + "grad_norm": 0.7483844203985907, + "learning_rate": 1.7725564093658273e-05, + "loss": 0.2144, + "step": 4767 + }, + { + "epoch": 0.24, + "grad_norm": 0.832420970371779, + "learning_rate": 1.772451825252145e-05, + "loss": 0.1933, + "step": 4768 + }, + { + "epoch": 0.24, + "grad_norm": 0.9269564472066407, + "learning_rate": 1.7723472201856632e-05, + "loss": 0.2117, + "step": 4769 + }, + { + "epoch": 0.24, + "grad_norm": 0.7062623972950685, + "learning_rate": 1.7722425941692193e-05, + "loss": 0.2011, + "step": 4770 + }, + { + "epoch": 0.24, + "grad_norm": 1.300632009889857, + "learning_rate": 1.7721379472056512e-05, + "loss": 0.235, + "step": 4771 + }, + { + "epoch": 0.24, + "grad_norm": 0.9395077257423332, + "learning_rate": 1.772033279297798e-05, + "loss": 0.214, + "step": 4772 + }, + { + "epoch": 0.24, + "grad_norm": 0.8855510946435825, + "learning_rate": 1.7719285904484984e-05, + "loss": 0.2153, + "step": 4773 + }, + { + "epoch": 0.24, + "grad_norm": 0.9062652247205932, + "learning_rate": 1.771823880660592e-05, + "loss": 0.2036, + "step": 4774 + }, + { + "epoch": 0.24, + "grad_norm": 1.2632706385556125, + "learning_rate": 1.7717191499369195e-05, + "loss": 0.2172, + "step": 4775 + }, + { + "epoch": 0.24, + "grad_norm": 1.0616280497428603, + "learning_rate": 1.7716143982803214e-05, + "loss": 0.214, + "step": 4776 + }, + { + "epoch": 0.24, + "grad_norm": 1.0610019253510121, + "learning_rate": 1.7715096256936387e-05, + "loss": 0.2148, + "step": 4777 + }, + { + "epoch": 0.24, + "grad_norm": 0.9265199693220165, + "learning_rate": 1.7714048321797146e-05, + "loss": 0.194, + "step": 4778 + }, + { + "epoch": 0.24, + "grad_norm": 1.4833546752386788, + "learning_rate": 1.7713000177413905e-05, + "loss": 0.2144, + "step": 4779 + }, + { + "epoch": 0.24, + "grad_norm": 2.119156400854007, + "learning_rate": 1.77119518238151e-05, + "loss": 0.2137, + "step": 4780 + }, + { + "epoch": 0.24, + "grad_norm": 0.8713331110192771, + "learning_rate": 1.7710903261029162e-05, + "loss": 0.1895, + "step": 4781 + }, + { + "epoch": 0.24, + "grad_norm": 1.4736802633037827, + "learning_rate": 1.770985448908454e-05, + "loss": 0.2002, + "step": 4782 + }, + { + "epoch": 0.24, + "grad_norm": 0.9670421103121505, + "learning_rate": 1.770880550800968e-05, + "loss": 0.2246, + "step": 4783 + }, + { + "epoch": 0.24, + "grad_norm": 0.9997782459952628, + "learning_rate": 1.7707756317833037e-05, + "loss": 0.2155, + "step": 4784 + }, + { + "epoch": 0.24, + "grad_norm": 1.0048297808434645, + "learning_rate": 1.7706706918583065e-05, + "loss": 0.22, + "step": 4785 + }, + { + "epoch": 0.24, + "grad_norm": 0.8625274276807509, + "learning_rate": 1.7705657310288234e-05, + "loss": 0.1931, + "step": 4786 + }, + { + "epoch": 0.24, + "grad_norm": 1.0333429133868393, + "learning_rate": 1.7704607492977016e-05, + "loss": 0.2056, + "step": 4787 + }, + { + "epoch": 0.24, + "grad_norm": 0.8165586544427031, + "learning_rate": 1.770355746667788e-05, + "loss": 0.2255, + "step": 4788 + }, + { + "epoch": 0.24, + "grad_norm": 0.9446874307612251, + "learning_rate": 1.7702507231419316e-05, + "loss": 0.2284, + "step": 4789 + }, + { + "epoch": 0.24, + "grad_norm": 1.157327733356626, + "learning_rate": 1.7701456787229805e-05, + "loss": 0.2121, + "step": 4790 + }, + { + "epoch": 0.24, + "grad_norm": 0.9067004989487868, + "learning_rate": 1.7700406134137846e-05, + "loss": 0.2084, + "step": 4791 + }, + { + "epoch": 0.24, + "grad_norm": 1.5375842708722038, + "learning_rate": 1.7699355272171936e-05, + "loss": 0.2111, + "step": 4792 + }, + { + "epoch": 0.24, + "grad_norm": 1.579611023345228, + "learning_rate": 1.769830420136058e-05, + "loss": 0.2197, + "step": 4793 + }, + { + "epoch": 0.24, + "grad_norm": 0.7815056142120229, + "learning_rate": 1.7697252921732288e-05, + "loss": 0.2399, + "step": 4794 + }, + { + "epoch": 0.24, + "grad_norm": 1.0464247436420646, + "learning_rate": 1.7696201433315572e-05, + "loss": 0.2047, + "step": 4795 + }, + { + "epoch": 0.24, + "grad_norm": 0.7803618338517158, + "learning_rate": 1.769514973613896e-05, + "loss": 0.2282, + "step": 4796 + }, + { + "epoch": 0.24, + "grad_norm": 3.0589890669182576, + "learning_rate": 1.7694097830230977e-05, + "loss": 0.217, + "step": 4797 + }, + { + "epoch": 0.24, + "grad_norm": 0.8290031373697013, + "learning_rate": 1.7693045715620154e-05, + "loss": 0.2312, + "step": 4798 + }, + { + "epoch": 0.24, + "grad_norm": 0.9998618652420123, + "learning_rate": 1.7691993392335033e-05, + "loss": 0.1801, + "step": 4799 + }, + { + "epoch": 0.24, + "grad_norm": 2.939470955111584, + "learning_rate": 1.7690940860404158e-05, + "loss": 0.2376, + "step": 4800 + }, + { + "epoch": 0.24, + "grad_norm": 0.7765224504735883, + "learning_rate": 1.7689888119856075e-05, + "loss": 0.1927, + "step": 4801 + }, + { + "epoch": 0.24, + "grad_norm": 0.803742550457734, + "learning_rate": 1.7688835170719346e-05, + "loss": 0.2185, + "step": 4802 + }, + { + "epoch": 0.24, + "grad_norm": 1.0415101084628973, + "learning_rate": 1.7687782013022526e-05, + "loss": 0.1941, + "step": 4803 + }, + { + "epoch": 0.24, + "grad_norm": 0.7424308880860995, + "learning_rate": 1.7686728646794184e-05, + "loss": 0.2005, + "step": 4804 + }, + { + "epoch": 0.24, + "grad_norm": 0.8623911571639182, + "learning_rate": 1.7685675072062894e-05, + "loss": 0.2083, + "step": 4805 + }, + { + "epoch": 0.24, + "grad_norm": 0.8419284534298985, + "learning_rate": 1.7684621288857233e-05, + "loss": 0.2208, + "step": 4806 + }, + { + "epoch": 0.24, + "grad_norm": 1.7300335906306568, + "learning_rate": 1.7683567297205786e-05, + "loss": 0.2105, + "step": 4807 + }, + { + "epoch": 0.24, + "grad_norm": 1.05541219527479, + "learning_rate": 1.7682513097137143e-05, + "loss": 0.2279, + "step": 4808 + }, + { + "epoch": 0.24, + "grad_norm": 1.0619360379113207, + "learning_rate": 1.76814586886799e-05, + "loss": 0.2034, + "step": 4809 + }, + { + "epoch": 0.24, + "grad_norm": 1.0246215374768057, + "learning_rate": 1.7680404071862653e-05, + "loss": 0.2148, + "step": 4810 + }, + { + "epoch": 0.24, + "grad_norm": 0.8031646650335315, + "learning_rate": 1.7679349246714012e-05, + "loss": 0.2003, + "step": 4811 + }, + { + "epoch": 0.24, + "grad_norm": 0.8335112525439298, + "learning_rate": 1.767829421326259e-05, + "loss": 0.2031, + "step": 4812 + }, + { + "epoch": 0.24, + "grad_norm": 1.5996760040622504, + "learning_rate": 1.7677238971537004e-05, + "loss": 0.224, + "step": 4813 + }, + { + "epoch": 0.24, + "grad_norm": 1.0144403532971613, + "learning_rate": 1.7676183521565876e-05, + "loss": 0.2219, + "step": 4814 + }, + { + "epoch": 0.24, + "grad_norm": 1.0481470431681141, + "learning_rate": 1.767512786337784e-05, + "loss": 0.2139, + "step": 4815 + }, + { + "epoch": 0.24, + "grad_norm": 1.3381315237653075, + "learning_rate": 1.7674071997001525e-05, + "loss": 0.2254, + "step": 4816 + }, + { + "epoch": 0.24, + "grad_norm": 0.7657901269899813, + "learning_rate": 1.767301592246557e-05, + "loss": 0.1899, + "step": 4817 + }, + { + "epoch": 0.25, + "grad_norm": 1.2415438690318472, + "learning_rate": 1.767195963979863e-05, + "loss": 0.2357, + "step": 4818 + }, + { + "epoch": 0.25, + "grad_norm": 6.371155851734972, + "learning_rate": 1.767090314902935e-05, + "loss": 0.2256, + "step": 4819 + }, + { + "epoch": 0.25, + "grad_norm": 1.027278506127918, + "learning_rate": 1.7669846450186384e-05, + "loss": 0.2021, + "step": 4820 + }, + { + "epoch": 0.25, + "grad_norm": 1.0454534167228677, + "learning_rate": 1.7668789543298407e-05, + "loss": 0.2124, + "step": 4821 + }, + { + "epoch": 0.25, + "grad_norm": 0.8698202989216001, + "learning_rate": 1.7667732428394077e-05, + "loss": 0.2048, + "step": 4822 + }, + { + "epoch": 0.25, + "grad_norm": 0.8752795249443815, + "learning_rate": 1.7666675105502073e-05, + "loss": 0.2053, + "step": 4823 + }, + { + "epoch": 0.25, + "grad_norm": 1.2287231186117786, + "learning_rate": 1.7665617574651074e-05, + "loss": 0.2013, + "step": 4824 + }, + { + "epoch": 0.25, + "grad_norm": 0.7811101027066473, + "learning_rate": 1.7664559835869763e-05, + "loss": 0.1969, + "step": 4825 + }, + { + "epoch": 0.25, + "grad_norm": 0.9831821506037443, + "learning_rate": 1.7663501889186837e-05, + "loss": 0.2071, + "step": 4826 + }, + { + "epoch": 0.25, + "grad_norm": 1.3321080093873667, + "learning_rate": 1.7662443734630987e-05, + "loss": 0.2057, + "step": 4827 + }, + { + "epoch": 0.25, + "grad_norm": 0.9328654862358803, + "learning_rate": 1.7661385372230918e-05, + "loss": 0.198, + "step": 4828 + }, + { + "epoch": 0.25, + "grad_norm": 0.9866929554002803, + "learning_rate": 1.766032680201534e-05, + "loss": 0.2252, + "step": 4829 + }, + { + "epoch": 0.25, + "grad_norm": 0.8040091299802148, + "learning_rate": 1.7659268024012962e-05, + "loss": 0.2133, + "step": 4830 + }, + { + "epoch": 0.25, + "grad_norm": 0.929652719573642, + "learning_rate": 1.7658209038252507e-05, + "loss": 0.1771, + "step": 4831 + }, + { + "epoch": 0.25, + "grad_norm": 0.9325894915758147, + "learning_rate": 1.76571498447627e-05, + "loss": 0.228, + "step": 4832 + }, + { + "epoch": 0.25, + "grad_norm": 1.0497336023331287, + "learning_rate": 1.765609044357227e-05, + "loss": 0.2139, + "step": 4833 + }, + { + "epoch": 0.25, + "grad_norm": 1.0788425842404294, + "learning_rate": 1.7655030834709954e-05, + "loss": 0.2446, + "step": 4834 + }, + { + "epoch": 0.25, + "grad_norm": 0.8148673513502684, + "learning_rate": 1.7653971018204498e-05, + "loss": 0.2052, + "step": 4835 + }, + { + "epoch": 0.25, + "grad_norm": 1.035118517547822, + "learning_rate": 1.7652910994084642e-05, + "loss": 0.1889, + "step": 4836 + }, + { + "epoch": 0.25, + "grad_norm": 1.8874921404778846, + "learning_rate": 1.7651850762379146e-05, + "loss": 0.2144, + "step": 4837 + }, + { + "epoch": 0.25, + "grad_norm": 0.971609414329634, + "learning_rate": 1.7650790323116764e-05, + "loss": 0.2261, + "step": 4838 + }, + { + "epoch": 0.25, + "grad_norm": 0.9666178939534561, + "learning_rate": 1.764972967632626e-05, + "loss": 0.2039, + "step": 4839 + }, + { + "epoch": 0.25, + "grad_norm": 0.8557814059940211, + "learning_rate": 1.764866882203641e-05, + "loss": 0.2122, + "step": 4840 + }, + { + "epoch": 0.25, + "grad_norm": 0.879809517867278, + "learning_rate": 1.7647607760275987e-05, + "loss": 0.2397, + "step": 4841 + }, + { + "epoch": 0.25, + "grad_norm": 1.1701570979540563, + "learning_rate": 1.764654649107377e-05, + "loss": 0.2348, + "step": 4842 + }, + { + "epoch": 0.25, + "grad_norm": 0.9819385381229103, + "learning_rate": 1.7645485014458545e-05, + "loss": 0.2138, + "step": 4843 + }, + { + "epoch": 0.25, + "grad_norm": 1.0372094524749425, + "learning_rate": 1.764442333045911e-05, + "loss": 0.2172, + "step": 4844 + }, + { + "epoch": 0.25, + "grad_norm": 1.1991618227826033, + "learning_rate": 1.764336143910426e-05, + "loss": 0.245, + "step": 4845 + }, + { + "epoch": 0.25, + "grad_norm": 1.2515952300942674, + "learning_rate": 1.76422993404228e-05, + "loss": 0.2346, + "step": 4846 + }, + { + "epoch": 0.25, + "grad_norm": 1.162773600935501, + "learning_rate": 1.7641237034443535e-05, + "loss": 0.2349, + "step": 4847 + }, + { + "epoch": 0.25, + "grad_norm": 1.198901515461344, + "learning_rate": 1.764017452119529e-05, + "loss": 0.2354, + "step": 4848 + }, + { + "epoch": 0.25, + "grad_norm": 0.9150902917184182, + "learning_rate": 1.7639111800706874e-05, + "loss": 0.2176, + "step": 4849 + }, + { + "epoch": 0.25, + "grad_norm": 0.9261690607074005, + "learning_rate": 1.7638048873007122e-05, + "loss": 0.2096, + "step": 4850 + }, + { + "epoch": 0.25, + "grad_norm": 0.9936123554925016, + "learning_rate": 1.7636985738124862e-05, + "loss": 0.2199, + "step": 4851 + }, + { + "epoch": 0.25, + "grad_norm": 0.8808245515832294, + "learning_rate": 1.7635922396088932e-05, + "loss": 0.1955, + "step": 4852 + }, + { + "epoch": 0.25, + "grad_norm": 1.109091782939819, + "learning_rate": 1.7634858846928174e-05, + "loss": 0.2231, + "step": 4853 + }, + { + "epoch": 0.25, + "grad_norm": 0.8979865795412102, + "learning_rate": 1.7633795090671445e-05, + "loss": 0.2311, + "step": 4854 + }, + { + "epoch": 0.25, + "grad_norm": 2.1646120782227802, + "learning_rate": 1.7632731127347588e-05, + "loss": 0.2348, + "step": 4855 + }, + { + "epoch": 0.25, + "grad_norm": 0.8728535210494023, + "learning_rate": 1.763166695698547e-05, + "loss": 0.2663, + "step": 4856 + }, + { + "epoch": 0.25, + "grad_norm": 0.9821654410268483, + "learning_rate": 1.7630602579613952e-05, + "loss": 0.2136, + "step": 4857 + }, + { + "epoch": 0.25, + "grad_norm": 2.0888253463187754, + "learning_rate": 1.7629537995261913e-05, + "loss": 0.233, + "step": 4858 + }, + { + "epoch": 0.25, + "grad_norm": 1.010011282244793, + "learning_rate": 1.7628473203958217e-05, + "loss": 0.1997, + "step": 4859 + }, + { + "epoch": 0.25, + "grad_norm": 1.0148151126522487, + "learning_rate": 1.7627408205731762e-05, + "loss": 0.1972, + "step": 4860 + }, + { + "epoch": 0.25, + "grad_norm": 1.0225367877080298, + "learning_rate": 1.7626343000611424e-05, + "loss": 0.2238, + "step": 4861 + }, + { + "epoch": 0.25, + "grad_norm": 1.0413589410630688, + "learning_rate": 1.7625277588626105e-05, + "loss": 0.1706, + "step": 4862 + }, + { + "epoch": 0.25, + "grad_norm": 2.9075675650831743, + "learning_rate": 1.76242119698047e-05, + "loss": 0.2213, + "step": 4863 + }, + { + "epoch": 0.25, + "grad_norm": 0.9576335241403465, + "learning_rate": 1.7623146144176114e-05, + "loss": 0.1945, + "step": 4864 + }, + { + "epoch": 0.25, + "grad_norm": 0.9841155313451155, + "learning_rate": 1.7622080111769257e-05, + "loss": 0.2088, + "step": 4865 + }, + { + "epoch": 0.25, + "grad_norm": 0.8088195105556512, + "learning_rate": 1.762101387261305e-05, + "loss": 0.2216, + "step": 4866 + }, + { + "epoch": 0.25, + "grad_norm": 0.7040924319850286, + "learning_rate": 1.7619947426736404e-05, + "loss": 0.2108, + "step": 4867 + }, + { + "epoch": 0.25, + "grad_norm": 0.8647540788705015, + "learning_rate": 1.761888077416826e-05, + "loss": 0.2055, + "step": 4868 + }, + { + "epoch": 0.25, + "grad_norm": 1.4917793213736366, + "learning_rate": 1.7617813914937544e-05, + "loss": 0.1867, + "step": 4869 + }, + { + "epoch": 0.25, + "grad_norm": 0.7589814055129415, + "learning_rate": 1.7616746849073195e-05, + "loss": 0.2267, + "step": 4870 + }, + { + "epoch": 0.25, + "grad_norm": 1.064675978697834, + "learning_rate": 1.7615679576604157e-05, + "loss": 0.2145, + "step": 4871 + }, + { + "epoch": 0.25, + "grad_norm": 1.8149268942899142, + "learning_rate": 1.761461209755938e-05, + "loss": 0.2243, + "step": 4872 + }, + { + "epoch": 0.25, + "grad_norm": 0.9404075709892658, + "learning_rate": 1.761354441196782e-05, + "loss": 0.2033, + "step": 4873 + }, + { + "epoch": 0.25, + "grad_norm": 2.066560634970023, + "learning_rate": 1.7612476519858437e-05, + "loss": 0.1929, + "step": 4874 + }, + { + "epoch": 0.25, + "grad_norm": 0.8873007469232738, + "learning_rate": 1.76114084212602e-05, + "loss": 0.2135, + "step": 4875 + }, + { + "epoch": 0.25, + "grad_norm": 1.0803485202137888, + "learning_rate": 1.761034011620208e-05, + "loss": 0.2024, + "step": 4876 + }, + { + "epoch": 0.25, + "grad_norm": 0.7213845346451273, + "learning_rate": 1.7609271604713055e-05, + "loss": 0.1998, + "step": 4877 + }, + { + "epoch": 0.25, + "grad_norm": 1.121037230608576, + "learning_rate": 1.7608202886822107e-05, + "loss": 0.2315, + "step": 4878 + }, + { + "epoch": 0.25, + "grad_norm": 0.9896700881943471, + "learning_rate": 1.7607133962558226e-05, + "loss": 0.2267, + "step": 4879 + }, + { + "epoch": 0.25, + "grad_norm": 1.266664377396315, + "learning_rate": 1.7606064831950403e-05, + "loss": 0.2078, + "step": 4880 + }, + { + "epoch": 0.25, + "grad_norm": 0.8553061465111234, + "learning_rate": 1.7604995495027645e-05, + "loss": 0.2116, + "step": 4881 + }, + { + "epoch": 0.25, + "grad_norm": 1.042777046982744, + "learning_rate": 1.7603925951818954e-05, + "loss": 0.2112, + "step": 4882 + }, + { + "epoch": 0.25, + "grad_norm": 0.8740583129550602, + "learning_rate": 1.7602856202353346e-05, + "loss": 0.2023, + "step": 4883 + }, + { + "epoch": 0.25, + "grad_norm": 0.9759766435581416, + "learning_rate": 1.760178624665983e-05, + "loss": 0.2227, + "step": 4884 + }, + { + "epoch": 0.25, + "grad_norm": 0.8085400387638324, + "learning_rate": 1.760071608476743e-05, + "loss": 0.2034, + "step": 4885 + }, + { + "epoch": 0.25, + "grad_norm": 0.833384795728059, + "learning_rate": 1.759964571670518e-05, + "loss": 0.2134, + "step": 4886 + }, + { + "epoch": 0.25, + "grad_norm": 0.9648949804215201, + "learning_rate": 1.7598575142502112e-05, + "loss": 0.2157, + "step": 4887 + }, + { + "epoch": 0.25, + "grad_norm": 4.111719389453267, + "learning_rate": 1.7597504362187263e-05, + "loss": 0.1994, + "step": 4888 + }, + { + "epoch": 0.25, + "grad_norm": 0.8866865010030395, + "learning_rate": 1.759643337578968e-05, + "loss": 0.2051, + "step": 4889 + }, + { + "epoch": 0.25, + "grad_norm": 0.8504575300897302, + "learning_rate": 1.759536218333841e-05, + "loss": 0.232, + "step": 4890 + }, + { + "epoch": 0.25, + "grad_norm": 1.0365543055672188, + "learning_rate": 1.7594290784862516e-05, + "loss": 0.2141, + "step": 4891 + }, + { + "epoch": 0.25, + "grad_norm": 1.0281837501246553, + "learning_rate": 1.7593219180391053e-05, + "loss": 0.2039, + "step": 4892 + }, + { + "epoch": 0.25, + "grad_norm": 0.9744470951075427, + "learning_rate": 1.759214736995309e-05, + "loss": 0.2091, + "step": 4893 + }, + { + "epoch": 0.25, + "grad_norm": 0.9084367183414519, + "learning_rate": 1.7591075353577702e-05, + "loss": 0.226, + "step": 4894 + }, + { + "epoch": 0.25, + "grad_norm": 0.8317928554365319, + "learning_rate": 1.7590003131293967e-05, + "loss": 0.2417, + "step": 4895 + }, + { + "epoch": 0.25, + "grad_norm": 0.7541092473554227, + "learning_rate": 1.758893070313097e-05, + "loss": 0.1997, + "step": 4896 + }, + { + "epoch": 0.25, + "grad_norm": 0.7883788654157217, + "learning_rate": 1.7587858069117794e-05, + "loss": 0.17, + "step": 4897 + }, + { + "epoch": 0.25, + "grad_norm": 1.2109666508636387, + "learning_rate": 1.7586785229283543e-05, + "loss": 0.2183, + "step": 4898 + }, + { + "epoch": 0.25, + "grad_norm": 0.721893397519787, + "learning_rate": 1.7585712183657312e-05, + "loss": 0.2016, + "step": 4899 + }, + { + "epoch": 0.25, + "grad_norm": 1.7380981007588767, + "learning_rate": 1.758463893226821e-05, + "loss": 0.2279, + "step": 4900 + }, + { + "epoch": 0.25, + "grad_norm": 1.1129843085791946, + "learning_rate": 1.758356547514535e-05, + "loss": 0.2218, + "step": 4901 + }, + { + "epoch": 0.25, + "grad_norm": 0.9678562315284823, + "learning_rate": 1.7582491812317846e-05, + "loss": 0.1974, + "step": 4902 + }, + { + "epoch": 0.25, + "grad_norm": 1.131902572948813, + "learning_rate": 1.7581417943814827e-05, + "loss": 0.2106, + "step": 4903 + }, + { + "epoch": 0.25, + "grad_norm": 0.9348090511857947, + "learning_rate": 1.7580343869665416e-05, + "loss": 0.2027, + "step": 4904 + }, + { + "epoch": 0.25, + "grad_norm": 1.457110700204861, + "learning_rate": 1.757926958989875e-05, + "loss": 0.211, + "step": 4905 + }, + { + "epoch": 0.25, + "grad_norm": 1.2125401140409062, + "learning_rate": 1.7578195104543964e-05, + "loss": 0.186, + "step": 4906 + }, + { + "epoch": 0.25, + "grad_norm": 1.1174812078893102, + "learning_rate": 1.7577120413630213e-05, + "loss": 0.2141, + "step": 4907 + }, + { + "epoch": 0.25, + "grad_norm": 1.125396472928048, + "learning_rate": 1.757604551718664e-05, + "loss": 0.2079, + "step": 4908 + }, + { + "epoch": 0.25, + "grad_norm": 0.887645588146478, + "learning_rate": 1.7574970415242407e-05, + "loss": 0.2031, + "step": 4909 + }, + { + "epoch": 0.25, + "grad_norm": 1.2491922728175417, + "learning_rate": 1.757389510782667e-05, + "loss": 0.2104, + "step": 4910 + }, + { + "epoch": 0.25, + "grad_norm": 0.9282309508640144, + "learning_rate": 1.75728195949686e-05, + "loss": 0.1882, + "step": 4911 + }, + { + "epoch": 0.25, + "grad_norm": 1.3550172160894849, + "learning_rate": 1.7571743876697377e-05, + "loss": 0.2003, + "step": 4912 + }, + { + "epoch": 0.25, + "grad_norm": 1.0095560289307193, + "learning_rate": 1.7570667953042167e-05, + "loss": 0.1877, + "step": 4913 + }, + { + "epoch": 0.25, + "grad_norm": 1.1898247035889686, + "learning_rate": 1.7569591824032168e-05, + "loss": 0.2217, + "step": 4914 + }, + { + "epoch": 0.25, + "grad_norm": 1.5094226280385097, + "learning_rate": 1.7568515489696558e-05, + "loss": 0.2195, + "step": 4915 + }, + { + "epoch": 0.25, + "grad_norm": 2.0270852292948054, + "learning_rate": 1.7567438950064542e-05, + "loss": 0.1776, + "step": 4916 + }, + { + "epoch": 0.25, + "grad_norm": 0.8945831319463787, + "learning_rate": 1.7566362205165313e-05, + "loss": 0.2195, + "step": 4917 + }, + { + "epoch": 0.25, + "grad_norm": 1.0540510808229446, + "learning_rate": 1.7565285255028083e-05, + "loss": 0.2182, + "step": 4918 + }, + { + "epoch": 0.25, + "grad_norm": 0.9530058829379024, + "learning_rate": 1.756420809968206e-05, + "loss": 0.2165, + "step": 4919 + }, + { + "epoch": 0.25, + "grad_norm": 0.9591411145842712, + "learning_rate": 1.756313073915647e-05, + "loss": 0.2227, + "step": 4920 + }, + { + "epoch": 0.25, + "grad_norm": 0.8538213378369253, + "learning_rate": 1.756205317348053e-05, + "loss": 0.1935, + "step": 4921 + }, + { + "epoch": 0.25, + "grad_norm": 1.000411790089436, + "learning_rate": 1.756097540268347e-05, + "loss": 0.2327, + "step": 4922 + }, + { + "epoch": 0.25, + "grad_norm": 1.0420033351873208, + "learning_rate": 1.7559897426794528e-05, + "loss": 0.2128, + "step": 4923 + }, + { + "epoch": 0.25, + "grad_norm": 0.9623079713245224, + "learning_rate": 1.7558819245842938e-05, + "loss": 0.1922, + "step": 4924 + }, + { + "epoch": 0.25, + "grad_norm": 1.1098096767015824, + "learning_rate": 1.7557740859857953e-05, + "loss": 0.2266, + "step": 4925 + }, + { + "epoch": 0.25, + "grad_norm": 1.3491601834510454, + "learning_rate": 1.7556662268868817e-05, + "loss": 0.2058, + "step": 4926 + }, + { + "epoch": 0.25, + "grad_norm": 0.9653203134414965, + "learning_rate": 1.7555583472904788e-05, + "loss": 0.2003, + "step": 4927 + }, + { + "epoch": 0.25, + "grad_norm": 0.864274668092387, + "learning_rate": 1.7554504471995134e-05, + "loss": 0.2023, + "step": 4928 + }, + { + "epoch": 0.25, + "grad_norm": 1.0880717825194903, + "learning_rate": 1.7553425266169118e-05, + "loss": 0.2149, + "step": 4929 + }, + { + "epoch": 0.25, + "grad_norm": 1.0489082281570887, + "learning_rate": 1.7552345855456017e-05, + "loss": 0.2129, + "step": 4930 + }, + { + "epoch": 0.25, + "grad_norm": 1.052512902454979, + "learning_rate": 1.7551266239885104e-05, + "loss": 0.2266, + "step": 4931 + }, + { + "epoch": 0.25, + "grad_norm": 1.6579917251985647, + "learning_rate": 1.755018641948567e-05, + "loss": 0.1981, + "step": 4932 + }, + { + "epoch": 0.25, + "grad_norm": 1.7821592754300142, + "learning_rate": 1.7549106394287004e-05, + "loss": 0.2269, + "step": 4933 + }, + { + "epoch": 0.25, + "grad_norm": 0.8870274340321501, + "learning_rate": 1.75480261643184e-05, + "loss": 0.2051, + "step": 4934 + }, + { + "epoch": 0.25, + "grad_norm": 0.8643188466997564, + "learning_rate": 1.7546945729609162e-05, + "loss": 0.222, + "step": 4935 + }, + { + "epoch": 0.25, + "grad_norm": 0.8508180991596589, + "learning_rate": 1.7545865090188594e-05, + "loss": 0.2269, + "step": 4936 + }, + { + "epoch": 0.25, + "grad_norm": 1.0361811340997602, + "learning_rate": 1.7544784246086007e-05, + "loss": 0.1965, + "step": 4937 + }, + { + "epoch": 0.25, + "grad_norm": 0.7944009776152027, + "learning_rate": 1.7543703197330722e-05, + "loss": 0.2294, + "step": 4938 + }, + { + "epoch": 0.25, + "grad_norm": 1.2904691970827866, + "learning_rate": 1.754262194395206e-05, + "loss": 0.1905, + "step": 4939 + }, + { + "epoch": 0.25, + "grad_norm": 1.2425741683662073, + "learning_rate": 1.7541540485979357e-05, + "loss": 0.2298, + "step": 4940 + }, + { + "epoch": 0.25, + "grad_norm": 1.0170380196630249, + "learning_rate": 1.754045882344194e-05, + "loss": 0.2172, + "step": 4941 + }, + { + "epoch": 0.25, + "grad_norm": 0.8377036824445884, + "learning_rate": 1.753937695636915e-05, + "loss": 0.2257, + "step": 4942 + }, + { + "epoch": 0.25, + "grad_norm": 1.4429064752446406, + "learning_rate": 1.7538294884790333e-05, + "loss": 0.2265, + "step": 4943 + }, + { + "epoch": 0.25, + "grad_norm": 1.002250270190205, + "learning_rate": 1.7537212608734842e-05, + "loss": 0.2341, + "step": 4944 + }, + { + "epoch": 0.25, + "grad_norm": 1.0738514608014265, + "learning_rate": 1.7536130128232035e-05, + "loss": 0.2029, + "step": 4945 + }, + { + "epoch": 0.25, + "grad_norm": 0.8699095271611501, + "learning_rate": 1.7535047443311274e-05, + "loss": 0.1972, + "step": 4946 + }, + { + "epoch": 0.25, + "grad_norm": 1.115828821103785, + "learning_rate": 1.7533964554001923e-05, + "loss": 0.2171, + "step": 4947 + }, + { + "epoch": 0.25, + "grad_norm": 0.8999017703628961, + "learning_rate": 1.753288146033336e-05, + "loss": 0.2086, + "step": 4948 + }, + { + "epoch": 0.25, + "grad_norm": 0.9623098645662769, + "learning_rate": 1.753179816233496e-05, + "loss": 0.1941, + "step": 4949 + }, + { + "epoch": 0.25, + "grad_norm": 1.148386681921377, + "learning_rate": 1.7530714660036112e-05, + "loss": 0.2169, + "step": 4950 + }, + { + "epoch": 0.25, + "grad_norm": 0.9251444910619677, + "learning_rate": 1.7529630953466202e-05, + "loss": 0.1907, + "step": 4951 + }, + { + "epoch": 0.25, + "grad_norm": 0.7047593827282227, + "learning_rate": 1.7528547042654626e-05, + "loss": 0.1885, + "step": 4952 + }, + { + "epoch": 0.25, + "grad_norm": 0.8934978764290634, + "learning_rate": 1.7527462927630786e-05, + "loss": 0.211, + "step": 4953 + }, + { + "epoch": 0.25, + "grad_norm": 1.0067227280512174, + "learning_rate": 1.752637860842409e-05, + "loss": 0.1983, + "step": 4954 + }, + { + "epoch": 0.25, + "grad_norm": 1.594869450914085, + "learning_rate": 1.752529408506395e-05, + "loss": 0.2195, + "step": 4955 + }, + { + "epoch": 0.25, + "grad_norm": 0.9634578527813551, + "learning_rate": 1.7524209357579782e-05, + "loss": 0.2178, + "step": 4956 + }, + { + "epoch": 0.25, + "grad_norm": 1.2163076294046304, + "learning_rate": 1.752312442600101e-05, + "loss": 0.2299, + "step": 4957 + }, + { + "epoch": 0.25, + "grad_norm": 0.827651648508648, + "learning_rate": 1.7522039290357066e-05, + "loss": 0.2053, + "step": 4958 + }, + { + "epoch": 0.25, + "grad_norm": 0.8964013881252514, + "learning_rate": 1.7520953950677374e-05, + "loss": 0.2302, + "step": 4959 + }, + { + "epoch": 0.25, + "grad_norm": 0.9926659228293512, + "learning_rate": 1.751986840699139e-05, + "loss": 0.1978, + "step": 4960 + }, + { + "epoch": 0.25, + "grad_norm": 1.4285563481945713, + "learning_rate": 1.7518782659328545e-05, + "loss": 0.2139, + "step": 4961 + }, + { + "epoch": 0.25, + "grad_norm": 0.8479583255530937, + "learning_rate": 1.7517696707718297e-05, + "loss": 0.204, + "step": 4962 + }, + { + "epoch": 0.25, + "grad_norm": 0.7944980238304211, + "learning_rate": 1.7516610552190104e-05, + "loss": 0.2052, + "step": 4963 + }, + { + "epoch": 0.25, + "grad_norm": 1.1398659985067, + "learning_rate": 1.751552419277342e-05, + "loss": 0.2133, + "step": 4964 + }, + { + "epoch": 0.25, + "grad_norm": 0.9269866998395468, + "learning_rate": 1.751443762949772e-05, + "loss": 0.2054, + "step": 4965 + }, + { + "epoch": 0.25, + "grad_norm": 1.006432875016119, + "learning_rate": 1.7513350862392478e-05, + "loss": 0.2559, + "step": 4966 + }, + { + "epoch": 0.25, + "grad_norm": 0.8283899901874466, + "learning_rate": 1.7512263891487165e-05, + "loss": 0.2087, + "step": 4967 + }, + { + "epoch": 0.25, + "grad_norm": 0.9388658415746001, + "learning_rate": 1.7511176716811275e-05, + "loss": 0.2167, + "step": 4968 + }, + { + "epoch": 0.25, + "grad_norm": 1.138438684960691, + "learning_rate": 1.7510089338394287e-05, + "loss": 0.1984, + "step": 4969 + }, + { + "epoch": 0.25, + "grad_norm": 1.1752742054523158, + "learning_rate": 1.7509001756265704e-05, + "loss": 0.2174, + "step": 4970 + }, + { + "epoch": 0.25, + "grad_norm": 0.9476066784239713, + "learning_rate": 1.7507913970455024e-05, + "loss": 0.2176, + "step": 4971 + }, + { + "epoch": 0.25, + "grad_norm": 1.472689980809985, + "learning_rate": 1.750682598099175e-05, + "loss": 0.2131, + "step": 4972 + }, + { + "epoch": 0.25, + "grad_norm": 1.0265826255610757, + "learning_rate": 1.7505737787905404e-05, + "loss": 0.2237, + "step": 4973 + }, + { + "epoch": 0.25, + "grad_norm": 1.5419271677885265, + "learning_rate": 1.7504649391225493e-05, + "loss": 0.2308, + "step": 4974 + }, + { + "epoch": 0.25, + "grad_norm": 1.8149367596422321, + "learning_rate": 1.7503560790981545e-05, + "loss": 0.1969, + "step": 4975 + }, + { + "epoch": 0.25, + "grad_norm": 1.104977220763604, + "learning_rate": 1.750247198720308e-05, + "loss": 0.2062, + "step": 4976 + }, + { + "epoch": 0.25, + "grad_norm": 0.9566919598399071, + "learning_rate": 1.750138297991965e-05, + "loss": 0.1797, + "step": 4977 + }, + { + "epoch": 0.25, + "grad_norm": 1.1870863493849544, + "learning_rate": 1.7500293769160773e-05, + "loss": 0.2253, + "step": 4978 + }, + { + "epoch": 0.25, + "grad_norm": 0.976047243062067, + "learning_rate": 1.749920435495601e-05, + "loss": 0.2292, + "step": 4979 + }, + { + "epoch": 0.25, + "grad_norm": 0.8263157669810269, + "learning_rate": 1.7498114737334902e-05, + "loss": 0.1972, + "step": 4980 + }, + { + "epoch": 0.25, + "grad_norm": 0.9403072898460152, + "learning_rate": 1.749702491632701e-05, + "loss": 0.22, + "step": 4981 + }, + { + "epoch": 0.25, + "grad_norm": 1.2141379406623052, + "learning_rate": 1.749593489196189e-05, + "loss": 0.1972, + "step": 4982 + }, + { + "epoch": 0.25, + "grad_norm": 1.7395791963743559, + "learning_rate": 1.7494844664269117e-05, + "loss": 0.2383, + "step": 4983 + }, + { + "epoch": 0.25, + "grad_norm": 0.8786426046599091, + "learning_rate": 1.749375423327826e-05, + "loss": 0.2054, + "step": 4984 + }, + { + "epoch": 0.25, + "grad_norm": 0.8585034424431195, + "learning_rate": 1.7492663599018893e-05, + "loss": 0.2157, + "step": 4985 + }, + { + "epoch": 0.25, + "grad_norm": 0.9268853902764058, + "learning_rate": 1.7491572761520604e-05, + "loss": 0.2231, + "step": 4986 + }, + { + "epoch": 0.25, + "grad_norm": 1.0354260982717227, + "learning_rate": 1.749048172081298e-05, + "loss": 0.2163, + "step": 4987 + }, + { + "epoch": 0.25, + "grad_norm": 0.8511031855858545, + "learning_rate": 1.7489390476925616e-05, + "loss": 0.2072, + "step": 4988 + }, + { + "epoch": 0.25, + "grad_norm": 0.8786659576418835, + "learning_rate": 1.7488299029888117e-05, + "loss": 0.2133, + "step": 4989 + }, + { + "epoch": 0.25, + "grad_norm": 0.812193258729004, + "learning_rate": 1.7487207379730078e-05, + "loss": 0.2303, + "step": 4990 + }, + { + "epoch": 0.25, + "grad_norm": 2.377958020015114, + "learning_rate": 1.7486115526481117e-05, + "loss": 0.2166, + "step": 4991 + }, + { + "epoch": 0.25, + "grad_norm": 0.8504237116518911, + "learning_rate": 1.748502347017085e-05, + "loss": 0.2049, + "step": 4992 + }, + { + "epoch": 0.25, + "grad_norm": 0.9399276635507209, + "learning_rate": 1.74839312108289e-05, + "loss": 0.1958, + "step": 4993 + }, + { + "epoch": 0.25, + "grad_norm": 0.834340680227393, + "learning_rate": 1.748283874848489e-05, + "loss": 0.2146, + "step": 4994 + }, + { + "epoch": 0.25, + "grad_norm": 1.1110556630181991, + "learning_rate": 1.748174608316846e-05, + "loss": 0.2286, + "step": 4995 + }, + { + "epoch": 0.25, + "grad_norm": 1.208705629511072, + "learning_rate": 1.748065321490924e-05, + "loss": 0.1975, + "step": 4996 + }, + { + "epoch": 0.25, + "grad_norm": 1.2830729263319827, + "learning_rate": 1.7479560143736885e-05, + "loss": 0.1994, + "step": 4997 + }, + { + "epoch": 0.25, + "grad_norm": 0.9408077019695862, + "learning_rate": 1.7478466869681035e-05, + "loss": 0.198, + "step": 4998 + }, + { + "epoch": 0.25, + "grad_norm": 1.5477988939109983, + "learning_rate": 1.7477373392771352e-05, + "loss": 0.2293, + "step": 4999 + }, + { + "epoch": 0.25, + "grad_norm": 1.1463803370648873, + "learning_rate": 1.747627971303749e-05, + "loss": 0.2188, + "step": 5000 + }, + { + "epoch": 0.25, + "grad_norm": 0.8189903246416517, + "learning_rate": 1.7475185830509124e-05, + "loss": 0.2073, + "step": 5001 + }, + { + "epoch": 0.25, + "grad_norm": 1.2662754100903848, + "learning_rate": 1.7474091745215912e-05, + "loss": 0.2507, + "step": 5002 + }, + { + "epoch": 0.25, + "grad_norm": 1.1058798998476678, + "learning_rate": 1.7472997457187543e-05, + "loss": 0.2354, + "step": 5003 + }, + { + "epoch": 0.25, + "grad_norm": 0.9845891354226091, + "learning_rate": 1.74719029664537e-05, + "loss": 0.2061, + "step": 5004 + }, + { + "epoch": 0.25, + "grad_norm": 0.9242568026610173, + "learning_rate": 1.747080827304406e-05, + "loss": 0.2387, + "step": 5005 + }, + { + "epoch": 0.25, + "grad_norm": 0.9957505132812781, + "learning_rate": 1.746971337698833e-05, + "loss": 0.2267, + "step": 5006 + }, + { + "epoch": 0.25, + "grad_norm": 1.130839216768605, + "learning_rate": 1.74686182783162e-05, + "loss": 0.2294, + "step": 5007 + }, + { + "epoch": 0.25, + "grad_norm": 1.0601687365795447, + "learning_rate": 1.7467522977057375e-05, + "loss": 0.2127, + "step": 5008 + }, + { + "epoch": 0.25, + "grad_norm": 1.4708274030519979, + "learning_rate": 1.746642747324157e-05, + "loss": 0.1969, + "step": 5009 + }, + { + "epoch": 0.25, + "grad_norm": 0.9259996950306866, + "learning_rate": 1.74653317668985e-05, + "loss": 0.2041, + "step": 5010 + }, + { + "epoch": 0.25, + "grad_norm": 0.98602895533177, + "learning_rate": 1.7464235858057878e-05, + "loss": 0.204, + "step": 5011 + }, + { + "epoch": 0.25, + "grad_norm": 1.1042160602475162, + "learning_rate": 1.7463139746749443e-05, + "loss": 0.2173, + "step": 5012 + }, + { + "epoch": 0.25, + "grad_norm": 1.328890259856893, + "learning_rate": 1.7462043433002915e-05, + "loss": 0.2021, + "step": 5013 + }, + { + "epoch": 0.25, + "grad_norm": 1.0417663851102328, + "learning_rate": 1.7460946916848042e-05, + "loss": 0.2108, + "step": 5014 + }, + { + "epoch": 0.26, + "grad_norm": 0.9022596817041325, + "learning_rate": 1.7459850198314562e-05, + "loss": 0.1964, + "step": 5015 + }, + { + "epoch": 0.26, + "grad_norm": 1.3093436909694542, + "learning_rate": 1.7458753277432223e-05, + "loss": 0.2168, + "step": 5016 + }, + { + "epoch": 0.26, + "grad_norm": 4.579107620746813, + "learning_rate": 1.745765615423078e-05, + "loss": 0.195, + "step": 5017 + }, + { + "epoch": 0.26, + "grad_norm": 0.9487815613788572, + "learning_rate": 1.7456558828739993e-05, + "loss": 0.2148, + "step": 5018 + }, + { + "epoch": 0.26, + "grad_norm": 1.2607977547964462, + "learning_rate": 1.7455461300989627e-05, + "loss": 0.2157, + "step": 5019 + }, + { + "epoch": 0.26, + "grad_norm": 0.9884044516273045, + "learning_rate": 1.7454363571009452e-05, + "loss": 0.2017, + "step": 5020 + }, + { + "epoch": 0.26, + "grad_norm": 0.9234874314136033, + "learning_rate": 1.7453265638829246e-05, + "loss": 0.2083, + "step": 5021 + }, + { + "epoch": 0.26, + "grad_norm": 0.9101887407052885, + "learning_rate": 1.745216750447878e-05, + "loss": 0.2083, + "step": 5022 + }, + { + "epoch": 0.26, + "grad_norm": 0.9324602885160264, + "learning_rate": 1.7451069167987858e-05, + "loss": 0.2164, + "step": 5023 + }, + { + "epoch": 0.26, + "grad_norm": 1.327783331629871, + "learning_rate": 1.7449970629386265e-05, + "loss": 0.1904, + "step": 5024 + }, + { + "epoch": 0.26, + "grad_norm": 0.9224667945636565, + "learning_rate": 1.7448871888703792e-05, + "loss": 0.1895, + "step": 5025 + }, + { + "epoch": 0.26, + "grad_norm": 0.8781524280150395, + "learning_rate": 1.744777294597025e-05, + "loss": 0.2284, + "step": 5026 + }, + { + "epoch": 0.26, + "grad_norm": 1.106374290900814, + "learning_rate": 1.744667380121545e-05, + "loss": 0.2332, + "step": 5027 + }, + { + "epoch": 0.26, + "grad_norm": 1.2153769878955571, + "learning_rate": 1.7445574454469202e-05, + "loss": 0.2294, + "step": 5028 + }, + { + "epoch": 0.26, + "grad_norm": 1.241271721583726, + "learning_rate": 1.744447490576132e-05, + "loss": 0.2285, + "step": 5029 + }, + { + "epoch": 0.26, + "grad_norm": 1.0396512825817412, + "learning_rate": 1.744337515512164e-05, + "loss": 0.2046, + "step": 5030 + }, + { + "epoch": 0.26, + "grad_norm": 0.990253885865504, + "learning_rate": 1.744227520257999e-05, + "loss": 0.1984, + "step": 5031 + }, + { + "epoch": 0.26, + "grad_norm": 0.9219416918548322, + "learning_rate": 1.7441175048166203e-05, + "loss": 0.2099, + "step": 5032 + }, + { + "epoch": 0.26, + "grad_norm": 0.9096825504551227, + "learning_rate": 1.7440074691910123e-05, + "loss": 0.2179, + "step": 5033 + }, + { + "epoch": 0.26, + "grad_norm": 0.9631200723736666, + "learning_rate": 1.7438974133841596e-05, + "loss": 0.2316, + "step": 5034 + }, + { + "epoch": 0.26, + "grad_norm": 0.951258868171326, + "learning_rate": 1.7437873373990478e-05, + "loss": 0.2176, + "step": 5035 + }, + { + "epoch": 0.26, + "grad_norm": 0.9007417553081281, + "learning_rate": 1.7436772412386622e-05, + "loss": 0.2053, + "step": 5036 + }, + { + "epoch": 0.26, + "grad_norm": 1.0672720680830408, + "learning_rate": 1.7435671249059895e-05, + "loss": 0.2219, + "step": 5037 + }, + { + "epoch": 0.26, + "grad_norm": 0.913837085638295, + "learning_rate": 1.743456988404017e-05, + "loss": 0.2218, + "step": 5038 + }, + { + "epoch": 0.26, + "grad_norm": 0.8086054513570482, + "learning_rate": 1.743346831735731e-05, + "loss": 0.1971, + "step": 5039 + }, + { + "epoch": 0.26, + "grad_norm": 0.9734111808136677, + "learning_rate": 1.7432366549041203e-05, + "loss": 0.2252, + "step": 5040 + }, + { + "epoch": 0.26, + "grad_norm": 0.9860366282496003, + "learning_rate": 1.7431264579121734e-05, + "loss": 0.1978, + "step": 5041 + }, + { + "epoch": 0.26, + "grad_norm": 0.9523717046703011, + "learning_rate": 1.7430162407628796e-05, + "loss": 0.1968, + "step": 5042 + }, + { + "epoch": 0.26, + "grad_norm": 0.8510264207270812, + "learning_rate": 1.742906003459228e-05, + "loss": 0.228, + "step": 5043 + }, + { + "epoch": 0.26, + "grad_norm": 1.0463789344345444, + "learning_rate": 1.7427957460042092e-05, + "loss": 0.1962, + "step": 5044 + }, + { + "epoch": 0.26, + "grad_norm": 1.0830758748704274, + "learning_rate": 1.742685468400814e-05, + "loss": 0.2348, + "step": 5045 + }, + { + "epoch": 0.26, + "grad_norm": 1.1041907730218539, + "learning_rate": 1.7425751706520337e-05, + "loss": 0.2205, + "step": 5046 + }, + { + "epoch": 0.26, + "grad_norm": 1.1479770893064867, + "learning_rate": 1.7424648527608594e-05, + "loss": 0.2145, + "step": 5047 + }, + { + "epoch": 0.26, + "grad_norm": 0.9543567978429918, + "learning_rate": 1.742354514730284e-05, + "loss": 0.2198, + "step": 5048 + }, + { + "epoch": 0.26, + "grad_norm": 1.2461996880851625, + "learning_rate": 1.742244156563301e-05, + "loss": 0.2192, + "step": 5049 + }, + { + "epoch": 0.26, + "grad_norm": 1.0231145847760585, + "learning_rate": 1.742133778262903e-05, + "loss": 0.2013, + "step": 5050 + }, + { + "epoch": 0.26, + "grad_norm": 0.8834962163343502, + "learning_rate": 1.7420233798320848e-05, + "loss": 0.1999, + "step": 5051 + }, + { + "epoch": 0.26, + "grad_norm": 1.2485676337162055, + "learning_rate": 1.74191296127384e-05, + "loss": 0.2233, + "step": 5052 + }, + { + "epoch": 0.26, + "grad_norm": 1.239142632740132, + "learning_rate": 1.7418025225911642e-05, + "loss": 0.2113, + "step": 5053 + }, + { + "epoch": 0.26, + "grad_norm": 0.9004485805006647, + "learning_rate": 1.7416920637870535e-05, + "loss": 0.192, + "step": 5054 + }, + { + "epoch": 0.26, + "grad_norm": 0.938876208690321, + "learning_rate": 1.7415815848645032e-05, + "loss": 0.2166, + "step": 5055 + }, + { + "epoch": 0.26, + "grad_norm": 1.0012870727151062, + "learning_rate": 1.741471085826511e-05, + "loss": 0.2053, + "step": 5056 + }, + { + "epoch": 0.26, + "grad_norm": 1.9550687135535754, + "learning_rate": 1.7413605666760733e-05, + "loss": 0.2296, + "step": 5057 + }, + { + "epoch": 0.26, + "grad_norm": 1.0829094222653786, + "learning_rate": 1.7412500274161885e-05, + "loss": 0.2173, + "step": 5058 + }, + { + "epoch": 0.26, + "grad_norm": 1.1394625432475365, + "learning_rate": 1.741139468049855e-05, + "loss": 0.1889, + "step": 5059 + }, + { + "epoch": 0.26, + "grad_norm": 0.934634491288444, + "learning_rate": 1.7410288885800716e-05, + "loss": 0.2201, + "step": 5060 + }, + { + "epoch": 0.26, + "grad_norm": 1.0153039446794108, + "learning_rate": 1.7409182890098372e-05, + "loss": 0.195, + "step": 5061 + }, + { + "epoch": 0.26, + "grad_norm": 1.6814140355594438, + "learning_rate": 1.7408076693421528e-05, + "loss": 0.2589, + "step": 5062 + }, + { + "epoch": 0.26, + "grad_norm": 1.06895135418714, + "learning_rate": 1.7406970295800188e-05, + "loss": 0.2224, + "step": 5063 + }, + { + "epoch": 0.26, + "grad_norm": 0.964438213306339, + "learning_rate": 1.7405863697264357e-05, + "loss": 0.2029, + "step": 5064 + }, + { + "epoch": 0.26, + "grad_norm": 1.388743643425397, + "learning_rate": 1.7404756897844054e-05, + "loss": 0.2133, + "step": 5065 + }, + { + "epoch": 0.26, + "grad_norm": 1.285559298704333, + "learning_rate": 1.7403649897569302e-05, + "loss": 0.2066, + "step": 5066 + }, + { + "epoch": 0.26, + "grad_norm": 0.8019958506858845, + "learning_rate": 1.740254269647013e-05, + "loss": 0.1981, + "step": 5067 + }, + { + "epoch": 0.26, + "grad_norm": 1.224315249511999, + "learning_rate": 1.7401435294576566e-05, + "loss": 0.2235, + "step": 5068 + }, + { + "epoch": 0.26, + "grad_norm": 1.2892782160471126, + "learning_rate": 1.7400327691918657e-05, + "loss": 0.2211, + "step": 5069 + }, + { + "epoch": 0.26, + "grad_norm": 1.1369492211634333, + "learning_rate": 1.7399219888526438e-05, + "loss": 0.207, + "step": 5070 + }, + { + "epoch": 0.26, + "grad_norm": 0.8704337063333867, + "learning_rate": 1.7398111884429966e-05, + "loss": 0.1868, + "step": 5071 + }, + { + "epoch": 0.26, + "grad_norm": 0.9232480538530867, + "learning_rate": 1.7397003679659285e-05, + "loss": 0.1932, + "step": 5072 + }, + { + "epoch": 0.26, + "grad_norm": 0.90226151158112, + "learning_rate": 1.7395895274244464e-05, + "loss": 0.2037, + "step": 5073 + }, + { + "epoch": 0.26, + "grad_norm": 1.0136440681877585, + "learning_rate": 1.7394786668215564e-05, + "loss": 0.1827, + "step": 5074 + }, + { + "epoch": 0.26, + "grad_norm": 1.1129209039716363, + "learning_rate": 1.739367786160266e-05, + "loss": 0.207, + "step": 5075 + }, + { + "epoch": 0.26, + "grad_norm": 1.204496096714119, + "learning_rate": 1.7392568854435828e-05, + "loss": 0.2223, + "step": 5076 + }, + { + "epoch": 0.26, + "grad_norm": 0.9851629720093069, + "learning_rate": 1.7391459646745145e-05, + "loss": 0.2275, + "step": 5077 + }, + { + "epoch": 0.26, + "grad_norm": 0.8517560781459264, + "learning_rate": 1.7390350238560706e-05, + "loss": 0.1947, + "step": 5078 + }, + { + "epoch": 0.26, + "grad_norm": 0.9295862908238195, + "learning_rate": 1.7389240629912594e-05, + "loss": 0.2254, + "step": 5079 + }, + { + "epoch": 0.26, + "grad_norm": 0.9968261336024243, + "learning_rate": 1.7388130820830914e-05, + "loss": 0.2028, + "step": 5080 + }, + { + "epoch": 0.26, + "grad_norm": 1.0777716957127221, + "learning_rate": 1.738702081134577e-05, + "loss": 0.2021, + "step": 5081 + }, + { + "epoch": 0.26, + "grad_norm": 0.8685885199262032, + "learning_rate": 1.738591060148727e-05, + "loss": 0.2068, + "step": 5082 + }, + { + "epoch": 0.26, + "grad_norm": 1.37429320393285, + "learning_rate": 1.738480019128553e-05, + "loss": 0.225, + "step": 5083 + }, + { + "epoch": 0.26, + "grad_norm": 1.068890387233058, + "learning_rate": 1.7383689580770662e-05, + "loss": 0.2211, + "step": 5084 + }, + { + "epoch": 0.26, + "grad_norm": 1.0787377651617818, + "learning_rate": 1.73825787699728e-05, + "loss": 0.212, + "step": 5085 + }, + { + "epoch": 0.26, + "grad_norm": 0.9387024177777231, + "learning_rate": 1.738146775892207e-05, + "loss": 0.2207, + "step": 5086 + }, + { + "epoch": 0.26, + "grad_norm": 0.8724959929584131, + "learning_rate": 1.738035654764861e-05, + "loss": 0.2258, + "step": 5087 + }, + { + "epoch": 0.26, + "grad_norm": 1.0341884160681272, + "learning_rate": 1.7379245136182563e-05, + "loss": 0.1936, + "step": 5088 + }, + { + "epoch": 0.26, + "grad_norm": 0.9305601735593984, + "learning_rate": 1.7378133524554076e-05, + "loss": 0.2028, + "step": 5089 + }, + { + "epoch": 0.26, + "grad_norm": 0.9216192771869709, + "learning_rate": 1.73770217127933e-05, + "loss": 0.2057, + "step": 5090 + }, + { + "epoch": 0.26, + "grad_norm": 0.8275065447729806, + "learning_rate": 1.737590970093039e-05, + "loss": 0.2191, + "step": 5091 + }, + { + "epoch": 0.26, + "grad_norm": 1.8141071722166489, + "learning_rate": 1.737479748899552e-05, + "loss": 0.2119, + "step": 5092 + }, + { + "epoch": 0.26, + "grad_norm": 0.9472226924117711, + "learning_rate": 1.7373685077018844e-05, + "loss": 0.2188, + "step": 5093 + }, + { + "epoch": 0.26, + "grad_norm": 1.047574692637895, + "learning_rate": 1.7372572465030545e-05, + "loss": 0.2104, + "step": 5094 + }, + { + "epoch": 0.26, + "grad_norm": 0.9203578125292801, + "learning_rate": 1.7371459653060806e-05, + "loss": 0.212, + "step": 5095 + }, + { + "epoch": 0.26, + "grad_norm": 0.8055320516963432, + "learning_rate": 1.7370346641139805e-05, + "loss": 0.2044, + "step": 5096 + }, + { + "epoch": 0.26, + "grad_norm": 0.842063707877496, + "learning_rate": 1.7369233429297734e-05, + "loss": 0.2212, + "step": 5097 + }, + { + "epoch": 0.26, + "grad_norm": 1.021204666571421, + "learning_rate": 1.7368120017564792e-05, + "loss": 0.2016, + "step": 5098 + }, + { + "epoch": 0.26, + "grad_norm": 1.310478716770612, + "learning_rate": 1.7367006405971177e-05, + "loss": 0.2464, + "step": 5099 + }, + { + "epoch": 0.26, + "grad_norm": 1.4498907688492093, + "learning_rate": 1.7365892594547097e-05, + "loss": 0.202, + "step": 5100 + }, + { + "epoch": 0.26, + "grad_norm": 1.196400429856508, + "learning_rate": 1.7364778583322765e-05, + "loss": 0.2247, + "step": 5101 + }, + { + "epoch": 0.26, + "grad_norm": 0.923844439015725, + "learning_rate": 1.7363664372328398e-05, + "loss": 0.2332, + "step": 5102 + }, + { + "epoch": 0.26, + "grad_norm": 0.9109385255003308, + "learning_rate": 1.736254996159422e-05, + "loss": 0.207, + "step": 5103 + }, + { + "epoch": 0.26, + "grad_norm": 1.0074463288140154, + "learning_rate": 1.7361435351150456e-05, + "loss": 0.2194, + "step": 5104 + }, + { + "epoch": 0.26, + "grad_norm": 0.9833506313364749, + "learning_rate": 1.7360320541027342e-05, + "loss": 0.2277, + "step": 5105 + }, + { + "epoch": 0.26, + "grad_norm": 0.7589870505508941, + "learning_rate": 1.7359205531255123e-05, + "loss": 0.2, + "step": 5106 + }, + { + "epoch": 0.26, + "grad_norm": 0.9118865622399176, + "learning_rate": 1.735809032186403e-05, + "loss": 0.2307, + "step": 5107 + }, + { + "epoch": 0.26, + "grad_norm": 0.9536475657355715, + "learning_rate": 1.7356974912884327e-05, + "loss": 0.2089, + "step": 5108 + }, + { + "epoch": 0.26, + "grad_norm": 0.8118782600313672, + "learning_rate": 1.7355859304346262e-05, + "loss": 0.2264, + "step": 5109 + }, + { + "epoch": 0.26, + "grad_norm": 0.9823121461102297, + "learning_rate": 1.7354743496280103e-05, + "loss": 0.2034, + "step": 5110 + }, + { + "epoch": 0.26, + "grad_norm": 0.903102188316509, + "learning_rate": 1.7353627488716106e-05, + "loss": 0.2166, + "step": 5111 + }, + { + "epoch": 0.26, + "grad_norm": 1.0695530528902415, + "learning_rate": 1.7352511281684548e-05, + "loss": 0.2328, + "step": 5112 + }, + { + "epoch": 0.26, + "grad_norm": 1.0200358766120496, + "learning_rate": 1.7351394875215707e-05, + "loss": 0.1933, + "step": 5113 + }, + { + "epoch": 0.26, + "grad_norm": 1.1098375204015731, + "learning_rate": 1.7350278269339867e-05, + "loss": 0.2238, + "step": 5114 + }, + { + "epoch": 0.26, + "grad_norm": 0.9829062711668459, + "learning_rate": 1.7349161464087312e-05, + "loss": 0.2239, + "step": 5115 + }, + { + "epoch": 0.26, + "grad_norm": 0.9539037518616844, + "learning_rate": 1.7348044459488334e-05, + "loss": 0.1879, + "step": 5116 + }, + { + "epoch": 0.26, + "grad_norm": 0.9057009263172614, + "learning_rate": 1.734692725557324e-05, + "loss": 0.2127, + "step": 5117 + }, + { + "epoch": 0.26, + "grad_norm": 0.9057268038085381, + "learning_rate": 1.734580985237233e-05, + "loss": 0.2221, + "step": 5118 + }, + { + "epoch": 0.26, + "grad_norm": 0.8635195105524243, + "learning_rate": 1.7344692249915907e-05, + "loss": 0.2007, + "step": 5119 + }, + { + "epoch": 0.26, + "grad_norm": 1.5332147512028922, + "learning_rate": 1.7343574448234294e-05, + "loss": 0.1949, + "step": 5120 + }, + { + "epoch": 0.26, + "grad_norm": 0.92037678511703, + "learning_rate": 1.7342456447357813e-05, + "loss": 0.2245, + "step": 5121 + }, + { + "epoch": 0.26, + "grad_norm": 1.3309735135939107, + "learning_rate": 1.7341338247316785e-05, + "loss": 0.2805, + "step": 5122 + }, + { + "epoch": 0.26, + "grad_norm": 1.0574651253476115, + "learning_rate": 1.734021984814154e-05, + "loss": 0.2256, + "step": 5123 + }, + { + "epoch": 0.26, + "grad_norm": 1.0460865418770287, + "learning_rate": 1.7339101249862418e-05, + "loss": 0.1988, + "step": 5124 + }, + { + "epoch": 0.26, + "grad_norm": 1.1198594990219086, + "learning_rate": 1.7337982452509757e-05, + "loss": 0.2306, + "step": 5125 + }, + { + "epoch": 0.26, + "grad_norm": 0.8641486909806005, + "learning_rate": 1.7336863456113912e-05, + "loss": 0.1677, + "step": 5126 + }, + { + "epoch": 0.26, + "grad_norm": 0.9883756135069262, + "learning_rate": 1.7335744260705233e-05, + "loss": 0.1941, + "step": 5127 + }, + { + "epoch": 0.26, + "grad_norm": 1.038637478516568, + "learning_rate": 1.733462486631407e-05, + "loss": 0.2101, + "step": 5128 + }, + { + "epoch": 0.26, + "grad_norm": 0.7833925304111683, + "learning_rate": 1.73335052729708e-05, + "loss": 0.1937, + "step": 5129 + }, + { + "epoch": 0.26, + "grad_norm": 1.1097990773913913, + "learning_rate": 1.733238548070578e-05, + "loss": 0.2329, + "step": 5130 + }, + { + "epoch": 0.26, + "grad_norm": 0.9441953595842283, + "learning_rate": 1.7331265489549392e-05, + "loss": 0.2259, + "step": 5131 + }, + { + "epoch": 0.26, + "grad_norm": 1.0517679100815562, + "learning_rate": 1.7330145299532014e-05, + "loss": 0.1908, + "step": 5132 + }, + { + "epoch": 0.26, + "grad_norm": 0.865174450000809, + "learning_rate": 1.7329024910684033e-05, + "loss": 0.2338, + "step": 5133 + }, + { + "epoch": 0.26, + "grad_norm": 1.4011916507400322, + "learning_rate": 1.7327904323035833e-05, + "loss": 0.1923, + "step": 5134 + }, + { + "epoch": 0.26, + "grad_norm": 1.733663183504703, + "learning_rate": 1.7326783536617817e-05, + "loss": 0.2025, + "step": 5135 + }, + { + "epoch": 0.26, + "grad_norm": 1.2850760579337643, + "learning_rate": 1.7325662551460382e-05, + "loss": 0.2243, + "step": 5136 + }, + { + "epoch": 0.26, + "grad_norm": 0.8465786613373185, + "learning_rate": 1.7324541367593938e-05, + "loss": 0.2175, + "step": 5137 + }, + { + "epoch": 0.26, + "grad_norm": 0.8575404475842421, + "learning_rate": 1.7323419985048895e-05, + "loss": 0.2039, + "step": 5138 + }, + { + "epoch": 0.26, + "grad_norm": 1.410440921873463, + "learning_rate": 1.732229840385567e-05, + "loss": 0.1937, + "step": 5139 + }, + { + "epoch": 0.26, + "grad_norm": 1.7958882699551637, + "learning_rate": 1.732117662404469e-05, + "loss": 0.2388, + "step": 5140 + }, + { + "epoch": 0.26, + "grad_norm": 1.3727292538167888, + "learning_rate": 1.7320054645646376e-05, + "loss": 0.2214, + "step": 5141 + }, + { + "epoch": 0.26, + "grad_norm": 1.3762093404062987, + "learning_rate": 1.7318932468691172e-05, + "loss": 0.2023, + "step": 5142 + }, + { + "epoch": 0.26, + "grad_norm": 0.9535075619962287, + "learning_rate": 1.7317810093209507e-05, + "loss": 0.1909, + "step": 5143 + }, + { + "epoch": 0.26, + "grad_norm": 1.0330727306732685, + "learning_rate": 1.731668751923183e-05, + "loss": 0.2154, + "step": 5144 + }, + { + "epoch": 0.26, + "grad_norm": 0.8244231717387553, + "learning_rate": 1.7315564746788592e-05, + "loss": 0.2179, + "step": 5145 + }, + { + "epoch": 0.26, + "grad_norm": 0.9174505172121481, + "learning_rate": 1.731444177591025e-05, + "loss": 0.2175, + "step": 5146 + }, + { + "epoch": 0.26, + "grad_norm": 0.9969529480674448, + "learning_rate": 1.7313318606627258e-05, + "loss": 0.2279, + "step": 5147 + }, + { + "epoch": 0.26, + "grad_norm": 1.2100766546934472, + "learning_rate": 1.7312195238970088e-05, + "loss": 0.1959, + "step": 5148 + }, + { + "epoch": 0.26, + "grad_norm": 1.412580848356862, + "learning_rate": 1.7311071672969206e-05, + "loss": 0.2057, + "step": 5149 + }, + { + "epoch": 0.26, + "grad_norm": 0.9606173224523024, + "learning_rate": 1.7309947908655096e-05, + "loss": 0.2338, + "step": 5150 + }, + { + "epoch": 0.26, + "grad_norm": 1.0224410815863882, + "learning_rate": 1.7308823946058237e-05, + "loss": 0.2161, + "step": 5151 + }, + { + "epoch": 0.26, + "grad_norm": 1.16150172069741, + "learning_rate": 1.7307699785209108e-05, + "loss": 0.1782, + "step": 5152 + }, + { + "epoch": 0.26, + "grad_norm": 0.9738594797923933, + "learning_rate": 1.7306575426138213e-05, + "loss": 0.229, + "step": 5153 + }, + { + "epoch": 0.26, + "grad_norm": 0.9858951663213912, + "learning_rate": 1.730545086887605e-05, + "loss": 0.2131, + "step": 5154 + }, + { + "epoch": 0.26, + "grad_norm": 0.8323012000824099, + "learning_rate": 1.730432611345312e-05, + "loss": 0.2032, + "step": 5155 + }, + { + "epoch": 0.26, + "grad_norm": 0.8765788794254222, + "learning_rate": 1.730320115989993e-05, + "loss": 0.2255, + "step": 5156 + }, + { + "epoch": 0.26, + "grad_norm": 0.8654955411348836, + "learning_rate": 1.7302076008246993e-05, + "loss": 0.1953, + "step": 5157 + }, + { + "epoch": 0.26, + "grad_norm": 1.1368959010757456, + "learning_rate": 1.7300950658524836e-05, + "loss": 0.2067, + "step": 5158 + }, + { + "epoch": 0.26, + "grad_norm": 1.0193148376193706, + "learning_rate": 1.729982511076398e-05, + "loss": 0.2171, + "step": 5159 + }, + { + "epoch": 0.26, + "grad_norm": 3.084242175875511, + "learning_rate": 1.7298699364994952e-05, + "loss": 0.2046, + "step": 5160 + }, + { + "epoch": 0.26, + "grad_norm": 0.9197173344240579, + "learning_rate": 1.7297573421248294e-05, + "loss": 0.2357, + "step": 5161 + }, + { + "epoch": 0.26, + "grad_norm": 1.1499815341386719, + "learning_rate": 1.729644727955454e-05, + "loss": 0.192, + "step": 5162 + }, + { + "epoch": 0.26, + "grad_norm": 1.2309580105851474, + "learning_rate": 1.7295320939944247e-05, + "loss": 0.2381, + "step": 5163 + }, + { + "epoch": 0.26, + "grad_norm": 1.1276580222219936, + "learning_rate": 1.729419440244796e-05, + "loss": 0.2211, + "step": 5164 + }, + { + "epoch": 0.26, + "grad_norm": 2.034431671433726, + "learning_rate": 1.729306766709624e-05, + "loss": 0.202, + "step": 5165 + }, + { + "epoch": 0.26, + "grad_norm": 0.8705311980286389, + "learning_rate": 1.7291940733919645e-05, + "loss": 0.2149, + "step": 5166 + }, + { + "epoch": 0.26, + "grad_norm": 0.8392682674202115, + "learning_rate": 1.7290813602948748e-05, + "loss": 0.2088, + "step": 5167 + }, + { + "epoch": 0.26, + "grad_norm": 0.8684905385029499, + "learning_rate": 1.7289686274214116e-05, + "loss": 0.2112, + "step": 5168 + }, + { + "epoch": 0.26, + "grad_norm": 1.1264314185559987, + "learning_rate": 1.7288558747746335e-05, + "loss": 0.2012, + "step": 5169 + }, + { + "epoch": 0.26, + "grad_norm": 0.978250214493829, + "learning_rate": 1.7287431023575988e-05, + "loss": 0.2155, + "step": 5170 + }, + { + "epoch": 0.26, + "grad_norm": 0.9360368977001948, + "learning_rate": 1.728630310173366e-05, + "loss": 0.2361, + "step": 5171 + }, + { + "epoch": 0.26, + "grad_norm": 0.8958072123216545, + "learning_rate": 1.7285174982249947e-05, + "loss": 0.2179, + "step": 5172 + }, + { + "epoch": 0.26, + "grad_norm": 0.941632424104645, + "learning_rate": 1.7284046665155456e-05, + "loss": 0.2368, + "step": 5173 + }, + { + "epoch": 0.26, + "grad_norm": 0.8263123681636818, + "learning_rate": 1.7282918150480786e-05, + "loss": 0.204, + "step": 5174 + }, + { + "epoch": 0.26, + "grad_norm": 0.914557956965905, + "learning_rate": 1.728178943825655e-05, + "loss": 0.1889, + "step": 5175 + }, + { + "epoch": 0.26, + "grad_norm": 0.9954073324336042, + "learning_rate": 1.7280660528513362e-05, + "loss": 0.2202, + "step": 5176 + }, + { + "epoch": 0.26, + "grad_norm": 0.7576450292859586, + "learning_rate": 1.727953142128185e-05, + "loss": 0.176, + "step": 5177 + }, + { + "epoch": 0.26, + "grad_norm": 0.8326638995643959, + "learning_rate": 1.727840211659263e-05, + "loss": 0.2173, + "step": 5178 + }, + { + "epoch": 0.26, + "grad_norm": 0.9662341961029348, + "learning_rate": 1.727727261447635e-05, + "loss": 0.208, + "step": 5179 + }, + { + "epoch": 0.26, + "grad_norm": 1.1586971973050766, + "learning_rate": 1.7276142914963635e-05, + "loss": 0.2183, + "step": 5180 + }, + { + "epoch": 0.26, + "grad_norm": 1.2003375851518439, + "learning_rate": 1.727501301808513e-05, + "loss": 0.2245, + "step": 5181 + }, + { + "epoch": 0.26, + "grad_norm": 0.9655590656767455, + "learning_rate": 1.7273882923871492e-05, + "loss": 0.1938, + "step": 5182 + }, + { + "epoch": 0.26, + "grad_norm": 1.3799769940471969, + "learning_rate": 1.7272752632353365e-05, + "loss": 0.223, + "step": 5183 + }, + { + "epoch": 0.26, + "grad_norm": 0.8849271828718694, + "learning_rate": 1.727162214356141e-05, + "loss": 0.2208, + "step": 5184 + }, + { + "epoch": 0.26, + "grad_norm": 0.8298189179543192, + "learning_rate": 1.72704914575263e-05, + "loss": 0.1903, + "step": 5185 + }, + { + "epoch": 0.26, + "grad_norm": 0.9025161258137357, + "learning_rate": 1.7269360574278694e-05, + "loss": 0.2043, + "step": 5186 + }, + { + "epoch": 0.26, + "grad_norm": 0.9525245007094877, + "learning_rate": 1.7268229493849273e-05, + "loss": 0.1838, + "step": 5187 + }, + { + "epoch": 0.26, + "grad_norm": 0.7302099244104475, + "learning_rate": 1.7267098216268715e-05, + "loss": 0.2172, + "step": 5188 + }, + { + "epoch": 0.26, + "grad_norm": 1.539391710700766, + "learning_rate": 1.726596674156771e-05, + "loss": 0.249, + "step": 5189 + }, + { + "epoch": 0.26, + "grad_norm": 1.207634354231271, + "learning_rate": 1.7264835069776945e-05, + "loss": 0.2087, + "step": 5190 + }, + { + "epoch": 0.26, + "grad_norm": 1.7730743543155159, + "learning_rate": 1.726370320092712e-05, + "loss": 0.2335, + "step": 5191 + }, + { + "epoch": 0.26, + "grad_norm": 1.1122396560369499, + "learning_rate": 1.7262571135048934e-05, + "loss": 0.2164, + "step": 5192 + }, + { + "epoch": 0.26, + "grad_norm": 1.3805955610238752, + "learning_rate": 1.7261438872173096e-05, + "loss": 0.2053, + "step": 5193 + }, + { + "epoch": 0.26, + "grad_norm": 1.0468968152705946, + "learning_rate": 1.7260306412330317e-05, + "loss": 0.207, + "step": 5194 + }, + { + "epoch": 0.26, + "grad_norm": 0.8783581393917873, + "learning_rate": 1.725917375555132e-05, + "loss": 0.2215, + "step": 5195 + }, + { + "epoch": 0.26, + "grad_norm": 1.3218838017025807, + "learning_rate": 1.7258040901866824e-05, + "loss": 0.209, + "step": 5196 + }, + { + "epoch": 0.26, + "grad_norm": 1.3075623769540963, + "learning_rate": 1.725690785130756e-05, + "loss": 0.2252, + "step": 5197 + }, + { + "epoch": 0.26, + "grad_norm": 1.2522311782873228, + "learning_rate": 1.7255774603904253e-05, + "loss": 0.1953, + "step": 5198 + }, + { + "epoch": 0.26, + "grad_norm": 0.9995002246188047, + "learning_rate": 1.7254641159687657e-05, + "loss": 0.2169, + "step": 5199 + }, + { + "epoch": 0.26, + "grad_norm": 0.8811425044279627, + "learning_rate": 1.725350751868851e-05, + "loss": 0.1869, + "step": 5200 + }, + { + "epoch": 0.26, + "grad_norm": 1.0446346367689991, + "learning_rate": 1.725237368093756e-05, + "loss": 0.1975, + "step": 5201 + }, + { + "epoch": 0.26, + "grad_norm": 0.8818232613937096, + "learning_rate": 1.7251239646465562e-05, + "loss": 0.2087, + "step": 5202 + }, + { + "epoch": 0.26, + "grad_norm": 1.070969961207307, + "learning_rate": 1.7250105415303283e-05, + "loss": 0.2288, + "step": 5203 + }, + { + "epoch": 0.26, + "grad_norm": 0.8885154254095409, + "learning_rate": 1.7248970987481484e-05, + "loss": 0.2268, + "step": 5204 + }, + { + "epoch": 0.26, + "grad_norm": 1.0101696219659841, + "learning_rate": 1.7247836363030935e-05, + "loss": 0.1933, + "step": 5205 + }, + { + "epoch": 0.26, + "grad_norm": 1.0928642196121847, + "learning_rate": 1.724670154198242e-05, + "loss": 0.2081, + "step": 5206 + }, + { + "epoch": 0.26, + "grad_norm": 0.8613315400435708, + "learning_rate": 1.7245566524366713e-05, + "loss": 0.2135, + "step": 5207 + }, + { + "epoch": 0.26, + "grad_norm": 0.8440440917985735, + "learning_rate": 1.7244431310214604e-05, + "loss": 0.234, + "step": 5208 + }, + { + "epoch": 0.26, + "grad_norm": 1.2084843767372881, + "learning_rate": 1.724329589955689e-05, + "loss": 0.1991, + "step": 5209 + }, + { + "epoch": 0.26, + "grad_norm": 1.2607093167904153, + "learning_rate": 1.7242160292424362e-05, + "loss": 0.2204, + "step": 5210 + }, + { + "epoch": 0.26, + "grad_norm": 0.9823639926148594, + "learning_rate": 1.724102448884783e-05, + "loss": 0.2363, + "step": 5211 + }, + { + "epoch": 0.27, + "grad_norm": 0.9200310958626141, + "learning_rate": 1.7239888488858097e-05, + "loss": 0.2204, + "step": 5212 + }, + { + "epoch": 0.27, + "grad_norm": 0.950300322829358, + "learning_rate": 1.723875229248598e-05, + "loss": 0.2235, + "step": 5213 + }, + { + "epoch": 0.27, + "grad_norm": 1.0896260359504228, + "learning_rate": 1.72376158997623e-05, + "loss": 0.2371, + "step": 5214 + }, + { + "epoch": 0.27, + "grad_norm": 0.9389995805140761, + "learning_rate": 1.7236479310717878e-05, + "loss": 0.2168, + "step": 5215 + }, + { + "epoch": 0.27, + "grad_norm": 0.970088000932802, + "learning_rate": 1.723534252538355e-05, + "loss": 0.2205, + "step": 5216 + }, + { + "epoch": 0.27, + "grad_norm": 1.1210916950770902, + "learning_rate": 1.7234205543790143e-05, + "loss": 0.2066, + "step": 5217 + }, + { + "epoch": 0.27, + "grad_norm": 0.8368801598300282, + "learning_rate": 1.7233068365968505e-05, + "loss": 0.2272, + "step": 5218 + }, + { + "epoch": 0.27, + "grad_norm": 1.007554450271422, + "learning_rate": 1.723193099194948e-05, + "loss": 0.2177, + "step": 5219 + }, + { + "epoch": 0.27, + "grad_norm": 1.127711066292918, + "learning_rate": 1.7230793421763914e-05, + "loss": 0.1948, + "step": 5220 + }, + { + "epoch": 0.27, + "grad_norm": 1.4235497287302739, + "learning_rate": 1.722965565544267e-05, + "loss": 0.1932, + "step": 5221 + }, + { + "epoch": 0.27, + "grad_norm": 1.263286240902716, + "learning_rate": 1.722851769301661e-05, + "loss": 0.2192, + "step": 5222 + }, + { + "epoch": 0.27, + "grad_norm": 0.8378216432441894, + "learning_rate": 1.7227379534516594e-05, + "loss": 0.1854, + "step": 5223 + }, + { + "epoch": 0.27, + "grad_norm": 0.87419967006041, + "learning_rate": 1.7226241179973505e-05, + "loss": 0.2109, + "step": 5224 + }, + { + "epoch": 0.27, + "grad_norm": 0.7866001766190502, + "learning_rate": 1.7225102629418217e-05, + "loss": 0.2234, + "step": 5225 + }, + { + "epoch": 0.27, + "grad_norm": 1.030357376854233, + "learning_rate": 1.7223963882881606e-05, + "loss": 0.1997, + "step": 5226 + }, + { + "epoch": 0.27, + "grad_norm": 0.9039436771158188, + "learning_rate": 1.722282494039457e-05, + "loss": 0.2306, + "step": 5227 + }, + { + "epoch": 0.27, + "grad_norm": 0.8311084196052098, + "learning_rate": 1.7221685801988003e-05, + "loss": 0.1968, + "step": 5228 + }, + { + "epoch": 0.27, + "grad_norm": 1.0155689933704966, + "learning_rate": 1.7220546467692797e-05, + "loss": 0.1919, + "step": 5229 + }, + { + "epoch": 0.27, + "grad_norm": 0.7621332743514103, + "learning_rate": 1.721940693753986e-05, + "loss": 0.1836, + "step": 5230 + }, + { + "epoch": 0.27, + "grad_norm": 0.9408075793675481, + "learning_rate": 1.7218267211560103e-05, + "loss": 0.2248, + "step": 5231 + }, + { + "epoch": 0.27, + "grad_norm": 0.978169976254573, + "learning_rate": 1.7217127289784437e-05, + "loss": 0.2243, + "step": 5232 + }, + { + "epoch": 0.27, + "grad_norm": 0.9131227974824523, + "learning_rate": 1.7215987172243788e-05, + "loss": 0.2016, + "step": 5233 + }, + { + "epoch": 0.27, + "grad_norm": 1.0586303575143619, + "learning_rate": 1.721484685896908e-05, + "loss": 0.2054, + "step": 5234 + }, + { + "epoch": 0.27, + "grad_norm": 0.7416757017851802, + "learning_rate": 1.7213706349991243e-05, + "loss": 0.2054, + "step": 5235 + }, + { + "epoch": 0.27, + "grad_norm": 1.2013216628849288, + "learning_rate": 1.721256564534122e-05, + "loss": 0.2013, + "step": 5236 + }, + { + "epoch": 0.27, + "grad_norm": 1.4472292421550423, + "learning_rate": 1.7211424745049935e-05, + "loss": 0.2044, + "step": 5237 + }, + { + "epoch": 0.27, + "grad_norm": 1.1853024744443268, + "learning_rate": 1.7210283649148355e-05, + "loss": 0.2098, + "step": 5238 + }, + { + "epoch": 0.27, + "grad_norm": 0.941182147562369, + "learning_rate": 1.720914235766742e-05, + "loss": 0.2263, + "step": 5239 + }, + { + "epoch": 0.27, + "grad_norm": 1.3061098361084038, + "learning_rate": 1.7208000870638094e-05, + "loss": 0.2085, + "step": 5240 + }, + { + "epoch": 0.27, + "grad_norm": 0.8182517884818539, + "learning_rate": 1.7206859188091334e-05, + "loss": 0.2026, + "step": 5241 + }, + { + "epoch": 0.27, + "grad_norm": 0.8225484731142085, + "learning_rate": 1.7205717310058115e-05, + "loss": 0.2006, + "step": 5242 + }, + { + "epoch": 0.27, + "grad_norm": 0.9473367325916228, + "learning_rate": 1.7204575236569403e-05, + "loss": 0.2237, + "step": 5243 + }, + { + "epoch": 0.27, + "grad_norm": 1.4132935672607803, + "learning_rate": 1.7203432967656185e-05, + "loss": 0.165, + "step": 5244 + }, + { + "epoch": 0.27, + "grad_norm": 0.9167655284706728, + "learning_rate": 1.7202290503349436e-05, + "loss": 0.1945, + "step": 5245 + }, + { + "epoch": 0.27, + "grad_norm": 0.9433089720769943, + "learning_rate": 1.7201147843680156e-05, + "loss": 0.2056, + "step": 5246 + }, + { + "epoch": 0.27, + "grad_norm": 1.5687627971666505, + "learning_rate": 1.7200004988679332e-05, + "loss": 0.2077, + "step": 5247 + }, + { + "epoch": 0.27, + "grad_norm": 1.0573084994768132, + "learning_rate": 1.7198861938377965e-05, + "loss": 0.2293, + "step": 5248 + }, + { + "epoch": 0.27, + "grad_norm": 0.7877683252059432, + "learning_rate": 1.719771869280706e-05, + "loss": 0.1943, + "step": 5249 + }, + { + "epoch": 0.27, + "grad_norm": 1.0970146564562546, + "learning_rate": 1.719657525199763e-05, + "loss": 0.2058, + "step": 5250 + }, + { + "epoch": 0.27, + "grad_norm": 0.9594541757438699, + "learning_rate": 1.7195431615980692e-05, + "loss": 0.2311, + "step": 5251 + }, + { + "epoch": 0.27, + "grad_norm": 0.9958431420570283, + "learning_rate": 1.719428778478726e-05, + "loss": 0.221, + "step": 5252 + }, + { + "epoch": 0.27, + "grad_norm": 1.1668962676306558, + "learning_rate": 1.719314375844837e-05, + "loss": 0.2285, + "step": 5253 + }, + { + "epoch": 0.27, + "grad_norm": 1.0370614060296535, + "learning_rate": 1.719199953699505e-05, + "loss": 0.2159, + "step": 5254 + }, + { + "epoch": 0.27, + "grad_norm": 0.8308609951209482, + "learning_rate": 1.7190855120458333e-05, + "loss": 0.2, + "step": 5255 + }, + { + "epoch": 0.27, + "grad_norm": 1.0656123715589458, + "learning_rate": 1.7189710508869266e-05, + "loss": 0.2023, + "step": 5256 + }, + { + "epoch": 0.27, + "grad_norm": 0.883717672221715, + "learning_rate": 1.7188565702258893e-05, + "loss": 0.235, + "step": 5257 + }, + { + "epoch": 0.27, + "grad_norm": 0.8786535948719226, + "learning_rate": 1.7187420700658273e-05, + "loss": 0.1991, + "step": 5258 + }, + { + "epoch": 0.27, + "grad_norm": 1.0064167074518728, + "learning_rate": 1.718627550409846e-05, + "loss": 0.2273, + "step": 5259 + }, + { + "epoch": 0.27, + "grad_norm": 1.2174073349081103, + "learning_rate": 1.7185130112610518e-05, + "loss": 0.2033, + "step": 5260 + }, + { + "epoch": 0.27, + "grad_norm": 1.0550775089638038, + "learning_rate": 1.7183984526225517e-05, + "loss": 0.2257, + "step": 5261 + }, + { + "epoch": 0.27, + "grad_norm": 0.843216437626411, + "learning_rate": 1.7182838744974525e-05, + "loss": 0.2314, + "step": 5262 + }, + { + "epoch": 0.27, + "grad_norm": 0.9792723396409431, + "learning_rate": 1.7181692768888632e-05, + "loss": 0.1869, + "step": 5263 + }, + { + "epoch": 0.27, + "grad_norm": 0.8365306985513885, + "learning_rate": 1.7180546597998913e-05, + "loss": 0.2035, + "step": 5264 + }, + { + "epoch": 0.27, + "grad_norm": 1.1671518444866484, + "learning_rate": 1.7179400232336462e-05, + "loss": 0.2152, + "step": 5265 + }, + { + "epoch": 0.27, + "grad_norm": 1.0216996902622235, + "learning_rate": 1.7178253671932378e-05, + "loss": 0.1933, + "step": 5266 + }, + { + "epoch": 0.27, + "grad_norm": 0.9408021541356948, + "learning_rate": 1.7177106916817754e-05, + "loss": 0.202, + "step": 5267 + }, + { + "epoch": 0.27, + "grad_norm": 1.076468275502716, + "learning_rate": 1.7175959967023703e-05, + "loss": 0.2212, + "step": 5268 + }, + { + "epoch": 0.27, + "grad_norm": 0.8578660342363781, + "learning_rate": 1.717481282258133e-05, + "loss": 0.1909, + "step": 5269 + }, + { + "epoch": 0.27, + "grad_norm": 1.398141662645059, + "learning_rate": 1.7173665483521757e-05, + "loss": 0.2328, + "step": 5270 + }, + { + "epoch": 0.27, + "grad_norm": 1.1515910090053505, + "learning_rate": 1.7172517949876098e-05, + "loss": 0.2135, + "step": 5271 + }, + { + "epoch": 0.27, + "grad_norm": 0.8627620038988313, + "learning_rate": 1.7171370221675486e-05, + "loss": 0.1891, + "step": 5272 + }, + { + "epoch": 0.27, + "grad_norm": 1.196025450833706, + "learning_rate": 1.7170222298951053e-05, + "loss": 0.2216, + "step": 5273 + }, + { + "epoch": 0.27, + "grad_norm": 0.8852782147065706, + "learning_rate": 1.7169074181733934e-05, + "loss": 0.1954, + "step": 5274 + }, + { + "epoch": 0.27, + "grad_norm": 1.1495828840508941, + "learning_rate": 1.7167925870055273e-05, + "loss": 0.1853, + "step": 5275 + }, + { + "epoch": 0.27, + "grad_norm": 1.0454671880927513, + "learning_rate": 1.716677736394622e-05, + "loss": 0.206, + "step": 5276 + }, + { + "epoch": 0.27, + "grad_norm": 1.1007704616492229, + "learning_rate": 1.7165628663437923e-05, + "loss": 0.1847, + "step": 5277 + }, + { + "epoch": 0.27, + "grad_norm": 0.9550909862275008, + "learning_rate": 1.7164479768561546e-05, + "loss": 0.1888, + "step": 5278 + }, + { + "epoch": 0.27, + "grad_norm": 1.2416770226952552, + "learning_rate": 1.7163330679348248e-05, + "loss": 0.1962, + "step": 5279 + }, + { + "epoch": 0.27, + "grad_norm": 0.9840509198815576, + "learning_rate": 1.7162181395829204e-05, + "loss": 0.2141, + "step": 5280 + }, + { + "epoch": 0.27, + "grad_norm": 1.1730068768818405, + "learning_rate": 1.7161031918035584e-05, + "loss": 0.2345, + "step": 5281 + }, + { + "epoch": 0.27, + "grad_norm": 1.0294990181578525, + "learning_rate": 1.715988224599857e-05, + "loss": 0.2259, + "step": 5282 + }, + { + "epoch": 0.27, + "grad_norm": 0.8718280425482781, + "learning_rate": 1.7158732379749342e-05, + "loss": 0.2206, + "step": 5283 + }, + { + "epoch": 0.27, + "grad_norm": 0.7973950150881813, + "learning_rate": 1.71575823193191e-05, + "loss": 0.2011, + "step": 5284 + }, + { + "epoch": 0.27, + "grad_norm": 0.8958789502079094, + "learning_rate": 1.7156432064739024e-05, + "loss": 0.235, + "step": 5285 + }, + { + "epoch": 0.27, + "grad_norm": 0.9867998258952296, + "learning_rate": 1.7155281616040333e-05, + "loss": 0.2324, + "step": 5286 + }, + { + "epoch": 0.27, + "grad_norm": 0.9559868928584956, + "learning_rate": 1.715413097325422e-05, + "loss": 0.2009, + "step": 5287 + }, + { + "epoch": 0.27, + "grad_norm": 0.9716198690195488, + "learning_rate": 1.71529801364119e-05, + "loss": 0.1964, + "step": 5288 + }, + { + "epoch": 0.27, + "grad_norm": 2.1129669967352243, + "learning_rate": 1.715182910554459e-05, + "loss": 0.2745, + "step": 5289 + }, + { + "epoch": 0.27, + "grad_norm": 0.9123072409362243, + "learning_rate": 1.7150677880683515e-05, + "loss": 0.2095, + "step": 5290 + }, + { + "epoch": 0.27, + "grad_norm": 1.0062131124197695, + "learning_rate": 1.7149526461859897e-05, + "loss": 0.2296, + "step": 5291 + }, + { + "epoch": 0.27, + "grad_norm": 1.0380383036645535, + "learning_rate": 1.7148374849104965e-05, + "loss": 0.2074, + "step": 5292 + }, + { + "epoch": 0.27, + "grad_norm": 1.5639900438854735, + "learning_rate": 1.7147223042449968e-05, + "loss": 0.1836, + "step": 5293 + }, + { + "epoch": 0.27, + "grad_norm": 0.7888321865748255, + "learning_rate": 1.7146071041926138e-05, + "loss": 0.1985, + "step": 5294 + }, + { + "epoch": 0.27, + "grad_norm": 1.433020224907867, + "learning_rate": 1.714491884756473e-05, + "loss": 0.252, + "step": 5295 + }, + { + "epoch": 0.27, + "grad_norm": 0.9519082354925571, + "learning_rate": 1.7143766459396993e-05, + "loss": 0.2113, + "step": 5296 + }, + { + "epoch": 0.27, + "grad_norm": 1.0719575839448903, + "learning_rate": 1.7142613877454186e-05, + "loss": 0.2011, + "step": 5297 + }, + { + "epoch": 0.27, + "grad_norm": 1.0089385606249057, + "learning_rate": 1.714146110176758e-05, + "loss": 0.2314, + "step": 5298 + }, + { + "epoch": 0.27, + "grad_norm": 1.2971638756917503, + "learning_rate": 1.714030813236843e-05, + "loss": 0.2287, + "step": 5299 + }, + { + "epoch": 0.27, + "grad_norm": 1.3336062167999805, + "learning_rate": 1.7139154969288026e-05, + "loss": 0.2174, + "step": 5300 + }, + { + "epoch": 0.27, + "grad_norm": 1.0890198977625634, + "learning_rate": 1.7138001612557636e-05, + "loss": 0.2095, + "step": 5301 + }, + { + "epoch": 0.27, + "grad_norm": 1.2127147798939326, + "learning_rate": 1.7136848062208552e-05, + "loss": 0.2234, + "step": 5302 + }, + { + "epoch": 0.27, + "grad_norm": 3.4830948947069063, + "learning_rate": 1.7135694318272057e-05, + "loss": 0.2158, + "step": 5303 + }, + { + "epoch": 0.27, + "grad_norm": 1.1718966869611827, + "learning_rate": 1.7134540380779453e-05, + "loss": 0.2205, + "step": 5304 + }, + { + "epoch": 0.27, + "grad_norm": 1.302483243747173, + "learning_rate": 1.713338624976204e-05, + "loss": 0.209, + "step": 5305 + }, + { + "epoch": 0.27, + "grad_norm": 0.9086727219113531, + "learning_rate": 1.713223192525112e-05, + "loss": 0.191, + "step": 5306 + }, + { + "epoch": 0.27, + "grad_norm": 1.11751268899965, + "learning_rate": 1.7131077407278008e-05, + "loss": 0.2278, + "step": 5307 + }, + { + "epoch": 0.27, + "grad_norm": 1.0514724800840336, + "learning_rate": 1.7129922695874016e-05, + "loss": 0.2207, + "step": 5308 + }, + { + "epoch": 0.27, + "grad_norm": 0.9040266788733141, + "learning_rate": 1.712876779107047e-05, + "loss": 0.2056, + "step": 5309 + }, + { + "epoch": 0.27, + "grad_norm": 0.9642309087364644, + "learning_rate": 1.7127612692898695e-05, + "loss": 0.1947, + "step": 5310 + }, + { + "epoch": 0.27, + "grad_norm": 0.9401415784326851, + "learning_rate": 1.7126457401390023e-05, + "loss": 0.2014, + "step": 5311 + }, + { + "epoch": 0.27, + "grad_norm": 1.027856827944775, + "learning_rate": 1.712530191657579e-05, + "loss": 0.2217, + "step": 5312 + }, + { + "epoch": 0.27, + "grad_norm": 0.926767366522984, + "learning_rate": 1.712414623848734e-05, + "loss": 0.1958, + "step": 5313 + }, + { + "epoch": 0.27, + "grad_norm": 1.0775741073707663, + "learning_rate": 1.712299036715602e-05, + "loss": 0.234, + "step": 5314 + }, + { + "epoch": 0.27, + "grad_norm": 1.0525832503522692, + "learning_rate": 1.712183430261319e-05, + "loss": 0.1927, + "step": 5315 + }, + { + "epoch": 0.27, + "grad_norm": 0.8435504630743897, + "learning_rate": 1.71206780448902e-05, + "loss": 0.1961, + "step": 5316 + }, + { + "epoch": 0.27, + "grad_norm": 0.8021204498469657, + "learning_rate": 1.711952159401841e-05, + "loss": 0.1926, + "step": 5317 + }, + { + "epoch": 0.27, + "grad_norm": 1.0147291827406735, + "learning_rate": 1.71183649500292e-05, + "loss": 0.1947, + "step": 5318 + }, + { + "epoch": 0.27, + "grad_norm": 1.0304586584190933, + "learning_rate": 1.711720811295394e-05, + "loss": 0.2174, + "step": 5319 + }, + { + "epoch": 0.27, + "grad_norm": 0.9570126657180933, + "learning_rate": 1.7116051082824003e-05, + "loss": 0.2029, + "step": 5320 + }, + { + "epoch": 0.27, + "grad_norm": 0.9625640745359515, + "learning_rate": 1.711489385967078e-05, + "loss": 0.1857, + "step": 5321 + }, + { + "epoch": 0.27, + "grad_norm": 0.9279472961440479, + "learning_rate": 1.7113736443525662e-05, + "loss": 0.2104, + "step": 5322 + }, + { + "epoch": 0.27, + "grad_norm": 0.9756088976113342, + "learning_rate": 1.7112578834420036e-05, + "loss": 0.2136, + "step": 5323 + }, + { + "epoch": 0.27, + "grad_norm": 1.0709776914176408, + "learning_rate": 1.7111421032385313e-05, + "loss": 0.2043, + "step": 5324 + }, + { + "epoch": 0.27, + "grad_norm": 1.225056753679554, + "learning_rate": 1.711026303745289e-05, + "loss": 0.222, + "step": 5325 + }, + { + "epoch": 0.27, + "grad_norm": 0.9022739276514431, + "learning_rate": 1.710910484965418e-05, + "loss": 0.2045, + "step": 5326 + }, + { + "epoch": 0.27, + "grad_norm": 0.7728402557707814, + "learning_rate": 1.71079464690206e-05, + "loss": 0.2314, + "step": 5327 + }, + { + "epoch": 0.27, + "grad_norm": 0.8080405879850907, + "learning_rate": 1.7106787895583573e-05, + "loss": 0.232, + "step": 5328 + }, + { + "epoch": 0.27, + "grad_norm": 0.7797907753458592, + "learning_rate": 1.710562912937452e-05, + "loss": 0.2095, + "step": 5329 + }, + { + "epoch": 0.27, + "grad_norm": 1.5049046662803585, + "learning_rate": 1.710447017042488e-05, + "loss": 0.2189, + "step": 5330 + }, + { + "epoch": 0.27, + "grad_norm": 0.8657038274319842, + "learning_rate": 1.710331101876608e-05, + "loss": 0.2169, + "step": 5331 + }, + { + "epoch": 0.27, + "grad_norm": 0.9621461414357568, + "learning_rate": 1.7102151674429567e-05, + "loss": 0.211, + "step": 5332 + }, + { + "epoch": 0.27, + "grad_norm": 1.3494794196811704, + "learning_rate": 1.7100992137446792e-05, + "loss": 0.1997, + "step": 5333 + }, + { + "epoch": 0.27, + "grad_norm": 1.0959570082489183, + "learning_rate": 1.7099832407849203e-05, + "loss": 0.2073, + "step": 5334 + }, + { + "epoch": 0.27, + "grad_norm": 1.0047881510760754, + "learning_rate": 1.709867248566826e-05, + "loss": 0.2302, + "step": 5335 + }, + { + "epoch": 0.27, + "grad_norm": 1.292243114304556, + "learning_rate": 1.7097512370935422e-05, + "loss": 0.256, + "step": 5336 + }, + { + "epoch": 0.27, + "grad_norm": 0.8680614701378598, + "learning_rate": 1.7096352063682163e-05, + "loss": 0.2063, + "step": 5337 + }, + { + "epoch": 0.27, + "grad_norm": 1.4413192657041078, + "learning_rate": 1.709519156393995e-05, + "loss": 0.2173, + "step": 5338 + }, + { + "epoch": 0.27, + "grad_norm": 0.9953817989447833, + "learning_rate": 1.709403087174027e-05, + "loss": 0.2115, + "step": 5339 + }, + { + "epoch": 0.27, + "grad_norm": 1.0349009846998212, + "learning_rate": 1.70928699871146e-05, + "loss": 0.2315, + "step": 5340 + }, + { + "epoch": 0.27, + "grad_norm": 0.977550830932778, + "learning_rate": 1.709170891009443e-05, + "loss": 0.1951, + "step": 5341 + }, + { + "epoch": 0.27, + "grad_norm": 0.9936609207937576, + "learning_rate": 1.7090547640711256e-05, + "loss": 0.2179, + "step": 5342 + }, + { + "epoch": 0.27, + "grad_norm": 1.0861275402229364, + "learning_rate": 1.7089386178996576e-05, + "loss": 0.2168, + "step": 5343 + }, + { + "epoch": 0.27, + "grad_norm": 0.7740154237094764, + "learning_rate": 1.70882245249819e-05, + "loss": 0.2033, + "step": 5344 + }, + { + "epoch": 0.27, + "grad_norm": 1.137396135861429, + "learning_rate": 1.7087062678698726e-05, + "loss": 0.2073, + "step": 5345 + }, + { + "epoch": 0.27, + "grad_norm": 0.9734578826182253, + "learning_rate": 1.7085900640178582e-05, + "loss": 0.2231, + "step": 5346 + }, + { + "epoch": 0.27, + "grad_norm": 2.023946483249804, + "learning_rate": 1.7084738409452982e-05, + "loss": 0.2204, + "step": 5347 + }, + { + "epoch": 0.27, + "grad_norm": 0.9838807350978188, + "learning_rate": 1.7083575986553448e-05, + "loss": 0.2084, + "step": 5348 + }, + { + "epoch": 0.27, + "grad_norm": 1.038334702873617, + "learning_rate": 1.708241337151152e-05, + "loss": 0.2181, + "step": 5349 + }, + { + "epoch": 0.27, + "grad_norm": 0.9705124443322362, + "learning_rate": 1.708125056435873e-05, + "loss": 0.2053, + "step": 5350 + }, + { + "epoch": 0.27, + "grad_norm": 1.0641275795654905, + "learning_rate": 1.7080087565126613e-05, + "loss": 0.2133, + "step": 5351 + }, + { + "epoch": 0.27, + "grad_norm": 0.8789516998052838, + "learning_rate": 1.707892437384673e-05, + "loss": 0.2174, + "step": 5352 + }, + { + "epoch": 0.27, + "grad_norm": 0.9768442549284431, + "learning_rate": 1.7077760990550617e-05, + "loss": 0.2115, + "step": 5353 + }, + { + "epoch": 0.27, + "grad_norm": 1.0576896748411604, + "learning_rate": 1.7076597415269836e-05, + "loss": 0.2261, + "step": 5354 + }, + { + "epoch": 0.27, + "grad_norm": 2.04442745089693, + "learning_rate": 1.7075433648035952e-05, + "loss": 0.2209, + "step": 5355 + }, + { + "epoch": 0.27, + "grad_norm": 1.9242450796659158, + "learning_rate": 1.707426968888053e-05, + "loss": 0.2035, + "step": 5356 + }, + { + "epoch": 0.27, + "grad_norm": 0.8041864219716768, + "learning_rate": 1.7073105537835145e-05, + "loss": 0.2223, + "step": 5357 + }, + { + "epoch": 0.27, + "grad_norm": 1.0618635761951816, + "learning_rate": 1.7071941194931372e-05, + "loss": 0.1791, + "step": 5358 + }, + { + "epoch": 0.27, + "grad_norm": 0.8489978326658181, + "learning_rate": 1.7070776660200797e-05, + "loss": 0.2106, + "step": 5359 + }, + { + "epoch": 0.27, + "grad_norm": 1.0697877918430914, + "learning_rate": 1.7069611933675006e-05, + "loss": 0.2105, + "step": 5360 + }, + { + "epoch": 0.27, + "grad_norm": 1.06294677154257, + "learning_rate": 1.7068447015385587e-05, + "loss": 0.236, + "step": 5361 + }, + { + "epoch": 0.27, + "grad_norm": 1.2730267833270381, + "learning_rate": 1.706728190536415e-05, + "loss": 0.2269, + "step": 5362 + }, + { + "epoch": 0.27, + "grad_norm": 1.1527764705868002, + "learning_rate": 1.7066116603642285e-05, + "loss": 0.2155, + "step": 5363 + }, + { + "epoch": 0.27, + "grad_norm": 0.9380764122116448, + "learning_rate": 1.706495111025161e-05, + "loss": 0.2209, + "step": 5364 + }, + { + "epoch": 0.27, + "grad_norm": 2.816402075469522, + "learning_rate": 1.706378542522374e-05, + "loss": 0.2285, + "step": 5365 + }, + { + "epoch": 0.27, + "grad_norm": 1.097092753302864, + "learning_rate": 1.706261954859029e-05, + "loss": 0.223, + "step": 5366 + }, + { + "epoch": 0.27, + "grad_norm": 1.2072634018517636, + "learning_rate": 1.7061453480382885e-05, + "loss": 0.2088, + "step": 5367 + }, + { + "epoch": 0.27, + "grad_norm": 1.1616017699973211, + "learning_rate": 1.7060287220633158e-05, + "loss": 0.1903, + "step": 5368 + }, + { + "epoch": 0.27, + "grad_norm": 1.0131673326230828, + "learning_rate": 1.7059120769372737e-05, + "loss": 0.204, + "step": 5369 + }, + { + "epoch": 0.27, + "grad_norm": 1.0226302459685408, + "learning_rate": 1.7057954126633268e-05, + "loss": 0.1971, + "step": 5370 + }, + { + "epoch": 0.27, + "grad_norm": 1.1991289041476931, + "learning_rate": 1.7056787292446396e-05, + "loss": 0.2156, + "step": 5371 + }, + { + "epoch": 0.27, + "grad_norm": 0.9740162925818302, + "learning_rate": 1.7055620266843776e-05, + "loss": 0.2157, + "step": 5372 + }, + { + "epoch": 0.27, + "grad_norm": 1.3082793580892642, + "learning_rate": 1.705445304985705e-05, + "loss": 0.1786, + "step": 5373 + }, + { + "epoch": 0.27, + "grad_norm": 0.9009398830787593, + "learning_rate": 1.7053285641517886e-05, + "loss": 0.1898, + "step": 5374 + }, + { + "epoch": 0.27, + "grad_norm": 1.333310758399028, + "learning_rate": 1.7052118041857954e-05, + "loss": 0.2291, + "step": 5375 + }, + { + "epoch": 0.27, + "grad_norm": 0.8219067225406175, + "learning_rate": 1.7050950250908923e-05, + "loss": 0.2086, + "step": 5376 + }, + { + "epoch": 0.27, + "grad_norm": 1.0504821356893528, + "learning_rate": 1.7049782268702464e-05, + "loss": 0.2185, + "step": 5377 + }, + { + "epoch": 0.27, + "grad_norm": 1.5062762094220585, + "learning_rate": 1.7048614095270264e-05, + "loss": 0.2142, + "step": 5378 + }, + { + "epoch": 0.27, + "grad_norm": 1.1517461171713312, + "learning_rate": 1.704744573064401e-05, + "loss": 0.2252, + "step": 5379 + }, + { + "epoch": 0.27, + "grad_norm": 1.0666768851989106, + "learning_rate": 1.704627717485539e-05, + "loss": 0.2154, + "step": 5380 + }, + { + "epoch": 0.27, + "grad_norm": 0.9800546805869343, + "learning_rate": 1.7045108427936104e-05, + "loss": 0.2039, + "step": 5381 + }, + { + "epoch": 0.27, + "grad_norm": 1.285558315197764, + "learning_rate": 1.7043939489917858e-05, + "loss": 0.217, + "step": 5382 + }, + { + "epoch": 0.27, + "grad_norm": 0.9672045321469797, + "learning_rate": 1.7042770360832353e-05, + "loss": 0.2049, + "step": 5383 + }, + { + "epoch": 0.27, + "grad_norm": 1.3148918349554373, + "learning_rate": 1.7041601040711303e-05, + "loss": 0.2531, + "step": 5384 + }, + { + "epoch": 0.27, + "grad_norm": 0.9287482626051508, + "learning_rate": 1.7040431529586427e-05, + "loss": 0.1955, + "step": 5385 + }, + { + "epoch": 0.27, + "grad_norm": 1.0439263751970198, + "learning_rate": 1.7039261827489452e-05, + "loss": 0.2057, + "step": 5386 + }, + { + "epoch": 0.27, + "grad_norm": 1.0966870805891058, + "learning_rate": 1.7038091934452098e-05, + "loss": 0.2182, + "step": 5387 + }, + { + "epoch": 0.27, + "grad_norm": 1.0429284976399116, + "learning_rate": 1.7036921850506104e-05, + "loss": 0.2444, + "step": 5388 + }, + { + "epoch": 0.27, + "grad_norm": 1.9220008646181936, + "learning_rate": 1.7035751575683208e-05, + "loss": 0.2106, + "step": 5389 + }, + { + "epoch": 0.27, + "grad_norm": 1.2418202057946766, + "learning_rate": 1.7034581110015156e-05, + "loss": 0.2111, + "step": 5390 + }, + { + "epoch": 0.27, + "grad_norm": 0.9337888990910338, + "learning_rate": 1.7033410453533687e-05, + "loss": 0.2124, + "step": 5391 + }, + { + "epoch": 0.27, + "grad_norm": 1.2735895782979454, + "learning_rate": 1.7032239606270567e-05, + "loss": 0.1977, + "step": 5392 + }, + { + "epoch": 0.27, + "grad_norm": 0.8751710465726789, + "learning_rate": 1.7031068568257548e-05, + "loss": 0.1978, + "step": 5393 + }, + { + "epoch": 0.27, + "grad_norm": 0.8455923178009026, + "learning_rate": 1.7029897339526404e-05, + "loss": 0.2147, + "step": 5394 + }, + { + "epoch": 0.27, + "grad_norm": 0.826334770638359, + "learning_rate": 1.702872592010889e-05, + "loss": 0.2223, + "step": 5395 + }, + { + "epoch": 0.27, + "grad_norm": 1.0356782106426614, + "learning_rate": 1.702755431003679e-05, + "loss": 0.216, + "step": 5396 + }, + { + "epoch": 0.27, + "grad_norm": 1.0489926641495397, + "learning_rate": 1.7026382509341885e-05, + "loss": 0.2214, + "step": 5397 + }, + { + "epoch": 0.27, + "grad_norm": 0.9830309908645947, + "learning_rate": 1.7025210518055954e-05, + "loss": 0.2126, + "step": 5398 + }, + { + "epoch": 0.27, + "grad_norm": 0.8860550610226454, + "learning_rate": 1.7024038336210794e-05, + "loss": 0.1982, + "step": 5399 + }, + { + "epoch": 0.27, + "grad_norm": 1.0477678845976919, + "learning_rate": 1.7022865963838195e-05, + "loss": 0.1764, + "step": 5400 + }, + { + "epoch": 0.27, + "grad_norm": 0.9756090995833591, + "learning_rate": 1.7021693400969962e-05, + "loss": 0.2074, + "step": 5401 + }, + { + "epoch": 0.27, + "grad_norm": 1.0863202596740222, + "learning_rate": 1.7020520647637894e-05, + "loss": 0.2004, + "step": 5402 + }, + { + "epoch": 0.27, + "grad_norm": 2.6093505669717603, + "learning_rate": 1.701934770387381e-05, + "loss": 0.2044, + "step": 5403 + }, + { + "epoch": 0.27, + "grad_norm": 1.1912297537469665, + "learning_rate": 1.7018174569709523e-05, + "loss": 0.214, + "step": 5404 + }, + { + "epoch": 0.27, + "grad_norm": 1.0552055408422767, + "learning_rate": 1.7017001245176857e-05, + "loss": 0.2056, + "step": 5405 + }, + { + "epoch": 0.27, + "grad_norm": 0.9568556007681108, + "learning_rate": 1.7015827730307637e-05, + "loss": 0.2009, + "step": 5406 + }, + { + "epoch": 0.27, + "grad_norm": 1.0428933384888612, + "learning_rate": 1.701465402513369e-05, + "loss": 0.1951, + "step": 5407 + }, + { + "epoch": 0.28, + "grad_norm": 1.0154243359082187, + "learning_rate": 1.7013480129686857e-05, + "loss": 0.2184, + "step": 5408 + }, + { + "epoch": 0.28, + "grad_norm": 1.1022197933165605, + "learning_rate": 1.701230604399898e-05, + "loss": 0.197, + "step": 5409 + }, + { + "epoch": 0.28, + "grad_norm": 1.0560707879020101, + "learning_rate": 1.7011131768101906e-05, + "loss": 0.2162, + "step": 5410 + }, + { + "epoch": 0.28, + "grad_norm": 1.2588170514748378, + "learning_rate": 1.7009957302027484e-05, + "loss": 0.2096, + "step": 5411 + }, + { + "epoch": 0.28, + "grad_norm": 1.0979029434665841, + "learning_rate": 1.7008782645807578e-05, + "loss": 0.2112, + "step": 5412 + }, + { + "epoch": 0.28, + "grad_norm": 0.9524557231896512, + "learning_rate": 1.7007607799474045e-05, + "loss": 0.2133, + "step": 5413 + }, + { + "epoch": 0.28, + "grad_norm": 1.1895621186357845, + "learning_rate": 1.7006432763058753e-05, + "loss": 0.2, + "step": 5414 + }, + { + "epoch": 0.28, + "grad_norm": 1.190375294075288, + "learning_rate": 1.7005257536593577e-05, + "loss": 0.2149, + "step": 5415 + }, + { + "epoch": 0.28, + "grad_norm": 0.9149426868161031, + "learning_rate": 1.7004082120110396e-05, + "loss": 0.2138, + "step": 5416 + }, + { + "epoch": 0.28, + "grad_norm": 0.885553028586094, + "learning_rate": 1.7002906513641094e-05, + "loss": 0.2094, + "step": 5417 + }, + { + "epoch": 0.28, + "grad_norm": 1.1377344849438038, + "learning_rate": 1.7001730717217554e-05, + "loss": 0.2188, + "step": 5418 + }, + { + "epoch": 0.28, + "grad_norm": 1.0142042105802078, + "learning_rate": 1.700055473087167e-05, + "loss": 0.2463, + "step": 5419 + }, + { + "epoch": 0.28, + "grad_norm": 0.9577845607969336, + "learning_rate": 1.699937855463535e-05, + "loss": 0.2003, + "step": 5420 + }, + { + "epoch": 0.28, + "grad_norm": 0.7987401617507317, + "learning_rate": 1.699820218854049e-05, + "loss": 0.196, + "step": 5421 + }, + { + "epoch": 0.28, + "grad_norm": 0.9466080898054804, + "learning_rate": 1.6997025632618996e-05, + "loss": 0.2141, + "step": 5422 + }, + { + "epoch": 0.28, + "grad_norm": 0.9591314964338855, + "learning_rate": 1.6995848886902794e-05, + "loss": 0.2099, + "step": 5423 + }, + { + "epoch": 0.28, + "grad_norm": 1.0143917225888177, + "learning_rate": 1.699467195142379e-05, + "loss": 0.2189, + "step": 5424 + }, + { + "epoch": 0.28, + "grad_norm": 1.1287154569913822, + "learning_rate": 1.6993494826213917e-05, + "loss": 0.2119, + "step": 5425 + }, + { + "epoch": 0.28, + "grad_norm": 0.9108997932383712, + "learning_rate": 1.6992317511305103e-05, + "loss": 0.2298, + "step": 5426 + }, + { + "epoch": 0.28, + "grad_norm": 1.0312293374855317, + "learning_rate": 1.6991140006729277e-05, + "loss": 0.2044, + "step": 5427 + }, + { + "epoch": 0.28, + "grad_norm": 1.2243294060862493, + "learning_rate": 1.6989962312518384e-05, + "loss": 0.2113, + "step": 5428 + }, + { + "epoch": 0.28, + "grad_norm": 1.3494977850309595, + "learning_rate": 1.698878442870437e-05, + "loss": 0.2011, + "step": 5429 + }, + { + "epoch": 0.28, + "grad_norm": 1.6045698540700097, + "learning_rate": 1.6987606355319184e-05, + "loss": 0.2081, + "step": 5430 + }, + { + "epoch": 0.28, + "grad_norm": 0.9385287988505853, + "learning_rate": 1.698642809239478e-05, + "loss": 0.189, + "step": 5431 + }, + { + "epoch": 0.28, + "grad_norm": 1.064849914586746, + "learning_rate": 1.698524963996312e-05, + "loss": 0.238, + "step": 5432 + }, + { + "epoch": 0.28, + "grad_norm": 0.9401539962006684, + "learning_rate": 1.698407099805617e-05, + "loss": 0.1883, + "step": 5433 + }, + { + "epoch": 0.28, + "grad_norm": 1.2014313002921773, + "learning_rate": 1.69828921667059e-05, + "loss": 0.1861, + "step": 5434 + }, + { + "epoch": 0.28, + "grad_norm": 1.0598940147011735, + "learning_rate": 1.6981713145944284e-05, + "loss": 0.2166, + "step": 5435 + }, + { + "epoch": 0.28, + "grad_norm": 1.1006504747535542, + "learning_rate": 1.6980533935803306e-05, + "loss": 0.2054, + "step": 5436 + }, + { + "epoch": 0.28, + "grad_norm": 0.8626033245045626, + "learning_rate": 1.6979354536314946e-05, + "loss": 0.2281, + "step": 5437 + }, + { + "epoch": 0.28, + "grad_norm": 0.8596006588035439, + "learning_rate": 1.6978174947511206e-05, + "loss": 0.2064, + "step": 5438 + }, + { + "epoch": 0.28, + "grad_norm": 1.4517008204949062, + "learning_rate": 1.6976995169424072e-05, + "loss": 0.2104, + "step": 5439 + }, + { + "epoch": 0.28, + "grad_norm": 0.8330081037185146, + "learning_rate": 1.6975815202085556e-05, + "loss": 0.191, + "step": 5440 + }, + { + "epoch": 0.28, + "grad_norm": 1.2590492794567847, + "learning_rate": 1.6974635045527652e-05, + "loss": 0.2134, + "step": 5441 + }, + { + "epoch": 0.28, + "grad_norm": 1.002732843409242, + "learning_rate": 1.6973454699782382e-05, + "loss": 0.2081, + "step": 5442 + }, + { + "epoch": 0.28, + "grad_norm": 0.7695496820937182, + "learning_rate": 1.697227416488176e-05, + "loss": 0.224, + "step": 5443 + }, + { + "epoch": 0.28, + "grad_norm": 0.7855777505010354, + "learning_rate": 1.6971093440857808e-05, + "loss": 0.1951, + "step": 5444 + }, + { + "epoch": 0.28, + "grad_norm": 0.9924584904825682, + "learning_rate": 1.6969912527742547e-05, + "loss": 0.2015, + "step": 5445 + }, + { + "epoch": 0.28, + "grad_norm": 1.1286658064526591, + "learning_rate": 1.696873142556802e-05, + "loss": 0.2269, + "step": 5446 + }, + { + "epoch": 0.28, + "grad_norm": 1.1905921151254606, + "learning_rate": 1.6967550134366256e-05, + "loss": 0.2266, + "step": 5447 + }, + { + "epoch": 0.28, + "grad_norm": 0.9938986283838219, + "learning_rate": 1.6966368654169305e-05, + "loss": 0.2068, + "step": 5448 + }, + { + "epoch": 0.28, + "grad_norm": 0.9412002207458522, + "learning_rate": 1.696518698500921e-05, + "loss": 0.1996, + "step": 5449 + }, + { + "epoch": 0.28, + "grad_norm": 0.8128848611817997, + "learning_rate": 1.696400512691802e-05, + "loss": 0.2133, + "step": 5450 + }, + { + "epoch": 0.28, + "grad_norm": 0.7118661473447155, + "learning_rate": 1.6962823079927803e-05, + "loss": 0.2049, + "step": 5451 + }, + { + "epoch": 0.28, + "grad_norm": 0.9659168027327373, + "learning_rate": 1.696164084407062e-05, + "loss": 0.1916, + "step": 5452 + }, + { + "epoch": 0.28, + "grad_norm": 1.07456804180084, + "learning_rate": 1.6960458419378528e-05, + "loss": 0.2201, + "step": 5453 + }, + { + "epoch": 0.28, + "grad_norm": 1.1286968393726051, + "learning_rate": 1.695927580588361e-05, + "loss": 0.2281, + "step": 5454 + }, + { + "epoch": 0.28, + "grad_norm": 0.9546368608833864, + "learning_rate": 1.6958093003617942e-05, + "loss": 0.2243, + "step": 5455 + }, + { + "epoch": 0.28, + "grad_norm": 0.8286795365597308, + "learning_rate": 1.6956910012613612e-05, + "loss": 0.211, + "step": 5456 + }, + { + "epoch": 0.28, + "grad_norm": 3.1624253033639174, + "learning_rate": 1.6955726832902705e-05, + "loss": 0.1959, + "step": 5457 + }, + { + "epoch": 0.28, + "grad_norm": 0.8258952586684022, + "learning_rate": 1.6954543464517313e-05, + "loss": 0.2002, + "step": 5458 + }, + { + "epoch": 0.28, + "grad_norm": 0.9861277252982845, + "learning_rate": 1.6953359907489538e-05, + "loss": 0.1835, + "step": 5459 + }, + { + "epoch": 0.28, + "grad_norm": 0.869053520172057, + "learning_rate": 1.695217616185148e-05, + "loss": 0.2033, + "step": 5460 + }, + { + "epoch": 0.28, + "grad_norm": 1.466873269888362, + "learning_rate": 1.6950992227635252e-05, + "loss": 0.214, + "step": 5461 + }, + { + "epoch": 0.28, + "grad_norm": 0.9216917177291614, + "learning_rate": 1.6949808104872965e-05, + "loss": 0.208, + "step": 5462 + }, + { + "epoch": 0.28, + "grad_norm": 1.1146734573826524, + "learning_rate": 1.6948623793596744e-05, + "loss": 0.2057, + "step": 5463 + }, + { + "epoch": 0.28, + "grad_norm": 0.9795441070290263, + "learning_rate": 1.694743929383871e-05, + "loss": 0.2031, + "step": 5464 + }, + { + "epoch": 0.28, + "grad_norm": 0.9643666484708967, + "learning_rate": 1.6946254605630995e-05, + "loss": 0.2045, + "step": 5465 + }, + { + "epoch": 0.28, + "grad_norm": 0.9659569712091012, + "learning_rate": 1.6945069729005726e-05, + "loss": 0.1829, + "step": 5466 + }, + { + "epoch": 0.28, + "grad_norm": 0.8573246043762399, + "learning_rate": 1.6943884663995055e-05, + "loss": 0.2012, + "step": 5467 + }, + { + "epoch": 0.28, + "grad_norm": 1.1065850167937314, + "learning_rate": 1.6942699410631114e-05, + "loss": 0.1991, + "step": 5468 + }, + { + "epoch": 0.28, + "grad_norm": 2.7600803308859483, + "learning_rate": 1.6941513968946063e-05, + "loss": 0.2072, + "step": 5469 + }, + { + "epoch": 0.28, + "grad_norm": 0.9756031387165406, + "learning_rate": 1.6940328338972053e-05, + "loss": 0.2149, + "step": 5470 + }, + { + "epoch": 0.28, + "grad_norm": 0.7830774903264235, + "learning_rate": 1.6939142520741243e-05, + "loss": 0.1987, + "step": 5471 + }, + { + "epoch": 0.28, + "grad_norm": 1.047546367636695, + "learning_rate": 1.6937956514285797e-05, + "loss": 0.2123, + "step": 5472 + }, + { + "epoch": 0.28, + "grad_norm": 1.2211101494147778, + "learning_rate": 1.6936770319637896e-05, + "loss": 0.1906, + "step": 5473 + }, + { + "epoch": 0.28, + "grad_norm": 0.9542329622300272, + "learning_rate": 1.6935583936829706e-05, + "loss": 0.2119, + "step": 5474 + }, + { + "epoch": 0.28, + "grad_norm": 0.99799970671572, + "learning_rate": 1.693439736589341e-05, + "loss": 0.2135, + "step": 5475 + }, + { + "epoch": 0.28, + "grad_norm": 1.2219047082819476, + "learning_rate": 1.693321060686119e-05, + "loss": 0.2051, + "step": 5476 + }, + { + "epoch": 0.28, + "grad_norm": 1.0655680430773613, + "learning_rate": 1.6932023659765248e-05, + "loss": 0.2079, + "step": 5477 + }, + { + "epoch": 0.28, + "grad_norm": 0.8407756911500547, + "learning_rate": 1.6930836524637766e-05, + "loss": 0.2046, + "step": 5478 + }, + { + "epoch": 0.28, + "grad_norm": 1.0019352759531008, + "learning_rate": 1.6929649201510953e-05, + "loss": 0.2061, + "step": 5479 + }, + { + "epoch": 0.28, + "grad_norm": 1.0336266052542888, + "learning_rate": 1.692846169041702e-05, + "loss": 0.2073, + "step": 5480 + }, + { + "epoch": 0.28, + "grad_norm": 1.1592968084273145, + "learning_rate": 1.6927273991388164e-05, + "loss": 0.2137, + "step": 5481 + }, + { + "epoch": 0.28, + "grad_norm": 1.0472714014176197, + "learning_rate": 1.6926086104456613e-05, + "loss": 0.1932, + "step": 5482 + }, + { + "epoch": 0.28, + "grad_norm": 0.8884190104412825, + "learning_rate": 1.6924898029654585e-05, + "loss": 0.1962, + "step": 5483 + }, + { + "epoch": 0.28, + "grad_norm": 1.440890522067889, + "learning_rate": 1.692370976701431e-05, + "loss": 0.2311, + "step": 5484 + }, + { + "epoch": 0.28, + "grad_norm": 0.9975412180086698, + "learning_rate": 1.6922521316568014e-05, + "loss": 0.1937, + "step": 5485 + }, + { + "epoch": 0.28, + "grad_norm": 0.9162614283067528, + "learning_rate": 1.6921332678347936e-05, + "loss": 0.2477, + "step": 5486 + }, + { + "epoch": 0.28, + "grad_norm": 0.885433998276788, + "learning_rate": 1.6920143852386316e-05, + "loss": 0.1839, + "step": 5487 + }, + { + "epoch": 0.28, + "grad_norm": 0.9263433712536561, + "learning_rate": 1.6918954838715408e-05, + "loss": 0.2161, + "step": 5488 + }, + { + "epoch": 0.28, + "grad_norm": 1.053505012946194, + "learning_rate": 1.6917765637367455e-05, + "loss": 0.2018, + "step": 5489 + }, + { + "epoch": 0.28, + "grad_norm": 1.3169890952877474, + "learning_rate": 1.691657624837472e-05, + "loss": 0.2152, + "step": 5490 + }, + { + "epoch": 0.28, + "grad_norm": 0.7815313992450333, + "learning_rate": 1.6915386671769463e-05, + "loss": 0.1914, + "step": 5491 + }, + { + "epoch": 0.28, + "grad_norm": 1.070490158653429, + "learning_rate": 1.6914196907583952e-05, + "loss": 0.231, + "step": 5492 + }, + { + "epoch": 0.28, + "grad_norm": 0.8897247721716537, + "learning_rate": 1.6913006955850462e-05, + "loss": 0.2147, + "step": 5493 + }, + { + "epoch": 0.28, + "grad_norm": 1.2170042600343527, + "learning_rate": 1.6911816816601266e-05, + "loss": 0.2403, + "step": 5494 + }, + { + "epoch": 0.28, + "grad_norm": 1.202057324771667, + "learning_rate": 1.691062648986865e-05, + "loss": 0.21, + "step": 5495 + }, + { + "epoch": 0.28, + "grad_norm": 1.0040241770319058, + "learning_rate": 1.69094359756849e-05, + "loss": 0.2084, + "step": 5496 + }, + { + "epoch": 0.28, + "grad_norm": 0.8620668771810918, + "learning_rate": 1.6908245274082306e-05, + "loss": 0.2105, + "step": 5497 + }, + { + "epoch": 0.28, + "grad_norm": 1.2031020687777687, + "learning_rate": 1.690705438509317e-05, + "loss": 0.2091, + "step": 5498 + }, + { + "epoch": 0.28, + "grad_norm": 2.2348820456964273, + "learning_rate": 1.6905863308749793e-05, + "loss": 0.1794, + "step": 5499 + }, + { + "epoch": 0.28, + "grad_norm": 1.0425667322586452, + "learning_rate": 1.6904672045084485e-05, + "loss": 0.2207, + "step": 5500 + }, + { + "epoch": 0.28, + "grad_norm": 1.0625047037067803, + "learning_rate": 1.6903480594129557e-05, + "loss": 0.1962, + "step": 5501 + }, + { + "epoch": 0.28, + "grad_norm": 1.1736637841443045, + "learning_rate": 1.6902288955917328e-05, + "loss": 0.2155, + "step": 5502 + }, + { + "epoch": 0.28, + "grad_norm": 4.101980767830775, + "learning_rate": 1.690109713048012e-05, + "loss": 0.2163, + "step": 5503 + }, + { + "epoch": 0.28, + "grad_norm": 5.937151736434946, + "learning_rate": 1.6899905117850266e-05, + "loss": 0.1959, + "step": 5504 + }, + { + "epoch": 0.28, + "grad_norm": 0.8079687624922969, + "learning_rate": 1.6898712918060093e-05, + "loss": 0.1853, + "step": 5505 + }, + { + "epoch": 0.28, + "grad_norm": 0.8451325110889567, + "learning_rate": 1.6897520531141944e-05, + "loss": 0.1939, + "step": 5506 + }, + { + "epoch": 0.28, + "grad_norm": 0.9312220398678531, + "learning_rate": 1.6896327957128162e-05, + "loss": 0.2163, + "step": 5507 + }, + { + "epoch": 0.28, + "grad_norm": 0.9727108910026903, + "learning_rate": 1.689513519605109e-05, + "loss": 0.1939, + "step": 5508 + }, + { + "epoch": 0.28, + "grad_norm": 0.7698826201117085, + "learning_rate": 1.689394224794309e-05, + "loss": 0.2068, + "step": 5509 + }, + { + "epoch": 0.28, + "grad_norm": 0.7688375248720871, + "learning_rate": 1.689274911283652e-05, + "loss": 0.196, + "step": 5510 + }, + { + "epoch": 0.28, + "grad_norm": 1.0006142145056547, + "learning_rate": 1.6891555790763735e-05, + "loss": 0.2128, + "step": 5511 + }, + { + "epoch": 0.28, + "grad_norm": 0.9641337809883561, + "learning_rate": 1.6890362281757117e-05, + "loss": 0.2134, + "step": 5512 + }, + { + "epoch": 0.28, + "grad_norm": 0.8013917063334224, + "learning_rate": 1.6889168585849027e-05, + "loss": 0.2173, + "step": 5513 + }, + { + "epoch": 0.28, + "grad_norm": 0.8104111308085543, + "learning_rate": 1.688797470307185e-05, + "loss": 0.1928, + "step": 5514 + }, + { + "epoch": 0.28, + "grad_norm": 0.8837868269865045, + "learning_rate": 1.6886780633457975e-05, + "loss": 0.2148, + "step": 5515 + }, + { + "epoch": 0.28, + "grad_norm": 0.9583015710960666, + "learning_rate": 1.688558637703978e-05, + "loss": 0.1941, + "step": 5516 + }, + { + "epoch": 0.28, + "grad_norm": 0.8237325286578135, + "learning_rate": 1.688439193384967e-05, + "loss": 0.2069, + "step": 5517 + }, + { + "epoch": 0.28, + "grad_norm": 1.285974324365605, + "learning_rate": 1.688319730392004e-05, + "loss": 0.2282, + "step": 5518 + }, + { + "epoch": 0.28, + "grad_norm": 1.1747843825560265, + "learning_rate": 1.6882002487283293e-05, + "loss": 0.2305, + "step": 5519 + }, + { + "epoch": 0.28, + "grad_norm": 2.165339327087881, + "learning_rate": 1.688080748397184e-05, + "loss": 0.2201, + "step": 5520 + }, + { + "epoch": 0.28, + "grad_norm": 1.0824986772041116, + "learning_rate": 1.6879612294018092e-05, + "loss": 0.2254, + "step": 5521 + }, + { + "epoch": 0.28, + "grad_norm": 1.5266727806258882, + "learning_rate": 1.687841691745448e-05, + "loss": 0.2098, + "step": 5522 + }, + { + "epoch": 0.28, + "grad_norm": 1.0719337185200175, + "learning_rate": 1.6877221354313413e-05, + "loss": 0.1957, + "step": 5523 + }, + { + "epoch": 0.28, + "grad_norm": 0.9076648477056829, + "learning_rate": 1.6876025604627335e-05, + "loss": 0.1939, + "step": 5524 + }, + { + "epoch": 0.28, + "grad_norm": 1.7657803013254962, + "learning_rate": 1.6874829668428667e-05, + "loss": 0.2222, + "step": 5525 + }, + { + "epoch": 0.28, + "grad_norm": 1.3297842157111655, + "learning_rate": 1.6873633545749858e-05, + "loss": 0.2118, + "step": 5526 + }, + { + "epoch": 0.28, + "grad_norm": 0.8951088432162874, + "learning_rate": 1.6872437236623352e-05, + "loss": 0.2007, + "step": 5527 + }, + { + "epoch": 0.28, + "grad_norm": 1.2104995388774757, + "learning_rate": 1.68712407410816e-05, + "loss": 0.1915, + "step": 5528 + }, + { + "epoch": 0.28, + "grad_norm": 1.1709013879049428, + "learning_rate": 1.6870044059157052e-05, + "loss": 0.2232, + "step": 5529 + }, + { + "epoch": 0.28, + "grad_norm": 1.236545303278789, + "learning_rate": 1.686884719088217e-05, + "loss": 0.2066, + "step": 5530 + }, + { + "epoch": 0.28, + "grad_norm": 1.0617137566736374, + "learning_rate": 1.6867650136289425e-05, + "loss": 0.2365, + "step": 5531 + }, + { + "epoch": 0.28, + "grad_norm": 1.3100906770551048, + "learning_rate": 1.686645289541128e-05, + "loss": 0.2079, + "step": 5532 + }, + { + "epoch": 0.28, + "grad_norm": 0.9947827430940916, + "learning_rate": 1.686525546828021e-05, + "loss": 0.1977, + "step": 5533 + }, + { + "epoch": 0.28, + "grad_norm": 2.6173318374449783, + "learning_rate": 1.6864057854928696e-05, + "loss": 0.216, + "step": 5534 + }, + { + "epoch": 0.28, + "grad_norm": 0.8658319621042003, + "learning_rate": 1.686286005538923e-05, + "loss": 0.1902, + "step": 5535 + }, + { + "epoch": 0.28, + "grad_norm": 0.9118008242410937, + "learning_rate": 1.6861662069694292e-05, + "loss": 0.2222, + "step": 5536 + }, + { + "epoch": 0.28, + "grad_norm": 4.287722536221362, + "learning_rate": 1.686046389787639e-05, + "loss": 0.209, + "step": 5537 + }, + { + "epoch": 0.28, + "grad_norm": 2.6445443639482815, + "learning_rate": 1.6859265539968014e-05, + "loss": 0.2136, + "step": 5538 + }, + { + "epoch": 0.28, + "grad_norm": 1.2204081239684939, + "learning_rate": 1.6858066996001673e-05, + "loss": 0.2172, + "step": 5539 + }, + { + "epoch": 0.28, + "grad_norm": 1.0577270671622425, + "learning_rate": 1.6856868266009874e-05, + "loss": 0.1842, + "step": 5540 + }, + { + "epoch": 0.28, + "grad_norm": 1.24709712484204, + "learning_rate": 1.6855669350025138e-05, + "loss": 0.2265, + "step": 5541 + }, + { + "epoch": 0.28, + "grad_norm": 1.0953891982266504, + "learning_rate": 1.6854470248079983e-05, + "loss": 0.2333, + "step": 5542 + }, + { + "epoch": 0.28, + "grad_norm": 1.9000461617024391, + "learning_rate": 1.685327096020694e-05, + "loss": 0.221, + "step": 5543 + }, + { + "epoch": 0.28, + "grad_norm": 0.9692217209212479, + "learning_rate": 1.685207148643853e-05, + "loss": 0.1821, + "step": 5544 + }, + { + "epoch": 0.28, + "grad_norm": 0.9836872882390156, + "learning_rate": 1.6850871826807297e-05, + "loss": 0.2122, + "step": 5545 + }, + { + "epoch": 0.28, + "grad_norm": 1.1278818571254685, + "learning_rate": 1.6849671981345775e-05, + "loss": 0.1943, + "step": 5546 + }, + { + "epoch": 0.28, + "grad_norm": 1.1074602416317132, + "learning_rate": 1.6848471950086517e-05, + "loss": 0.224, + "step": 5547 + }, + { + "epoch": 0.28, + "grad_norm": 1.005589817604628, + "learning_rate": 1.684727173306207e-05, + "loss": 0.2247, + "step": 5548 + }, + { + "epoch": 0.28, + "grad_norm": 0.9020066072063216, + "learning_rate": 1.684607133030499e-05, + "loss": 0.2225, + "step": 5549 + }, + { + "epoch": 0.28, + "grad_norm": 0.7223123330717156, + "learning_rate": 1.684487074184784e-05, + "loss": 0.1739, + "step": 5550 + }, + { + "epoch": 0.28, + "grad_norm": 0.7908254662329712, + "learning_rate": 1.6843669967723183e-05, + "loss": 0.2086, + "step": 5551 + }, + { + "epoch": 0.28, + "grad_norm": 0.9050190208821911, + "learning_rate": 1.6842469007963592e-05, + "loss": 0.1978, + "step": 5552 + }, + { + "epoch": 0.28, + "grad_norm": 1.1064952641542272, + "learning_rate": 1.6841267862601644e-05, + "loss": 0.199, + "step": 5553 + }, + { + "epoch": 0.28, + "grad_norm": 1.3459381134914081, + "learning_rate": 1.6840066531669915e-05, + "loss": 0.2242, + "step": 5554 + }, + { + "epoch": 0.28, + "grad_norm": 0.8388784954524903, + "learning_rate": 1.6838865015200995e-05, + "loss": 0.21, + "step": 5555 + }, + { + "epoch": 0.28, + "grad_norm": 0.9451542750441775, + "learning_rate": 1.683766331322748e-05, + "loss": 0.2214, + "step": 5556 + }, + { + "epoch": 0.28, + "grad_norm": 0.8709892311875053, + "learning_rate": 1.683646142578196e-05, + "loss": 0.2011, + "step": 5557 + }, + { + "epoch": 0.28, + "grad_norm": 0.9928480478934592, + "learning_rate": 1.6835259352897035e-05, + "loss": 0.2044, + "step": 5558 + }, + { + "epoch": 0.28, + "grad_norm": 0.9502941509055366, + "learning_rate": 1.6834057094605314e-05, + "loss": 0.2281, + "step": 5559 + }, + { + "epoch": 0.28, + "grad_norm": 1.0910863752995716, + "learning_rate": 1.683285465093941e-05, + "loss": 0.1846, + "step": 5560 + }, + { + "epoch": 0.28, + "grad_norm": 2.5288457696683095, + "learning_rate": 1.683165202193194e-05, + "loss": 0.1984, + "step": 5561 + }, + { + "epoch": 0.28, + "grad_norm": 1.8747400245648658, + "learning_rate": 1.683044920761552e-05, + "loss": 0.2311, + "step": 5562 + }, + { + "epoch": 0.28, + "grad_norm": 1.0847339869897128, + "learning_rate": 1.682924620802278e-05, + "loss": 0.1875, + "step": 5563 + }, + { + "epoch": 0.28, + "grad_norm": 1.024929122243032, + "learning_rate": 1.682804302318635e-05, + "loss": 0.1864, + "step": 5564 + }, + { + "epoch": 0.28, + "grad_norm": 1.6855645827959116, + "learning_rate": 1.6826839653138872e-05, + "loss": 0.1951, + "step": 5565 + }, + { + "epoch": 0.28, + "grad_norm": 0.9463126278497938, + "learning_rate": 1.6825636097912976e-05, + "loss": 0.2164, + "step": 5566 + }, + { + "epoch": 0.28, + "grad_norm": 0.7997011520507737, + "learning_rate": 1.682443235754132e-05, + "loss": 0.1905, + "step": 5567 + }, + { + "epoch": 0.28, + "grad_norm": 1.3388113262501908, + "learning_rate": 1.682322843205655e-05, + "loss": 0.2205, + "step": 5568 + }, + { + "epoch": 0.28, + "grad_norm": 0.9707915728471659, + "learning_rate": 1.6822024321491323e-05, + "loss": 0.2247, + "step": 5569 + }, + { + "epoch": 0.28, + "grad_norm": 0.8366771355218784, + "learning_rate": 1.6820820025878298e-05, + "loss": 0.196, + "step": 5570 + }, + { + "epoch": 0.28, + "grad_norm": 1.1658522644977178, + "learning_rate": 1.6819615545250146e-05, + "loss": 0.2072, + "step": 5571 + }, + { + "epoch": 0.28, + "grad_norm": 1.0906354617526341, + "learning_rate": 1.681841087963954e-05, + "loss": 0.2709, + "step": 5572 + }, + { + "epoch": 0.28, + "grad_norm": 0.7832304082448883, + "learning_rate": 1.681720602907915e-05, + "loss": 0.1903, + "step": 5573 + }, + { + "epoch": 0.28, + "grad_norm": 0.9031727797125559, + "learning_rate": 1.6816000993601668e-05, + "loss": 0.1975, + "step": 5574 + }, + { + "epoch": 0.28, + "grad_norm": 0.8269271957462649, + "learning_rate": 1.6814795773239766e-05, + "loss": 0.1893, + "step": 5575 + }, + { + "epoch": 0.28, + "grad_norm": 2.409115031127353, + "learning_rate": 1.681359036802615e-05, + "loss": 0.2361, + "step": 5576 + }, + { + "epoch": 0.28, + "grad_norm": 1.0714923444580238, + "learning_rate": 1.681238477799351e-05, + "loss": 0.2124, + "step": 5577 + }, + { + "epoch": 0.28, + "grad_norm": 1.4448729012174302, + "learning_rate": 1.6811179003174546e-05, + "loss": 0.2037, + "step": 5578 + }, + { + "epoch": 0.28, + "grad_norm": 0.9301528109817786, + "learning_rate": 1.6809973043601962e-05, + "loss": 0.1971, + "step": 5579 + }, + { + "epoch": 0.28, + "grad_norm": 1.1091802621706934, + "learning_rate": 1.680876689930848e-05, + "loss": 0.2285, + "step": 5580 + }, + { + "epoch": 0.28, + "grad_norm": 0.7086801941850318, + "learning_rate": 1.680756057032681e-05, + "loss": 0.1964, + "step": 5581 + }, + { + "epoch": 0.28, + "grad_norm": 1.0176127550275247, + "learning_rate": 1.680635405668968e-05, + "loss": 0.2213, + "step": 5582 + }, + { + "epoch": 0.28, + "grad_norm": 1.4627825962559355, + "learning_rate": 1.6805147358429806e-05, + "loss": 0.1863, + "step": 5583 + }, + { + "epoch": 0.28, + "grad_norm": 0.7935095943088147, + "learning_rate": 1.6803940475579926e-05, + "loss": 0.1861, + "step": 5584 + }, + { + "epoch": 0.28, + "grad_norm": 1.2494953657874224, + "learning_rate": 1.680273340817278e-05, + "loss": 0.2112, + "step": 5585 + }, + { + "epoch": 0.28, + "grad_norm": 1.154399835221216, + "learning_rate": 1.68015261562411e-05, + "loss": 0.2179, + "step": 5586 + }, + { + "epoch": 0.28, + "grad_norm": 0.9786789665166755, + "learning_rate": 1.6800318719817647e-05, + "loss": 0.2403, + "step": 5587 + }, + { + "epoch": 0.28, + "grad_norm": 0.7028235365975033, + "learning_rate": 1.679911109893516e-05, + "loss": 0.1846, + "step": 5588 + }, + { + "epoch": 0.28, + "grad_norm": 3.1778933740553295, + "learning_rate": 1.67979032936264e-05, + "loss": 0.2242, + "step": 5589 + }, + { + "epoch": 0.28, + "grad_norm": 1.398224553047776, + "learning_rate": 1.679669530392413e-05, + "loss": 0.2028, + "step": 5590 + }, + { + "epoch": 0.28, + "grad_norm": 1.3966578774852827, + "learning_rate": 1.679548712986111e-05, + "loss": 0.2242, + "step": 5591 + }, + { + "epoch": 0.28, + "grad_norm": 0.9520399610008522, + "learning_rate": 1.6794278771470127e-05, + "loss": 0.1944, + "step": 5592 + }, + { + "epoch": 0.28, + "grad_norm": 0.8628424629593858, + "learning_rate": 1.6793070228783946e-05, + "loss": 0.1972, + "step": 5593 + }, + { + "epoch": 0.28, + "grad_norm": 0.8946564228844422, + "learning_rate": 1.679186150183535e-05, + "loss": 0.2161, + "step": 5594 + }, + { + "epoch": 0.28, + "grad_norm": 1.660295366849027, + "learning_rate": 1.6790652590657125e-05, + "loss": 0.2274, + "step": 5595 + }, + { + "epoch": 0.28, + "grad_norm": 1.1028149430210776, + "learning_rate": 1.678944349528207e-05, + "loss": 0.2041, + "step": 5596 + }, + { + "epoch": 0.28, + "grad_norm": 0.9057071652801462, + "learning_rate": 1.6788234215742974e-05, + "loss": 0.224, + "step": 5597 + }, + { + "epoch": 0.28, + "grad_norm": 0.8486970283199143, + "learning_rate": 1.6787024752072642e-05, + "loss": 0.2148, + "step": 5598 + }, + { + "epoch": 0.28, + "grad_norm": 1.1979543365735743, + "learning_rate": 1.678581510430388e-05, + "loss": 0.2268, + "step": 5599 + }, + { + "epoch": 0.28, + "grad_norm": 1.0996428666993319, + "learning_rate": 1.6784605272469502e-05, + "loss": 0.2101, + "step": 5600 + }, + { + "epoch": 0.28, + "grad_norm": 1.1156512400914844, + "learning_rate": 1.6783395256602318e-05, + "loss": 0.2249, + "step": 5601 + }, + { + "epoch": 0.28, + "grad_norm": 1.0102994579614044, + "learning_rate": 1.6782185056735157e-05, + "loss": 0.2209, + "step": 5602 + }, + { + "epoch": 0.28, + "grad_norm": 3.1022746944347066, + "learning_rate": 1.6780974672900845e-05, + "loss": 0.1736, + "step": 5603 + }, + { + "epoch": 0.28, + "grad_norm": 0.8559003285821831, + "learning_rate": 1.677976410513221e-05, + "loss": 0.2073, + "step": 5604 + }, + { + "epoch": 0.29, + "grad_norm": 1.316426081816249, + "learning_rate": 1.6778553353462092e-05, + "loss": 0.2115, + "step": 5605 + }, + { + "epoch": 0.29, + "grad_norm": 1.3715287544859591, + "learning_rate": 1.677734241792333e-05, + "loss": 0.2162, + "step": 5606 + }, + { + "epoch": 0.29, + "grad_norm": 0.8883864765063251, + "learning_rate": 1.677613129854877e-05, + "loss": 0.203, + "step": 5607 + }, + { + "epoch": 0.29, + "grad_norm": 0.8644309330656957, + "learning_rate": 1.6774919995371272e-05, + "loss": 0.1833, + "step": 5608 + }, + { + "epoch": 0.29, + "grad_norm": 0.8490153399647028, + "learning_rate": 1.6773708508423683e-05, + "loss": 0.2016, + "step": 5609 + }, + { + "epoch": 0.29, + "grad_norm": 0.9282538393726152, + "learning_rate": 1.6772496837738866e-05, + "loss": 0.2365, + "step": 5610 + }, + { + "epoch": 0.29, + "grad_norm": 1.0473637461227412, + "learning_rate": 1.6771284983349693e-05, + "loss": 0.2206, + "step": 5611 + }, + { + "epoch": 0.29, + "grad_norm": 1.0559113462080079, + "learning_rate": 1.6770072945289034e-05, + "loss": 0.2596, + "step": 5612 + }, + { + "epoch": 0.29, + "grad_norm": 0.7196028723600773, + "learning_rate": 1.676886072358976e-05, + "loss": 0.2199, + "step": 5613 + }, + { + "epoch": 0.29, + "grad_norm": 1.091513271080775, + "learning_rate": 1.676764831828476e-05, + "loss": 0.2019, + "step": 5614 + }, + { + "epoch": 0.29, + "grad_norm": 0.9225670399473616, + "learning_rate": 1.6766435729406913e-05, + "loss": 0.2347, + "step": 5615 + }, + { + "epoch": 0.29, + "grad_norm": 0.99503266690628, + "learning_rate": 1.6765222956989117e-05, + "loss": 0.2061, + "step": 5616 + }, + { + "epoch": 0.29, + "grad_norm": 0.878108183400604, + "learning_rate": 1.6764010001064268e-05, + "loss": 0.1947, + "step": 5617 + }, + { + "epoch": 0.29, + "grad_norm": 0.881611187195488, + "learning_rate": 1.6762796861665262e-05, + "loss": 0.2059, + "step": 5618 + }, + { + "epoch": 0.29, + "grad_norm": 0.8924377250902428, + "learning_rate": 1.6761583538825013e-05, + "loss": 0.2228, + "step": 5619 + }, + { + "epoch": 0.29, + "grad_norm": 0.8815121230345735, + "learning_rate": 1.6760370032576424e-05, + "loss": 0.2137, + "step": 5620 + }, + { + "epoch": 0.29, + "grad_norm": 0.8511565240034018, + "learning_rate": 1.6759156342952422e-05, + "loss": 0.2009, + "step": 5621 + }, + { + "epoch": 0.29, + "grad_norm": 1.1423106856019447, + "learning_rate": 1.6757942469985917e-05, + "loss": 0.2177, + "step": 5622 + }, + { + "epoch": 0.29, + "grad_norm": 4.370018827440741, + "learning_rate": 1.6756728413709843e-05, + "loss": 0.2134, + "step": 5623 + }, + { + "epoch": 0.29, + "grad_norm": 1.3487268044836134, + "learning_rate": 1.6755514174157127e-05, + "loss": 0.2188, + "step": 5624 + }, + { + "epoch": 0.29, + "grad_norm": 1.090300940977795, + "learning_rate": 1.675429975136071e-05, + "loss": 0.2023, + "step": 5625 + }, + { + "epoch": 0.29, + "grad_norm": 0.9414580517820581, + "learning_rate": 1.675308514535353e-05, + "loss": 0.2219, + "step": 5626 + }, + { + "epoch": 0.29, + "grad_norm": 0.8682789778569923, + "learning_rate": 1.6751870356168534e-05, + "loss": 0.1966, + "step": 5627 + }, + { + "epoch": 0.29, + "grad_norm": 0.8623789355882996, + "learning_rate": 1.6750655383838674e-05, + "loss": 0.2179, + "step": 5628 + }, + { + "epoch": 0.29, + "grad_norm": 0.9338422907144169, + "learning_rate": 1.6749440228396903e-05, + "loss": 0.2091, + "step": 5629 + }, + { + "epoch": 0.29, + "grad_norm": 1.1181363031884917, + "learning_rate": 1.6748224889876188e-05, + "loss": 0.2387, + "step": 5630 + }, + { + "epoch": 0.29, + "grad_norm": 1.623438042143102, + "learning_rate": 1.674700936830949e-05, + "loss": 0.2126, + "step": 5631 + }, + { + "epoch": 0.29, + "grad_norm": 0.910801099431657, + "learning_rate": 1.6745793663729785e-05, + "loss": 0.238, + "step": 5632 + }, + { + "epoch": 0.29, + "grad_norm": 0.9199691919612899, + "learning_rate": 1.674457777617004e-05, + "loss": 0.1937, + "step": 5633 + }, + { + "epoch": 0.29, + "grad_norm": 0.9524353356354959, + "learning_rate": 1.6743361705663246e-05, + "loss": 0.2159, + "step": 5634 + }, + { + "epoch": 0.29, + "grad_norm": 0.8468532219803875, + "learning_rate": 1.6742145452242383e-05, + "loss": 0.2097, + "step": 5635 + }, + { + "epoch": 0.29, + "grad_norm": 0.8347163633932896, + "learning_rate": 1.6740929015940442e-05, + "loss": 0.209, + "step": 5636 + }, + { + "epoch": 0.29, + "grad_norm": 1.0916796899646724, + "learning_rate": 1.6739712396790424e-05, + "loss": 0.2056, + "step": 5637 + }, + { + "epoch": 0.29, + "grad_norm": 0.9631455506402163, + "learning_rate": 1.673849559482533e-05, + "loss": 0.1959, + "step": 5638 + }, + { + "epoch": 0.29, + "grad_norm": 0.9242865697867108, + "learning_rate": 1.6737278610078153e-05, + "loss": 0.1952, + "step": 5639 + }, + { + "epoch": 0.29, + "grad_norm": 1.074821995609312, + "learning_rate": 1.6736061442581922e-05, + "loss": 0.2034, + "step": 5640 + }, + { + "epoch": 0.29, + "grad_norm": 0.9970315145305882, + "learning_rate": 1.673484409236964e-05, + "loss": 0.1959, + "step": 5641 + }, + { + "epoch": 0.29, + "grad_norm": 0.8036034716113344, + "learning_rate": 1.673362655947433e-05, + "loss": 0.2198, + "step": 5642 + }, + { + "epoch": 0.29, + "grad_norm": 0.9439196709366106, + "learning_rate": 1.673240884392902e-05, + "loss": 0.1962, + "step": 5643 + }, + { + "epoch": 0.29, + "grad_norm": 0.881953076253939, + "learning_rate": 1.6731190945766742e-05, + "loss": 0.2086, + "step": 5644 + }, + { + "epoch": 0.29, + "grad_norm": 1.0536395032248527, + "learning_rate": 1.672997286502053e-05, + "loss": 0.2236, + "step": 5645 + }, + { + "epoch": 0.29, + "grad_norm": 1.3389673197902134, + "learning_rate": 1.672875460172342e-05, + "loss": 0.2045, + "step": 5646 + }, + { + "epoch": 0.29, + "grad_norm": 0.9475155150541045, + "learning_rate": 1.6727536155908466e-05, + "loss": 0.2071, + "step": 5647 + }, + { + "epoch": 0.29, + "grad_norm": 0.9946118684590498, + "learning_rate": 1.672631752760871e-05, + "loss": 0.2139, + "step": 5648 + }, + { + "epoch": 0.29, + "grad_norm": 1.1607991460269982, + "learning_rate": 1.6725098716857212e-05, + "loss": 0.2155, + "step": 5649 + }, + { + "epoch": 0.29, + "grad_norm": 0.8941861618108317, + "learning_rate": 1.672387972368703e-05, + "loss": 0.2113, + "step": 5650 + }, + { + "epoch": 0.29, + "grad_norm": 1.086806213120535, + "learning_rate": 1.6722660548131235e-05, + "loss": 0.2116, + "step": 5651 + }, + { + "epoch": 0.29, + "grad_norm": 0.7956815228190831, + "learning_rate": 1.6721441190222893e-05, + "loss": 0.1886, + "step": 5652 + }, + { + "epoch": 0.29, + "grad_norm": 1.0590701485177547, + "learning_rate": 1.6720221649995076e-05, + "loss": 0.2069, + "step": 5653 + }, + { + "epoch": 0.29, + "grad_norm": 0.7979807415368692, + "learning_rate": 1.6719001927480867e-05, + "loss": 0.2057, + "step": 5654 + }, + { + "epoch": 0.29, + "grad_norm": 1.0005418175143537, + "learning_rate": 1.6717782022713353e-05, + "loss": 0.2211, + "step": 5655 + }, + { + "epoch": 0.29, + "grad_norm": 0.9628323917247162, + "learning_rate": 1.671656193572562e-05, + "loss": 0.2178, + "step": 5656 + }, + { + "epoch": 0.29, + "grad_norm": 1.5734424180049371, + "learning_rate": 1.671534166655077e-05, + "loss": 0.1901, + "step": 5657 + }, + { + "epoch": 0.29, + "grad_norm": 1.55494116484686, + "learning_rate": 1.6714121215221894e-05, + "loss": 0.229, + "step": 5658 + }, + { + "epoch": 0.29, + "grad_norm": 0.9325780349684512, + "learning_rate": 1.67129005817721e-05, + "loss": 0.2171, + "step": 5659 + }, + { + "epoch": 0.29, + "grad_norm": 2.2796034374258602, + "learning_rate": 1.67116797662345e-05, + "loss": 0.2123, + "step": 5660 + }, + { + "epoch": 0.29, + "grad_norm": 1.0749231903118135, + "learning_rate": 1.6710458768642207e-05, + "loss": 0.2215, + "step": 5661 + }, + { + "epoch": 0.29, + "grad_norm": 1.1093769231076407, + "learning_rate": 1.670923758902834e-05, + "loss": 0.2224, + "step": 5662 + }, + { + "epoch": 0.29, + "grad_norm": 1.3186374701895731, + "learning_rate": 1.6708016227426026e-05, + "loss": 0.1889, + "step": 5663 + }, + { + "epoch": 0.29, + "grad_norm": 1.067109564591355, + "learning_rate": 1.6706794683868392e-05, + "loss": 0.2334, + "step": 5664 + }, + { + "epoch": 0.29, + "grad_norm": 1.2771290744730772, + "learning_rate": 1.6705572958388576e-05, + "loss": 0.2373, + "step": 5665 + }, + { + "epoch": 0.29, + "grad_norm": 0.994192818216985, + "learning_rate": 1.6704351051019713e-05, + "loss": 0.2172, + "step": 5666 + }, + { + "epoch": 0.29, + "grad_norm": 0.8792652678481304, + "learning_rate": 1.6703128961794947e-05, + "loss": 0.1987, + "step": 5667 + }, + { + "epoch": 0.29, + "grad_norm": 1.3800397889807778, + "learning_rate": 1.670190669074743e-05, + "loss": 0.2263, + "step": 5668 + }, + { + "epoch": 0.29, + "grad_norm": 0.840686344309017, + "learning_rate": 1.670068423791032e-05, + "loss": 0.1914, + "step": 5669 + }, + { + "epoch": 0.29, + "grad_norm": 0.8809991020409779, + "learning_rate": 1.6699461603316765e-05, + "loss": 0.1922, + "step": 5670 + }, + { + "epoch": 0.29, + "grad_norm": 1.1474302610352636, + "learning_rate": 1.669823878699994e-05, + "loss": 0.2132, + "step": 5671 + }, + { + "epoch": 0.29, + "grad_norm": 0.9949574631829177, + "learning_rate": 1.669701578899301e-05, + "loss": 0.2054, + "step": 5672 + }, + { + "epoch": 0.29, + "grad_norm": 2.378673043957621, + "learning_rate": 1.6695792609329148e-05, + "loss": 0.1907, + "step": 5673 + }, + { + "epoch": 0.29, + "grad_norm": 0.9551872363706768, + "learning_rate": 1.669456924804153e-05, + "loss": 0.1831, + "step": 5674 + }, + { + "epoch": 0.29, + "grad_norm": 0.921282116685391, + "learning_rate": 1.6693345705163343e-05, + "loss": 0.1964, + "step": 5675 + }, + { + "epoch": 0.29, + "grad_norm": 1.415101477622893, + "learning_rate": 1.669212198072778e-05, + "loss": 0.2267, + "step": 5676 + }, + { + "epoch": 0.29, + "grad_norm": 1.2433915586599227, + "learning_rate": 1.669089807476803e-05, + "loss": 0.2407, + "step": 5677 + }, + { + "epoch": 0.29, + "grad_norm": 1.1426110298544896, + "learning_rate": 1.668967398731729e-05, + "loss": 0.2146, + "step": 5678 + }, + { + "epoch": 0.29, + "grad_norm": 0.8677298754657188, + "learning_rate": 1.6688449718408763e-05, + "loss": 0.2143, + "step": 5679 + }, + { + "epoch": 0.29, + "grad_norm": 0.845779757673886, + "learning_rate": 1.6687225268075665e-05, + "loss": 0.2377, + "step": 5680 + }, + { + "epoch": 0.29, + "grad_norm": 0.7281009544445249, + "learning_rate": 1.6686000636351197e-05, + "loss": 0.1995, + "step": 5681 + }, + { + "epoch": 0.29, + "grad_norm": 0.8926328124730796, + "learning_rate": 1.6684775823268592e-05, + "loss": 0.1922, + "step": 5682 + }, + { + "epoch": 0.29, + "grad_norm": 0.923995341210268, + "learning_rate": 1.668355082886106e-05, + "loss": 0.1937, + "step": 5683 + }, + { + "epoch": 0.29, + "grad_norm": 1.391058018212231, + "learning_rate": 1.6682325653161833e-05, + "loss": 0.2005, + "step": 5684 + }, + { + "epoch": 0.29, + "grad_norm": 0.9626507421652647, + "learning_rate": 1.668110029620415e-05, + "loss": 0.2245, + "step": 5685 + }, + { + "epoch": 0.29, + "grad_norm": 1.0274059668926714, + "learning_rate": 1.6679874758021238e-05, + "loss": 0.184, + "step": 5686 + }, + { + "epoch": 0.29, + "grad_norm": 1.0835223734839512, + "learning_rate": 1.6678649038646353e-05, + "loss": 0.2178, + "step": 5687 + }, + { + "epoch": 0.29, + "grad_norm": 1.0583879888167973, + "learning_rate": 1.667742313811273e-05, + "loss": 0.2051, + "step": 5688 + }, + { + "epoch": 0.29, + "grad_norm": 0.826642326401598, + "learning_rate": 1.667619705645363e-05, + "loss": 0.2006, + "step": 5689 + }, + { + "epoch": 0.29, + "grad_norm": 0.8127180877352143, + "learning_rate": 1.667497079370231e-05, + "loss": 0.2119, + "step": 5690 + }, + { + "epoch": 0.29, + "grad_norm": 1.4911742939755106, + "learning_rate": 1.6673744349892027e-05, + "loss": 0.2051, + "step": 5691 + }, + { + "epoch": 0.29, + "grad_norm": 0.8507874783495339, + "learning_rate": 1.6672517725056052e-05, + "loss": 0.1919, + "step": 5692 + }, + { + "epoch": 0.29, + "grad_norm": 1.2710015881210284, + "learning_rate": 1.6671290919227656e-05, + "loss": 0.2389, + "step": 5693 + }, + { + "epoch": 0.29, + "grad_norm": 0.9906515157794544, + "learning_rate": 1.667006393244012e-05, + "loss": 0.2384, + "step": 5694 + }, + { + "epoch": 0.29, + "grad_norm": 0.9793754164793437, + "learning_rate": 1.666883676472672e-05, + "loss": 0.2152, + "step": 5695 + }, + { + "epoch": 0.29, + "grad_norm": 0.8065468973363352, + "learning_rate": 1.666760941612075e-05, + "loss": 0.2056, + "step": 5696 + }, + { + "epoch": 0.29, + "grad_norm": 0.9731836300048301, + "learning_rate": 1.666638188665549e-05, + "loss": 0.2213, + "step": 5697 + }, + { + "epoch": 0.29, + "grad_norm": 2.226280971426743, + "learning_rate": 1.6665154176364252e-05, + "loss": 0.2196, + "step": 5698 + }, + { + "epoch": 0.29, + "grad_norm": 0.9443958794853763, + "learning_rate": 1.666392628528033e-05, + "loss": 0.1855, + "step": 5699 + }, + { + "epoch": 0.29, + "grad_norm": 1.1573762089894577, + "learning_rate": 1.666269821343703e-05, + "loss": 0.201, + "step": 5700 + }, + { + "epoch": 0.29, + "grad_norm": 0.9768856011391813, + "learning_rate": 1.666146996086766e-05, + "loss": 0.2271, + "step": 5701 + }, + { + "epoch": 0.29, + "grad_norm": 0.8118714858889647, + "learning_rate": 1.6660241527605546e-05, + "loss": 0.1818, + "step": 5702 + }, + { + "epoch": 0.29, + "grad_norm": 0.8388414321070643, + "learning_rate": 1.6659012913684005e-05, + "loss": 0.2118, + "step": 5703 + }, + { + "epoch": 0.29, + "grad_norm": 0.8937549781822656, + "learning_rate": 1.665778411913636e-05, + "loss": 0.2302, + "step": 5704 + }, + { + "epoch": 0.29, + "grad_norm": 1.0990565637143503, + "learning_rate": 1.6656555143995946e-05, + "loss": 0.211, + "step": 5705 + }, + { + "epoch": 0.29, + "grad_norm": 1.0010147981590252, + "learning_rate": 1.66553259882961e-05, + "loss": 0.2307, + "step": 5706 + }, + { + "epoch": 0.29, + "grad_norm": 0.8837306750998283, + "learning_rate": 1.6654096652070157e-05, + "loss": 0.1806, + "step": 5707 + }, + { + "epoch": 0.29, + "grad_norm": 1.8684906796406004, + "learning_rate": 1.665286713535147e-05, + "loss": 0.2114, + "step": 5708 + }, + { + "epoch": 0.29, + "grad_norm": 0.9092706245114834, + "learning_rate": 1.6651637438173382e-05, + "loss": 0.1945, + "step": 5709 + }, + { + "epoch": 0.29, + "grad_norm": 1.0686983902228964, + "learning_rate": 1.665040756056926e-05, + "loss": 0.23, + "step": 5710 + }, + { + "epoch": 0.29, + "grad_norm": 1.1221867067651907, + "learning_rate": 1.6649177502572447e-05, + "loss": 0.2209, + "step": 5711 + }, + { + "epoch": 0.29, + "grad_norm": 1.0704699851234865, + "learning_rate": 1.6647947264216328e-05, + "loss": 0.1899, + "step": 5712 + }, + { + "epoch": 0.29, + "grad_norm": 0.991710562833805, + "learning_rate": 1.664671684553426e-05, + "loss": 0.1844, + "step": 5713 + }, + { + "epoch": 0.29, + "grad_norm": 0.8965807307602568, + "learning_rate": 1.6645486246559622e-05, + "loss": 0.1953, + "step": 5714 + }, + { + "epoch": 0.29, + "grad_norm": 1.0461114963512261, + "learning_rate": 1.6644255467325793e-05, + "loss": 0.2339, + "step": 5715 + }, + { + "epoch": 0.29, + "grad_norm": 0.8634193165469928, + "learning_rate": 1.6643024507866158e-05, + "loss": 0.1947, + "step": 5716 + }, + { + "epoch": 0.29, + "grad_norm": 0.8478908905893784, + "learning_rate": 1.664179336821411e-05, + "loss": 0.2118, + "step": 5717 + }, + { + "epoch": 0.29, + "grad_norm": 0.9551220717526766, + "learning_rate": 1.6640562048403044e-05, + "loss": 0.198, + "step": 5718 + }, + { + "epoch": 0.29, + "grad_norm": 1.1053131099679867, + "learning_rate": 1.6639330548466356e-05, + "loss": 0.2285, + "step": 5719 + }, + { + "epoch": 0.29, + "grad_norm": 0.8436838705449303, + "learning_rate": 1.6638098868437453e-05, + "loss": 0.1869, + "step": 5720 + }, + { + "epoch": 0.29, + "grad_norm": 2.41614560693824, + "learning_rate": 1.663686700834974e-05, + "loss": 0.2166, + "step": 5721 + }, + { + "epoch": 0.29, + "grad_norm": 1.451927439591581, + "learning_rate": 1.6635634968236637e-05, + "loss": 0.2365, + "step": 5722 + }, + { + "epoch": 0.29, + "grad_norm": 0.993119411046374, + "learning_rate": 1.663440274813156e-05, + "loss": 0.1974, + "step": 5723 + }, + { + "epoch": 0.29, + "grad_norm": 3.097312693258662, + "learning_rate": 1.6633170348067935e-05, + "loss": 0.1944, + "step": 5724 + }, + { + "epoch": 0.29, + "grad_norm": 1.1593390713884788, + "learning_rate": 1.663193776807919e-05, + "loss": 0.2147, + "step": 5725 + }, + { + "epoch": 0.29, + "grad_norm": 1.159035057231313, + "learning_rate": 1.6630705008198757e-05, + "loss": 0.2045, + "step": 5726 + }, + { + "epoch": 0.29, + "grad_norm": 0.9332523049846795, + "learning_rate": 1.6629472068460077e-05, + "loss": 0.1989, + "step": 5727 + }, + { + "epoch": 0.29, + "grad_norm": 1.2142552198529883, + "learning_rate": 1.662823894889659e-05, + "loss": 0.2172, + "step": 5728 + }, + { + "epoch": 0.29, + "grad_norm": 1.49516565416373, + "learning_rate": 1.6627005649541746e-05, + "loss": 0.2127, + "step": 5729 + }, + { + "epoch": 0.29, + "grad_norm": 1.4027560988903203, + "learning_rate": 1.6625772170429005e-05, + "loss": 0.1918, + "step": 5730 + }, + { + "epoch": 0.29, + "grad_norm": 3.202650465639793, + "learning_rate": 1.6624538511591817e-05, + "loss": 0.2219, + "step": 5731 + }, + { + "epoch": 0.29, + "grad_norm": 1.2748809823636216, + "learning_rate": 1.6623304673063647e-05, + "loss": 0.2256, + "step": 5732 + }, + { + "epoch": 0.29, + "grad_norm": 1.0268563268112219, + "learning_rate": 1.6622070654877966e-05, + "loss": 0.2014, + "step": 5733 + }, + { + "epoch": 0.29, + "grad_norm": 1.1245407843284232, + "learning_rate": 1.6620836457068242e-05, + "loss": 0.2189, + "step": 5734 + }, + { + "epoch": 0.29, + "grad_norm": 0.9168931745402805, + "learning_rate": 1.6619602079667956e-05, + "loss": 0.2112, + "step": 5735 + }, + { + "epoch": 0.29, + "grad_norm": 0.8845526859215986, + "learning_rate": 1.661836752271059e-05, + "loss": 0.2323, + "step": 5736 + }, + { + "epoch": 0.29, + "grad_norm": 1.0410697350692784, + "learning_rate": 1.6617132786229634e-05, + "loss": 0.1992, + "step": 5737 + }, + { + "epoch": 0.29, + "grad_norm": 2.0815951639272545, + "learning_rate": 1.661589787025857e-05, + "loss": 0.2061, + "step": 5738 + }, + { + "epoch": 0.29, + "grad_norm": 1.133386163528876, + "learning_rate": 1.6614662774830908e-05, + "loss": 0.2232, + "step": 5739 + }, + { + "epoch": 0.29, + "grad_norm": 1.3203570744289945, + "learning_rate": 1.6613427499980143e-05, + "loss": 0.1992, + "step": 5740 + }, + { + "epoch": 0.29, + "grad_norm": 0.8843085131207743, + "learning_rate": 1.6612192045739787e-05, + "loss": 0.2039, + "step": 5741 + }, + { + "epoch": 0.29, + "grad_norm": 0.8025452416194425, + "learning_rate": 1.6610956412143346e-05, + "loss": 0.1985, + "step": 5742 + }, + { + "epoch": 0.29, + "grad_norm": 1.489947977642656, + "learning_rate": 1.6609720599224337e-05, + "loss": 0.2007, + "step": 5743 + }, + { + "epoch": 0.29, + "grad_norm": 0.9234053910717891, + "learning_rate": 1.6608484607016283e-05, + "loss": 0.2143, + "step": 5744 + }, + { + "epoch": 0.29, + "grad_norm": 0.9372514810803722, + "learning_rate": 1.6607248435552714e-05, + "loss": 0.2024, + "step": 5745 + }, + { + "epoch": 0.29, + "grad_norm": 1.107377154971928, + "learning_rate": 1.6606012084867158e-05, + "loss": 0.2152, + "step": 5746 + }, + { + "epoch": 0.29, + "grad_norm": 0.922792485971133, + "learning_rate": 1.660477555499315e-05, + "loss": 0.2083, + "step": 5747 + }, + { + "epoch": 0.29, + "grad_norm": 0.9716374962394986, + "learning_rate": 1.660353884596423e-05, + "loss": 0.1889, + "step": 5748 + }, + { + "epoch": 0.29, + "grad_norm": 1.03480208202746, + "learning_rate": 1.6602301957813945e-05, + "loss": 0.23, + "step": 5749 + }, + { + "epoch": 0.29, + "grad_norm": 1.2601246611472028, + "learning_rate": 1.6601064890575852e-05, + "loss": 0.2107, + "step": 5750 + }, + { + "epoch": 0.29, + "grad_norm": 0.9725700733302011, + "learning_rate": 1.6599827644283496e-05, + "loss": 0.1852, + "step": 5751 + }, + { + "epoch": 0.29, + "grad_norm": 0.8676443914009231, + "learning_rate": 1.6598590218970448e-05, + "loss": 0.2128, + "step": 5752 + }, + { + "epoch": 0.29, + "grad_norm": 0.9559585801515682, + "learning_rate": 1.6597352614670265e-05, + "loss": 0.1967, + "step": 5753 + }, + { + "epoch": 0.29, + "grad_norm": 0.9523708018419746, + "learning_rate": 1.6596114831416516e-05, + "loss": 0.1927, + "step": 5754 + }, + { + "epoch": 0.29, + "grad_norm": 1.5704200689560264, + "learning_rate": 1.6594876869242785e-05, + "loss": 0.2105, + "step": 5755 + }, + { + "epoch": 0.29, + "grad_norm": 1.2809466782681533, + "learning_rate": 1.659363872818264e-05, + "loss": 0.1994, + "step": 5756 + }, + { + "epoch": 0.29, + "grad_norm": 0.7852549758135308, + "learning_rate": 1.6592400408269678e-05, + "loss": 0.1898, + "step": 5757 + }, + { + "epoch": 0.29, + "grad_norm": 1.468007426452819, + "learning_rate": 1.659116190953748e-05, + "loss": 0.1984, + "step": 5758 + }, + { + "epoch": 0.29, + "grad_norm": 1.0512057713606473, + "learning_rate": 1.6589923232019646e-05, + "loss": 0.236, + "step": 5759 + }, + { + "epoch": 0.29, + "grad_norm": 0.8335140202496528, + "learning_rate": 1.6588684375749767e-05, + "loss": 0.2051, + "step": 5760 + }, + { + "epoch": 0.29, + "grad_norm": 0.8435524615441553, + "learning_rate": 1.6587445340761456e-05, + "loss": 0.1985, + "step": 5761 + }, + { + "epoch": 0.29, + "grad_norm": 1.136818437779603, + "learning_rate": 1.658620612708832e-05, + "loss": 0.203, + "step": 5762 + }, + { + "epoch": 0.29, + "grad_norm": 1.5529887857017122, + "learning_rate": 1.6584966734763966e-05, + "loss": 0.1925, + "step": 5763 + }, + { + "epoch": 0.29, + "grad_norm": 1.0151767977894686, + "learning_rate": 1.6583727163822016e-05, + "loss": 0.2128, + "step": 5764 + }, + { + "epoch": 0.29, + "grad_norm": 0.8870410825868733, + "learning_rate": 1.6582487414296097e-05, + "loss": 0.2046, + "step": 5765 + }, + { + "epoch": 0.29, + "grad_norm": 1.3136083487300818, + "learning_rate": 1.6581247486219837e-05, + "loss": 0.2081, + "step": 5766 + }, + { + "epoch": 0.29, + "grad_norm": 1.075196880227764, + "learning_rate": 1.6580007379626868e-05, + "loss": 0.2084, + "step": 5767 + }, + { + "epoch": 0.29, + "grad_norm": 0.928292447081866, + "learning_rate": 1.6578767094550826e-05, + "loss": 0.2121, + "step": 5768 + }, + { + "epoch": 0.29, + "grad_norm": 1.1057184105594184, + "learning_rate": 1.6577526631025352e-05, + "loss": 0.2214, + "step": 5769 + }, + { + "epoch": 0.29, + "grad_norm": 1.084890084007609, + "learning_rate": 1.65762859890841e-05, + "loss": 0.2286, + "step": 5770 + }, + { + "epoch": 0.29, + "grad_norm": 1.0065522632622652, + "learning_rate": 1.6575045168760716e-05, + "loss": 0.1941, + "step": 5771 + }, + { + "epoch": 0.29, + "grad_norm": 1.0942944625729116, + "learning_rate": 1.6573804170088866e-05, + "loss": 0.2055, + "step": 5772 + }, + { + "epoch": 0.29, + "grad_norm": 1.2163497184581087, + "learning_rate": 1.65725629931022e-05, + "loss": 0.2141, + "step": 5773 + }, + { + "epoch": 0.29, + "grad_norm": 1.2837384138894168, + "learning_rate": 1.65713216378344e-05, + "loss": 0.2065, + "step": 5774 + }, + { + "epoch": 0.29, + "grad_norm": 1.1628408812538042, + "learning_rate": 1.6570080104319122e-05, + "loss": 0.2181, + "step": 5775 + }, + { + "epoch": 0.29, + "grad_norm": 1.0705948505531206, + "learning_rate": 1.656883839259005e-05, + "loss": 0.2109, + "step": 5776 + }, + { + "epoch": 0.29, + "grad_norm": 1.6840407787429668, + "learning_rate": 1.656759650268087e-05, + "loss": 0.2008, + "step": 5777 + }, + { + "epoch": 0.29, + "grad_norm": 1.2944722190769227, + "learning_rate": 1.6566354434625262e-05, + "loss": 0.2134, + "step": 5778 + }, + { + "epoch": 0.29, + "grad_norm": 1.7357663578551865, + "learning_rate": 1.656511218845692e-05, + "loss": 0.1994, + "step": 5779 + }, + { + "epoch": 0.29, + "grad_norm": 1.3509628119207966, + "learning_rate": 1.6563869764209538e-05, + "loss": 0.1859, + "step": 5780 + }, + { + "epoch": 0.29, + "grad_norm": 1.133614725165837, + "learning_rate": 1.656262716191682e-05, + "loss": 0.2061, + "step": 5781 + }, + { + "epoch": 0.29, + "grad_norm": 1.0005438686506853, + "learning_rate": 1.6561384381612463e-05, + "loss": 0.1961, + "step": 5782 + }, + { + "epoch": 0.29, + "grad_norm": 1.2009133423409168, + "learning_rate": 1.656014142333019e-05, + "loss": 0.2023, + "step": 5783 + }, + { + "epoch": 0.29, + "grad_norm": 1.1461548411705642, + "learning_rate": 1.6558898287103708e-05, + "loss": 0.2175, + "step": 5784 + }, + { + "epoch": 0.29, + "grad_norm": 1.102960060107423, + "learning_rate": 1.6557654972966743e-05, + "loss": 0.2301, + "step": 5785 + }, + { + "epoch": 0.29, + "grad_norm": 0.8208184710818014, + "learning_rate": 1.6556411480953012e-05, + "loss": 0.2057, + "step": 5786 + }, + { + "epoch": 0.29, + "grad_norm": 0.9678370278571667, + "learning_rate": 1.655516781109625e-05, + "loss": 0.2112, + "step": 5787 + }, + { + "epoch": 0.29, + "grad_norm": 0.7878003614739206, + "learning_rate": 1.6553923963430193e-05, + "loss": 0.1782, + "step": 5788 + }, + { + "epoch": 0.29, + "grad_norm": 1.0807032380326957, + "learning_rate": 1.655267993798858e-05, + "loss": 0.1907, + "step": 5789 + }, + { + "epoch": 0.29, + "grad_norm": 1.0574042335885612, + "learning_rate": 1.655143573480515e-05, + "loss": 0.2407, + "step": 5790 + }, + { + "epoch": 0.29, + "grad_norm": 0.9989306136577616, + "learning_rate": 1.6550191353913657e-05, + "loss": 0.212, + "step": 5791 + }, + { + "epoch": 0.29, + "grad_norm": 0.9239187621127245, + "learning_rate": 1.654894679534785e-05, + "loss": 0.1757, + "step": 5792 + }, + { + "epoch": 0.29, + "grad_norm": 0.9822578991598856, + "learning_rate": 1.6547702059141497e-05, + "loss": 0.1973, + "step": 5793 + }, + { + "epoch": 0.29, + "grad_norm": 1.0885857577961602, + "learning_rate": 1.6546457145328354e-05, + "loss": 0.2046, + "step": 5794 + }, + { + "epoch": 0.29, + "grad_norm": 1.1196107951161263, + "learning_rate": 1.654521205394219e-05, + "loss": 0.1986, + "step": 5795 + }, + { + "epoch": 0.29, + "grad_norm": 1.0144618703541814, + "learning_rate": 1.654396678501678e-05, + "loss": 0.1774, + "step": 5796 + }, + { + "epoch": 0.29, + "grad_norm": 0.9975357074513678, + "learning_rate": 1.65427213385859e-05, + "loss": 0.1916, + "step": 5797 + }, + { + "epoch": 0.29, + "grad_norm": 1.027741848788133, + "learning_rate": 1.6541475714683337e-05, + "loss": 0.197, + "step": 5798 + }, + { + "epoch": 0.29, + "grad_norm": 0.7796913200996538, + "learning_rate": 1.6540229913342875e-05, + "loss": 0.1971, + "step": 5799 + }, + { + "epoch": 0.29, + "grad_norm": 0.7289629178138828, + "learning_rate": 1.6538983934598304e-05, + "loss": 0.201, + "step": 5800 + }, + { + "epoch": 0.29, + "grad_norm": 1.2443187942554474, + "learning_rate": 1.653773777848343e-05, + "loss": 0.2208, + "step": 5801 + }, + { + "epoch": 0.3, + "grad_norm": 1.013422787334009, + "learning_rate": 1.6536491445032044e-05, + "loss": 0.1913, + "step": 5802 + }, + { + "epoch": 0.3, + "grad_norm": 5.358326711355817, + "learning_rate": 1.6535244934277962e-05, + "loss": 0.2023, + "step": 5803 + }, + { + "epoch": 0.3, + "grad_norm": 1.5021160626864412, + "learning_rate": 1.653399824625499e-05, + "loss": 0.2047, + "step": 5804 + }, + { + "epoch": 0.3, + "grad_norm": 0.8694947879982603, + "learning_rate": 1.653275138099695e-05, + "loss": 0.24, + "step": 5805 + }, + { + "epoch": 0.3, + "grad_norm": 1.0982253887611164, + "learning_rate": 1.6531504338537653e-05, + "loss": 0.2039, + "step": 5806 + }, + { + "epoch": 0.3, + "grad_norm": 0.9651108431730049, + "learning_rate": 1.6530257118910936e-05, + "loss": 0.1938, + "step": 5807 + }, + { + "epoch": 0.3, + "grad_norm": 1.2459276680152085, + "learning_rate": 1.6529009722150626e-05, + "loss": 0.2199, + "step": 5808 + }, + { + "epoch": 0.3, + "grad_norm": 1.0178850451064618, + "learning_rate": 1.652776214829056e-05, + "loss": 0.2025, + "step": 5809 + }, + { + "epoch": 0.3, + "grad_norm": 1.1226569227170007, + "learning_rate": 1.6526514397364575e-05, + "loss": 0.2068, + "step": 5810 + }, + { + "epoch": 0.3, + "grad_norm": 1.5468864133936497, + "learning_rate": 1.652526646940652e-05, + "loss": 0.203, + "step": 5811 + }, + { + "epoch": 0.3, + "grad_norm": 1.046251258894582, + "learning_rate": 1.6524018364450243e-05, + "loss": 0.1911, + "step": 5812 + }, + { + "epoch": 0.3, + "grad_norm": 0.9299981519638938, + "learning_rate": 1.6522770082529596e-05, + "loss": 0.2273, + "step": 5813 + }, + { + "epoch": 0.3, + "grad_norm": 0.867663303689732, + "learning_rate": 1.6521521623678445e-05, + "loss": 0.2123, + "step": 5814 + }, + { + "epoch": 0.3, + "grad_norm": 0.8664470400619952, + "learning_rate": 1.6520272987930652e-05, + "loss": 0.215, + "step": 5815 + }, + { + "epoch": 0.3, + "grad_norm": 0.9126660340051115, + "learning_rate": 1.6519024175320083e-05, + "loss": 0.2117, + "step": 5816 + }, + { + "epoch": 0.3, + "grad_norm": 1.0413653605044608, + "learning_rate": 1.651777518588062e-05, + "loss": 0.2017, + "step": 5817 + }, + { + "epoch": 0.3, + "grad_norm": 1.3946594861416581, + "learning_rate": 1.6516526019646134e-05, + "loss": 0.1966, + "step": 5818 + }, + { + "epoch": 0.3, + "grad_norm": 1.3855714810992754, + "learning_rate": 1.651527667665051e-05, + "loss": 0.2036, + "step": 5819 + }, + { + "epoch": 0.3, + "grad_norm": 0.8732515849837948, + "learning_rate": 1.6514027156927645e-05, + "loss": 0.2048, + "step": 5820 + }, + { + "epoch": 0.3, + "grad_norm": 2.534939302966109, + "learning_rate": 1.6512777460511416e-05, + "loss": 0.2151, + "step": 5821 + }, + { + "epoch": 0.3, + "grad_norm": 0.8623181831359924, + "learning_rate": 1.6511527587435736e-05, + "loss": 0.1974, + "step": 5822 + }, + { + "epoch": 0.3, + "grad_norm": 1.0794220973605955, + "learning_rate": 1.6510277537734503e-05, + "loss": 0.2174, + "step": 5823 + }, + { + "epoch": 0.3, + "grad_norm": 1.5431023442504375, + "learning_rate": 1.6509027311441622e-05, + "loss": 0.2106, + "step": 5824 + }, + { + "epoch": 0.3, + "grad_norm": 1.0567470028378723, + "learning_rate": 1.6507776908591008e-05, + "loss": 0.2049, + "step": 5825 + }, + { + "epoch": 0.3, + "grad_norm": 0.9985409094547987, + "learning_rate": 1.6506526329216577e-05, + "loss": 0.226, + "step": 5826 + }, + { + "epoch": 0.3, + "grad_norm": 0.9380862280445232, + "learning_rate": 1.6505275573352256e-05, + "loss": 0.2239, + "step": 5827 + }, + { + "epoch": 0.3, + "grad_norm": 1.02642850762763, + "learning_rate": 1.6504024641031962e-05, + "loss": 0.22, + "step": 5828 + }, + { + "epoch": 0.3, + "grad_norm": 1.5253979388506016, + "learning_rate": 1.6502773532289636e-05, + "loss": 0.2042, + "step": 5829 + }, + { + "epoch": 0.3, + "grad_norm": 1.3658204535216976, + "learning_rate": 1.650152224715921e-05, + "loss": 0.1972, + "step": 5830 + }, + { + "epoch": 0.3, + "grad_norm": 1.7792321176824304, + "learning_rate": 1.6500270785674622e-05, + "loss": 0.1887, + "step": 5831 + }, + { + "epoch": 0.3, + "grad_norm": 1.0631302674580914, + "learning_rate": 1.6499019147869826e-05, + "loss": 0.1867, + "step": 5832 + }, + { + "epoch": 0.3, + "grad_norm": 0.9993820993819318, + "learning_rate": 1.649776733377877e-05, + "loss": 0.2004, + "step": 5833 + }, + { + "epoch": 0.3, + "grad_norm": 1.1175435765314348, + "learning_rate": 1.6496515343435402e-05, + "loss": 0.215, + "step": 5834 + }, + { + "epoch": 0.3, + "grad_norm": 0.7984194198142733, + "learning_rate": 1.6495263176873693e-05, + "loss": 0.1992, + "step": 5835 + }, + { + "epoch": 0.3, + "grad_norm": 0.8747750342396716, + "learning_rate": 1.6494010834127606e-05, + "loss": 0.21, + "step": 5836 + }, + { + "epoch": 0.3, + "grad_norm": 1.0164685258600892, + "learning_rate": 1.6492758315231105e-05, + "loss": 0.2239, + "step": 5837 + }, + { + "epoch": 0.3, + "grad_norm": 3.2855641459344684, + "learning_rate": 1.6491505620218164e-05, + "loss": 0.2095, + "step": 5838 + }, + { + "epoch": 0.3, + "grad_norm": 1.0643574580669881, + "learning_rate": 1.649025274912277e-05, + "loss": 0.2103, + "step": 5839 + }, + { + "epoch": 0.3, + "grad_norm": 0.9935202396359508, + "learning_rate": 1.6488999701978905e-05, + "loss": 0.2034, + "step": 5840 + }, + { + "epoch": 0.3, + "grad_norm": 0.8713947124605383, + "learning_rate": 1.6487746478820553e-05, + "loss": 0.2141, + "step": 5841 + }, + { + "epoch": 0.3, + "grad_norm": 0.8350889530294335, + "learning_rate": 1.6486493079681717e-05, + "loss": 0.1887, + "step": 5842 + }, + { + "epoch": 0.3, + "grad_norm": 0.9041295959036052, + "learning_rate": 1.6485239504596388e-05, + "loss": 0.1954, + "step": 5843 + }, + { + "epoch": 0.3, + "grad_norm": 0.9224791675877131, + "learning_rate": 1.6483985753598568e-05, + "loss": 0.1839, + "step": 5844 + }, + { + "epoch": 0.3, + "grad_norm": 1.2302875736385745, + "learning_rate": 1.6482731826722268e-05, + "loss": 0.2014, + "step": 5845 + }, + { + "epoch": 0.3, + "grad_norm": 0.9097035251363624, + "learning_rate": 1.6481477724001505e-05, + "loss": 0.2199, + "step": 5846 + }, + { + "epoch": 0.3, + "grad_norm": 0.9888881253592569, + "learning_rate": 1.648022344547029e-05, + "loss": 0.2227, + "step": 5847 + }, + { + "epoch": 0.3, + "grad_norm": 0.9414438514171718, + "learning_rate": 1.647896899116265e-05, + "loss": 0.2229, + "step": 5848 + }, + { + "epoch": 0.3, + "grad_norm": 1.0139577064085958, + "learning_rate": 1.647771436111261e-05, + "loss": 0.2129, + "step": 5849 + }, + { + "epoch": 0.3, + "grad_norm": 0.8495467191827688, + "learning_rate": 1.64764595553542e-05, + "loss": 0.1753, + "step": 5850 + }, + { + "epoch": 0.3, + "grad_norm": 1.1991959064883873, + "learning_rate": 1.647520457392146e-05, + "loss": 0.2039, + "step": 5851 + }, + { + "epoch": 0.3, + "grad_norm": 0.8550431663988788, + "learning_rate": 1.647394941684843e-05, + "loss": 0.221, + "step": 5852 + }, + { + "epoch": 0.3, + "grad_norm": 0.8606637731862601, + "learning_rate": 1.6472694084169155e-05, + "loss": 0.1815, + "step": 5853 + }, + { + "epoch": 0.3, + "grad_norm": 1.0369770786766133, + "learning_rate": 1.6471438575917688e-05, + "loss": 0.2189, + "step": 5854 + }, + { + "epoch": 0.3, + "grad_norm": 1.0865897305584387, + "learning_rate": 1.6470182892128085e-05, + "loss": 0.2183, + "step": 5855 + }, + { + "epoch": 0.3, + "grad_norm": 0.9732329093878587, + "learning_rate": 1.6468927032834407e-05, + "loss": 0.2007, + "step": 5856 + }, + { + "epoch": 0.3, + "grad_norm": 0.9412361804662748, + "learning_rate": 1.6467670998070715e-05, + "loss": 0.2058, + "step": 5857 + }, + { + "epoch": 0.3, + "grad_norm": 1.2031680501967463, + "learning_rate": 1.6466414787871084e-05, + "loss": 0.1962, + "step": 5858 + }, + { + "epoch": 0.3, + "grad_norm": 0.9115571475165246, + "learning_rate": 1.6465158402269585e-05, + "loss": 0.2157, + "step": 5859 + }, + { + "epoch": 0.3, + "grad_norm": 0.7158282006426111, + "learning_rate": 1.64639018413003e-05, + "loss": 0.1991, + "step": 5860 + }, + { + "epoch": 0.3, + "grad_norm": 0.8445846725413154, + "learning_rate": 1.6462645104997313e-05, + "loss": 0.1883, + "step": 5861 + }, + { + "epoch": 0.3, + "grad_norm": 0.8719816706104393, + "learning_rate": 1.646138819339471e-05, + "loss": 0.2028, + "step": 5862 + }, + { + "epoch": 0.3, + "grad_norm": 0.9688970258918401, + "learning_rate": 1.646013110652659e-05, + "loss": 0.2212, + "step": 5863 + }, + { + "epoch": 0.3, + "grad_norm": 0.888364909503381, + "learning_rate": 1.645887384442705e-05, + "loss": 0.2056, + "step": 5864 + }, + { + "epoch": 0.3, + "grad_norm": 0.8982603781397506, + "learning_rate": 1.645761640713019e-05, + "loss": 0.2103, + "step": 5865 + }, + { + "epoch": 0.3, + "grad_norm": 2.3715133981948604, + "learning_rate": 1.645635879467012e-05, + "loss": 0.2104, + "step": 5866 + }, + { + "epoch": 0.3, + "grad_norm": 1.002168238431376, + "learning_rate": 1.6455101007080955e-05, + "loss": 0.2034, + "step": 5867 + }, + { + "epoch": 0.3, + "grad_norm": 0.9555106086590374, + "learning_rate": 1.645384304439681e-05, + "loss": 0.2587, + "step": 5868 + }, + { + "epoch": 0.3, + "grad_norm": 1.1883950255284266, + "learning_rate": 1.6452584906651807e-05, + "loss": 0.2111, + "step": 5869 + }, + { + "epoch": 0.3, + "grad_norm": 1.0928930058177924, + "learning_rate": 1.6451326593880072e-05, + "loss": 0.2152, + "step": 5870 + }, + { + "epoch": 0.3, + "grad_norm": 1.022071758104498, + "learning_rate": 1.6450068106115745e-05, + "loss": 0.2027, + "step": 5871 + }, + { + "epoch": 0.3, + "grad_norm": 0.9165368983625263, + "learning_rate": 1.644880944339295e-05, + "loss": 0.1926, + "step": 5872 + }, + { + "epoch": 0.3, + "grad_norm": 0.9855256169629955, + "learning_rate": 1.6447550605745836e-05, + "loss": 0.2126, + "step": 5873 + }, + { + "epoch": 0.3, + "grad_norm": 0.9811264576955663, + "learning_rate": 1.644629159320855e-05, + "loss": 0.2095, + "step": 5874 + }, + { + "epoch": 0.3, + "grad_norm": 0.8399754500955879, + "learning_rate": 1.644503240581524e-05, + "loss": 0.1823, + "step": 5875 + }, + { + "epoch": 0.3, + "grad_norm": 1.0565131500331548, + "learning_rate": 1.6443773043600058e-05, + "loss": 0.2225, + "step": 5876 + }, + { + "epoch": 0.3, + "grad_norm": 1.2131651665261163, + "learning_rate": 1.6442513506597175e-05, + "loss": 0.2317, + "step": 5877 + }, + { + "epoch": 0.3, + "grad_norm": 0.9404071637103668, + "learning_rate": 1.6441253794840745e-05, + "loss": 0.1882, + "step": 5878 + }, + { + "epoch": 0.3, + "grad_norm": 0.9782996232409031, + "learning_rate": 1.6439993908364942e-05, + "loss": 0.1891, + "step": 5879 + }, + { + "epoch": 0.3, + "grad_norm": 0.9048113744479341, + "learning_rate": 1.643873384720394e-05, + "loss": 0.2286, + "step": 5880 + }, + { + "epoch": 0.3, + "grad_norm": 1.0659766855293744, + "learning_rate": 1.643747361139192e-05, + "loss": 0.2317, + "step": 5881 + }, + { + "epoch": 0.3, + "grad_norm": 0.8068037627212562, + "learning_rate": 1.6436213200963065e-05, + "loss": 0.198, + "step": 5882 + }, + { + "epoch": 0.3, + "grad_norm": 0.877210694130185, + "learning_rate": 1.643495261595156e-05, + "loss": 0.211, + "step": 5883 + }, + { + "epoch": 0.3, + "grad_norm": 0.8623766726369708, + "learning_rate": 1.6433691856391608e-05, + "loss": 0.2107, + "step": 5884 + }, + { + "epoch": 0.3, + "grad_norm": 1.0274789198072334, + "learning_rate": 1.6432430922317396e-05, + "loss": 0.2024, + "step": 5885 + }, + { + "epoch": 0.3, + "grad_norm": 1.3082314211100265, + "learning_rate": 1.6431169813763134e-05, + "loss": 0.2281, + "step": 5886 + }, + { + "epoch": 0.3, + "grad_norm": 0.9357881374870074, + "learning_rate": 1.6429908530763027e-05, + "loss": 0.2018, + "step": 5887 + }, + { + "epoch": 0.3, + "grad_norm": 0.8417090098117563, + "learning_rate": 1.6428647073351287e-05, + "loss": 0.2225, + "step": 5888 + }, + { + "epoch": 0.3, + "grad_norm": 0.8117492998568088, + "learning_rate": 1.6427385441562135e-05, + "loss": 0.1911, + "step": 5889 + }, + { + "epoch": 0.3, + "grad_norm": 0.9145794378876565, + "learning_rate": 1.6426123635429787e-05, + "loss": 0.2137, + "step": 5890 + }, + { + "epoch": 0.3, + "grad_norm": 1.2038725188113313, + "learning_rate": 1.6424861654988477e-05, + "loss": 0.2157, + "step": 5891 + }, + { + "epoch": 0.3, + "grad_norm": 0.9292827245859995, + "learning_rate": 1.6423599500272424e-05, + "loss": 0.1931, + "step": 5892 + }, + { + "epoch": 0.3, + "grad_norm": 0.9946593686418843, + "learning_rate": 1.6422337171315878e-05, + "loss": 0.1904, + "step": 5893 + }, + { + "epoch": 0.3, + "grad_norm": 1.3859397288247326, + "learning_rate": 1.642107466815307e-05, + "loss": 0.2022, + "step": 5894 + }, + { + "epoch": 0.3, + "grad_norm": 0.8991319027869209, + "learning_rate": 1.6419811990818252e-05, + "loss": 0.2132, + "step": 5895 + }, + { + "epoch": 0.3, + "grad_norm": 1.0301125345203759, + "learning_rate": 1.6418549139345667e-05, + "loss": 0.2348, + "step": 5896 + }, + { + "epoch": 0.3, + "grad_norm": 1.0126074489129862, + "learning_rate": 1.641728611376958e-05, + "loss": 0.215, + "step": 5897 + }, + { + "epoch": 0.3, + "grad_norm": 0.8804663931110788, + "learning_rate": 1.641602291412424e-05, + "loss": 0.1978, + "step": 5898 + }, + { + "epoch": 0.3, + "grad_norm": 0.9066843573868038, + "learning_rate": 1.641475954044392e-05, + "loss": 0.214, + "step": 5899 + }, + { + "epoch": 0.3, + "grad_norm": 0.8398951594937323, + "learning_rate": 1.641349599276288e-05, + "loss": 0.1948, + "step": 5900 + }, + { + "epoch": 0.3, + "grad_norm": 0.88342216108323, + "learning_rate": 1.64122322711154e-05, + "loss": 0.2018, + "step": 5901 + }, + { + "epoch": 0.3, + "grad_norm": 0.8609414098283307, + "learning_rate": 1.6410968375535762e-05, + "loss": 0.2192, + "step": 5902 + }, + { + "epoch": 0.3, + "grad_norm": 0.7894700819728515, + "learning_rate": 1.640970430605824e-05, + "loss": 0.2002, + "step": 5903 + }, + { + "epoch": 0.3, + "grad_norm": 0.948260892075998, + "learning_rate": 1.640844006271713e-05, + "loss": 0.2033, + "step": 5904 + }, + { + "epoch": 0.3, + "grad_norm": 0.9018493655463135, + "learning_rate": 1.640717564554672e-05, + "loss": 0.2039, + "step": 5905 + }, + { + "epoch": 0.3, + "grad_norm": 0.8166650211310612, + "learning_rate": 1.6405911054581307e-05, + "loss": 0.1891, + "step": 5906 + }, + { + "epoch": 0.3, + "grad_norm": 0.9446127345677011, + "learning_rate": 1.6404646289855194e-05, + "loss": 0.2209, + "step": 5907 + }, + { + "epoch": 0.3, + "grad_norm": 0.8743844759657704, + "learning_rate": 1.640338135140269e-05, + "loss": 0.1924, + "step": 5908 + }, + { + "epoch": 0.3, + "grad_norm": 1.4321428005760437, + "learning_rate": 1.640211623925811e-05, + "loss": 0.2296, + "step": 5909 + }, + { + "epoch": 0.3, + "grad_norm": 0.9025069904442802, + "learning_rate": 1.640085095345576e-05, + "loss": 0.2115, + "step": 5910 + }, + { + "epoch": 0.3, + "grad_norm": 0.8714708162046169, + "learning_rate": 1.6399585494029968e-05, + "loss": 0.1909, + "step": 5911 + }, + { + "epoch": 0.3, + "grad_norm": 1.1490013763920408, + "learning_rate": 1.639831986101506e-05, + "loss": 0.2036, + "step": 5912 + }, + { + "epoch": 0.3, + "grad_norm": 0.9508681180299058, + "learning_rate": 1.639705405444536e-05, + "loss": 0.1905, + "step": 5913 + }, + { + "epoch": 0.3, + "grad_norm": 1.235841414052421, + "learning_rate": 1.6395788074355212e-05, + "loss": 0.1929, + "step": 5914 + }, + { + "epoch": 0.3, + "grad_norm": 1.1104003104638973, + "learning_rate": 1.639452192077895e-05, + "loss": 0.1946, + "step": 5915 + }, + { + "epoch": 0.3, + "grad_norm": 0.9560855958680096, + "learning_rate": 1.6393255593750917e-05, + "loss": 0.2202, + "step": 5916 + }, + { + "epoch": 0.3, + "grad_norm": 0.8003227650607686, + "learning_rate": 1.6391989093305468e-05, + "loss": 0.1872, + "step": 5917 + }, + { + "epoch": 0.3, + "grad_norm": 1.093372947587186, + "learning_rate": 1.6390722419476952e-05, + "loss": 0.215, + "step": 5918 + }, + { + "epoch": 0.3, + "grad_norm": 0.8234618930446632, + "learning_rate": 1.638945557229973e-05, + "loss": 0.2064, + "step": 5919 + }, + { + "epoch": 0.3, + "grad_norm": 1.0258032410734896, + "learning_rate": 1.6388188551808166e-05, + "loss": 0.1988, + "step": 5920 + }, + { + "epoch": 0.3, + "grad_norm": 1.1774833686716462, + "learning_rate": 1.6386921358036624e-05, + "loss": 0.1894, + "step": 5921 + }, + { + "epoch": 0.3, + "grad_norm": 0.8458940053240332, + "learning_rate": 1.638565399101948e-05, + "loss": 0.211, + "step": 5922 + }, + { + "epoch": 0.3, + "grad_norm": 1.0143614561029222, + "learning_rate": 1.6384386450791114e-05, + "loss": 0.2019, + "step": 5923 + }, + { + "epoch": 0.3, + "grad_norm": 0.9030318476331984, + "learning_rate": 1.6383118737385903e-05, + "loss": 0.1939, + "step": 5924 + }, + { + "epoch": 0.3, + "grad_norm": 0.9946142914549149, + "learning_rate": 1.6381850850838232e-05, + "loss": 0.1889, + "step": 5925 + }, + { + "epoch": 0.3, + "grad_norm": 0.9871772763534351, + "learning_rate": 1.63805827911825e-05, + "loss": 0.1854, + "step": 5926 + }, + { + "epoch": 0.3, + "grad_norm": 0.9040610810639874, + "learning_rate": 1.63793145584531e-05, + "loss": 0.1925, + "step": 5927 + }, + { + "epoch": 0.3, + "grad_norm": 0.9290544683412078, + "learning_rate": 1.637804615268443e-05, + "loss": 0.2287, + "step": 5928 + }, + { + "epoch": 0.3, + "grad_norm": 0.9562484980139982, + "learning_rate": 1.63767775739109e-05, + "loss": 0.2036, + "step": 5929 + }, + { + "epoch": 0.3, + "grad_norm": 0.8777655403386205, + "learning_rate": 1.6375508822166917e-05, + "loss": 0.2296, + "step": 5930 + }, + { + "epoch": 0.3, + "grad_norm": 0.8735047939100099, + "learning_rate": 1.63742398974869e-05, + "loss": 0.1851, + "step": 5931 + }, + { + "epoch": 0.3, + "grad_norm": 0.935282896007759, + "learning_rate": 1.6372970799905262e-05, + "loss": 0.2209, + "step": 5932 + }, + { + "epoch": 0.3, + "grad_norm": 1.0901327351521781, + "learning_rate": 1.6371701529456433e-05, + "loss": 0.2119, + "step": 5933 + }, + { + "epoch": 0.3, + "grad_norm": 1.1528605527009306, + "learning_rate": 1.6370432086174837e-05, + "loss": 0.1906, + "step": 5934 + }, + { + "epoch": 0.3, + "grad_norm": 0.8206100964092897, + "learning_rate": 1.6369162470094915e-05, + "loss": 0.2065, + "step": 5935 + }, + { + "epoch": 0.3, + "grad_norm": 0.8480007652060755, + "learning_rate": 1.63678926812511e-05, + "loss": 0.1984, + "step": 5936 + }, + { + "epoch": 0.3, + "grad_norm": 1.2839298260001515, + "learning_rate": 1.6366622719677834e-05, + "loss": 0.2135, + "step": 5937 + }, + { + "epoch": 0.3, + "grad_norm": 0.9534265566922148, + "learning_rate": 1.6365352585409572e-05, + "loss": 0.2263, + "step": 5938 + }, + { + "epoch": 0.3, + "grad_norm": 0.962950305698948, + "learning_rate": 1.636408227848076e-05, + "loss": 0.2058, + "step": 5939 + }, + { + "epoch": 0.3, + "grad_norm": 0.9968989531366338, + "learning_rate": 1.6362811798925852e-05, + "loss": 0.2056, + "step": 5940 + }, + { + "epoch": 0.3, + "grad_norm": 0.9642269498234763, + "learning_rate": 1.636154114677932e-05, + "loss": 0.2174, + "step": 5941 + }, + { + "epoch": 0.3, + "grad_norm": 0.8536369510964444, + "learning_rate": 1.636027032207562e-05, + "loss": 0.1977, + "step": 5942 + }, + { + "epoch": 0.3, + "grad_norm": 2.701056712938503, + "learning_rate": 1.6358999324849235e-05, + "loss": 0.2213, + "step": 5943 + }, + { + "epoch": 0.3, + "grad_norm": 0.7754497041155105, + "learning_rate": 1.635772815513463e-05, + "loss": 0.1935, + "step": 5944 + }, + { + "epoch": 0.3, + "grad_norm": 0.9182076271424672, + "learning_rate": 1.635645681296629e-05, + "loss": 0.1837, + "step": 5945 + }, + { + "epoch": 0.3, + "grad_norm": 0.9513656758902006, + "learning_rate": 1.63551852983787e-05, + "loss": 0.2034, + "step": 5946 + }, + { + "epoch": 0.3, + "grad_norm": 0.8099373859425659, + "learning_rate": 1.635391361140635e-05, + "loss": 0.2035, + "step": 5947 + }, + { + "epoch": 0.3, + "grad_norm": 1.2689296573815851, + "learning_rate": 1.6352641752083734e-05, + "loss": 0.2376, + "step": 5948 + }, + { + "epoch": 0.3, + "grad_norm": 1.3730141177994373, + "learning_rate": 1.6351369720445353e-05, + "loss": 0.2268, + "step": 5949 + }, + { + "epoch": 0.3, + "grad_norm": 0.8013748927283166, + "learning_rate": 1.6350097516525705e-05, + "loss": 0.2, + "step": 5950 + }, + { + "epoch": 0.3, + "grad_norm": 1.116575591861395, + "learning_rate": 1.634882514035931e-05, + "loss": 0.1904, + "step": 5951 + }, + { + "epoch": 0.3, + "grad_norm": 1.0059605669840697, + "learning_rate": 1.6347552591980672e-05, + "loss": 0.2203, + "step": 5952 + }, + { + "epoch": 0.3, + "grad_norm": 0.9044501685247852, + "learning_rate": 1.634627987142431e-05, + "loss": 0.2197, + "step": 5953 + }, + { + "epoch": 0.3, + "grad_norm": 1.1633662000849947, + "learning_rate": 1.6345006978724748e-05, + "loss": 0.2302, + "step": 5954 + }, + { + "epoch": 0.3, + "grad_norm": 1.0048216985971365, + "learning_rate": 1.6343733913916516e-05, + "loss": 0.199, + "step": 5955 + }, + { + "epoch": 0.3, + "grad_norm": 0.7956734026992229, + "learning_rate": 1.634246067703414e-05, + "loss": 0.2101, + "step": 5956 + }, + { + "epoch": 0.3, + "grad_norm": 1.0101755028948003, + "learning_rate": 1.6341187268112162e-05, + "loss": 0.2084, + "step": 5957 + }, + { + "epoch": 0.3, + "grad_norm": 0.9222892642766556, + "learning_rate": 1.633991368718512e-05, + "loss": 0.2193, + "step": 5958 + }, + { + "epoch": 0.3, + "grad_norm": 0.9677295069555725, + "learning_rate": 1.6338639934287563e-05, + "loss": 0.2085, + "step": 5959 + }, + { + "epoch": 0.3, + "grad_norm": 0.9158364680199044, + "learning_rate": 1.6337366009454037e-05, + "loss": 0.1924, + "step": 5960 + }, + { + "epoch": 0.3, + "grad_norm": 0.944337670541037, + "learning_rate": 1.6336091912719102e-05, + "loss": 0.2005, + "step": 5961 + }, + { + "epoch": 0.3, + "grad_norm": 0.8085176627955762, + "learning_rate": 1.6334817644117316e-05, + "loss": 0.2191, + "step": 5962 + }, + { + "epoch": 0.3, + "grad_norm": 1.4056936577366979, + "learning_rate": 1.633354320368324e-05, + "loss": 0.1941, + "step": 5963 + }, + { + "epoch": 0.3, + "grad_norm": 0.8614378964196189, + "learning_rate": 1.6332268591451454e-05, + "loss": 0.2031, + "step": 5964 + }, + { + "epoch": 0.3, + "grad_norm": 0.934286179464845, + "learning_rate": 1.633099380745652e-05, + "loss": 0.1998, + "step": 5965 + }, + { + "epoch": 0.3, + "grad_norm": 0.9754800276459915, + "learning_rate": 1.6329718851733024e-05, + "loss": 0.2147, + "step": 5966 + }, + { + "epoch": 0.3, + "grad_norm": 1.1766704671644548, + "learning_rate": 1.6328443724315544e-05, + "loss": 0.2143, + "step": 5967 + }, + { + "epoch": 0.3, + "grad_norm": 1.0562421455698277, + "learning_rate": 1.6327168425238672e-05, + "loss": 0.213, + "step": 5968 + }, + { + "epoch": 0.3, + "grad_norm": 0.9481384551774131, + "learning_rate": 1.6325892954536997e-05, + "loss": 0.1783, + "step": 5969 + }, + { + "epoch": 0.3, + "grad_norm": 1.1198994360550176, + "learning_rate": 1.6324617312245123e-05, + "loss": 0.1809, + "step": 5970 + }, + { + "epoch": 0.3, + "grad_norm": 0.865446262601342, + "learning_rate": 1.632334149839764e-05, + "loss": 0.167, + "step": 5971 + }, + { + "epoch": 0.3, + "grad_norm": 0.8359926893079918, + "learning_rate": 1.632206551302917e-05, + "loss": 0.2142, + "step": 5972 + }, + { + "epoch": 0.3, + "grad_norm": 1.1003436066740737, + "learning_rate": 1.632078935617431e-05, + "loss": 0.1987, + "step": 5973 + }, + { + "epoch": 0.3, + "grad_norm": 1.1175808214952012, + "learning_rate": 1.6319513027867683e-05, + "loss": 0.2127, + "step": 5974 + }, + { + "epoch": 0.3, + "grad_norm": 1.09109343841727, + "learning_rate": 1.631823652814391e-05, + "loss": 0.2195, + "step": 5975 + }, + { + "epoch": 0.3, + "grad_norm": 1.5938819437245713, + "learning_rate": 1.631695985703761e-05, + "loss": 0.1932, + "step": 5976 + }, + { + "epoch": 0.3, + "grad_norm": 1.1572532065360304, + "learning_rate": 1.631568301458342e-05, + "loss": 0.2291, + "step": 5977 + }, + { + "epoch": 0.3, + "grad_norm": 0.7935200691091274, + "learning_rate": 1.6314406000815975e-05, + "loss": 0.1829, + "step": 5978 + }, + { + "epoch": 0.3, + "grad_norm": 1.1058627197706676, + "learning_rate": 1.6313128815769904e-05, + "loss": 0.2027, + "step": 5979 + }, + { + "epoch": 0.3, + "grad_norm": 0.8970087572102301, + "learning_rate": 1.631185145947986e-05, + "loss": 0.2225, + "step": 5980 + }, + { + "epoch": 0.3, + "grad_norm": 0.8393393193043988, + "learning_rate": 1.631057393198049e-05, + "loss": 0.2061, + "step": 5981 + }, + { + "epoch": 0.3, + "grad_norm": 0.9851589297321419, + "learning_rate": 1.6309296233306446e-05, + "loss": 0.2127, + "step": 5982 + }, + { + "epoch": 0.3, + "grad_norm": 1.3751830177191615, + "learning_rate": 1.6308018363492385e-05, + "loss": 0.2221, + "step": 5983 + }, + { + "epoch": 0.3, + "grad_norm": 0.8238273156343242, + "learning_rate": 1.630674032257297e-05, + "loss": 0.1916, + "step": 5984 + }, + { + "epoch": 0.3, + "grad_norm": 1.0898527791933963, + "learning_rate": 1.6305462110582863e-05, + "loss": 0.2429, + "step": 5985 + }, + { + "epoch": 0.3, + "grad_norm": 1.302930626821737, + "learning_rate": 1.6304183727556747e-05, + "loss": 0.2166, + "step": 5986 + }, + { + "epoch": 0.3, + "grad_norm": 3.39479291669915, + "learning_rate": 1.6302905173529285e-05, + "loss": 0.2118, + "step": 5987 + }, + { + "epoch": 0.3, + "grad_norm": 1.4736219557226864, + "learning_rate": 1.6301626448535168e-05, + "loss": 0.2022, + "step": 5988 + }, + { + "epoch": 0.3, + "grad_norm": 0.9127817268931459, + "learning_rate": 1.6300347552609074e-05, + "loss": 0.1959, + "step": 5989 + }, + { + "epoch": 0.3, + "grad_norm": 1.008387597169717, + "learning_rate": 1.62990684857857e-05, + "loss": 0.227, + "step": 5990 + }, + { + "epoch": 0.3, + "grad_norm": 0.8037508248908429, + "learning_rate": 1.6297789248099736e-05, + "loss": 0.21, + "step": 5991 + }, + { + "epoch": 0.3, + "grad_norm": 1.087320987869786, + "learning_rate": 1.6296509839585885e-05, + "loss": 0.2035, + "step": 5992 + }, + { + "epoch": 0.3, + "grad_norm": 0.8294639592735905, + "learning_rate": 1.6295230260278847e-05, + "loss": 0.2123, + "step": 5993 + }, + { + "epoch": 0.3, + "grad_norm": 0.7337293869270486, + "learning_rate": 1.6293950510213335e-05, + "loss": 0.2082, + "step": 5994 + }, + { + "epoch": 0.3, + "grad_norm": 0.8497313029047259, + "learning_rate": 1.6292670589424057e-05, + "loss": 0.2148, + "step": 5995 + }, + { + "epoch": 0.3, + "grad_norm": 1.6162721450291733, + "learning_rate": 1.6291390497945737e-05, + "loss": 0.2115, + "step": 5996 + }, + { + "epoch": 0.3, + "grad_norm": 0.8330036247733196, + "learning_rate": 1.6290110235813094e-05, + "loss": 0.1815, + "step": 5997 + }, + { + "epoch": 0.31, + "grad_norm": 1.1318812757855268, + "learning_rate": 1.6288829803060853e-05, + "loss": 0.2048, + "step": 5998 + }, + { + "epoch": 0.31, + "grad_norm": 1.82097750858233, + "learning_rate": 1.6287549199723745e-05, + "loss": 0.2189, + "step": 5999 + }, + { + "epoch": 0.31, + "grad_norm": 0.8236429171653487, + "learning_rate": 1.6286268425836517e-05, + "loss": 0.2, + "step": 6000 + }, + { + "epoch": 0.31, + "grad_norm": 1.5845029614639, + "learning_rate": 1.62849874814339e-05, + "loss": 0.1795, + "step": 6001 + }, + { + "epoch": 0.31, + "grad_norm": 1.4522997519291503, + "learning_rate": 1.6283706366550646e-05, + "loss": 0.2155, + "step": 6002 + }, + { + "epoch": 0.31, + "grad_norm": 0.9965723060845276, + "learning_rate": 1.6282425081221498e-05, + "loss": 0.1913, + "step": 6003 + }, + { + "epoch": 0.31, + "grad_norm": 1.0240651787267885, + "learning_rate": 1.6281143625481214e-05, + "loss": 0.1995, + "step": 6004 + }, + { + "epoch": 0.31, + "grad_norm": 0.7303063496135936, + "learning_rate": 1.627986199936456e-05, + "loss": 0.2153, + "step": 6005 + }, + { + "epoch": 0.31, + "grad_norm": 0.9488526266709439, + "learning_rate": 1.6278580202906287e-05, + "loss": 0.2074, + "step": 6006 + }, + { + "epoch": 0.31, + "grad_norm": 0.8874494026802576, + "learning_rate": 1.6277298236141177e-05, + "loss": 0.234, + "step": 6007 + }, + { + "epoch": 0.31, + "grad_norm": 0.9477817405838913, + "learning_rate": 1.6276016099103995e-05, + "loss": 0.2199, + "step": 6008 + }, + { + "epoch": 0.31, + "grad_norm": 1.1014385615064697, + "learning_rate": 1.6274733791829522e-05, + "loss": 0.2199, + "step": 6009 + }, + { + "epoch": 0.31, + "grad_norm": 1.0774681703386346, + "learning_rate": 1.627345131435254e-05, + "loss": 0.2538, + "step": 6010 + }, + { + "epoch": 0.31, + "grad_norm": 1.2211872301986042, + "learning_rate": 1.6272168666707838e-05, + "loss": 0.2199, + "step": 6011 + }, + { + "epoch": 0.31, + "grad_norm": 0.7944085611231179, + "learning_rate": 1.627088584893021e-05, + "loss": 0.1968, + "step": 6012 + }, + { + "epoch": 0.31, + "grad_norm": 1.1626975384193183, + "learning_rate": 1.6269602861054442e-05, + "loss": 0.2084, + "step": 6013 + }, + { + "epoch": 0.31, + "grad_norm": 0.9266890007568821, + "learning_rate": 1.6268319703115348e-05, + "loss": 0.1952, + "step": 6014 + }, + { + "epoch": 0.31, + "grad_norm": 1.0407450828399152, + "learning_rate": 1.6267036375147728e-05, + "loss": 0.2085, + "step": 6015 + }, + { + "epoch": 0.31, + "grad_norm": 0.9540943475088866, + "learning_rate": 1.6265752877186386e-05, + "loss": 0.1963, + "step": 6016 + }, + { + "epoch": 0.31, + "grad_norm": 0.9119042826028682, + "learning_rate": 1.626446920926615e-05, + "loss": 0.2015, + "step": 6017 + }, + { + "epoch": 0.31, + "grad_norm": 1.3939619942230101, + "learning_rate": 1.626318537142183e-05, + "loss": 0.1915, + "step": 6018 + }, + { + "epoch": 0.31, + "grad_norm": 1.4344270785747297, + "learning_rate": 1.6261901363688257e-05, + "loss": 0.2255, + "step": 6019 + }, + { + "epoch": 0.31, + "grad_norm": 1.079480238045351, + "learning_rate": 1.626061718610025e-05, + "loss": 0.2304, + "step": 6020 + }, + { + "epoch": 0.31, + "grad_norm": 1.4324024219544331, + "learning_rate": 1.625933283869265e-05, + "loss": 0.1926, + "step": 6021 + }, + { + "epoch": 0.31, + "grad_norm": 1.7344142079737424, + "learning_rate": 1.6258048321500294e-05, + "loss": 0.1956, + "step": 6022 + }, + { + "epoch": 0.31, + "grad_norm": 1.1070084317838647, + "learning_rate": 1.6256763634558024e-05, + "loss": 0.2114, + "step": 6023 + }, + { + "epoch": 0.31, + "grad_norm": 0.9764029510305736, + "learning_rate": 1.625547877790069e-05, + "loss": 0.1983, + "step": 6024 + }, + { + "epoch": 0.31, + "grad_norm": 0.9096497335946024, + "learning_rate": 1.6254193751563137e-05, + "loss": 0.2244, + "step": 6025 + }, + { + "epoch": 0.31, + "grad_norm": 0.9647081068711773, + "learning_rate": 1.6252908555580223e-05, + "loss": 0.2021, + "step": 6026 + }, + { + "epoch": 0.31, + "grad_norm": 1.003658864196987, + "learning_rate": 1.6251623189986815e-05, + "loss": 0.2282, + "step": 6027 + }, + { + "epoch": 0.31, + "grad_norm": 1.0027742910910584, + "learning_rate": 1.6250337654817774e-05, + "loss": 0.2084, + "step": 6028 + }, + { + "epoch": 0.31, + "grad_norm": 1.0873519198431463, + "learning_rate": 1.624905195010797e-05, + "loss": 0.2172, + "step": 6029 + }, + { + "epoch": 0.31, + "grad_norm": 1.0149186797500493, + "learning_rate": 1.6247766075892283e-05, + "loss": 0.2034, + "step": 6030 + }, + { + "epoch": 0.31, + "grad_norm": 0.9063023009686626, + "learning_rate": 1.624648003220558e-05, + "loss": 0.1857, + "step": 6031 + }, + { + "epoch": 0.31, + "grad_norm": 0.7650021001433587, + "learning_rate": 1.624519381908276e-05, + "loss": 0.2074, + "step": 6032 + }, + { + "epoch": 0.31, + "grad_norm": 1.3530888182017393, + "learning_rate": 1.6243907436558705e-05, + "loss": 0.2268, + "step": 6033 + }, + { + "epoch": 0.31, + "grad_norm": 0.911985352429917, + "learning_rate": 1.624262088466831e-05, + "loss": 0.2375, + "step": 6034 + }, + { + "epoch": 0.31, + "grad_norm": 1.6303058569455007, + "learning_rate": 1.6241334163446465e-05, + "loss": 0.2074, + "step": 6035 + }, + { + "epoch": 0.31, + "grad_norm": 1.459715495070587, + "learning_rate": 1.6240047272928082e-05, + "loss": 0.2149, + "step": 6036 + }, + { + "epoch": 0.31, + "grad_norm": 0.8229690605766699, + "learning_rate": 1.6238760213148064e-05, + "loss": 0.1893, + "step": 6037 + }, + { + "epoch": 0.31, + "grad_norm": 1.6615636863545942, + "learning_rate": 1.6237472984141322e-05, + "loss": 0.2066, + "step": 6038 + }, + { + "epoch": 0.31, + "grad_norm": 0.8825843487781814, + "learning_rate": 1.623618558594277e-05, + "loss": 0.1996, + "step": 6039 + }, + { + "epoch": 0.31, + "grad_norm": 0.8986305451461882, + "learning_rate": 1.6234898018587336e-05, + "loss": 0.193, + "step": 6040 + }, + { + "epoch": 0.31, + "grad_norm": 0.8603017400911558, + "learning_rate": 1.623361028210994e-05, + "loss": 0.2074, + "step": 6041 + }, + { + "epoch": 0.31, + "grad_norm": 1.0757625435404399, + "learning_rate": 1.6232322376545516e-05, + "loss": 0.1834, + "step": 6042 + }, + { + "epoch": 0.31, + "grad_norm": 0.9255175015178886, + "learning_rate": 1.623103430192899e-05, + "loss": 0.1739, + "step": 6043 + }, + { + "epoch": 0.31, + "grad_norm": 1.0095454531080827, + "learning_rate": 1.6229746058295312e-05, + "loss": 0.2135, + "step": 6044 + }, + { + "epoch": 0.31, + "grad_norm": 1.7730502584757073, + "learning_rate": 1.6228457645679414e-05, + "loss": 0.2339, + "step": 6045 + }, + { + "epoch": 0.31, + "grad_norm": 0.9101283209796647, + "learning_rate": 1.6227169064116255e-05, + "loss": 0.2184, + "step": 6046 + }, + { + "epoch": 0.31, + "grad_norm": 1.4470438726261499, + "learning_rate": 1.622588031364078e-05, + "loss": 0.2147, + "step": 6047 + }, + { + "epoch": 0.31, + "grad_norm": 1.6151800077895164, + "learning_rate": 1.6224591394287954e-05, + "loss": 0.2114, + "step": 6048 + }, + { + "epoch": 0.31, + "grad_norm": 1.141700862481167, + "learning_rate": 1.6223302306092733e-05, + "loss": 0.2097, + "step": 6049 + }, + { + "epoch": 0.31, + "grad_norm": 3.6258373665253605, + "learning_rate": 1.6222013049090086e-05, + "loss": 0.2116, + "step": 6050 + }, + { + "epoch": 0.31, + "grad_norm": 1.0270473789404706, + "learning_rate": 1.6220723623314983e-05, + "loss": 0.2229, + "step": 6051 + }, + { + "epoch": 0.31, + "grad_norm": 0.9431739891813424, + "learning_rate": 1.6219434028802402e-05, + "loss": 0.2079, + "step": 6052 + }, + { + "epoch": 0.31, + "grad_norm": 1.0931954007419473, + "learning_rate": 1.621814426558732e-05, + "loss": 0.225, + "step": 6053 + }, + { + "epoch": 0.31, + "grad_norm": 1.5694243876987546, + "learning_rate": 1.6216854333704725e-05, + "loss": 0.2058, + "step": 6054 + }, + { + "epoch": 0.31, + "grad_norm": 1.858906512456257, + "learning_rate": 1.6215564233189606e-05, + "loss": 0.2106, + "step": 6055 + }, + { + "epoch": 0.31, + "grad_norm": 0.8471245246956891, + "learning_rate": 1.621427396407695e-05, + "loss": 0.2, + "step": 6056 + }, + { + "epoch": 0.31, + "grad_norm": 1.0547280866759583, + "learning_rate": 1.6212983526401767e-05, + "loss": 0.2011, + "step": 6057 + }, + { + "epoch": 0.31, + "grad_norm": 1.119564287962234, + "learning_rate": 1.6211692920199054e-05, + "loss": 0.1756, + "step": 6058 + }, + { + "epoch": 0.31, + "grad_norm": 1.1099677356408273, + "learning_rate": 1.621040214550382e-05, + "loss": 0.177, + "step": 6059 + }, + { + "epoch": 0.31, + "grad_norm": 1.0913921470754946, + "learning_rate": 1.6209111202351076e-05, + "loss": 0.22, + "step": 6060 + }, + { + "epoch": 0.31, + "grad_norm": 1.0658146194201514, + "learning_rate": 1.620782009077584e-05, + "loss": 0.2421, + "step": 6061 + }, + { + "epoch": 0.31, + "grad_norm": 2.5260663580908203, + "learning_rate": 1.6206528810813135e-05, + "loss": 0.1937, + "step": 6062 + }, + { + "epoch": 0.31, + "grad_norm": 0.9794394801934251, + "learning_rate": 1.6205237362497982e-05, + "loss": 0.2261, + "step": 6063 + }, + { + "epoch": 0.31, + "grad_norm": 1.5247938545986877, + "learning_rate": 1.6203945745865418e-05, + "loss": 0.2084, + "step": 6064 + }, + { + "epoch": 0.31, + "grad_norm": 1.242093586082393, + "learning_rate": 1.6202653960950474e-05, + "loss": 0.2221, + "step": 6065 + }, + { + "epoch": 0.31, + "grad_norm": 1.6609521159811682, + "learning_rate": 1.6201362007788193e-05, + "loss": 0.1981, + "step": 6066 + }, + { + "epoch": 0.31, + "grad_norm": 0.9717974454332525, + "learning_rate": 1.6200069886413612e-05, + "loss": 0.2035, + "step": 6067 + }, + { + "epoch": 0.31, + "grad_norm": 1.040278624958183, + "learning_rate": 1.6198777596861792e-05, + "loss": 0.2517, + "step": 6068 + }, + { + "epoch": 0.31, + "grad_norm": 1.060313397045809, + "learning_rate": 1.6197485139167775e-05, + "loss": 0.2244, + "step": 6069 + }, + { + "epoch": 0.31, + "grad_norm": 1.4185748386978991, + "learning_rate": 1.619619251336663e-05, + "loss": 0.1927, + "step": 6070 + }, + { + "epoch": 0.31, + "grad_norm": 1.1902479620651243, + "learning_rate": 1.6194899719493404e-05, + "loss": 0.2443, + "step": 6071 + }, + { + "epoch": 0.31, + "grad_norm": 1.0869353007806424, + "learning_rate": 1.619360675758318e-05, + "loss": 0.1993, + "step": 6072 + }, + { + "epoch": 0.31, + "grad_norm": 1.4174221590710505, + "learning_rate": 1.619231362767102e-05, + "loss": 0.1729, + "step": 6073 + }, + { + "epoch": 0.31, + "grad_norm": 1.0794084394279622, + "learning_rate": 1.6191020329792003e-05, + "loss": 0.1664, + "step": 6074 + }, + { + "epoch": 0.31, + "grad_norm": 0.7769533876934619, + "learning_rate": 1.6189726863981212e-05, + "loss": 0.1774, + "step": 6075 + }, + { + "epoch": 0.31, + "grad_norm": 1.0305866506804409, + "learning_rate": 1.618843323027373e-05, + "loss": 0.2087, + "step": 6076 + }, + { + "epoch": 0.31, + "grad_norm": 0.8876921764605629, + "learning_rate": 1.6187139428704645e-05, + "loss": 0.2175, + "step": 6077 + }, + { + "epoch": 0.31, + "grad_norm": 1.0822377892878023, + "learning_rate": 1.6185845459309053e-05, + "loss": 0.1939, + "step": 6078 + }, + { + "epoch": 0.31, + "grad_norm": 0.9176729470876167, + "learning_rate": 1.6184551322122056e-05, + "loss": 0.1904, + "step": 6079 + }, + { + "epoch": 0.31, + "grad_norm": 1.1716604783757618, + "learning_rate": 1.6183257017178754e-05, + "loss": 0.1996, + "step": 6080 + }, + { + "epoch": 0.31, + "grad_norm": 1.2684319856473547, + "learning_rate": 1.6181962544514257e-05, + "loss": 0.1879, + "step": 6081 + }, + { + "epoch": 0.31, + "grad_norm": 0.8193278183743143, + "learning_rate": 1.6180667904163675e-05, + "loss": 0.1868, + "step": 6082 + }, + { + "epoch": 0.31, + "grad_norm": 1.5171040957386068, + "learning_rate": 1.617937309616213e-05, + "loss": 0.219, + "step": 6083 + }, + { + "epoch": 0.31, + "grad_norm": 1.33064610781682, + "learning_rate": 1.6178078120544735e-05, + "loss": 0.205, + "step": 6084 + }, + { + "epoch": 0.31, + "grad_norm": 1.6137351930166788, + "learning_rate": 1.6176782977346626e-05, + "loss": 0.1999, + "step": 6085 + }, + { + "epoch": 0.31, + "grad_norm": 1.5597204402629181, + "learning_rate": 1.6175487666602928e-05, + "loss": 0.215, + "step": 6086 + }, + { + "epoch": 0.31, + "grad_norm": 1.1470285061130765, + "learning_rate": 1.617419218834878e-05, + "loss": 0.2121, + "step": 6087 + }, + { + "epoch": 0.31, + "grad_norm": 1.2800202663819764, + "learning_rate": 1.617289654261932e-05, + "loss": 0.2111, + "step": 6088 + }, + { + "epoch": 0.31, + "grad_norm": 1.545652925213808, + "learning_rate": 1.6171600729449693e-05, + "loss": 0.2338, + "step": 6089 + }, + { + "epoch": 0.31, + "grad_norm": 1.1290088033332193, + "learning_rate": 1.617030474887505e-05, + "loss": 0.1965, + "step": 6090 + }, + { + "epoch": 0.31, + "grad_norm": 0.969297537706917, + "learning_rate": 1.616900860093054e-05, + "loss": 0.2065, + "step": 6091 + }, + { + "epoch": 0.31, + "grad_norm": 1.190717952530125, + "learning_rate": 1.616771228565132e-05, + "loss": 0.195, + "step": 6092 + }, + { + "epoch": 0.31, + "grad_norm": 1.1595524076758887, + "learning_rate": 1.616641580307256e-05, + "loss": 0.2136, + "step": 6093 + }, + { + "epoch": 0.31, + "grad_norm": 1.6403050277216027, + "learning_rate": 1.616511915322942e-05, + "loss": 0.1861, + "step": 6094 + }, + { + "epoch": 0.31, + "grad_norm": 1.6437870280410332, + "learning_rate": 1.6163822336157076e-05, + "loss": 0.21, + "step": 6095 + }, + { + "epoch": 0.31, + "grad_norm": 1.8419726499255795, + "learning_rate": 1.6162525351890702e-05, + "loss": 0.207, + "step": 6096 + }, + { + "epoch": 0.31, + "grad_norm": 1.1212531960451904, + "learning_rate": 1.6161228200465485e-05, + "loss": 0.1988, + "step": 6097 + }, + { + "epoch": 0.31, + "grad_norm": 1.051091374373422, + "learning_rate": 1.61599308819166e-05, + "loss": 0.2186, + "step": 6098 + }, + { + "epoch": 0.31, + "grad_norm": 0.9688417139595524, + "learning_rate": 1.6158633396279243e-05, + "loss": 0.243, + "step": 6099 + }, + { + "epoch": 0.31, + "grad_norm": 1.2062547757471067, + "learning_rate": 1.6157335743588606e-05, + "loss": 0.2107, + "step": 6100 + }, + { + "epoch": 0.31, + "grad_norm": 1.0249173195322316, + "learning_rate": 1.6156037923879893e-05, + "loss": 0.2127, + "step": 6101 + }, + { + "epoch": 0.31, + "grad_norm": 1.3317006096755801, + "learning_rate": 1.61547399371883e-05, + "loss": 0.2066, + "step": 6102 + }, + { + "epoch": 0.31, + "grad_norm": 1.0908772213653302, + "learning_rate": 1.6153441783549043e-05, + "loss": 0.2107, + "step": 6103 + }, + { + "epoch": 0.31, + "grad_norm": 2.061902894350475, + "learning_rate": 1.6152143462997325e-05, + "loss": 0.2165, + "step": 6104 + }, + { + "epoch": 0.31, + "grad_norm": 1.4954267996419317, + "learning_rate": 1.615084497556837e-05, + "loss": 0.2195, + "step": 6105 + }, + { + "epoch": 0.31, + "grad_norm": 1.5649819771869884, + "learning_rate": 1.61495463212974e-05, + "loss": 0.2019, + "step": 6106 + }, + { + "epoch": 0.31, + "grad_norm": 1.2063094496662974, + "learning_rate": 1.6148247500219635e-05, + "loss": 0.2021, + "step": 6107 + }, + { + "epoch": 0.31, + "grad_norm": 0.995772199176662, + "learning_rate": 1.614694851237031e-05, + "loss": 0.1937, + "step": 6108 + }, + { + "epoch": 0.31, + "grad_norm": 2.9327743983007237, + "learning_rate": 1.614564935778466e-05, + "loss": 0.2104, + "step": 6109 + }, + { + "epoch": 0.31, + "grad_norm": 1.4621831264462208, + "learning_rate": 1.6144350036497925e-05, + "loss": 0.1952, + "step": 6110 + }, + { + "epoch": 0.31, + "grad_norm": 1.882534748644571, + "learning_rate": 1.6143050548545348e-05, + "loss": 0.1974, + "step": 6111 + }, + { + "epoch": 0.31, + "grad_norm": 2.1978580185486694, + "learning_rate": 1.6141750893962176e-05, + "loss": 0.1929, + "step": 6112 + }, + { + "epoch": 0.31, + "grad_norm": 2.1465567091981104, + "learning_rate": 1.6140451072783666e-05, + "loss": 0.2049, + "step": 6113 + }, + { + "epoch": 0.31, + "grad_norm": 1.2664074830992735, + "learning_rate": 1.613915108504507e-05, + "loss": 0.1923, + "step": 6114 + }, + { + "epoch": 0.31, + "grad_norm": 1.0980669945835926, + "learning_rate": 1.613785093078166e-05, + "loss": 0.1896, + "step": 6115 + }, + { + "epoch": 0.31, + "grad_norm": 1.631937385976018, + "learning_rate": 1.6136550610028694e-05, + "loss": 0.1885, + "step": 6116 + }, + { + "epoch": 0.31, + "grad_norm": 1.5889117710056704, + "learning_rate": 1.6135250122821445e-05, + "loss": 0.1946, + "step": 6117 + }, + { + "epoch": 0.31, + "grad_norm": 1.5134914654859153, + "learning_rate": 1.6133949469195194e-05, + "loss": 0.2162, + "step": 6118 + }, + { + "epoch": 0.31, + "grad_norm": 1.2476327546290005, + "learning_rate": 1.6132648649185214e-05, + "loss": 0.2031, + "step": 6119 + }, + { + "epoch": 0.31, + "grad_norm": 1.8453200099035407, + "learning_rate": 1.6131347662826793e-05, + "loss": 0.1868, + "step": 6120 + }, + { + "epoch": 0.31, + "grad_norm": 1.319105677189608, + "learning_rate": 1.613004651015522e-05, + "loss": 0.2438, + "step": 6121 + }, + { + "epoch": 0.31, + "grad_norm": 2.0640604006619943, + "learning_rate": 1.612874519120579e-05, + "loss": 0.2214, + "step": 6122 + }, + { + "epoch": 0.31, + "grad_norm": 1.4071140029611668, + "learning_rate": 1.61274437060138e-05, + "loss": 0.2094, + "step": 6123 + }, + { + "epoch": 0.31, + "grad_norm": 1.4588025197496317, + "learning_rate": 1.6126142054614556e-05, + "loss": 0.2015, + "step": 6124 + }, + { + "epoch": 0.31, + "grad_norm": 2.1702119223872676, + "learning_rate": 1.612484023704336e-05, + "loss": 0.1994, + "step": 6125 + }, + { + "epoch": 0.31, + "grad_norm": 1.3620761809710489, + "learning_rate": 1.6123538253335526e-05, + "loss": 0.1852, + "step": 6126 + }, + { + "epoch": 0.31, + "grad_norm": 2.4131413570897076, + "learning_rate": 1.612223610352637e-05, + "loss": 0.2007, + "step": 6127 + }, + { + "epoch": 0.31, + "grad_norm": 1.835054207929354, + "learning_rate": 1.612093378765122e-05, + "loss": 0.1974, + "step": 6128 + }, + { + "epoch": 0.31, + "grad_norm": 1.6071319397296138, + "learning_rate": 1.611963130574539e-05, + "loss": 0.2054, + "step": 6129 + }, + { + "epoch": 0.31, + "grad_norm": 1.7980661677245373, + "learning_rate": 1.6118328657844216e-05, + "loss": 0.2101, + "step": 6130 + }, + { + "epoch": 0.31, + "grad_norm": 1.5776819747938176, + "learning_rate": 1.6117025843983032e-05, + "loss": 0.2358, + "step": 6131 + }, + { + "epoch": 0.31, + "grad_norm": 2.877337042985936, + "learning_rate": 1.6115722864197178e-05, + "loss": 0.2195, + "step": 6132 + }, + { + "epoch": 0.31, + "grad_norm": 2.5470479776051027, + "learning_rate": 1.6114419718521994e-05, + "loss": 0.2126, + "step": 6133 + }, + { + "epoch": 0.31, + "grad_norm": 3.749177645710888, + "learning_rate": 1.6113116406992833e-05, + "loss": 0.1907, + "step": 6134 + }, + { + "epoch": 0.31, + "grad_norm": 1.4421813486978317, + "learning_rate": 1.611181292964504e-05, + "loss": 0.207, + "step": 6135 + }, + { + "epoch": 0.31, + "grad_norm": 1.520289919949835, + "learning_rate": 1.611050928651398e-05, + "loss": 0.2052, + "step": 6136 + }, + { + "epoch": 0.31, + "grad_norm": 1.293791051110462, + "learning_rate": 1.6109205477635006e-05, + "loss": 0.2182, + "step": 6137 + }, + { + "epoch": 0.31, + "grad_norm": 5.513386645098428, + "learning_rate": 1.610790150304349e-05, + "loss": 0.2319, + "step": 6138 + }, + { + "epoch": 0.31, + "grad_norm": 3.0876460560077863, + "learning_rate": 1.61065973627748e-05, + "loss": 0.2333, + "step": 6139 + }, + { + "epoch": 0.31, + "grad_norm": 2.1635359350738135, + "learning_rate": 1.6105293056864314e-05, + "loss": 0.1949, + "step": 6140 + }, + { + "epoch": 0.31, + "grad_norm": 1.9290528759839747, + "learning_rate": 1.6103988585347408e-05, + "loss": 0.2172, + "step": 6141 + }, + { + "epoch": 0.31, + "grad_norm": 1.5256634967346925, + "learning_rate": 1.6102683948259467e-05, + "loss": 0.1897, + "step": 6142 + }, + { + "epoch": 0.31, + "grad_norm": 1.5949036318919134, + "learning_rate": 1.610137914563588e-05, + "loss": 0.2081, + "step": 6143 + }, + { + "epoch": 0.31, + "grad_norm": 1.454851335074028, + "learning_rate": 1.6100074177512038e-05, + "loss": 0.199, + "step": 6144 + }, + { + "epoch": 0.31, + "grad_norm": 1.7598947260802447, + "learning_rate": 1.6098769043923338e-05, + "loss": 0.2103, + "step": 6145 + }, + { + "epoch": 0.31, + "grad_norm": 1.4634098370805022, + "learning_rate": 1.6097463744905187e-05, + "loss": 0.1987, + "step": 6146 + }, + { + "epoch": 0.31, + "grad_norm": 1.2310446246519797, + "learning_rate": 1.6096158280492984e-05, + "loss": 0.1591, + "step": 6147 + }, + { + "epoch": 0.31, + "grad_norm": 1.3524333215013153, + "learning_rate": 1.6094852650722145e-05, + "loss": 0.2018, + "step": 6148 + }, + { + "epoch": 0.31, + "grad_norm": 1.6759301777793807, + "learning_rate": 1.6093546855628085e-05, + "loss": 0.1788, + "step": 6149 + }, + { + "epoch": 0.31, + "grad_norm": 1.2458571119083635, + "learning_rate": 1.609224089524622e-05, + "loss": 0.2098, + "step": 6150 + }, + { + "epoch": 0.31, + "grad_norm": 1.5002867548310084, + "learning_rate": 1.6090934769611976e-05, + "loss": 0.2265, + "step": 6151 + }, + { + "epoch": 0.31, + "grad_norm": 1.1122774148111543, + "learning_rate": 1.6089628478760785e-05, + "loss": 0.2059, + "step": 6152 + }, + { + "epoch": 0.31, + "grad_norm": 1.1362389543734666, + "learning_rate": 1.6088322022728076e-05, + "loss": 0.2541, + "step": 6153 + }, + { + "epoch": 0.31, + "grad_norm": 1.268390507330005, + "learning_rate": 1.608701540154929e-05, + "loss": 0.1765, + "step": 6154 + }, + { + "epoch": 0.31, + "grad_norm": 1.6803598721739206, + "learning_rate": 1.6085708615259863e-05, + "loss": 0.1931, + "step": 6155 + }, + { + "epoch": 0.31, + "grad_norm": 3.066069526341508, + "learning_rate": 1.608440166389525e-05, + "loss": 0.1848, + "step": 6156 + }, + { + "epoch": 0.31, + "grad_norm": 1.1775493264874153, + "learning_rate": 1.6083094547490895e-05, + "loss": 0.1911, + "step": 6157 + }, + { + "epoch": 0.31, + "grad_norm": 1.316939659878754, + "learning_rate": 1.6081787266082258e-05, + "loss": 0.204, + "step": 6158 + }, + { + "epoch": 0.31, + "grad_norm": 1.1886676203084303, + "learning_rate": 1.60804798197048e-05, + "loss": 0.193, + "step": 6159 + }, + { + "epoch": 0.31, + "grad_norm": 1.0711579452613742, + "learning_rate": 1.6079172208393985e-05, + "loss": 0.1887, + "step": 6160 + }, + { + "epoch": 0.31, + "grad_norm": 1.3376326440975657, + "learning_rate": 1.607786443218528e-05, + "loss": 0.1986, + "step": 6161 + }, + { + "epoch": 0.31, + "grad_norm": 1.1083942748161777, + "learning_rate": 1.6076556491114152e-05, + "loss": 0.2099, + "step": 6162 + }, + { + "epoch": 0.31, + "grad_norm": 1.614406990008451, + "learning_rate": 1.6075248385216093e-05, + "loss": 0.2101, + "step": 6163 + }, + { + "epoch": 0.31, + "grad_norm": 1.0930356453004402, + "learning_rate": 1.6073940114526574e-05, + "loss": 0.2174, + "step": 6164 + }, + { + "epoch": 0.31, + "grad_norm": 1.0492328588433775, + "learning_rate": 1.607263167908109e-05, + "loss": 0.2359, + "step": 6165 + }, + { + "epoch": 0.31, + "grad_norm": 1.2424165398353795, + "learning_rate": 1.6071323078915128e-05, + "loss": 0.2419, + "step": 6166 + }, + { + "epoch": 0.31, + "grad_norm": 0.8259912298969864, + "learning_rate": 1.6070014314064183e-05, + "loss": 0.2406, + "step": 6167 + }, + { + "epoch": 0.31, + "grad_norm": 0.8474954794357589, + "learning_rate": 1.6068705384563757e-05, + "loss": 0.2092, + "step": 6168 + }, + { + "epoch": 0.31, + "grad_norm": 1.0906208809203402, + "learning_rate": 1.6067396290449356e-05, + "loss": 0.2131, + "step": 6169 + }, + { + "epoch": 0.31, + "grad_norm": 1.6127793598314828, + "learning_rate": 1.6066087031756485e-05, + "loss": 0.2096, + "step": 6170 + }, + { + "epoch": 0.31, + "grad_norm": 0.9349034239176398, + "learning_rate": 1.6064777608520666e-05, + "loss": 0.2152, + "step": 6171 + }, + { + "epoch": 0.31, + "grad_norm": 1.393936008162501, + "learning_rate": 1.606346802077741e-05, + "loss": 0.2057, + "step": 6172 + }, + { + "epoch": 0.31, + "grad_norm": 1.204797087355292, + "learning_rate": 1.6062158268562237e-05, + "loss": 0.207, + "step": 6173 + }, + { + "epoch": 0.31, + "grad_norm": 1.1014356914347432, + "learning_rate": 1.6060848351910685e-05, + "loss": 0.2191, + "step": 6174 + }, + { + "epoch": 0.31, + "grad_norm": 0.9548581883876565, + "learning_rate": 1.6059538270858275e-05, + "loss": 0.1874, + "step": 6175 + }, + { + "epoch": 0.31, + "grad_norm": 1.1023120629875385, + "learning_rate": 1.6058228025440548e-05, + "loss": 0.2297, + "step": 6176 + }, + { + "epoch": 0.31, + "grad_norm": 0.9500603896039294, + "learning_rate": 1.6056917615693045e-05, + "loss": 0.2156, + "step": 6177 + }, + { + "epoch": 0.31, + "grad_norm": 1.0595746338846443, + "learning_rate": 1.605560704165131e-05, + "loss": 0.1836, + "step": 6178 + }, + { + "epoch": 0.31, + "grad_norm": 1.601444715454401, + "learning_rate": 1.6054296303350886e-05, + "loss": 0.2015, + "step": 6179 + }, + { + "epoch": 0.31, + "grad_norm": 1.2146261540538896, + "learning_rate": 1.605298540082734e-05, + "loss": 0.2129, + "step": 6180 + }, + { + "epoch": 0.31, + "grad_norm": 0.873388952037026, + "learning_rate": 1.605167433411622e-05, + "loss": 0.2025, + "step": 6181 + }, + { + "epoch": 0.31, + "grad_norm": 1.726024719124285, + "learning_rate": 1.6050363103253093e-05, + "loss": 0.1764, + "step": 6182 + }, + { + "epoch": 0.31, + "grad_norm": 1.0938330909494385, + "learning_rate": 1.6049051708273526e-05, + "loss": 0.2067, + "step": 6183 + }, + { + "epoch": 0.31, + "grad_norm": 1.079586704593921, + "learning_rate": 1.6047740149213085e-05, + "loss": 0.1896, + "step": 6184 + }, + { + "epoch": 0.31, + "grad_norm": 1.2714408408105322, + "learning_rate": 1.6046428426107354e-05, + "loss": 0.2357, + "step": 6185 + }, + { + "epoch": 0.31, + "grad_norm": 1.2726345863274997, + "learning_rate": 1.604511653899191e-05, + "loss": 0.2032, + "step": 6186 + }, + { + "epoch": 0.31, + "grad_norm": 1.1207757688502018, + "learning_rate": 1.604380448790234e-05, + "loss": 0.2256, + "step": 6187 + }, + { + "epoch": 0.31, + "grad_norm": 1.093894661508314, + "learning_rate": 1.604249227287423e-05, + "loss": 0.2159, + "step": 6188 + }, + { + "epoch": 0.31, + "grad_norm": 1.197879422109431, + "learning_rate": 1.6041179893943178e-05, + "loss": 0.2238, + "step": 6189 + }, + { + "epoch": 0.31, + "grad_norm": 1.019478303665953, + "learning_rate": 1.6039867351144778e-05, + "loss": 0.19, + "step": 6190 + }, + { + "epoch": 0.31, + "grad_norm": 0.892825698089129, + "learning_rate": 1.6038554644514634e-05, + "loss": 0.2223, + "step": 6191 + }, + { + "epoch": 0.31, + "grad_norm": 1.018159470859247, + "learning_rate": 1.6037241774088355e-05, + "loss": 0.1961, + "step": 6192 + }, + { + "epoch": 0.31, + "grad_norm": 1.5166257860517596, + "learning_rate": 1.6035928739901555e-05, + "loss": 0.2292, + "step": 6193 + }, + { + "epoch": 0.31, + "grad_norm": 0.8754496142517274, + "learning_rate": 1.6034615541989845e-05, + "loss": 0.2046, + "step": 6194 + }, + { + "epoch": 0.32, + "grad_norm": 1.2530659916913465, + "learning_rate": 1.603330218038885e-05, + "loss": 0.2013, + "step": 6195 + }, + { + "epoch": 0.32, + "grad_norm": 1.2702145828391984, + "learning_rate": 1.6031988655134186e-05, + "loss": 0.1842, + "step": 6196 + }, + { + "epoch": 0.32, + "grad_norm": 1.0243935600013727, + "learning_rate": 1.6030674966261496e-05, + "loss": 0.2063, + "step": 6197 + }, + { + "epoch": 0.32, + "grad_norm": 1.0114038595335235, + "learning_rate": 1.60293611138064e-05, + "loss": 0.2125, + "step": 6198 + }, + { + "epoch": 0.32, + "grad_norm": 0.9342042489144418, + "learning_rate": 1.6028047097804548e-05, + "loss": 0.1981, + "step": 6199 + }, + { + "epoch": 0.32, + "grad_norm": 1.0782026950345764, + "learning_rate": 1.6026732918291577e-05, + "loss": 0.171, + "step": 6200 + }, + { + "epoch": 0.32, + "grad_norm": 0.8232405471508174, + "learning_rate": 1.6025418575303135e-05, + "loss": 0.2016, + "step": 6201 + }, + { + "epoch": 0.32, + "grad_norm": 0.8404034639112508, + "learning_rate": 1.6024104068874877e-05, + "loss": 0.1885, + "step": 6202 + }, + { + "epoch": 0.32, + "grad_norm": 0.9614631039331818, + "learning_rate": 1.6022789399042454e-05, + "loss": 0.2043, + "step": 6203 + }, + { + "epoch": 0.32, + "grad_norm": 1.6367109191337368, + "learning_rate": 1.6021474565841527e-05, + "loss": 0.2171, + "step": 6204 + }, + { + "epoch": 0.32, + "grad_norm": 1.109163775941658, + "learning_rate": 1.602015956930776e-05, + "loss": 0.1892, + "step": 6205 + }, + { + "epoch": 0.32, + "grad_norm": 0.9729785517016886, + "learning_rate": 1.601884440947683e-05, + "loss": 0.201, + "step": 6206 + }, + { + "epoch": 0.32, + "grad_norm": 0.9578508782413219, + "learning_rate": 1.60175290863844e-05, + "loss": 0.1853, + "step": 6207 + }, + { + "epoch": 0.32, + "grad_norm": 1.2746326590873274, + "learning_rate": 1.601621360006616e-05, + "loss": 0.2266, + "step": 6208 + }, + { + "epoch": 0.32, + "grad_norm": 0.8786848186524946, + "learning_rate": 1.601489795055778e-05, + "loss": 0.213, + "step": 6209 + }, + { + "epoch": 0.32, + "grad_norm": 0.9956355351416327, + "learning_rate": 1.6013582137894957e-05, + "loss": 0.1952, + "step": 6210 + }, + { + "epoch": 0.32, + "grad_norm": 0.8905803437082221, + "learning_rate": 1.6012266162113378e-05, + "loss": 0.1862, + "step": 6211 + }, + { + "epoch": 0.32, + "grad_norm": 1.276417244394672, + "learning_rate": 1.601095002324874e-05, + "loss": 0.2076, + "step": 6212 + }, + { + "epoch": 0.32, + "grad_norm": 1.0567990624078263, + "learning_rate": 1.6009633721336745e-05, + "loss": 0.2151, + "step": 6213 + }, + { + "epoch": 0.32, + "grad_norm": 1.103289688097003, + "learning_rate": 1.6008317256413092e-05, + "loss": 0.2407, + "step": 6214 + }, + { + "epoch": 0.32, + "grad_norm": 1.0858152536697987, + "learning_rate": 1.6007000628513498e-05, + "loss": 0.176, + "step": 6215 + }, + { + "epoch": 0.32, + "grad_norm": 1.024115694386055, + "learning_rate": 1.600568383767367e-05, + "loss": 0.1804, + "step": 6216 + }, + { + "epoch": 0.32, + "grad_norm": 1.7915259534803418, + "learning_rate": 1.600436688392933e-05, + "loss": 0.2151, + "step": 6217 + }, + { + "epoch": 0.32, + "grad_norm": 0.9311081543873564, + "learning_rate": 1.6003049767316196e-05, + "loss": 0.2032, + "step": 6218 + }, + { + "epoch": 0.32, + "grad_norm": 3.0398814378944885, + "learning_rate": 1.6001732487870002e-05, + "loss": 0.2193, + "step": 6219 + }, + { + "epoch": 0.32, + "grad_norm": 0.7722538840376368, + "learning_rate": 1.6000415045626474e-05, + "loss": 0.1869, + "step": 6220 + }, + { + "epoch": 0.32, + "grad_norm": 0.7243802802542608, + "learning_rate": 1.599909744062135e-05, + "loss": 0.1802, + "step": 6221 + }, + { + "epoch": 0.32, + "grad_norm": 0.9532863679444862, + "learning_rate": 1.5997779672890367e-05, + "loss": 0.1937, + "step": 6222 + }, + { + "epoch": 0.32, + "grad_norm": 0.7103301266208392, + "learning_rate": 1.5996461742469273e-05, + "loss": 0.1711, + "step": 6223 + }, + { + "epoch": 0.32, + "grad_norm": 0.9093473889476353, + "learning_rate": 1.5995143649393814e-05, + "loss": 0.191, + "step": 6224 + }, + { + "epoch": 0.32, + "grad_norm": 0.9615416909069292, + "learning_rate": 1.5993825393699746e-05, + "loss": 0.2005, + "step": 6225 + }, + { + "epoch": 0.32, + "grad_norm": 0.9817798027230897, + "learning_rate": 1.5992506975422826e-05, + "loss": 0.2155, + "step": 6226 + }, + { + "epoch": 0.32, + "grad_norm": 1.0178129605623831, + "learning_rate": 1.5991188394598817e-05, + "loss": 0.2074, + "step": 6227 + }, + { + "epoch": 0.32, + "grad_norm": 0.9295853846888892, + "learning_rate": 1.598986965126348e-05, + "loss": 0.1846, + "step": 6228 + }, + { + "epoch": 0.32, + "grad_norm": 2.4018557858414185, + "learning_rate": 1.5988550745452593e-05, + "loss": 0.1982, + "step": 6229 + }, + { + "epoch": 0.32, + "grad_norm": 0.9804221302826391, + "learning_rate": 1.598723167720193e-05, + "loss": 0.2109, + "step": 6230 + }, + { + "epoch": 0.32, + "grad_norm": 1.7358355239165495, + "learning_rate": 1.598591244654727e-05, + "loss": 0.1874, + "step": 6231 + }, + { + "epoch": 0.32, + "grad_norm": 0.9962790646475322, + "learning_rate": 1.5984593053524395e-05, + "loss": 0.1918, + "step": 6232 + }, + { + "epoch": 0.32, + "grad_norm": 1.035123937100931, + "learning_rate": 1.5983273498169095e-05, + "loss": 0.1729, + "step": 6233 + }, + { + "epoch": 0.32, + "grad_norm": 1.7762721872372287, + "learning_rate": 1.5981953780517166e-05, + "loss": 0.2003, + "step": 6234 + }, + { + "epoch": 0.32, + "grad_norm": 0.922755801851708, + "learning_rate": 1.59806339006044e-05, + "loss": 0.1896, + "step": 6235 + }, + { + "epoch": 0.32, + "grad_norm": 1.3481005397591015, + "learning_rate": 1.59793138584666e-05, + "loss": 0.1951, + "step": 6236 + }, + { + "epoch": 0.32, + "grad_norm": 0.7834068245640107, + "learning_rate": 1.597799365413958e-05, + "loss": 0.1814, + "step": 6237 + }, + { + "epoch": 0.32, + "grad_norm": 3.1028864518367505, + "learning_rate": 1.597667328765914e-05, + "loss": 0.2181, + "step": 6238 + }, + { + "epoch": 0.32, + "grad_norm": 0.9488861711209522, + "learning_rate": 1.5975352759061103e-05, + "loss": 0.2215, + "step": 6239 + }, + { + "epoch": 0.32, + "grad_norm": 1.3137009303969078, + "learning_rate": 1.597403206838128e-05, + "loss": 0.1913, + "step": 6240 + }, + { + "epoch": 0.32, + "grad_norm": 1.0603030370685838, + "learning_rate": 1.5972711215655504e-05, + "loss": 0.2342, + "step": 6241 + }, + { + "epoch": 0.32, + "grad_norm": 1.005721988712769, + "learning_rate": 1.5971390200919597e-05, + "loss": 0.1838, + "step": 6242 + }, + { + "epoch": 0.32, + "grad_norm": 1.9540413565559143, + "learning_rate": 1.5970069024209394e-05, + "loss": 0.2288, + "step": 6243 + }, + { + "epoch": 0.32, + "grad_norm": 1.056861645845795, + "learning_rate": 1.596874768556073e-05, + "loss": 0.2376, + "step": 6244 + }, + { + "epoch": 0.32, + "grad_norm": 1.8608096734735928, + "learning_rate": 1.5967426185009448e-05, + "loss": 0.1994, + "step": 6245 + }, + { + "epoch": 0.32, + "grad_norm": 0.9993820475578914, + "learning_rate": 1.5966104522591397e-05, + "loss": 0.2048, + "step": 6246 + }, + { + "epoch": 0.32, + "grad_norm": 1.052166738593178, + "learning_rate": 1.5964782698342423e-05, + "loss": 0.2037, + "step": 6247 + }, + { + "epoch": 0.32, + "grad_norm": 2.4623813432862196, + "learning_rate": 1.596346071229838e-05, + "loss": 0.2014, + "step": 6248 + }, + { + "epoch": 0.32, + "grad_norm": 1.4257348201093032, + "learning_rate": 1.5962138564495128e-05, + "loss": 0.2111, + "step": 6249 + }, + { + "epoch": 0.32, + "grad_norm": 0.8422300324358466, + "learning_rate": 1.596081625496853e-05, + "loss": 0.2061, + "step": 6250 + }, + { + "epoch": 0.32, + "grad_norm": 0.8232571074717809, + "learning_rate": 1.5959493783754456e-05, + "loss": 0.2089, + "step": 6251 + }, + { + "epoch": 0.32, + "grad_norm": 1.7018511761221633, + "learning_rate": 1.595817115088878e-05, + "loss": 0.1917, + "step": 6252 + }, + { + "epoch": 0.32, + "grad_norm": 0.9579800641887645, + "learning_rate": 1.5956848356407365e-05, + "loss": 0.2103, + "step": 6253 + }, + { + "epoch": 0.32, + "grad_norm": 1.010412464498688, + "learning_rate": 1.595552540034611e-05, + "loss": 0.2362, + "step": 6254 + }, + { + "epoch": 0.32, + "grad_norm": 0.8006817815454528, + "learning_rate": 1.595420228274089e-05, + "loss": 0.1742, + "step": 6255 + }, + { + "epoch": 0.32, + "grad_norm": 0.9325631107213238, + "learning_rate": 1.59528790036276e-05, + "loss": 0.1993, + "step": 6256 + }, + { + "epoch": 0.32, + "grad_norm": 1.926514554437551, + "learning_rate": 1.5951555563042128e-05, + "loss": 0.2306, + "step": 6257 + }, + { + "epoch": 0.32, + "grad_norm": 1.7324924450988743, + "learning_rate": 1.5950231961020373e-05, + "loss": 0.214, + "step": 6258 + }, + { + "epoch": 0.32, + "grad_norm": 1.0247474294843106, + "learning_rate": 1.5948908197598242e-05, + "loss": 0.1911, + "step": 6259 + }, + { + "epoch": 0.32, + "grad_norm": 1.1524455317274755, + "learning_rate": 1.594758427281164e-05, + "loss": 0.2054, + "step": 6260 + }, + { + "epoch": 0.32, + "grad_norm": 1.3710386575529927, + "learning_rate": 1.5946260186696477e-05, + "loss": 0.236, + "step": 6261 + }, + { + "epoch": 0.32, + "grad_norm": 1.177146195681321, + "learning_rate": 1.5944935939288675e-05, + "loss": 0.1931, + "step": 6262 + }, + { + "epoch": 0.32, + "grad_norm": 1.0671464671711801, + "learning_rate": 1.5943611530624147e-05, + "loss": 0.2121, + "step": 6263 + }, + { + "epoch": 0.32, + "grad_norm": 0.9365291413240289, + "learning_rate": 1.594228696073882e-05, + "loss": 0.201, + "step": 6264 + }, + { + "epoch": 0.32, + "grad_norm": 0.9882062230623252, + "learning_rate": 1.5940962229668625e-05, + "loss": 0.2131, + "step": 6265 + }, + { + "epoch": 0.32, + "grad_norm": 1.0502799091230823, + "learning_rate": 1.5939637337449493e-05, + "loss": 0.2051, + "step": 6266 + }, + { + "epoch": 0.32, + "grad_norm": 0.86354473763294, + "learning_rate": 1.5938312284117367e-05, + "loss": 0.1884, + "step": 6267 + }, + { + "epoch": 0.32, + "grad_norm": 1.3278074739442671, + "learning_rate": 1.593698706970818e-05, + "loss": 0.2101, + "step": 6268 + }, + { + "epoch": 0.32, + "grad_norm": 1.4480295763019384, + "learning_rate": 1.593566169425788e-05, + "loss": 0.2023, + "step": 6269 + }, + { + "epoch": 0.32, + "grad_norm": 1.2014894821170918, + "learning_rate": 1.5934336157802427e-05, + "loss": 0.2066, + "step": 6270 + }, + { + "epoch": 0.32, + "grad_norm": 1.47506376769695, + "learning_rate": 1.593301046037777e-05, + "loss": 0.2199, + "step": 6271 + }, + { + "epoch": 0.32, + "grad_norm": 1.1430677599552572, + "learning_rate": 1.5931684602019866e-05, + "loss": 0.2121, + "step": 6272 + }, + { + "epoch": 0.32, + "grad_norm": 0.856863370411823, + "learning_rate": 1.5930358582764686e-05, + "loss": 0.2003, + "step": 6273 + }, + { + "epoch": 0.32, + "grad_norm": 0.8649431457227784, + "learning_rate": 1.5929032402648194e-05, + "loss": 0.1998, + "step": 6274 + }, + { + "epoch": 0.32, + "grad_norm": 0.8653885888780428, + "learning_rate": 1.5927706061706363e-05, + "loss": 0.2049, + "step": 6275 + }, + { + "epoch": 0.32, + "grad_norm": 0.8381520198334308, + "learning_rate": 1.592637955997517e-05, + "loss": 0.1964, + "step": 6276 + }, + { + "epoch": 0.32, + "grad_norm": 1.1500336757509857, + "learning_rate": 1.5925052897490595e-05, + "loss": 0.1976, + "step": 6277 + }, + { + "epoch": 0.32, + "grad_norm": 1.0400983335658338, + "learning_rate": 1.5923726074288627e-05, + "loss": 0.2028, + "step": 6278 + }, + { + "epoch": 0.32, + "grad_norm": 0.7803330730926793, + "learning_rate": 1.5922399090405255e-05, + "loss": 0.1951, + "step": 6279 + }, + { + "epoch": 0.32, + "grad_norm": 1.2882797730496849, + "learning_rate": 1.592107194587647e-05, + "loss": 0.2098, + "step": 6280 + }, + { + "epoch": 0.32, + "grad_norm": 0.949332819796093, + "learning_rate": 1.591974464073828e-05, + "loss": 0.1939, + "step": 6281 + }, + { + "epoch": 0.32, + "grad_norm": 0.8669114548502495, + "learning_rate": 1.591841717502668e-05, + "loss": 0.2039, + "step": 6282 + }, + { + "epoch": 0.32, + "grad_norm": 0.8760982488429814, + "learning_rate": 1.5917089548777678e-05, + "loss": 0.1997, + "step": 6283 + }, + { + "epoch": 0.32, + "grad_norm": 3.425809164271548, + "learning_rate": 1.591576176202729e-05, + "loss": 0.2227, + "step": 6284 + }, + { + "epoch": 0.32, + "grad_norm": 1.0285643242186238, + "learning_rate": 1.5914433814811524e-05, + "loss": 0.2049, + "step": 6285 + }, + { + "epoch": 0.32, + "grad_norm": 0.9789355523220069, + "learning_rate": 1.591310570716641e-05, + "loss": 0.2057, + "step": 6286 + }, + { + "epoch": 0.32, + "grad_norm": 0.9412502930882807, + "learning_rate": 1.5911777439127975e-05, + "loss": 0.1977, + "step": 6287 + }, + { + "epoch": 0.32, + "grad_norm": 1.021924695120764, + "learning_rate": 1.591044901073224e-05, + "loss": 0.2136, + "step": 6288 + }, + { + "epoch": 0.32, + "grad_norm": 1.0255290361753848, + "learning_rate": 1.590912042201524e-05, + "loss": 0.196, + "step": 6289 + }, + { + "epoch": 0.32, + "grad_norm": 0.9600707351927388, + "learning_rate": 1.5907791673013016e-05, + "loss": 0.1881, + "step": 6290 + }, + { + "epoch": 0.32, + "grad_norm": 1.2638585822881137, + "learning_rate": 1.5906462763761606e-05, + "loss": 0.1805, + "step": 6291 + }, + { + "epoch": 0.32, + "grad_norm": 1.0002196629413556, + "learning_rate": 1.5905133694297065e-05, + "loss": 0.2008, + "step": 6292 + }, + { + "epoch": 0.32, + "grad_norm": 1.0106544911549504, + "learning_rate": 1.5903804464655437e-05, + "loss": 0.2092, + "step": 6293 + }, + { + "epoch": 0.32, + "grad_norm": 1.314786481203474, + "learning_rate": 1.590247507487278e-05, + "loss": 0.2297, + "step": 6294 + }, + { + "epoch": 0.32, + "grad_norm": 2.112341901630231, + "learning_rate": 1.5901145524985155e-05, + "loss": 0.1985, + "step": 6295 + }, + { + "epoch": 0.32, + "grad_norm": 1.1707892567465332, + "learning_rate": 1.5899815815028618e-05, + "loss": 0.2032, + "step": 6296 + }, + { + "epoch": 0.32, + "grad_norm": 1.0298589833768144, + "learning_rate": 1.589848594503925e-05, + "loss": 0.2026, + "step": 6297 + }, + { + "epoch": 0.32, + "grad_norm": 1.1082543126402602, + "learning_rate": 1.5897155915053113e-05, + "loss": 0.2433, + "step": 6298 + }, + { + "epoch": 0.32, + "grad_norm": 1.6329212771296873, + "learning_rate": 1.5895825725106295e-05, + "loss": 0.1874, + "step": 6299 + }, + { + "epoch": 0.32, + "grad_norm": 1.1679904879141962, + "learning_rate": 1.5894495375234865e-05, + "loss": 0.2348, + "step": 6300 + }, + { + "epoch": 0.32, + "grad_norm": 0.7321960858669198, + "learning_rate": 1.589316486547492e-05, + "loss": 0.1893, + "step": 6301 + }, + { + "epoch": 0.32, + "grad_norm": 0.8250329944711491, + "learning_rate": 1.589183419586254e-05, + "loss": 0.2105, + "step": 6302 + }, + { + "epoch": 0.32, + "grad_norm": 0.8952598550183528, + "learning_rate": 1.5890503366433827e-05, + "loss": 0.217, + "step": 6303 + }, + { + "epoch": 0.32, + "grad_norm": 1.930619450747476, + "learning_rate": 1.5889172377224877e-05, + "loss": 0.208, + "step": 6304 + }, + { + "epoch": 0.32, + "grad_norm": 0.805703441367176, + "learning_rate": 1.5887841228271794e-05, + "loss": 0.1803, + "step": 6305 + }, + { + "epoch": 0.32, + "grad_norm": 1.1449471204861255, + "learning_rate": 1.5886509919610683e-05, + "loss": 0.2274, + "step": 6306 + }, + { + "epoch": 0.32, + "grad_norm": 1.0149187593723974, + "learning_rate": 1.588517845127766e-05, + "loss": 0.2062, + "step": 6307 + }, + { + "epoch": 0.32, + "grad_norm": 1.517302821901913, + "learning_rate": 1.5883846823308843e-05, + "loss": 0.1797, + "step": 6308 + }, + { + "epoch": 0.32, + "grad_norm": 0.81017351323962, + "learning_rate": 1.5882515035740345e-05, + "loss": 0.2159, + "step": 6309 + }, + { + "epoch": 0.32, + "grad_norm": 0.8318977084874352, + "learning_rate": 1.5881183088608293e-05, + "loss": 0.2093, + "step": 6310 + }, + { + "epoch": 0.32, + "grad_norm": 1.681851519727326, + "learning_rate": 1.587985098194882e-05, + "loss": 0.1841, + "step": 6311 + }, + { + "epoch": 0.32, + "grad_norm": 0.8934676155716955, + "learning_rate": 1.5878518715798053e-05, + "loss": 0.1861, + "step": 6312 + }, + { + "epoch": 0.32, + "grad_norm": 0.9779274252743558, + "learning_rate": 1.587718629019214e-05, + "loss": 0.213, + "step": 6313 + }, + { + "epoch": 0.32, + "grad_norm": 0.9371299745261433, + "learning_rate": 1.5875853705167214e-05, + "loss": 0.193, + "step": 6314 + }, + { + "epoch": 0.32, + "grad_norm": 0.8759217719403662, + "learning_rate": 1.5874520960759423e-05, + "loss": 0.1968, + "step": 6315 + }, + { + "epoch": 0.32, + "grad_norm": 1.1231242328686373, + "learning_rate": 1.5873188057004924e-05, + "loss": 0.1896, + "step": 6316 + }, + { + "epoch": 0.32, + "grad_norm": 1.0473174398740164, + "learning_rate": 1.5871854993939862e-05, + "loss": 0.2339, + "step": 6317 + }, + { + "epoch": 0.32, + "grad_norm": 1.1763710546308124, + "learning_rate": 1.5870521771600402e-05, + "loss": 0.2014, + "step": 6318 + }, + { + "epoch": 0.32, + "grad_norm": 1.0078407380626875, + "learning_rate": 1.5869188390022708e-05, + "loss": 0.2139, + "step": 6319 + }, + { + "epoch": 0.32, + "grad_norm": 1.1132752101430765, + "learning_rate": 1.586785484924295e-05, + "loss": 0.213, + "step": 6320 + }, + { + "epoch": 0.32, + "grad_norm": 1.1250644268142815, + "learning_rate": 1.5866521149297294e-05, + "loss": 0.2078, + "step": 6321 + }, + { + "epoch": 0.32, + "grad_norm": 1.0358756499365598, + "learning_rate": 1.586518729022192e-05, + "loss": 0.2045, + "step": 6322 + }, + { + "epoch": 0.32, + "grad_norm": 1.4276322383882027, + "learning_rate": 1.5863853272053017e-05, + "loss": 0.2078, + "step": 6323 + }, + { + "epoch": 0.32, + "grad_norm": 1.1332020736682589, + "learning_rate": 1.5862519094826753e-05, + "loss": 0.1952, + "step": 6324 + }, + { + "epoch": 0.32, + "grad_norm": 1.3461541846759482, + "learning_rate": 1.5861184758579332e-05, + "loss": 0.1962, + "step": 6325 + }, + { + "epoch": 0.32, + "grad_norm": 0.9436324996098256, + "learning_rate": 1.5859850263346945e-05, + "loss": 0.1897, + "step": 6326 + }, + { + "epoch": 0.32, + "grad_norm": 1.08030280388521, + "learning_rate": 1.5858515609165786e-05, + "loss": 0.2137, + "step": 6327 + }, + { + "epoch": 0.32, + "grad_norm": 1.2932692522911984, + "learning_rate": 1.5857180796072064e-05, + "loss": 0.1772, + "step": 6328 + }, + { + "epoch": 0.32, + "grad_norm": 1.0491984582849074, + "learning_rate": 1.5855845824101978e-05, + "loss": 0.1888, + "step": 6329 + }, + { + "epoch": 0.32, + "grad_norm": 1.023037106386847, + "learning_rate": 1.5854510693291744e-05, + "loss": 0.1993, + "step": 6330 + }, + { + "epoch": 0.32, + "grad_norm": 0.8958489251993921, + "learning_rate": 1.5853175403677577e-05, + "loss": 0.216, + "step": 6331 + }, + { + "epoch": 0.32, + "grad_norm": 0.9890468500643292, + "learning_rate": 1.5851839955295697e-05, + "loss": 0.2089, + "step": 6332 + }, + { + "epoch": 0.32, + "grad_norm": 1.1823366094267416, + "learning_rate": 1.5850504348182327e-05, + "loss": 0.2098, + "step": 6333 + }, + { + "epoch": 0.32, + "grad_norm": 1.427086580965463, + "learning_rate": 1.5849168582373698e-05, + "loss": 0.2417, + "step": 6334 + }, + { + "epoch": 0.32, + "grad_norm": 0.9748347836779382, + "learning_rate": 1.584783265790604e-05, + "loss": 0.2211, + "step": 6335 + }, + { + "epoch": 0.32, + "grad_norm": 0.8419382988560267, + "learning_rate": 1.584649657481559e-05, + "loss": 0.2043, + "step": 6336 + }, + { + "epoch": 0.32, + "grad_norm": 1.0180216149981338, + "learning_rate": 1.5845160333138593e-05, + "loss": 0.2057, + "step": 6337 + }, + { + "epoch": 0.32, + "grad_norm": 2.438738728038019, + "learning_rate": 1.584382393291129e-05, + "loss": 0.2175, + "step": 6338 + }, + { + "epoch": 0.32, + "grad_norm": 0.8929441096966545, + "learning_rate": 1.584248737416993e-05, + "loss": 0.1956, + "step": 6339 + }, + { + "epoch": 0.32, + "grad_norm": 3.4119312182239425, + "learning_rate": 1.584115065695077e-05, + "loss": 0.2038, + "step": 6340 + }, + { + "epoch": 0.32, + "grad_norm": 0.9668934126268923, + "learning_rate": 1.583981378129007e-05, + "loss": 0.2106, + "step": 6341 + }, + { + "epoch": 0.32, + "grad_norm": 1.194773506228996, + "learning_rate": 1.5838476747224096e-05, + "loss": 0.2029, + "step": 6342 + }, + { + "epoch": 0.32, + "grad_norm": 1.4189946860982916, + "learning_rate": 1.5837139554789106e-05, + "loss": 0.1852, + "step": 6343 + }, + { + "epoch": 0.32, + "grad_norm": 1.7200228788960104, + "learning_rate": 1.5835802204021374e-05, + "loss": 0.1945, + "step": 6344 + }, + { + "epoch": 0.32, + "grad_norm": 1.049067356353654, + "learning_rate": 1.583446469495718e-05, + "loss": 0.2132, + "step": 6345 + }, + { + "epoch": 0.32, + "grad_norm": 0.8484716174067365, + "learning_rate": 1.5833127027632803e-05, + "loss": 0.1767, + "step": 6346 + }, + { + "epoch": 0.32, + "grad_norm": 1.2644673501039674, + "learning_rate": 1.5831789202084524e-05, + "loss": 0.196, + "step": 6347 + }, + { + "epoch": 0.32, + "grad_norm": 1.0563048846551186, + "learning_rate": 1.5830451218348634e-05, + "loss": 0.1971, + "step": 6348 + }, + { + "epoch": 0.32, + "grad_norm": 0.8555301237648472, + "learning_rate": 1.5829113076461426e-05, + "loss": 0.1986, + "step": 6349 + }, + { + "epoch": 0.32, + "grad_norm": 1.0299193138241485, + "learning_rate": 1.5827774776459195e-05, + "loss": 0.1942, + "step": 6350 + }, + { + "epoch": 0.32, + "grad_norm": 1.4125485894893952, + "learning_rate": 1.5826436318378248e-05, + "loss": 0.1968, + "step": 6351 + }, + { + "epoch": 0.32, + "grad_norm": 0.9133171869368052, + "learning_rate": 1.5825097702254885e-05, + "loss": 0.2082, + "step": 6352 + }, + { + "epoch": 0.32, + "grad_norm": 1.330384605741934, + "learning_rate": 1.5823758928125416e-05, + "loss": 0.2016, + "step": 6353 + }, + { + "epoch": 0.32, + "grad_norm": 1.4065688789992765, + "learning_rate": 1.5822419996026156e-05, + "loss": 0.1982, + "step": 6354 + }, + { + "epoch": 0.32, + "grad_norm": 1.0543744183417312, + "learning_rate": 1.5821080905993425e-05, + "loss": 0.2171, + "step": 6355 + }, + { + "epoch": 0.32, + "grad_norm": 1.010192278983979, + "learning_rate": 1.5819741658063548e-05, + "loss": 0.209, + "step": 6356 + }, + { + "epoch": 0.32, + "grad_norm": 1.1998783585968111, + "learning_rate": 1.5818402252272852e-05, + "loss": 0.2112, + "step": 6357 + }, + { + "epoch": 0.32, + "grad_norm": 1.3962202709631413, + "learning_rate": 1.581706268865766e-05, + "loss": 0.1958, + "step": 6358 + }, + { + "epoch": 0.32, + "grad_norm": 1.154206370575021, + "learning_rate": 1.5815722967254317e-05, + "loss": 0.2085, + "step": 6359 + }, + { + "epoch": 0.32, + "grad_norm": 1.9722024390165005, + "learning_rate": 1.5814383088099164e-05, + "loss": 0.2087, + "step": 6360 + }, + { + "epoch": 0.32, + "grad_norm": 0.9929739020758829, + "learning_rate": 1.5813043051228536e-05, + "loss": 0.2216, + "step": 6361 + }, + { + "epoch": 0.32, + "grad_norm": 1.016156213865893, + "learning_rate": 1.581170285667879e-05, + "loss": 0.1903, + "step": 6362 + }, + { + "epoch": 0.32, + "grad_norm": 2.5700458159966972, + "learning_rate": 1.5810362504486274e-05, + "loss": 0.2197, + "step": 6363 + }, + { + "epoch": 0.32, + "grad_norm": 1.1682437605649398, + "learning_rate": 1.5809021994687346e-05, + "loss": 0.1977, + "step": 6364 + }, + { + "epoch": 0.32, + "grad_norm": 1.2753786330700614, + "learning_rate": 1.5807681327318372e-05, + "loss": 0.2343, + "step": 6365 + }, + { + "epoch": 0.32, + "grad_norm": 1.130208997823953, + "learning_rate": 1.580634050241571e-05, + "loss": 0.1902, + "step": 6366 + }, + { + "epoch": 0.32, + "grad_norm": 1.1777548703995049, + "learning_rate": 1.5804999520015735e-05, + "loss": 0.1963, + "step": 6367 + }, + { + "epoch": 0.32, + "grad_norm": 0.9107676559244275, + "learning_rate": 1.5803658380154822e-05, + "loss": 0.1923, + "step": 6368 + }, + { + "epoch": 0.32, + "grad_norm": 1.0898334399860248, + "learning_rate": 1.5802317082869346e-05, + "loss": 0.1831, + "step": 6369 + }, + { + "epoch": 0.32, + "grad_norm": 1.3729466617948922, + "learning_rate": 1.5800975628195692e-05, + "loss": 0.1964, + "step": 6370 + }, + { + "epoch": 0.32, + "grad_norm": 0.9014294103357848, + "learning_rate": 1.5799634016170245e-05, + "loss": 0.1943, + "step": 6371 + }, + { + "epoch": 0.32, + "grad_norm": 1.0305709033064328, + "learning_rate": 1.57982922468294e-05, + "loss": 0.1945, + "step": 6372 + }, + { + "epoch": 0.32, + "grad_norm": 0.955380919591037, + "learning_rate": 1.5796950320209548e-05, + "loss": 0.2081, + "step": 6373 + }, + { + "epoch": 0.32, + "grad_norm": 0.9686561104117304, + "learning_rate": 1.5795608236347092e-05, + "loss": 0.1725, + "step": 6374 + }, + { + "epoch": 0.32, + "grad_norm": 0.9817649288066675, + "learning_rate": 1.5794265995278438e-05, + "loss": 0.2223, + "step": 6375 + }, + { + "epoch": 0.32, + "grad_norm": 0.9605643115340182, + "learning_rate": 1.5792923597039988e-05, + "loss": 0.1965, + "step": 6376 + }, + { + "epoch": 0.32, + "grad_norm": 0.9827142089808804, + "learning_rate": 1.579158104166816e-05, + "loss": 0.2116, + "step": 6377 + }, + { + "epoch": 0.32, + "grad_norm": 1.9111504054520023, + "learning_rate": 1.579023832919937e-05, + "loss": 0.1855, + "step": 6378 + }, + { + "epoch": 0.32, + "grad_norm": 1.2261708996905023, + "learning_rate": 1.5788895459670036e-05, + "loss": 0.2072, + "step": 6379 + }, + { + "epoch": 0.32, + "grad_norm": 1.2240954515580862, + "learning_rate": 1.5787552433116586e-05, + "loss": 0.2059, + "step": 6380 + }, + { + "epoch": 0.32, + "grad_norm": 0.9802222648434512, + "learning_rate": 1.5786209249575452e-05, + "loss": 0.2064, + "step": 6381 + }, + { + "epoch": 0.32, + "grad_norm": 1.4678277710491507, + "learning_rate": 1.5784865909083064e-05, + "loss": 0.2009, + "step": 6382 + }, + { + "epoch": 0.32, + "grad_norm": 2.0693945382145387, + "learning_rate": 1.5783522411675857e-05, + "loss": 0.2025, + "step": 6383 + }, + { + "epoch": 0.32, + "grad_norm": 1.5394552546625184, + "learning_rate": 1.5782178757390284e-05, + "loss": 0.1903, + "step": 6384 + }, + { + "epoch": 0.32, + "grad_norm": 1.0743984781457123, + "learning_rate": 1.5780834946262782e-05, + "loss": 0.2184, + "step": 6385 + }, + { + "epoch": 0.32, + "grad_norm": 1.585305608844427, + "learning_rate": 1.5779490978329806e-05, + "loss": 0.2141, + "step": 6386 + }, + { + "epoch": 0.32, + "grad_norm": 0.9386347211553212, + "learning_rate": 1.5778146853627813e-05, + "loss": 0.1865, + "step": 6387 + }, + { + "epoch": 0.32, + "grad_norm": 1.0530458992126261, + "learning_rate": 1.577680257219326e-05, + "loss": 0.1897, + "step": 6388 + }, + { + "epoch": 0.32, + "grad_norm": 1.2499711672577036, + "learning_rate": 1.577545813406261e-05, + "loss": 0.2078, + "step": 6389 + }, + { + "epoch": 0.32, + "grad_norm": 1.127326725810852, + "learning_rate": 1.5774113539272332e-05, + "loss": 0.2238, + "step": 6390 + }, + { + "epoch": 0.32, + "grad_norm": 1.3929687223303266, + "learning_rate": 1.57727687878589e-05, + "loss": 0.1949, + "step": 6391 + }, + { + "epoch": 0.33, + "grad_norm": 0.9969779103098971, + "learning_rate": 1.5771423879858783e-05, + "loss": 0.1882, + "step": 6392 + }, + { + "epoch": 0.33, + "grad_norm": 1.41306573541083, + "learning_rate": 1.5770078815308473e-05, + "loss": 0.1869, + "step": 6393 + }, + { + "epoch": 0.33, + "grad_norm": 0.838933203170488, + "learning_rate": 1.5768733594244445e-05, + "loss": 0.2058, + "step": 6394 + }, + { + "epoch": 0.33, + "grad_norm": 1.0659843745048814, + "learning_rate": 1.5767388216703196e-05, + "loss": 0.2124, + "step": 6395 + }, + { + "epoch": 0.33, + "grad_norm": 1.152946095323606, + "learning_rate": 1.5766042682721213e-05, + "loss": 0.2095, + "step": 6396 + }, + { + "epoch": 0.33, + "grad_norm": 2.0228491280513095, + "learning_rate": 1.5764696992335e-05, + "loss": 0.2322, + "step": 6397 + }, + { + "epoch": 0.33, + "grad_norm": 1.3934301752230933, + "learning_rate": 1.5763351145581052e-05, + "loss": 0.2031, + "step": 6398 + }, + { + "epoch": 0.33, + "grad_norm": 1.0083370881264966, + "learning_rate": 1.576200514249588e-05, + "loss": 0.217, + "step": 6399 + }, + { + "epoch": 0.33, + "grad_norm": 1.4430508995785853, + "learning_rate": 1.576065898311599e-05, + "loss": 0.1991, + "step": 6400 + }, + { + "epoch": 0.33, + "grad_norm": 1.6340672012097428, + "learning_rate": 1.5759312667477907e-05, + "loss": 0.1983, + "step": 6401 + }, + { + "epoch": 0.33, + "grad_norm": 1.6808778569379283, + "learning_rate": 1.575796619561814e-05, + "loss": 0.1861, + "step": 6402 + }, + { + "epoch": 0.33, + "grad_norm": 1.2748513907509549, + "learning_rate": 1.5756619567573213e-05, + "loss": 0.2164, + "step": 6403 + }, + { + "epoch": 0.33, + "grad_norm": 0.936128174627036, + "learning_rate": 1.5755272783379656e-05, + "loss": 0.1953, + "step": 6404 + }, + { + "epoch": 0.33, + "grad_norm": 1.2888785381934886, + "learning_rate": 1.5753925843073997e-05, + "loss": 0.2246, + "step": 6405 + }, + { + "epoch": 0.33, + "grad_norm": 1.078888056580051, + "learning_rate": 1.5752578746692776e-05, + "loss": 0.1997, + "step": 6406 + }, + { + "epoch": 0.33, + "grad_norm": 0.9884413373743257, + "learning_rate": 1.5751231494272535e-05, + "loss": 0.1972, + "step": 6407 + }, + { + "epoch": 0.33, + "grad_norm": 1.882073537819897, + "learning_rate": 1.574988408584981e-05, + "loss": 0.2124, + "step": 6408 + }, + { + "epoch": 0.33, + "grad_norm": 0.8856297252603791, + "learning_rate": 1.574853652146116e-05, + "loss": 0.2084, + "step": 6409 + }, + { + "epoch": 0.33, + "grad_norm": 0.9096131648444687, + "learning_rate": 1.5747188801143126e-05, + "loss": 0.2104, + "step": 6410 + }, + { + "epoch": 0.33, + "grad_norm": 1.1483305633397578, + "learning_rate": 1.574584092493227e-05, + "loss": 0.2229, + "step": 6411 + }, + { + "epoch": 0.33, + "grad_norm": 1.2905589927471228, + "learning_rate": 1.574449289286516e-05, + "loss": 0.1895, + "step": 6412 + }, + { + "epoch": 0.33, + "grad_norm": 1.055202980082456, + "learning_rate": 1.5743144704978358e-05, + "loss": 0.2162, + "step": 6413 + }, + { + "epoch": 0.33, + "grad_norm": 1.571357528956249, + "learning_rate": 1.5741796361308424e-05, + "loss": 0.2185, + "step": 6414 + }, + { + "epoch": 0.33, + "grad_norm": 1.27518127262942, + "learning_rate": 1.5740447861891946e-05, + "loss": 0.2178, + "step": 6415 + }, + { + "epoch": 0.33, + "grad_norm": 1.1005877561884827, + "learning_rate": 1.573909920676549e-05, + "loss": 0.2353, + "step": 6416 + }, + { + "epoch": 0.33, + "grad_norm": 0.8713251975400061, + "learning_rate": 1.5737750395965646e-05, + "loss": 0.1908, + "step": 6417 + }, + { + "epoch": 0.33, + "grad_norm": 1.2112967064517417, + "learning_rate": 1.5736401429529e-05, + "loss": 0.1913, + "step": 6418 + }, + { + "epoch": 0.33, + "grad_norm": 1.9991676495662414, + "learning_rate": 1.573505230749214e-05, + "loss": 0.1855, + "step": 6419 + }, + { + "epoch": 0.33, + "grad_norm": 0.8813185706705601, + "learning_rate": 1.573370302989166e-05, + "loss": 0.211, + "step": 6420 + }, + { + "epoch": 0.33, + "grad_norm": 1.2830617593144382, + "learning_rate": 1.5732353596764168e-05, + "loss": 0.19, + "step": 6421 + }, + { + "epoch": 0.33, + "grad_norm": 1.3583092644558936, + "learning_rate": 1.5731004008146255e-05, + "loss": 0.1827, + "step": 6422 + }, + { + "epoch": 0.33, + "grad_norm": 1.0762212789850942, + "learning_rate": 1.5729654264074536e-05, + "loss": 0.1934, + "step": 6423 + }, + { + "epoch": 0.33, + "grad_norm": 1.2099248654148724, + "learning_rate": 1.572830436458562e-05, + "loss": 0.1947, + "step": 6424 + }, + { + "epoch": 0.33, + "grad_norm": 1.592646821803759, + "learning_rate": 1.5726954309716128e-05, + "loss": 0.1849, + "step": 6425 + }, + { + "epoch": 0.33, + "grad_norm": 1.0505585309130414, + "learning_rate": 1.5725604099502673e-05, + "loss": 0.2032, + "step": 6426 + }, + { + "epoch": 0.33, + "grad_norm": 1.5026472365739723, + "learning_rate": 1.572425373398189e-05, + "loss": 0.1994, + "step": 6427 + }, + { + "epoch": 0.33, + "grad_norm": 0.8287932239533964, + "learning_rate": 1.5722903213190393e-05, + "loss": 0.1654, + "step": 6428 + }, + { + "epoch": 0.33, + "grad_norm": 1.0151895468230154, + "learning_rate": 1.572155253716483e-05, + "loss": 0.1894, + "step": 6429 + }, + { + "epoch": 0.33, + "grad_norm": 1.813161979459579, + "learning_rate": 1.5720201705941827e-05, + "loss": 0.2153, + "step": 6430 + }, + { + "epoch": 0.33, + "grad_norm": 1.4623926629210309, + "learning_rate": 1.571885071955803e-05, + "loss": 0.1959, + "step": 6431 + }, + { + "epoch": 0.33, + "grad_norm": 2.197393529509465, + "learning_rate": 1.5717499578050083e-05, + "loss": 0.2121, + "step": 6432 + }, + { + "epoch": 0.33, + "grad_norm": 0.8507227862656661, + "learning_rate": 1.571614828145464e-05, + "loss": 0.1791, + "step": 6433 + }, + { + "epoch": 0.33, + "grad_norm": 0.9143042226597085, + "learning_rate": 1.5714796829808352e-05, + "loss": 0.2176, + "step": 6434 + }, + { + "epoch": 0.33, + "grad_norm": 1.0246240592941134, + "learning_rate": 1.5713445223147876e-05, + "loss": 0.2225, + "step": 6435 + }, + { + "epoch": 0.33, + "grad_norm": 1.2215280314440582, + "learning_rate": 1.5712093461509878e-05, + "loss": 0.2066, + "step": 6436 + }, + { + "epoch": 0.33, + "grad_norm": 1.947008583860387, + "learning_rate": 1.5710741544931017e-05, + "loss": 0.2278, + "step": 6437 + }, + { + "epoch": 0.33, + "grad_norm": 1.1323169093199634, + "learning_rate": 1.5709389473447974e-05, + "loss": 0.2134, + "step": 6438 + }, + { + "epoch": 0.33, + "grad_norm": 0.8026527983512476, + "learning_rate": 1.5708037247097418e-05, + "loss": 0.2012, + "step": 6439 + }, + { + "epoch": 0.33, + "grad_norm": 1.1969855479924654, + "learning_rate": 1.5706684865916025e-05, + "loss": 0.2282, + "step": 6440 + }, + { + "epoch": 0.33, + "grad_norm": 0.9403857301981011, + "learning_rate": 1.570533232994049e-05, + "loss": 0.1967, + "step": 6441 + }, + { + "epoch": 0.33, + "grad_norm": 2.305053539772037, + "learning_rate": 1.570397963920749e-05, + "loss": 0.1965, + "step": 6442 + }, + { + "epoch": 0.33, + "grad_norm": 1.1363241248573346, + "learning_rate": 1.5702626793753717e-05, + "loss": 0.2044, + "step": 6443 + }, + { + "epoch": 0.33, + "grad_norm": 0.8737256816832991, + "learning_rate": 1.5701273793615876e-05, + "loss": 0.2048, + "step": 6444 + }, + { + "epoch": 0.33, + "grad_norm": 1.6215538187724758, + "learning_rate": 1.5699920638830656e-05, + "loss": 0.218, + "step": 6445 + }, + { + "epoch": 0.33, + "grad_norm": 1.0878235244057186, + "learning_rate": 1.5698567329434768e-05, + "loss": 0.222, + "step": 6446 + }, + { + "epoch": 0.33, + "grad_norm": 1.0513041461655643, + "learning_rate": 1.569721386546492e-05, + "loss": 0.2166, + "step": 6447 + }, + { + "epoch": 0.33, + "grad_norm": 0.9622985569927587, + "learning_rate": 1.5695860246957826e-05, + "loss": 0.1979, + "step": 6448 + }, + { + "epoch": 0.33, + "grad_norm": 1.106124143025721, + "learning_rate": 1.5694506473950198e-05, + "loss": 0.2146, + "step": 6449 + }, + { + "epoch": 0.33, + "grad_norm": 0.9022371205892392, + "learning_rate": 1.5693152546478762e-05, + "loss": 0.1904, + "step": 6450 + }, + { + "epoch": 0.33, + "grad_norm": 0.8059048062766315, + "learning_rate": 1.569179846458024e-05, + "loss": 0.1933, + "step": 6451 + }, + { + "epoch": 0.33, + "grad_norm": 1.8776693830331572, + "learning_rate": 1.5690444228291366e-05, + "loss": 0.2228, + "step": 6452 + }, + { + "epoch": 0.33, + "grad_norm": 1.0309478566891377, + "learning_rate": 1.568908983764887e-05, + "loss": 0.1922, + "step": 6453 + }, + { + "epoch": 0.33, + "grad_norm": 1.050309678890914, + "learning_rate": 1.568773529268949e-05, + "loss": 0.2045, + "step": 6454 + }, + { + "epoch": 0.33, + "grad_norm": 1.0229060671888315, + "learning_rate": 1.5686380593449966e-05, + "loss": 0.2047, + "step": 6455 + }, + { + "epoch": 0.33, + "grad_norm": 1.001874802951346, + "learning_rate": 1.5685025739967056e-05, + "loss": 0.229, + "step": 6456 + }, + { + "epoch": 0.33, + "grad_norm": 0.9513827785300544, + "learning_rate": 1.568367073227749e-05, + "loss": 0.1857, + "step": 6457 + }, + { + "epoch": 0.33, + "grad_norm": 0.8963176883806334, + "learning_rate": 1.5682315570418043e-05, + "loss": 0.1848, + "step": 6458 + }, + { + "epoch": 0.33, + "grad_norm": 0.9336871285878903, + "learning_rate": 1.5680960254425467e-05, + "loss": 0.2199, + "step": 6459 + }, + { + "epoch": 0.33, + "grad_norm": 0.9301503405854965, + "learning_rate": 1.5679604784336516e-05, + "loss": 0.1947, + "step": 6460 + }, + { + "epoch": 0.33, + "grad_norm": 0.8430696671678639, + "learning_rate": 1.567824916018797e-05, + "loss": 0.2281, + "step": 6461 + }, + { + "epoch": 0.33, + "grad_norm": 1.0779499877725116, + "learning_rate": 1.567689338201659e-05, + "loss": 0.2232, + "step": 6462 + }, + { + "epoch": 0.33, + "grad_norm": 0.9323416339337731, + "learning_rate": 1.5675537449859158e-05, + "loss": 0.2095, + "step": 6463 + }, + { + "epoch": 0.33, + "grad_norm": 1.1019950763615698, + "learning_rate": 1.5674181363752452e-05, + "loss": 0.1894, + "step": 6464 + }, + { + "epoch": 0.33, + "grad_norm": 1.2348306315579887, + "learning_rate": 1.5672825123733257e-05, + "loss": 0.1921, + "step": 6465 + }, + { + "epoch": 0.33, + "grad_norm": 1.0231332496881906, + "learning_rate": 1.567146872983836e-05, + "loss": 0.1972, + "step": 6466 + }, + { + "epoch": 0.33, + "grad_norm": 2.220781301600539, + "learning_rate": 1.5670112182104552e-05, + "loss": 0.1978, + "step": 6467 + }, + { + "epoch": 0.33, + "grad_norm": 1.1006520137959355, + "learning_rate": 1.566875548056863e-05, + "loss": 0.2557, + "step": 6468 + }, + { + "epoch": 0.33, + "grad_norm": 1.0133967400662975, + "learning_rate": 1.5667398625267402e-05, + "loss": 0.1999, + "step": 6469 + }, + { + "epoch": 0.33, + "grad_norm": 0.9847310910093158, + "learning_rate": 1.566604161623766e-05, + "loss": 0.1787, + "step": 6470 + }, + { + "epoch": 0.33, + "grad_norm": 1.3193309942941929, + "learning_rate": 1.5664684453516218e-05, + "loss": 0.2028, + "step": 6471 + }, + { + "epoch": 0.33, + "grad_norm": 1.0438169210754187, + "learning_rate": 1.5663327137139893e-05, + "loss": 0.1959, + "step": 6472 + }, + { + "epoch": 0.33, + "grad_norm": 0.9198281649456738, + "learning_rate": 1.56619696671455e-05, + "loss": 0.2137, + "step": 6473 + }, + { + "epoch": 0.33, + "grad_norm": 1.3123289776496023, + "learning_rate": 1.5660612043569864e-05, + "loss": 0.2134, + "step": 6474 + }, + { + "epoch": 0.33, + "grad_norm": 0.9629029433242554, + "learning_rate": 1.56592542664498e-05, + "loss": 0.1986, + "step": 6475 + }, + { + "epoch": 0.33, + "grad_norm": 1.0251036113371328, + "learning_rate": 1.5657896335822147e-05, + "loss": 0.1771, + "step": 6476 + }, + { + "epoch": 0.33, + "grad_norm": 0.8869554671856225, + "learning_rate": 1.5656538251723734e-05, + "loss": 0.1902, + "step": 6477 + }, + { + "epoch": 0.33, + "grad_norm": 1.3756739947044674, + "learning_rate": 1.5655180014191404e-05, + "loss": 0.2119, + "step": 6478 + }, + { + "epoch": 0.33, + "grad_norm": 0.9633315984036738, + "learning_rate": 1.5653821623261998e-05, + "loss": 0.2, + "step": 6479 + }, + { + "epoch": 0.33, + "grad_norm": 9.251828938015827, + "learning_rate": 1.565246307897236e-05, + "loss": 0.203, + "step": 6480 + }, + { + "epoch": 0.33, + "grad_norm": 1.14068294685859, + "learning_rate": 1.565110438135934e-05, + "loss": 0.191, + "step": 6481 + }, + { + "epoch": 0.33, + "grad_norm": 1.5452138562508575, + "learning_rate": 1.5649745530459794e-05, + "loss": 0.1995, + "step": 6482 + }, + { + "epoch": 0.33, + "grad_norm": 1.0536420677133722, + "learning_rate": 1.5648386526310582e-05, + "loss": 0.1993, + "step": 6483 + }, + { + "epoch": 0.33, + "grad_norm": 1.1356216475216754, + "learning_rate": 1.564702736894857e-05, + "loss": 0.2186, + "step": 6484 + }, + { + "epoch": 0.33, + "grad_norm": 0.8532781011175651, + "learning_rate": 1.5645668058410617e-05, + "loss": 0.2082, + "step": 6485 + }, + { + "epoch": 0.33, + "grad_norm": 0.8455231181313126, + "learning_rate": 1.56443085947336e-05, + "loss": 0.1931, + "step": 6486 + }, + { + "epoch": 0.33, + "grad_norm": 1.2629784905997012, + "learning_rate": 1.5642948977954395e-05, + "loss": 0.2528, + "step": 6487 + }, + { + "epoch": 0.33, + "grad_norm": 1.2047458771536588, + "learning_rate": 1.564158920810988e-05, + "loss": 0.2098, + "step": 6488 + }, + { + "epoch": 0.33, + "grad_norm": 1.154183894969296, + "learning_rate": 1.5640229285236938e-05, + "loss": 0.2142, + "step": 6489 + }, + { + "epoch": 0.33, + "grad_norm": 1.4439578060813603, + "learning_rate": 1.563886920937246e-05, + "loss": 0.1939, + "step": 6490 + }, + { + "epoch": 0.33, + "grad_norm": 1.1269334348934161, + "learning_rate": 1.5637508980553335e-05, + "loss": 0.202, + "step": 6491 + }, + { + "epoch": 0.33, + "grad_norm": 0.8474971196663091, + "learning_rate": 1.563614859881646e-05, + "loss": 0.1944, + "step": 6492 + }, + { + "epoch": 0.33, + "grad_norm": 0.775445743159406, + "learning_rate": 1.5634788064198736e-05, + "loss": 0.1838, + "step": 6493 + }, + { + "epoch": 0.33, + "grad_norm": 0.8829878157374699, + "learning_rate": 1.5633427376737072e-05, + "loss": 0.2099, + "step": 6494 + }, + { + "epoch": 0.33, + "grad_norm": 1.388770427746395, + "learning_rate": 1.5632066536468367e-05, + "loss": 0.1908, + "step": 6495 + }, + { + "epoch": 0.33, + "grad_norm": 1.0099467639292705, + "learning_rate": 1.5630705543429542e-05, + "loss": 0.191, + "step": 6496 + }, + { + "epoch": 0.33, + "grad_norm": 0.9404162366537719, + "learning_rate": 1.5629344397657506e-05, + "loss": 0.194, + "step": 6497 + }, + { + "epoch": 0.33, + "grad_norm": 0.9446683305039197, + "learning_rate": 1.562798309918919e-05, + "loss": 0.1796, + "step": 6498 + }, + { + "epoch": 0.33, + "grad_norm": 0.9309802969044433, + "learning_rate": 1.5626621648061514e-05, + "loss": 0.202, + "step": 6499 + }, + { + "epoch": 0.33, + "grad_norm": 1.7207205399944414, + "learning_rate": 1.5625260044311405e-05, + "loss": 0.2094, + "step": 6500 + }, + { + "epoch": 0.33, + "grad_norm": 1.8054028028686318, + "learning_rate": 1.5623898287975806e-05, + "loss": 0.204, + "step": 6501 + }, + { + "epoch": 0.33, + "grad_norm": 1.4060494249650275, + "learning_rate": 1.562253637909164e-05, + "loss": 0.194, + "step": 6502 + }, + { + "epoch": 0.33, + "grad_norm": 0.8105266496711662, + "learning_rate": 1.5621174317695862e-05, + "loss": 0.2025, + "step": 6503 + }, + { + "epoch": 0.33, + "grad_norm": 0.8944536557109091, + "learning_rate": 1.561981210382541e-05, + "loss": 0.2159, + "step": 6504 + }, + { + "epoch": 0.33, + "grad_norm": 0.7844477751552985, + "learning_rate": 1.5618449737517242e-05, + "loss": 0.2015, + "step": 6505 + }, + { + "epoch": 0.33, + "grad_norm": 0.8709928078499433, + "learning_rate": 1.5617087218808307e-05, + "loss": 0.1814, + "step": 6506 + }, + { + "epoch": 0.33, + "grad_norm": 1.3657649463539294, + "learning_rate": 1.5615724547735562e-05, + "loss": 0.2111, + "step": 6507 + }, + { + "epoch": 0.33, + "grad_norm": 1.3173488373145754, + "learning_rate": 1.561436172433597e-05, + "loss": 0.1981, + "step": 6508 + }, + { + "epoch": 0.33, + "grad_norm": 1.078990856080944, + "learning_rate": 1.56129987486465e-05, + "loss": 0.2151, + "step": 6509 + }, + { + "epoch": 0.33, + "grad_norm": 0.8717188418025383, + "learning_rate": 1.5611635620704128e-05, + "loss": 0.1983, + "step": 6510 + }, + { + "epoch": 0.33, + "grad_norm": 0.8053767630345547, + "learning_rate": 1.5610272340545814e-05, + "loss": 0.1947, + "step": 6511 + }, + { + "epoch": 0.33, + "grad_norm": 0.842561014891099, + "learning_rate": 1.560890890820855e-05, + "loss": 0.1883, + "step": 6512 + }, + { + "epoch": 0.33, + "grad_norm": 1.1991401747191601, + "learning_rate": 1.5607545323729313e-05, + "loss": 0.1756, + "step": 6513 + }, + { + "epoch": 0.33, + "grad_norm": 1.059056198298021, + "learning_rate": 1.5606181587145097e-05, + "loss": 0.2366, + "step": 6514 + }, + { + "epoch": 0.33, + "grad_norm": 1.7460856086533205, + "learning_rate": 1.5604817698492886e-05, + "loss": 0.2369, + "step": 6515 + }, + { + "epoch": 0.33, + "grad_norm": 1.0681687279055996, + "learning_rate": 1.560345365780968e-05, + "loss": 0.2114, + "step": 6516 + }, + { + "epoch": 0.33, + "grad_norm": 1.240767782666872, + "learning_rate": 1.5602089465132474e-05, + "loss": 0.197, + "step": 6517 + }, + { + "epoch": 0.33, + "grad_norm": 1.6283348417848393, + "learning_rate": 1.5600725120498273e-05, + "loss": 0.1867, + "step": 6518 + }, + { + "epoch": 0.33, + "grad_norm": 0.8753049390892731, + "learning_rate": 1.5599360623944092e-05, + "loss": 0.198, + "step": 6519 + }, + { + "epoch": 0.33, + "grad_norm": 0.8253790374894722, + "learning_rate": 1.5597995975506936e-05, + "loss": 0.2095, + "step": 6520 + }, + { + "epoch": 0.33, + "grad_norm": 1.679152258436176, + "learning_rate": 1.5596631175223823e-05, + "loss": 0.2221, + "step": 6521 + }, + { + "epoch": 0.33, + "grad_norm": 1.363989379907467, + "learning_rate": 1.559526622313177e-05, + "loss": 0.1862, + "step": 6522 + }, + { + "epoch": 0.33, + "grad_norm": 5.99462758090583, + "learning_rate": 1.559390111926781e-05, + "loss": 0.2021, + "step": 6523 + }, + { + "epoch": 0.33, + "grad_norm": 1.105152738508639, + "learning_rate": 1.559253586366896e-05, + "loss": 0.2166, + "step": 6524 + }, + { + "epoch": 0.33, + "grad_norm": 0.8798114773050834, + "learning_rate": 1.5591170456372264e-05, + "loss": 0.2005, + "step": 6525 + }, + { + "epoch": 0.33, + "grad_norm": 1.2415787362049853, + "learning_rate": 1.5589804897414757e-05, + "loss": 0.2013, + "step": 6526 + }, + { + "epoch": 0.33, + "grad_norm": 1.1061386725225641, + "learning_rate": 1.5588439186833467e-05, + "loss": 0.2422, + "step": 6527 + }, + { + "epoch": 0.33, + "grad_norm": 0.7700214560139843, + "learning_rate": 1.5587073324665457e-05, + "loss": 0.197, + "step": 6528 + }, + { + "epoch": 0.33, + "grad_norm": 1.3002606779597166, + "learning_rate": 1.558570731094776e-05, + "loss": 0.2082, + "step": 6529 + }, + { + "epoch": 0.33, + "grad_norm": 1.120351330109009, + "learning_rate": 1.558434114571744e-05, + "loss": 0.202, + "step": 6530 + }, + { + "epoch": 0.33, + "grad_norm": 1.4396268572009103, + "learning_rate": 1.558297482901155e-05, + "loss": 0.2189, + "step": 6531 + }, + { + "epoch": 0.33, + "grad_norm": 1.0460960319227817, + "learning_rate": 1.5581608360867154e-05, + "loss": 0.217, + "step": 6532 + }, + { + "epoch": 0.33, + "grad_norm": 1.0874166588094762, + "learning_rate": 1.5580241741321317e-05, + "loss": 0.2019, + "step": 6533 + }, + { + "epoch": 0.33, + "grad_norm": 1.1894921553008606, + "learning_rate": 1.5578874970411105e-05, + "loss": 0.1977, + "step": 6534 + }, + { + "epoch": 0.33, + "grad_norm": 0.9151050298336578, + "learning_rate": 1.5577508048173596e-05, + "loss": 0.1988, + "step": 6535 + }, + { + "epoch": 0.33, + "grad_norm": 1.8028004085361633, + "learning_rate": 1.5576140974645868e-05, + "loss": 0.2192, + "step": 6536 + }, + { + "epoch": 0.33, + "grad_norm": 1.0884779279303818, + "learning_rate": 1.5574773749865e-05, + "loss": 0.2029, + "step": 6537 + }, + { + "epoch": 0.33, + "grad_norm": 0.8143331503528025, + "learning_rate": 1.5573406373868077e-05, + "loss": 0.2073, + "step": 6538 + }, + { + "epoch": 0.33, + "grad_norm": 1.0414387222930812, + "learning_rate": 1.5572038846692193e-05, + "loss": 0.1941, + "step": 6539 + }, + { + "epoch": 0.33, + "grad_norm": 1.4755430457912222, + "learning_rate": 1.557067116837444e-05, + "loss": 0.1945, + "step": 6540 + }, + { + "epoch": 0.33, + "grad_norm": 1.063393639956194, + "learning_rate": 1.5569303338951914e-05, + "loss": 0.2216, + "step": 6541 + }, + { + "epoch": 0.33, + "grad_norm": 3.354543793682284, + "learning_rate": 1.5567935358461724e-05, + "loss": 0.22, + "step": 6542 + }, + { + "epoch": 0.33, + "grad_norm": 1.1240307573927577, + "learning_rate": 1.5566567226940974e-05, + "loss": 0.2162, + "step": 6543 + }, + { + "epoch": 0.33, + "grad_norm": 0.8625214215267929, + "learning_rate": 1.556519894442677e-05, + "loss": 0.207, + "step": 6544 + }, + { + "epoch": 0.33, + "grad_norm": 0.9190977071427817, + "learning_rate": 1.5563830510956234e-05, + "loss": 0.1947, + "step": 6545 + }, + { + "epoch": 0.33, + "grad_norm": 1.7671693867274025, + "learning_rate": 1.556246192656648e-05, + "loss": 0.1928, + "step": 6546 + }, + { + "epoch": 0.33, + "grad_norm": 0.9212945113465504, + "learning_rate": 1.556109319129463e-05, + "loss": 0.2034, + "step": 6547 + }, + { + "epoch": 0.33, + "grad_norm": 0.7883832295181457, + "learning_rate": 1.5559724305177814e-05, + "loss": 0.217, + "step": 6548 + }, + { + "epoch": 0.33, + "grad_norm": 1.5995930916405159, + "learning_rate": 1.5558355268253166e-05, + "loss": 0.1975, + "step": 6549 + }, + { + "epoch": 0.33, + "grad_norm": 0.8519660979050935, + "learning_rate": 1.555698608055781e-05, + "loss": 0.195, + "step": 6550 + }, + { + "epoch": 0.33, + "grad_norm": 1.1385807258455753, + "learning_rate": 1.5555616742128897e-05, + "loss": 0.1953, + "step": 6551 + }, + { + "epoch": 0.33, + "grad_norm": 0.8376325527210272, + "learning_rate": 1.5554247253003567e-05, + "loss": 0.1755, + "step": 6552 + }, + { + "epoch": 0.33, + "grad_norm": 0.790951262306196, + "learning_rate": 1.5552877613218964e-05, + "loss": 0.1992, + "step": 6553 + }, + { + "epoch": 0.33, + "grad_norm": 1.0898402865456223, + "learning_rate": 1.555150782281224e-05, + "loss": 0.1971, + "step": 6554 + }, + { + "epoch": 0.33, + "grad_norm": 1.0067994486256289, + "learning_rate": 1.555013788182056e-05, + "loss": 0.2048, + "step": 6555 + }, + { + "epoch": 0.33, + "grad_norm": 1.205884701347545, + "learning_rate": 1.554876779028107e-05, + "loss": 0.2196, + "step": 6556 + }, + { + "epoch": 0.33, + "grad_norm": 1.322602602161279, + "learning_rate": 1.5547397548230943e-05, + "loss": 0.2039, + "step": 6557 + }, + { + "epoch": 0.33, + "grad_norm": 0.7720006927722426, + "learning_rate": 1.554602715570735e-05, + "loss": 0.186, + "step": 6558 + }, + { + "epoch": 0.33, + "grad_norm": 1.0980304504859055, + "learning_rate": 1.554465661274745e-05, + "loss": 0.2072, + "step": 6559 + }, + { + "epoch": 0.33, + "grad_norm": 1.0093278518246684, + "learning_rate": 1.5543285919388426e-05, + "loss": 0.2197, + "step": 6560 + }, + { + "epoch": 0.33, + "grad_norm": 1.2019674945560757, + "learning_rate": 1.554191507566746e-05, + "loss": 0.1965, + "step": 6561 + }, + { + "epoch": 0.33, + "grad_norm": 0.9670683442046764, + "learning_rate": 1.5540544081621736e-05, + "loss": 0.2288, + "step": 6562 + }, + { + "epoch": 0.33, + "grad_norm": 0.949605601109131, + "learning_rate": 1.5539172937288437e-05, + "loss": 0.1993, + "step": 6563 + }, + { + "epoch": 0.33, + "grad_norm": 0.9442795776353967, + "learning_rate": 1.5537801642704763e-05, + "loss": 0.2031, + "step": 6564 + }, + { + "epoch": 0.33, + "grad_norm": 1.0005769850834774, + "learning_rate": 1.5536430197907904e-05, + "loss": 0.2164, + "step": 6565 + }, + { + "epoch": 0.33, + "grad_norm": 0.9143813378018446, + "learning_rate": 1.5535058602935065e-05, + "loss": 0.1804, + "step": 6566 + }, + { + "epoch": 0.33, + "grad_norm": 0.8805621406735882, + "learning_rate": 1.5533686857823447e-05, + "loss": 0.2052, + "step": 6567 + }, + { + "epoch": 0.33, + "grad_norm": 0.9730532725900689, + "learning_rate": 1.5532314962610263e-05, + "loss": 0.2081, + "step": 6568 + }, + { + "epoch": 0.33, + "grad_norm": 1.1364802182104736, + "learning_rate": 1.553094291733272e-05, + "loss": 0.2015, + "step": 6569 + }, + { + "epoch": 0.33, + "grad_norm": 0.95822533473699, + "learning_rate": 1.552957072202804e-05, + "loss": 0.2093, + "step": 6570 + }, + { + "epoch": 0.33, + "grad_norm": 1.3920492379804228, + "learning_rate": 1.5528198376733444e-05, + "loss": 0.1986, + "step": 6571 + }, + { + "epoch": 0.33, + "grad_norm": 0.8671365676659566, + "learning_rate": 1.552682588148615e-05, + "loss": 0.2204, + "step": 6572 + }, + { + "epoch": 0.33, + "grad_norm": 0.8180428446337018, + "learning_rate": 1.5525453236323396e-05, + "loss": 0.1759, + "step": 6573 + }, + { + "epoch": 0.33, + "grad_norm": 1.0814408848789787, + "learning_rate": 1.5524080441282408e-05, + "loss": 0.1984, + "step": 6574 + }, + { + "epoch": 0.33, + "grad_norm": 0.8779769686721743, + "learning_rate": 1.5522707496400425e-05, + "loss": 0.1806, + "step": 6575 + }, + { + "epoch": 0.33, + "grad_norm": 0.9724116612781548, + "learning_rate": 1.5521334401714692e-05, + "loss": 0.1971, + "step": 6576 + }, + { + "epoch": 0.33, + "grad_norm": 0.916346755900272, + "learning_rate": 1.551996115726245e-05, + "loss": 0.1832, + "step": 6577 + }, + { + "epoch": 0.33, + "grad_norm": 1.2003326335004167, + "learning_rate": 1.5518587763080956e-05, + "loss": 0.2176, + "step": 6578 + }, + { + "epoch": 0.33, + "grad_norm": 1.1705118254184124, + "learning_rate": 1.551721421920745e-05, + "loss": 0.194, + "step": 6579 + }, + { + "epoch": 0.33, + "grad_norm": 1.0446029535231542, + "learning_rate": 1.55158405256792e-05, + "loss": 0.2208, + "step": 6580 + }, + { + "epoch": 0.33, + "grad_norm": 1.0562633998936752, + "learning_rate": 1.551446668253346e-05, + "loss": 0.2092, + "step": 6581 + }, + { + "epoch": 0.33, + "grad_norm": 0.8909198049009172, + "learning_rate": 1.5513092689807505e-05, + "loss": 0.2235, + "step": 6582 + }, + { + "epoch": 0.33, + "grad_norm": 1.1344678807289725, + "learning_rate": 1.5511718547538596e-05, + "loss": 0.1994, + "step": 6583 + }, + { + "epoch": 0.33, + "grad_norm": 1.1833341979307461, + "learning_rate": 1.551034425576401e-05, + "loss": 0.1932, + "step": 6584 + }, + { + "epoch": 0.33, + "grad_norm": 0.9294885641880941, + "learning_rate": 1.5508969814521026e-05, + "loss": 0.2624, + "step": 6585 + }, + { + "epoch": 0.33, + "grad_norm": 0.9262316728429739, + "learning_rate": 1.550759522384693e-05, + "loss": 0.2206, + "step": 6586 + }, + { + "epoch": 0.33, + "grad_norm": 0.793394516959776, + "learning_rate": 1.5506220483778994e-05, + "loss": 0.1744, + "step": 6587 + }, + { + "epoch": 0.34, + "grad_norm": 1.0311135703428027, + "learning_rate": 1.550484559435452e-05, + "loss": 0.2464, + "step": 6588 + }, + { + "epoch": 0.34, + "grad_norm": 1.1685635601265862, + "learning_rate": 1.5503470555610797e-05, + "loss": 0.2192, + "step": 6589 + }, + { + "epoch": 0.34, + "grad_norm": 1.2425139947073716, + "learning_rate": 1.5502095367585124e-05, + "loss": 0.2053, + "step": 6590 + }, + { + "epoch": 0.34, + "grad_norm": 1.1825964827556712, + "learning_rate": 1.5500720030314805e-05, + "loss": 0.2195, + "step": 6591 + }, + { + "epoch": 0.34, + "grad_norm": 1.0157324689975242, + "learning_rate": 1.5499344543837144e-05, + "loss": 0.2074, + "step": 6592 + }, + { + "epoch": 0.34, + "grad_norm": 1.1504316423024223, + "learning_rate": 1.549796890818945e-05, + "loss": 0.2221, + "step": 6593 + }, + { + "epoch": 0.34, + "grad_norm": 1.1223861108775626, + "learning_rate": 1.5496593123409042e-05, + "loss": 0.1861, + "step": 6594 + }, + { + "epoch": 0.34, + "grad_norm": 1.5017109310443328, + "learning_rate": 1.549521718953323e-05, + "loss": 0.1979, + "step": 6595 + }, + { + "epoch": 0.34, + "grad_norm": 0.9587162726326114, + "learning_rate": 1.549384110659935e-05, + "loss": 0.2071, + "step": 6596 + }, + { + "epoch": 0.34, + "grad_norm": 1.4159051199900652, + "learning_rate": 1.5492464874644713e-05, + "loss": 0.2177, + "step": 6597 + }, + { + "epoch": 0.34, + "grad_norm": 1.468995929268174, + "learning_rate": 1.5491088493706657e-05, + "loss": 0.2205, + "step": 6598 + }, + { + "epoch": 0.34, + "grad_norm": 1.1113322064440028, + "learning_rate": 1.548971196382252e-05, + "loss": 0.1878, + "step": 6599 + }, + { + "epoch": 0.34, + "grad_norm": 1.1024645553263197, + "learning_rate": 1.548833528502963e-05, + "loss": 0.2177, + "step": 6600 + }, + { + "epoch": 0.34, + "grad_norm": 0.9146618497170281, + "learning_rate": 1.5486958457365338e-05, + "loss": 0.1983, + "step": 6601 + }, + { + "epoch": 0.34, + "grad_norm": 1.004880063563026, + "learning_rate": 1.5485581480866985e-05, + "loss": 0.1895, + "step": 6602 + }, + { + "epoch": 0.34, + "grad_norm": 0.8620626063742314, + "learning_rate": 1.5484204355571927e-05, + "loss": 0.1999, + "step": 6603 + }, + { + "epoch": 0.34, + "grad_norm": 1.6230596374204962, + "learning_rate": 1.5482827081517516e-05, + "loss": 0.2073, + "step": 6604 + }, + { + "epoch": 0.34, + "grad_norm": 0.8772018169236586, + "learning_rate": 1.5481449658741112e-05, + "loss": 0.1892, + "step": 6605 + }, + { + "epoch": 0.34, + "grad_norm": 0.9995458502864405, + "learning_rate": 1.5480072087280075e-05, + "loss": 0.1928, + "step": 6606 + }, + { + "epoch": 0.34, + "grad_norm": 1.1101682764237155, + "learning_rate": 1.5478694367171772e-05, + "loss": 0.1975, + "step": 6607 + }, + { + "epoch": 0.34, + "grad_norm": 1.1474296693817685, + "learning_rate": 1.547731649845358e-05, + "loss": 0.2093, + "step": 6608 + }, + { + "epoch": 0.34, + "grad_norm": 1.0708314496289437, + "learning_rate": 1.5475938481162862e-05, + "loss": 0.215, + "step": 6609 + }, + { + "epoch": 0.34, + "grad_norm": 1.4647452705106907, + "learning_rate": 1.5474560315337007e-05, + "loss": 0.2148, + "step": 6610 + }, + { + "epoch": 0.34, + "grad_norm": 1.0394882698957975, + "learning_rate": 1.5473182001013394e-05, + "loss": 0.18, + "step": 6611 + }, + { + "epoch": 0.34, + "grad_norm": 1.177655440718952, + "learning_rate": 1.547180353822941e-05, + "loss": 0.204, + "step": 6612 + }, + { + "epoch": 0.34, + "grad_norm": 1.6098661456952317, + "learning_rate": 1.5470424927022442e-05, + "loss": 0.2157, + "step": 6613 + }, + { + "epoch": 0.34, + "grad_norm": 1.4013179662489288, + "learning_rate": 1.5469046167429895e-05, + "loss": 0.1768, + "step": 6614 + }, + { + "epoch": 0.34, + "grad_norm": 0.8876557891047067, + "learning_rate": 1.5467667259489157e-05, + "loss": 0.1968, + "step": 6615 + }, + { + "epoch": 0.34, + "grad_norm": 0.7364338213122036, + "learning_rate": 1.546628820323764e-05, + "loss": 0.1951, + "step": 6616 + }, + { + "epoch": 0.34, + "grad_norm": 0.8233505032421118, + "learning_rate": 1.5464908998712743e-05, + "loss": 0.2177, + "step": 6617 + }, + { + "epoch": 0.34, + "grad_norm": 0.7988196118879016, + "learning_rate": 1.5463529645951884e-05, + "loss": 0.1774, + "step": 6618 + }, + { + "epoch": 0.34, + "grad_norm": 1.7863804154493699, + "learning_rate": 1.5462150144992473e-05, + "loss": 0.2375, + "step": 6619 + }, + { + "epoch": 0.34, + "grad_norm": 1.1876939499505215, + "learning_rate": 1.546077049587193e-05, + "loss": 0.1965, + "step": 6620 + }, + { + "epoch": 0.34, + "grad_norm": 0.7738270979735855, + "learning_rate": 1.545939069862768e-05, + "loss": 0.2106, + "step": 6621 + }, + { + "epoch": 0.34, + "grad_norm": 0.8918711817915994, + "learning_rate": 1.545801075329715e-05, + "loss": 0.1929, + "step": 6622 + }, + { + "epoch": 0.34, + "grad_norm": 0.9377869351034032, + "learning_rate": 1.5456630659917768e-05, + "loss": 0.1894, + "step": 6623 + }, + { + "epoch": 0.34, + "grad_norm": 0.8214995234734142, + "learning_rate": 1.5455250418526976e-05, + "loss": 0.2025, + "step": 6624 + }, + { + "epoch": 0.34, + "grad_norm": 1.9899732290557761, + "learning_rate": 1.5453870029162202e-05, + "loss": 0.1985, + "step": 6625 + }, + { + "epoch": 0.34, + "grad_norm": 1.0791010808495334, + "learning_rate": 1.5452489491860897e-05, + "loss": 0.2171, + "step": 6626 + }, + { + "epoch": 0.34, + "grad_norm": 0.8035978835832673, + "learning_rate": 1.5451108806660508e-05, + "loss": 0.1964, + "step": 6627 + }, + { + "epoch": 0.34, + "grad_norm": 0.8693180246456279, + "learning_rate": 1.5449727973598487e-05, + "loss": 0.1976, + "step": 6628 + }, + { + "epoch": 0.34, + "grad_norm": 0.8965073835730478, + "learning_rate": 1.544834699271228e-05, + "loss": 0.2127, + "step": 6629 + }, + { + "epoch": 0.34, + "grad_norm": 1.1466026109131418, + "learning_rate": 1.5446965864039357e-05, + "loss": 0.1774, + "step": 6630 + }, + { + "epoch": 0.34, + "grad_norm": 0.9670915922502352, + "learning_rate": 1.544558458761718e-05, + "loss": 0.2116, + "step": 6631 + }, + { + "epoch": 0.34, + "grad_norm": 1.3603515418339192, + "learning_rate": 1.5444203163483212e-05, + "loss": 0.2025, + "step": 6632 + }, + { + "epoch": 0.34, + "grad_norm": 0.9222393157039742, + "learning_rate": 1.544282159167492e-05, + "loss": 0.2049, + "step": 6633 + }, + { + "epoch": 0.34, + "grad_norm": 0.9098251873140276, + "learning_rate": 1.5441439872229793e-05, + "loss": 0.1982, + "step": 6634 + }, + { + "epoch": 0.34, + "grad_norm": 1.0500483855202372, + "learning_rate": 1.5440058005185295e-05, + "loss": 0.219, + "step": 6635 + }, + { + "epoch": 0.34, + "grad_norm": 0.9415410885951533, + "learning_rate": 1.5438675990578923e-05, + "loss": 0.1917, + "step": 6636 + }, + { + "epoch": 0.34, + "grad_norm": 0.9378779126221987, + "learning_rate": 1.5437293828448153e-05, + "loss": 0.2228, + "step": 6637 + }, + { + "epoch": 0.34, + "grad_norm": 0.9004931303198556, + "learning_rate": 1.5435911518830485e-05, + "loss": 0.2029, + "step": 6638 + }, + { + "epoch": 0.34, + "grad_norm": 2.782645150359459, + "learning_rate": 1.5434529061763405e-05, + "loss": 0.1996, + "step": 6639 + }, + { + "epoch": 0.34, + "grad_norm": 1.004324869305781, + "learning_rate": 1.543314645728442e-05, + "loss": 0.2044, + "step": 6640 + }, + { + "epoch": 0.34, + "grad_norm": 1.6039110715845524, + "learning_rate": 1.543176370543103e-05, + "loss": 0.2098, + "step": 6641 + }, + { + "epoch": 0.34, + "grad_norm": 1.2148324779559043, + "learning_rate": 1.5430380806240744e-05, + "loss": 0.2267, + "step": 6642 + }, + { + "epoch": 0.34, + "grad_norm": 0.9685722004536197, + "learning_rate": 1.5428997759751073e-05, + "loss": 0.1986, + "step": 6643 + }, + { + "epoch": 0.34, + "grad_norm": 1.4287506123498237, + "learning_rate": 1.5427614565999527e-05, + "loss": 0.1698, + "step": 6644 + }, + { + "epoch": 0.34, + "grad_norm": 1.2560403408390468, + "learning_rate": 1.542623122502363e-05, + "loss": 0.2371, + "step": 6645 + }, + { + "epoch": 0.34, + "grad_norm": 1.493029626176336, + "learning_rate": 1.5424847736860907e-05, + "loss": 0.2216, + "step": 6646 + }, + { + "epoch": 0.34, + "grad_norm": 1.502436700329338, + "learning_rate": 1.5423464101548883e-05, + "loss": 0.217, + "step": 6647 + }, + { + "epoch": 0.34, + "grad_norm": 1.7559959460499155, + "learning_rate": 1.5422080319125085e-05, + "loss": 0.2062, + "step": 6648 + }, + { + "epoch": 0.34, + "grad_norm": 1.0631140913339923, + "learning_rate": 1.5420696389627057e-05, + "loss": 0.2386, + "step": 6649 + }, + { + "epoch": 0.34, + "grad_norm": 1.224313506950377, + "learning_rate": 1.5419312313092328e-05, + "loss": 0.1964, + "step": 6650 + }, + { + "epoch": 0.34, + "grad_norm": 1.3134515484754445, + "learning_rate": 1.541792808955845e-05, + "loss": 0.1774, + "step": 6651 + }, + { + "epoch": 0.34, + "grad_norm": 0.9334696811383187, + "learning_rate": 1.5416543719062967e-05, + "loss": 0.1961, + "step": 6652 + }, + { + "epoch": 0.34, + "grad_norm": 5.177362982763627, + "learning_rate": 1.541515920164343e-05, + "loss": 0.2205, + "step": 6653 + }, + { + "epoch": 0.34, + "grad_norm": 1.2813984057128454, + "learning_rate": 1.541377453733739e-05, + "loss": 0.1877, + "step": 6654 + }, + { + "epoch": 0.34, + "grad_norm": 0.8656007817415847, + "learning_rate": 1.541238972618241e-05, + "loss": 0.1998, + "step": 6655 + }, + { + "epoch": 0.34, + "grad_norm": 1.0367298136556913, + "learning_rate": 1.541100476821606e-05, + "loss": 0.2136, + "step": 6656 + }, + { + "epoch": 0.34, + "grad_norm": 0.9975140810289721, + "learning_rate": 1.5409619663475894e-05, + "loss": 0.2154, + "step": 6657 + }, + { + "epoch": 0.34, + "grad_norm": 1.2458834798559473, + "learning_rate": 1.540823441199949e-05, + "loss": 0.193, + "step": 6658 + }, + { + "epoch": 0.34, + "grad_norm": 1.5142136985614558, + "learning_rate": 1.540684901382442e-05, + "loss": 0.2157, + "step": 6659 + }, + { + "epoch": 0.34, + "grad_norm": 0.8924500791959433, + "learning_rate": 1.540546346898827e-05, + "loss": 0.2003, + "step": 6660 + }, + { + "epoch": 0.34, + "grad_norm": 0.9653697735047038, + "learning_rate": 1.5404077777528613e-05, + "loss": 0.2131, + "step": 6661 + }, + { + "epoch": 0.34, + "grad_norm": 0.8638110891762141, + "learning_rate": 1.5402691939483046e-05, + "loss": 0.1747, + "step": 6662 + }, + { + "epoch": 0.34, + "grad_norm": 0.9144960253692133, + "learning_rate": 1.540130595488915e-05, + "loss": 0.2166, + "step": 6663 + }, + { + "epoch": 0.34, + "grad_norm": 0.735671035910116, + "learning_rate": 1.539991982378453e-05, + "loss": 0.1921, + "step": 6664 + }, + { + "epoch": 0.34, + "grad_norm": 1.2657061752905912, + "learning_rate": 1.539853354620678e-05, + "loss": 0.2117, + "step": 6665 + }, + { + "epoch": 0.34, + "grad_norm": 1.283712998444193, + "learning_rate": 1.53971471221935e-05, + "loss": 0.179, + "step": 6666 + }, + { + "epoch": 0.34, + "grad_norm": 0.8989145391276644, + "learning_rate": 1.53957605517823e-05, + "loss": 0.1903, + "step": 6667 + }, + { + "epoch": 0.34, + "grad_norm": 1.0048519214602194, + "learning_rate": 1.539437383501079e-05, + "loss": 0.2057, + "step": 6668 + }, + { + "epoch": 0.34, + "grad_norm": 1.0132983115014504, + "learning_rate": 1.5392986971916583e-05, + "loss": 0.2205, + "step": 6669 + }, + { + "epoch": 0.34, + "grad_norm": 1.4896675971356934, + "learning_rate": 1.53915999625373e-05, + "loss": 0.1955, + "step": 6670 + }, + { + "epoch": 0.34, + "grad_norm": 1.5033619249115908, + "learning_rate": 1.539021280691057e-05, + "loss": 0.2164, + "step": 6671 + }, + { + "epoch": 0.34, + "grad_norm": 1.4593355835166457, + "learning_rate": 1.5388825505074006e-05, + "loss": 0.2004, + "step": 6672 + }, + { + "epoch": 0.34, + "grad_norm": 1.148201487459412, + "learning_rate": 1.538743805706525e-05, + "loss": 0.1927, + "step": 6673 + }, + { + "epoch": 0.34, + "grad_norm": 1.6333105220118933, + "learning_rate": 1.538605046292193e-05, + "loss": 0.2123, + "step": 6674 + }, + { + "epoch": 0.34, + "grad_norm": 0.9890080278145551, + "learning_rate": 1.5384662722681688e-05, + "loss": 0.1891, + "step": 6675 + }, + { + "epoch": 0.34, + "grad_norm": 0.9610666592755823, + "learning_rate": 1.5383274836382163e-05, + "loss": 0.1825, + "step": 6676 + }, + { + "epoch": 0.34, + "grad_norm": 1.0378885292526392, + "learning_rate": 1.5381886804061005e-05, + "loss": 0.2171, + "step": 6677 + }, + { + "epoch": 0.34, + "grad_norm": 1.1277837007123201, + "learning_rate": 1.5380498625755867e-05, + "loss": 0.2176, + "step": 6678 + }, + { + "epoch": 0.34, + "grad_norm": 1.0877352146653516, + "learning_rate": 1.5379110301504397e-05, + "loss": 0.2081, + "step": 6679 + }, + { + "epoch": 0.34, + "grad_norm": 1.2947756337756804, + "learning_rate": 1.5377721831344258e-05, + "loss": 0.2119, + "step": 6680 + }, + { + "epoch": 0.34, + "grad_norm": 1.117584689887842, + "learning_rate": 1.5376333215313106e-05, + "loss": 0.2149, + "step": 6681 + }, + { + "epoch": 0.34, + "grad_norm": 1.2078699493400893, + "learning_rate": 1.5374944453448617e-05, + "loss": 0.2057, + "step": 6682 + }, + { + "epoch": 0.34, + "grad_norm": 0.9548572667866763, + "learning_rate": 1.5373555545788456e-05, + "loss": 0.2049, + "step": 6683 + }, + { + "epoch": 0.34, + "grad_norm": 1.3772461895252333, + "learning_rate": 1.5372166492370297e-05, + "loss": 0.1893, + "step": 6684 + }, + { + "epoch": 0.34, + "grad_norm": 0.9630206874145618, + "learning_rate": 1.5370777293231814e-05, + "loss": 0.1958, + "step": 6685 + }, + { + "epoch": 0.34, + "grad_norm": 1.3183565853223211, + "learning_rate": 1.5369387948410695e-05, + "loss": 0.2091, + "step": 6686 + }, + { + "epoch": 0.34, + "grad_norm": 1.0363802125546604, + "learning_rate": 1.536799845794463e-05, + "loss": 0.241, + "step": 6687 + }, + { + "epoch": 0.34, + "grad_norm": 0.8656078936422779, + "learning_rate": 1.53666088218713e-05, + "loss": 0.1964, + "step": 6688 + }, + { + "epoch": 0.34, + "grad_norm": 0.9737685003439084, + "learning_rate": 1.5365219040228402e-05, + "loss": 0.1881, + "step": 6689 + }, + { + "epoch": 0.34, + "grad_norm": 1.0616642657522841, + "learning_rate": 1.5363829113053633e-05, + "loss": 0.237, + "step": 6690 + }, + { + "epoch": 0.34, + "grad_norm": 0.8708419272926546, + "learning_rate": 1.53624390403847e-05, + "loss": 0.1886, + "step": 6691 + }, + { + "epoch": 0.34, + "grad_norm": 1.1335578938991093, + "learning_rate": 1.5361048822259302e-05, + "loss": 0.2157, + "step": 6692 + }, + { + "epoch": 0.34, + "grad_norm": 1.326790601511181, + "learning_rate": 1.5359658458715158e-05, + "loss": 0.228, + "step": 6693 + }, + { + "epoch": 0.34, + "grad_norm": 0.9248727896429133, + "learning_rate": 1.5358267949789968e-05, + "loss": 0.2035, + "step": 6694 + }, + { + "epoch": 0.34, + "grad_norm": 1.2076276869173062, + "learning_rate": 1.535687729552146e-05, + "loss": 0.2129, + "step": 6695 + }, + { + "epoch": 0.34, + "grad_norm": 1.8942627148982174, + "learning_rate": 1.5355486495947353e-05, + "loss": 0.2178, + "step": 6696 + }, + { + "epoch": 0.34, + "grad_norm": 0.8807289461435734, + "learning_rate": 1.5354095551105374e-05, + "loss": 0.2192, + "step": 6697 + }, + { + "epoch": 0.34, + "grad_norm": 1.842682012974875, + "learning_rate": 1.5352704461033247e-05, + "loss": 0.203, + "step": 6698 + }, + { + "epoch": 0.34, + "grad_norm": 0.9057246496827717, + "learning_rate": 1.5351313225768713e-05, + "loss": 0.1882, + "step": 6699 + }, + { + "epoch": 0.34, + "grad_norm": 2.1617478367242047, + "learning_rate": 1.5349921845349504e-05, + "loss": 0.2036, + "step": 6700 + }, + { + "epoch": 0.34, + "grad_norm": 0.8422370104317792, + "learning_rate": 1.5348530319813365e-05, + "loss": 0.2045, + "step": 6701 + }, + { + "epoch": 0.34, + "grad_norm": 1.172859237684915, + "learning_rate": 1.5347138649198036e-05, + "loss": 0.1777, + "step": 6702 + }, + { + "epoch": 0.34, + "grad_norm": 0.851183073220529, + "learning_rate": 1.5345746833541268e-05, + "loss": 0.1951, + "step": 6703 + }, + { + "epoch": 0.34, + "grad_norm": 1.3893811227479087, + "learning_rate": 1.5344354872880817e-05, + "loss": 0.1876, + "step": 6704 + }, + { + "epoch": 0.34, + "grad_norm": 1.2961449575191275, + "learning_rate": 1.534296276725444e-05, + "loss": 0.2221, + "step": 6705 + }, + { + "epoch": 0.34, + "grad_norm": 1.471827649921739, + "learning_rate": 1.5341570516699893e-05, + "loss": 0.1939, + "step": 6706 + }, + { + "epoch": 0.34, + "grad_norm": 0.8421797836882626, + "learning_rate": 1.5340178121254944e-05, + "loss": 0.2059, + "step": 6707 + }, + { + "epoch": 0.34, + "grad_norm": 0.968441843112463, + "learning_rate": 1.5338785580957366e-05, + "loss": 0.2059, + "step": 6708 + }, + { + "epoch": 0.34, + "grad_norm": 1.1233535791260196, + "learning_rate": 1.5337392895844923e-05, + "loss": 0.2172, + "step": 6709 + }, + { + "epoch": 0.34, + "grad_norm": 0.8078361898244383, + "learning_rate": 1.53360000659554e-05, + "loss": 0.1763, + "step": 6710 + }, + { + "epoch": 0.34, + "grad_norm": 0.7165845447840065, + "learning_rate": 1.533460709132657e-05, + "loss": 0.1835, + "step": 6711 + }, + { + "epoch": 0.34, + "grad_norm": 3.5322930348415293, + "learning_rate": 1.5333213971996223e-05, + "loss": 0.2092, + "step": 6712 + }, + { + "epoch": 0.34, + "grad_norm": 0.9859837992060969, + "learning_rate": 1.5331820708002148e-05, + "loss": 0.2159, + "step": 6713 + }, + { + "epoch": 0.34, + "grad_norm": 0.9879252355628823, + "learning_rate": 1.533042729938213e-05, + "loss": 0.1894, + "step": 6714 + }, + { + "epoch": 0.34, + "grad_norm": 0.8812018427849039, + "learning_rate": 1.5329033746173975e-05, + "loss": 0.1833, + "step": 6715 + }, + { + "epoch": 0.34, + "grad_norm": 1.4601846109173515, + "learning_rate": 1.5327640048415476e-05, + "loss": 0.2237, + "step": 6716 + }, + { + "epoch": 0.34, + "grad_norm": 1.0581061376121739, + "learning_rate": 1.5326246206144443e-05, + "loss": 0.2092, + "step": 6717 + }, + { + "epoch": 0.34, + "grad_norm": 0.7203814696096753, + "learning_rate": 1.532485221939868e-05, + "loss": 0.1884, + "step": 6718 + }, + { + "epoch": 0.34, + "grad_norm": 1.2641468880148858, + "learning_rate": 1.5323458088216e-05, + "loss": 0.2215, + "step": 6719 + }, + { + "epoch": 0.34, + "grad_norm": 0.9041915947266804, + "learning_rate": 1.5322063812634213e-05, + "loss": 0.182, + "step": 6720 + }, + { + "epoch": 0.34, + "grad_norm": 0.9312729062386421, + "learning_rate": 1.532066939269115e-05, + "loss": 0.2172, + "step": 6721 + }, + { + "epoch": 0.34, + "grad_norm": 1.7275672748749746, + "learning_rate": 1.531927482842463e-05, + "loss": 0.1762, + "step": 6722 + }, + { + "epoch": 0.34, + "grad_norm": 0.9269472309163541, + "learning_rate": 1.531788011987248e-05, + "loss": 0.206, + "step": 6723 + }, + { + "epoch": 0.34, + "grad_norm": 1.1568597220771661, + "learning_rate": 1.5316485267072528e-05, + "loss": 0.1887, + "step": 6724 + }, + { + "epoch": 0.34, + "grad_norm": 0.9338998705897923, + "learning_rate": 1.5315090270062612e-05, + "loss": 0.203, + "step": 6725 + }, + { + "epoch": 0.34, + "grad_norm": 1.2715192485484121, + "learning_rate": 1.5313695128880578e-05, + "loss": 0.2145, + "step": 6726 + }, + { + "epoch": 0.34, + "grad_norm": 1.0559807576449627, + "learning_rate": 1.531229984356426e-05, + "loss": 0.1945, + "step": 6727 + }, + { + "epoch": 0.34, + "grad_norm": 1.1443412285252021, + "learning_rate": 1.5310904414151505e-05, + "loss": 0.2212, + "step": 6728 + }, + { + "epoch": 0.34, + "grad_norm": 1.2483375607136242, + "learning_rate": 1.530950884068017e-05, + "loss": 0.2071, + "step": 6729 + }, + { + "epoch": 0.34, + "grad_norm": 1.1469468803995153, + "learning_rate": 1.530811312318811e-05, + "loss": 0.2045, + "step": 6730 + }, + { + "epoch": 0.34, + "grad_norm": 1.2036637391446452, + "learning_rate": 1.530671726171318e-05, + "loss": 0.1866, + "step": 6731 + }, + { + "epoch": 0.34, + "grad_norm": 1.198487986397559, + "learning_rate": 1.530532125629325e-05, + "loss": 0.2133, + "step": 6732 + }, + { + "epoch": 0.34, + "grad_norm": 0.7976346078056735, + "learning_rate": 1.5303925106966176e-05, + "loss": 0.2204, + "step": 6733 + }, + { + "epoch": 0.34, + "grad_norm": 1.205968836433249, + "learning_rate": 1.5302528813769832e-05, + "loss": 0.2076, + "step": 6734 + }, + { + "epoch": 0.34, + "grad_norm": 0.9959892711878979, + "learning_rate": 1.5301132376742097e-05, + "loss": 0.2149, + "step": 6735 + }, + { + "epoch": 0.34, + "grad_norm": 1.0439639919923365, + "learning_rate": 1.5299735795920852e-05, + "loss": 0.2182, + "step": 6736 + }, + { + "epoch": 0.34, + "grad_norm": 1.3157354916778414, + "learning_rate": 1.5298339071343965e-05, + "loss": 0.1834, + "step": 6737 + }, + { + "epoch": 0.34, + "grad_norm": 2.008918421266012, + "learning_rate": 1.5296942203049336e-05, + "loss": 0.2016, + "step": 6738 + }, + { + "epoch": 0.34, + "grad_norm": 1.0213466637424429, + "learning_rate": 1.5295545191074854e-05, + "loss": 0.1827, + "step": 6739 + }, + { + "epoch": 0.34, + "grad_norm": 1.393285649452404, + "learning_rate": 1.5294148035458406e-05, + "loss": 0.1925, + "step": 6740 + }, + { + "epoch": 0.34, + "grad_norm": 1.611635542327183, + "learning_rate": 1.529275073623789e-05, + "loss": 0.2131, + "step": 6741 + }, + { + "epoch": 0.34, + "grad_norm": 1.9380453950042238, + "learning_rate": 1.5291353293451216e-05, + "loss": 0.2109, + "step": 6742 + }, + { + "epoch": 0.34, + "grad_norm": 1.780304518112081, + "learning_rate": 1.5289955707136282e-05, + "loss": 0.21, + "step": 6743 + }, + { + "epoch": 0.34, + "grad_norm": 1.276791397406679, + "learning_rate": 1.5288557977331006e-05, + "loss": 0.1979, + "step": 6744 + }, + { + "epoch": 0.34, + "grad_norm": 1.0230013122394754, + "learning_rate": 1.528716010407329e-05, + "loss": 0.2092, + "step": 6745 + }, + { + "epoch": 0.34, + "grad_norm": 0.911278338876281, + "learning_rate": 1.528576208740106e-05, + "loss": 0.1953, + "step": 6746 + }, + { + "epoch": 0.34, + "grad_norm": 1.8718440559042655, + "learning_rate": 1.5284363927352234e-05, + "loss": 0.2164, + "step": 6747 + }, + { + "epoch": 0.34, + "grad_norm": 0.960141783585586, + "learning_rate": 1.528296562396474e-05, + "loss": 0.2246, + "step": 6748 + }, + { + "epoch": 0.34, + "grad_norm": 1.0057641215159536, + "learning_rate": 1.5281567177276504e-05, + "loss": 0.2114, + "step": 6749 + }, + { + "epoch": 0.34, + "grad_norm": 1.1115177064216062, + "learning_rate": 1.5280168587325462e-05, + "loss": 0.2166, + "step": 6750 + }, + { + "epoch": 0.34, + "grad_norm": 0.9984220338545741, + "learning_rate": 1.5278769854149544e-05, + "loss": 0.2036, + "step": 6751 + }, + { + "epoch": 0.34, + "grad_norm": 1.0728686079952425, + "learning_rate": 1.5277370977786698e-05, + "loss": 0.2143, + "step": 6752 + }, + { + "epoch": 0.34, + "grad_norm": 1.00311623585292, + "learning_rate": 1.527597195827487e-05, + "loss": 0.2022, + "step": 6753 + }, + { + "epoch": 0.34, + "grad_norm": 1.4416304137365434, + "learning_rate": 1.5274572795652e-05, + "loss": 0.1861, + "step": 6754 + }, + { + "epoch": 0.34, + "grad_norm": 1.353646006666074, + "learning_rate": 1.5273173489956045e-05, + "loss": 0.1966, + "step": 6755 + }, + { + "epoch": 0.34, + "grad_norm": 1.6279188709189063, + "learning_rate": 1.5271774041224965e-05, + "loss": 0.2124, + "step": 6756 + }, + { + "epoch": 0.34, + "grad_norm": 1.2112152569319565, + "learning_rate": 1.5270374449496713e-05, + "loss": 0.1893, + "step": 6757 + }, + { + "epoch": 0.34, + "grad_norm": 1.9026300065868202, + "learning_rate": 1.526897471480926e-05, + "loss": 0.2145, + "step": 6758 + }, + { + "epoch": 0.34, + "grad_norm": 1.4535905698686216, + "learning_rate": 1.5267574837200567e-05, + "loss": 0.1969, + "step": 6759 + }, + { + "epoch": 0.34, + "grad_norm": 0.9155814290404279, + "learning_rate": 1.5266174816708607e-05, + "loss": 0.2077, + "step": 6760 + }, + { + "epoch": 0.34, + "grad_norm": 1.4571808090129743, + "learning_rate": 1.526477465337136e-05, + "loss": 0.2047, + "step": 6761 + }, + { + "epoch": 0.34, + "grad_norm": 1.700143986136528, + "learning_rate": 1.5263374347226804e-05, + "loss": 0.207, + "step": 6762 + }, + { + "epoch": 0.34, + "grad_norm": 1.3011385282165446, + "learning_rate": 1.526197389831292e-05, + "loss": 0.1692, + "step": 6763 + }, + { + "epoch": 0.34, + "grad_norm": 1.0324443289219334, + "learning_rate": 1.52605733066677e-05, + "loss": 0.2005, + "step": 6764 + }, + { + "epoch": 0.34, + "grad_norm": 0.9902437055657942, + "learning_rate": 1.5259172572329132e-05, + "loss": 0.181, + "step": 6765 + }, + { + "epoch": 0.34, + "grad_norm": 1.4277802318478219, + "learning_rate": 1.5257771695335207e-05, + "loss": 0.1872, + "step": 6766 + }, + { + "epoch": 0.34, + "grad_norm": 0.9587925594158275, + "learning_rate": 1.5256370675723928e-05, + "loss": 0.1822, + "step": 6767 + }, + { + "epoch": 0.34, + "grad_norm": 1.169017797107394, + "learning_rate": 1.52549695135333e-05, + "loss": 0.2135, + "step": 6768 + }, + { + "epoch": 0.34, + "grad_norm": 1.0043873111957573, + "learning_rate": 1.5253568208801324e-05, + "loss": 0.2032, + "step": 6769 + }, + { + "epoch": 0.34, + "grad_norm": 0.9227197727775629, + "learning_rate": 1.5252166761566018e-05, + "loss": 0.2208, + "step": 6770 + }, + { + "epoch": 0.34, + "grad_norm": 0.9284620558439093, + "learning_rate": 1.5250765171865391e-05, + "loss": 0.2122, + "step": 6771 + }, + { + "epoch": 0.34, + "grad_norm": 1.2444535096292564, + "learning_rate": 1.5249363439737458e-05, + "loss": 0.2359, + "step": 6772 + }, + { + "epoch": 0.34, + "grad_norm": 0.8917021534524678, + "learning_rate": 1.5247961565220251e-05, + "loss": 0.1886, + "step": 6773 + }, + { + "epoch": 0.34, + "grad_norm": 1.0070824235122227, + "learning_rate": 1.5246559548351786e-05, + "loss": 0.1946, + "step": 6774 + }, + { + "epoch": 0.34, + "grad_norm": 1.0143443272664339, + "learning_rate": 1.5245157389170099e-05, + "loss": 0.2004, + "step": 6775 + }, + { + "epoch": 0.34, + "grad_norm": 0.8925425331684878, + "learning_rate": 1.5243755087713221e-05, + "loss": 0.1978, + "step": 6776 + }, + { + "epoch": 0.34, + "grad_norm": 0.9067333501121372, + "learning_rate": 1.5242352644019188e-05, + "loss": 0.1804, + "step": 6777 + }, + { + "epoch": 0.34, + "grad_norm": 0.8315046848626775, + "learning_rate": 1.5240950058126047e-05, + "loss": 0.2057, + "step": 6778 + }, + { + "epoch": 0.34, + "grad_norm": 1.2926472057668967, + "learning_rate": 1.5239547330071838e-05, + "loss": 0.1967, + "step": 6779 + }, + { + "epoch": 0.34, + "grad_norm": 1.4130144050799631, + "learning_rate": 1.5238144459894612e-05, + "loss": 0.2164, + "step": 6780 + }, + { + "epoch": 0.34, + "grad_norm": 1.5404583219652528, + "learning_rate": 1.523674144763242e-05, + "loss": 0.2328, + "step": 6781 + }, + { + "epoch": 0.34, + "grad_norm": 1.4509180543256275, + "learning_rate": 1.5235338293323322e-05, + "loss": 0.1865, + "step": 6782 + }, + { + "epoch": 0.34, + "grad_norm": 0.9155452259245301, + "learning_rate": 1.5233934997005377e-05, + "loss": 0.1868, + "step": 6783 + }, + { + "epoch": 0.34, + "grad_norm": 0.9785165899029814, + "learning_rate": 1.523253155871665e-05, + "loss": 0.2161, + "step": 6784 + }, + { + "epoch": 0.35, + "grad_norm": 0.9681027179097117, + "learning_rate": 1.5231127978495208e-05, + "loss": 0.1961, + "step": 6785 + }, + { + "epoch": 0.35, + "grad_norm": 1.291598955342103, + "learning_rate": 1.5229724256379124e-05, + "loss": 0.1936, + "step": 6786 + }, + { + "epoch": 0.35, + "grad_norm": 1.2179833211574773, + "learning_rate": 1.5228320392406476e-05, + "loss": 0.1859, + "step": 6787 + }, + { + "epoch": 0.35, + "grad_norm": 1.132302330300989, + "learning_rate": 1.522691638661534e-05, + "loss": 0.1998, + "step": 6788 + }, + { + "epoch": 0.35, + "grad_norm": 0.9243465153919224, + "learning_rate": 1.5225512239043805e-05, + "loss": 0.1908, + "step": 6789 + }, + { + "epoch": 0.35, + "grad_norm": 0.794549770398676, + "learning_rate": 1.5224107949729952e-05, + "loss": 0.1878, + "step": 6790 + }, + { + "epoch": 0.35, + "grad_norm": 1.1940722120482383, + "learning_rate": 1.5222703518711876e-05, + "loss": 0.1877, + "step": 6791 + }, + { + "epoch": 0.35, + "grad_norm": 1.006766552572996, + "learning_rate": 1.5221298946027674e-05, + "loss": 0.1982, + "step": 6792 + }, + { + "epoch": 0.35, + "grad_norm": 1.2122944014485775, + "learning_rate": 1.5219894231715443e-05, + "loss": 0.1661, + "step": 6793 + }, + { + "epoch": 0.35, + "grad_norm": 0.7706146021851047, + "learning_rate": 1.521848937581328e-05, + "loss": 0.1752, + "step": 6794 + }, + { + "epoch": 0.35, + "grad_norm": 0.8239227910933536, + "learning_rate": 1.5217084378359306e-05, + "loss": 0.1822, + "step": 6795 + }, + { + "epoch": 0.35, + "grad_norm": 0.8731121198592614, + "learning_rate": 1.5215679239391621e-05, + "loss": 0.1922, + "step": 6796 + }, + { + "epoch": 0.35, + "grad_norm": 1.218467420504002, + "learning_rate": 1.5214273958948343e-05, + "loss": 0.198, + "step": 6797 + }, + { + "epoch": 0.35, + "grad_norm": 1.232086371764998, + "learning_rate": 1.5212868537067587e-05, + "loss": 0.1799, + "step": 6798 + }, + { + "epoch": 0.35, + "grad_norm": 0.8349998328027023, + "learning_rate": 1.5211462973787478e-05, + "loss": 0.1858, + "step": 6799 + }, + { + "epoch": 0.35, + "grad_norm": 0.9479616491015617, + "learning_rate": 1.5210057269146141e-05, + "loss": 0.2169, + "step": 6800 + }, + { + "epoch": 0.35, + "grad_norm": 1.021839551116278, + "learning_rate": 1.5208651423181709e-05, + "loss": 0.2123, + "step": 6801 + }, + { + "epoch": 0.35, + "grad_norm": 0.7427252710048137, + "learning_rate": 1.5207245435932312e-05, + "loss": 0.2014, + "step": 6802 + }, + { + "epoch": 0.35, + "grad_norm": 1.3946540105732994, + "learning_rate": 1.5205839307436088e-05, + "loss": 0.206, + "step": 6803 + }, + { + "epoch": 0.35, + "grad_norm": 0.9192566427112906, + "learning_rate": 1.5204433037731177e-05, + "loss": 0.1898, + "step": 6804 + }, + { + "epoch": 0.35, + "grad_norm": 1.0661683152184878, + "learning_rate": 1.5203026626855728e-05, + "loss": 0.217, + "step": 6805 + }, + { + "epoch": 0.35, + "grad_norm": 0.9281868878900686, + "learning_rate": 1.5201620074847888e-05, + "loss": 0.2235, + "step": 6806 + }, + { + "epoch": 0.35, + "grad_norm": 0.8851710721864421, + "learning_rate": 1.5200213381745807e-05, + "loss": 0.2138, + "step": 6807 + }, + { + "epoch": 0.35, + "grad_norm": 0.8892054097059979, + "learning_rate": 1.5198806547587648e-05, + "loss": 0.2257, + "step": 6808 + }, + { + "epoch": 0.35, + "grad_norm": 1.2053124528725399, + "learning_rate": 1.5197399572411566e-05, + "loss": 0.1975, + "step": 6809 + }, + { + "epoch": 0.35, + "grad_norm": 0.960929308363589, + "learning_rate": 1.5195992456255728e-05, + "loss": 0.1986, + "step": 6810 + }, + { + "epoch": 0.35, + "grad_norm": 1.0636077023307362, + "learning_rate": 1.51945851991583e-05, + "loss": 0.2068, + "step": 6811 + }, + { + "epoch": 0.35, + "grad_norm": 0.867963274482051, + "learning_rate": 1.5193177801157456e-05, + "loss": 0.1769, + "step": 6812 + }, + { + "epoch": 0.35, + "grad_norm": 0.8875259655818101, + "learning_rate": 1.5191770262291367e-05, + "loss": 0.1952, + "step": 6813 + }, + { + "epoch": 0.35, + "grad_norm": 0.9265142993267459, + "learning_rate": 1.5190362582598223e-05, + "loss": 0.1973, + "step": 6814 + }, + { + "epoch": 0.35, + "grad_norm": 1.0713731845568697, + "learning_rate": 1.5188954762116197e-05, + "loss": 0.1891, + "step": 6815 + }, + { + "epoch": 0.35, + "grad_norm": 1.170054639379894, + "learning_rate": 1.518754680088348e-05, + "loss": 0.1806, + "step": 6816 + }, + { + "epoch": 0.35, + "grad_norm": 1.5358593286525253, + "learning_rate": 1.5186138698938262e-05, + "loss": 0.2304, + "step": 6817 + }, + { + "epoch": 0.35, + "grad_norm": 1.2267745756341035, + "learning_rate": 1.5184730456318742e-05, + "loss": 0.21, + "step": 6818 + }, + { + "epoch": 0.35, + "grad_norm": 1.2369692943548545, + "learning_rate": 1.5183322073063113e-05, + "loss": 0.1932, + "step": 6819 + }, + { + "epoch": 0.35, + "grad_norm": 0.9842943782482564, + "learning_rate": 1.5181913549209582e-05, + "loss": 0.199, + "step": 6820 + }, + { + "epoch": 0.35, + "grad_norm": 1.1127812635949084, + "learning_rate": 1.5180504884796352e-05, + "loss": 0.1981, + "step": 6821 + }, + { + "epoch": 0.35, + "grad_norm": 1.15296947156494, + "learning_rate": 1.5179096079861633e-05, + "loss": 0.1961, + "step": 6822 + }, + { + "epoch": 0.35, + "grad_norm": 1.3997778340175386, + "learning_rate": 1.5177687134443644e-05, + "loss": 0.2149, + "step": 6823 + }, + { + "epoch": 0.35, + "grad_norm": 1.1547947538700556, + "learning_rate": 1.51762780485806e-05, + "loss": 0.2027, + "step": 6824 + }, + { + "epoch": 0.35, + "grad_norm": 1.2744762270890364, + "learning_rate": 1.5174868822310715e-05, + "loss": 0.1796, + "step": 6825 + }, + { + "epoch": 0.35, + "grad_norm": 0.8809148208865116, + "learning_rate": 1.5173459455672225e-05, + "loss": 0.2006, + "step": 6826 + }, + { + "epoch": 0.35, + "grad_norm": 0.7797320005770062, + "learning_rate": 1.5172049948703356e-05, + "loss": 0.2017, + "step": 6827 + }, + { + "epoch": 0.35, + "grad_norm": 1.074741508538611, + "learning_rate": 1.5170640301442339e-05, + "loss": 0.2007, + "step": 6828 + }, + { + "epoch": 0.35, + "grad_norm": 1.1055816149226974, + "learning_rate": 1.516923051392741e-05, + "loss": 0.2054, + "step": 6829 + }, + { + "epoch": 0.35, + "grad_norm": 0.8006123473485938, + "learning_rate": 1.516782058619681e-05, + "loss": 0.2051, + "step": 6830 + }, + { + "epoch": 0.35, + "grad_norm": 0.9676537160614846, + "learning_rate": 1.516641051828879e-05, + "loss": 0.177, + "step": 6831 + }, + { + "epoch": 0.35, + "grad_norm": 1.2592218417012093, + "learning_rate": 1.5165000310241592e-05, + "loss": 0.1988, + "step": 6832 + }, + { + "epoch": 0.35, + "grad_norm": 0.7618946631132122, + "learning_rate": 1.5163589962093466e-05, + "loss": 0.1668, + "step": 6833 + }, + { + "epoch": 0.35, + "grad_norm": 0.8695663377266801, + "learning_rate": 1.5162179473882668e-05, + "loss": 0.1817, + "step": 6834 + }, + { + "epoch": 0.35, + "grad_norm": 1.1313227587735268, + "learning_rate": 1.5160768845647464e-05, + "loss": 0.2208, + "step": 6835 + }, + { + "epoch": 0.35, + "grad_norm": 1.7788299794995026, + "learning_rate": 1.5159358077426114e-05, + "loss": 0.2113, + "step": 6836 + }, + { + "epoch": 0.35, + "grad_norm": 1.0772132694371461, + "learning_rate": 1.5157947169256886e-05, + "loss": 0.2016, + "step": 6837 + }, + { + "epoch": 0.35, + "grad_norm": 0.8650455266282737, + "learning_rate": 1.515653612117805e-05, + "loss": 0.1963, + "step": 6838 + }, + { + "epoch": 0.35, + "grad_norm": 1.0680136956495005, + "learning_rate": 1.5155124933227876e-05, + "loss": 0.2127, + "step": 6839 + }, + { + "epoch": 0.35, + "grad_norm": 1.0971031970802028, + "learning_rate": 1.515371360544465e-05, + "loss": 0.1729, + "step": 6840 + }, + { + "epoch": 0.35, + "grad_norm": 0.9096219601191085, + "learning_rate": 1.5152302137866653e-05, + "loss": 0.1937, + "step": 6841 + }, + { + "epoch": 0.35, + "grad_norm": 0.9816498176529843, + "learning_rate": 1.5150890530532165e-05, + "loss": 0.1817, + "step": 6842 + }, + { + "epoch": 0.35, + "grad_norm": 1.3215012680366147, + "learning_rate": 1.5149478783479484e-05, + "loss": 0.2153, + "step": 6843 + }, + { + "epoch": 0.35, + "grad_norm": 1.101868774315477, + "learning_rate": 1.51480668967469e-05, + "loss": 0.2143, + "step": 6844 + }, + { + "epoch": 0.35, + "grad_norm": 0.8953111804081765, + "learning_rate": 1.514665487037271e-05, + "loss": 0.2136, + "step": 6845 + }, + { + "epoch": 0.35, + "grad_norm": 0.9155024355336064, + "learning_rate": 1.5145242704395215e-05, + "loss": 0.1709, + "step": 6846 + }, + { + "epoch": 0.35, + "grad_norm": 1.194699702974594, + "learning_rate": 1.5143830398852722e-05, + "loss": 0.2143, + "step": 6847 + }, + { + "epoch": 0.35, + "grad_norm": 0.815983227890357, + "learning_rate": 1.5142417953783536e-05, + "loss": 0.1805, + "step": 6848 + }, + { + "epoch": 0.35, + "grad_norm": 1.3899500734068588, + "learning_rate": 1.5141005369225976e-05, + "loss": 0.1977, + "step": 6849 + }, + { + "epoch": 0.35, + "grad_norm": 1.8772900169020614, + "learning_rate": 1.5139592645218355e-05, + "loss": 0.2173, + "step": 6850 + }, + { + "epoch": 0.35, + "grad_norm": 4.3715237882830085, + "learning_rate": 1.5138179781798994e-05, + "loss": 0.2088, + "step": 6851 + }, + { + "epoch": 0.35, + "grad_norm": 0.9728600232206785, + "learning_rate": 1.513676677900621e-05, + "loss": 0.2141, + "step": 6852 + }, + { + "epoch": 0.35, + "grad_norm": 1.2467143391094206, + "learning_rate": 1.5135353636878343e-05, + "loss": 0.2127, + "step": 6853 + }, + { + "epoch": 0.35, + "grad_norm": 2.4410997273671753, + "learning_rate": 1.5133940355453717e-05, + "loss": 0.1994, + "step": 6854 + }, + { + "epoch": 0.35, + "grad_norm": 1.0130725754368426, + "learning_rate": 1.513252693477067e-05, + "loss": 0.191, + "step": 6855 + }, + { + "epoch": 0.35, + "grad_norm": 0.9178135147372484, + "learning_rate": 1.5131113374867537e-05, + "loss": 0.157, + "step": 6856 + }, + { + "epoch": 0.35, + "grad_norm": 1.0776349600749444, + "learning_rate": 1.5129699675782666e-05, + "loss": 0.2282, + "step": 6857 + }, + { + "epoch": 0.35, + "grad_norm": 1.2034064612598838, + "learning_rate": 1.5128285837554404e-05, + "loss": 0.2008, + "step": 6858 + }, + { + "epoch": 0.35, + "grad_norm": 1.0843112113793854, + "learning_rate": 1.5126871860221098e-05, + "loss": 0.1881, + "step": 6859 + }, + { + "epoch": 0.35, + "grad_norm": 1.0250256906050836, + "learning_rate": 1.5125457743821098e-05, + "loss": 0.2129, + "step": 6860 + }, + { + "epoch": 0.35, + "grad_norm": 0.9537784444716515, + "learning_rate": 1.5124043488392772e-05, + "loss": 0.2287, + "step": 6861 + }, + { + "epoch": 0.35, + "grad_norm": 1.0963638776333486, + "learning_rate": 1.5122629093974476e-05, + "loss": 0.1926, + "step": 6862 + }, + { + "epoch": 0.35, + "grad_norm": 1.18682479755038, + "learning_rate": 1.5121214560604579e-05, + "loss": 0.2174, + "step": 6863 + }, + { + "epoch": 0.35, + "grad_norm": 0.8760608982889208, + "learning_rate": 1.5119799888321444e-05, + "loss": 0.2168, + "step": 6864 + }, + { + "epoch": 0.35, + "grad_norm": 1.0627961084019029, + "learning_rate": 1.5118385077163446e-05, + "loss": 0.1941, + "step": 6865 + }, + { + "epoch": 0.35, + "grad_norm": 1.6957928296348652, + "learning_rate": 1.5116970127168969e-05, + "loss": 0.2307, + "step": 6866 + }, + { + "epoch": 0.35, + "grad_norm": 1.0456603258940167, + "learning_rate": 1.5115555038376386e-05, + "loss": 0.2038, + "step": 6867 + }, + { + "epoch": 0.35, + "grad_norm": 1.0644078326243325, + "learning_rate": 1.5114139810824084e-05, + "loss": 0.2085, + "step": 6868 + }, + { + "epoch": 0.35, + "grad_norm": 0.9346639998075899, + "learning_rate": 1.5112724444550449e-05, + "loss": 0.1976, + "step": 6869 + }, + { + "epoch": 0.35, + "grad_norm": 0.9871990707491872, + "learning_rate": 1.5111308939593876e-05, + "loss": 0.2325, + "step": 6870 + }, + { + "epoch": 0.35, + "grad_norm": 0.862600105336854, + "learning_rate": 1.510989329599276e-05, + "loss": 0.2257, + "step": 6871 + }, + { + "epoch": 0.35, + "grad_norm": 0.8081876993422035, + "learning_rate": 1.51084775137855e-05, + "loss": 0.1908, + "step": 6872 + }, + { + "epoch": 0.35, + "grad_norm": 1.1136152808647861, + "learning_rate": 1.5107061593010497e-05, + "loss": 0.1845, + "step": 6873 + }, + { + "epoch": 0.35, + "grad_norm": 0.9049398819784213, + "learning_rate": 1.5105645533706161e-05, + "loss": 0.2169, + "step": 6874 + }, + { + "epoch": 0.35, + "grad_norm": 0.8437673319992209, + "learning_rate": 1.5104229335910901e-05, + "loss": 0.1851, + "step": 6875 + }, + { + "epoch": 0.35, + "grad_norm": 0.9911256372429206, + "learning_rate": 1.5102812999663136e-05, + "loss": 0.204, + "step": 6876 + }, + { + "epoch": 0.35, + "grad_norm": 1.059383024074309, + "learning_rate": 1.5101396525001275e-05, + "loss": 0.2161, + "step": 6877 + }, + { + "epoch": 0.35, + "grad_norm": 0.807238408215533, + "learning_rate": 1.5099979911963747e-05, + "loss": 0.1751, + "step": 6878 + }, + { + "epoch": 0.35, + "grad_norm": 1.0651713275527612, + "learning_rate": 1.5098563160588975e-05, + "loss": 0.1957, + "step": 6879 + }, + { + "epoch": 0.35, + "grad_norm": 1.1544476706394258, + "learning_rate": 1.5097146270915391e-05, + "loss": 0.2063, + "step": 6880 + }, + { + "epoch": 0.35, + "grad_norm": 1.0130726774046317, + "learning_rate": 1.5095729242981426e-05, + "loss": 0.2123, + "step": 6881 + }, + { + "epoch": 0.35, + "grad_norm": 0.946893609225108, + "learning_rate": 1.5094312076825514e-05, + "loss": 0.1984, + "step": 6882 + }, + { + "epoch": 0.35, + "grad_norm": 0.9464456103251006, + "learning_rate": 1.5092894772486104e-05, + "loss": 0.2102, + "step": 6883 + }, + { + "epoch": 0.35, + "grad_norm": 1.0216789633156547, + "learning_rate": 1.5091477330001634e-05, + "loss": 0.1997, + "step": 6884 + }, + { + "epoch": 0.35, + "grad_norm": 1.6466262185119827, + "learning_rate": 1.5090059749410553e-05, + "loss": 0.2085, + "step": 6885 + }, + { + "epoch": 0.35, + "grad_norm": 0.9729954755515294, + "learning_rate": 1.5088642030751314e-05, + "loss": 0.171, + "step": 6886 + }, + { + "epoch": 0.35, + "grad_norm": 5.385590133366258, + "learning_rate": 1.5087224174062371e-05, + "loss": 0.1922, + "step": 6887 + }, + { + "epoch": 0.35, + "grad_norm": 0.8916447836837915, + "learning_rate": 1.5085806179382188e-05, + "loss": 0.2069, + "step": 6888 + }, + { + "epoch": 0.35, + "grad_norm": 1.2889193077338137, + "learning_rate": 1.5084388046749224e-05, + "loss": 0.2201, + "step": 6889 + }, + { + "epoch": 0.35, + "grad_norm": 1.921388178955955, + "learning_rate": 1.5082969776201948e-05, + "loss": 0.2002, + "step": 6890 + }, + { + "epoch": 0.35, + "grad_norm": 1.2911495515489098, + "learning_rate": 1.5081551367778828e-05, + "loss": 0.2063, + "step": 6891 + }, + { + "epoch": 0.35, + "grad_norm": 1.54823405008985, + "learning_rate": 1.508013282151834e-05, + "loss": 0.186, + "step": 6892 + }, + { + "epoch": 0.35, + "grad_norm": 0.8821794175637585, + "learning_rate": 1.5078714137458965e-05, + "loss": 0.2036, + "step": 6893 + }, + { + "epoch": 0.35, + "grad_norm": 1.5243194241716584, + "learning_rate": 1.5077295315639183e-05, + "loss": 0.2197, + "step": 6894 + }, + { + "epoch": 0.35, + "grad_norm": 1.7558299954036014, + "learning_rate": 1.5075876356097472e-05, + "loss": 0.1974, + "step": 6895 + }, + { + "epoch": 0.35, + "grad_norm": 1.0411962229686005, + "learning_rate": 1.5074457258872332e-05, + "loss": 0.1895, + "step": 6896 + }, + { + "epoch": 0.35, + "grad_norm": 1.1459677441955394, + "learning_rate": 1.5073038024002254e-05, + "loss": 0.2032, + "step": 6897 + }, + { + "epoch": 0.35, + "grad_norm": 1.3905183101820433, + "learning_rate": 1.5071618651525733e-05, + "loss": 0.2074, + "step": 6898 + }, + { + "epoch": 0.35, + "grad_norm": 0.9654236834489405, + "learning_rate": 1.5070199141481267e-05, + "loss": 0.21, + "step": 6899 + }, + { + "epoch": 0.35, + "grad_norm": 1.0754205007601063, + "learning_rate": 1.5068779493907364e-05, + "loss": 0.2037, + "step": 6900 + }, + { + "epoch": 0.35, + "grad_norm": 1.1811109530453496, + "learning_rate": 1.5067359708842531e-05, + "loss": 0.2191, + "step": 6901 + }, + { + "epoch": 0.35, + "grad_norm": 0.8896704583059244, + "learning_rate": 1.506593978632528e-05, + "loss": 0.1918, + "step": 6902 + }, + { + "epoch": 0.35, + "grad_norm": 0.8190830743448189, + "learning_rate": 1.5064519726394127e-05, + "loss": 0.1848, + "step": 6903 + }, + { + "epoch": 0.35, + "grad_norm": 1.4839731613633682, + "learning_rate": 1.5063099529087588e-05, + "loss": 0.1942, + "step": 6904 + }, + { + "epoch": 0.35, + "grad_norm": 0.8611499038512758, + "learning_rate": 1.506167919444419e-05, + "loss": 0.1833, + "step": 6905 + }, + { + "epoch": 0.35, + "grad_norm": 0.9749771586556178, + "learning_rate": 1.5060258722502457e-05, + "loss": 0.2128, + "step": 6906 + }, + { + "epoch": 0.35, + "grad_norm": 1.1688444786842203, + "learning_rate": 1.5058838113300922e-05, + "loss": 0.1839, + "step": 6907 + }, + { + "epoch": 0.35, + "grad_norm": 0.9682085388119024, + "learning_rate": 1.5057417366878117e-05, + "loss": 0.201, + "step": 6908 + }, + { + "epoch": 0.35, + "grad_norm": 1.0901203502465975, + "learning_rate": 1.505599648327258e-05, + "loss": 0.2072, + "step": 6909 + }, + { + "epoch": 0.35, + "grad_norm": 1.404939365745371, + "learning_rate": 1.505457546252285e-05, + "loss": 0.226, + "step": 6910 + }, + { + "epoch": 0.35, + "grad_norm": 1.0672437834253885, + "learning_rate": 1.5053154304667481e-05, + "loss": 0.2089, + "step": 6911 + }, + { + "epoch": 0.35, + "grad_norm": 1.0637403739652012, + "learning_rate": 1.5051733009745013e-05, + "loss": 0.1869, + "step": 6912 + }, + { + "epoch": 0.35, + "grad_norm": 0.9076819960560645, + "learning_rate": 1.5050311577794002e-05, + "loss": 0.1877, + "step": 6913 + }, + { + "epoch": 0.35, + "grad_norm": 0.9717309390704598, + "learning_rate": 1.5048890008853004e-05, + "loss": 0.1993, + "step": 6914 + }, + { + "epoch": 0.35, + "grad_norm": 0.9407606112256421, + "learning_rate": 1.5047468302960577e-05, + "loss": 0.2085, + "step": 6915 + }, + { + "epoch": 0.35, + "grad_norm": 1.4738042456951326, + "learning_rate": 1.504604646015529e-05, + "loss": 0.184, + "step": 6916 + }, + { + "epoch": 0.35, + "grad_norm": 1.7211102814223973, + "learning_rate": 1.5044624480475704e-05, + "loss": 0.2234, + "step": 6917 + }, + { + "epoch": 0.35, + "grad_norm": 1.4920048618936634, + "learning_rate": 1.50432023639604e-05, + "loss": 0.1944, + "step": 6918 + }, + { + "epoch": 0.35, + "grad_norm": 1.0084709392210685, + "learning_rate": 1.5041780110647945e-05, + "loss": 0.1899, + "step": 6919 + }, + { + "epoch": 0.35, + "grad_norm": 1.1115772913320738, + "learning_rate": 1.5040357720576917e-05, + "loss": 0.2065, + "step": 6920 + }, + { + "epoch": 0.35, + "grad_norm": 1.0794035077770285, + "learning_rate": 1.5038935193785904e-05, + "loss": 0.2026, + "step": 6921 + }, + { + "epoch": 0.35, + "grad_norm": 0.9209720954523564, + "learning_rate": 1.5037512530313487e-05, + "loss": 0.1812, + "step": 6922 + }, + { + "epoch": 0.35, + "grad_norm": 1.3927461318168444, + "learning_rate": 1.503608973019826e-05, + "loss": 0.2163, + "step": 6923 + }, + { + "epoch": 0.35, + "grad_norm": 0.8818625228439795, + "learning_rate": 1.5034666793478814e-05, + "loss": 0.2027, + "step": 6924 + }, + { + "epoch": 0.35, + "grad_norm": 0.9774795678754011, + "learning_rate": 1.5033243720193746e-05, + "loss": 0.1986, + "step": 6925 + }, + { + "epoch": 0.35, + "grad_norm": 1.1421701507240238, + "learning_rate": 1.5031820510381661e-05, + "loss": 0.2108, + "step": 6926 + }, + { + "epoch": 0.35, + "grad_norm": 0.9000965134357147, + "learning_rate": 1.5030397164081157e-05, + "loss": 0.2099, + "step": 6927 + }, + { + "epoch": 0.35, + "grad_norm": 1.095911521854206, + "learning_rate": 1.502897368133085e-05, + "loss": 0.2009, + "step": 6928 + }, + { + "epoch": 0.35, + "grad_norm": 0.8489297047692981, + "learning_rate": 1.5027550062169343e-05, + "loss": 0.1884, + "step": 6929 + }, + { + "epoch": 0.35, + "grad_norm": 0.997414952129682, + "learning_rate": 1.5026126306635256e-05, + "loss": 0.1995, + "step": 6930 + }, + { + "epoch": 0.35, + "grad_norm": 0.9949856638154007, + "learning_rate": 1.5024702414767212e-05, + "loss": 0.2306, + "step": 6931 + }, + { + "epoch": 0.35, + "grad_norm": 0.8183068565989067, + "learning_rate": 1.5023278386603832e-05, + "loss": 0.1763, + "step": 6932 + }, + { + "epoch": 0.35, + "grad_norm": 2.1024967773167957, + "learning_rate": 1.502185422218374e-05, + "loss": 0.2041, + "step": 6933 + }, + { + "epoch": 0.35, + "grad_norm": 1.0647604029272146, + "learning_rate": 1.5020429921545572e-05, + "loss": 0.2104, + "step": 6934 + }, + { + "epoch": 0.35, + "grad_norm": 1.2222934743683858, + "learning_rate": 1.5019005484727953e-05, + "loss": 0.1928, + "step": 6935 + }, + { + "epoch": 0.35, + "grad_norm": 0.949740467637576, + "learning_rate": 1.501758091176953e-05, + "loss": 0.2148, + "step": 6936 + }, + { + "epoch": 0.35, + "grad_norm": 1.057136051282396, + "learning_rate": 1.5016156202708942e-05, + "loss": 0.2267, + "step": 6937 + }, + { + "epoch": 0.35, + "grad_norm": 0.9248210993970946, + "learning_rate": 1.5014731357584835e-05, + "loss": 0.1782, + "step": 6938 + }, + { + "epoch": 0.35, + "grad_norm": 1.0259772541994394, + "learning_rate": 1.5013306376435852e-05, + "loss": 0.2216, + "step": 6939 + }, + { + "epoch": 0.35, + "grad_norm": 1.6482033868110175, + "learning_rate": 1.5011881259300654e-05, + "loss": 0.2146, + "step": 6940 + }, + { + "epoch": 0.35, + "grad_norm": 1.137004966715899, + "learning_rate": 1.5010456006217892e-05, + "loss": 0.2031, + "step": 6941 + }, + { + "epoch": 0.35, + "grad_norm": 0.8897749780607621, + "learning_rate": 1.5009030617226227e-05, + "loss": 0.2103, + "step": 6942 + }, + { + "epoch": 0.35, + "grad_norm": 1.7726206917351606, + "learning_rate": 1.5007605092364329e-05, + "loss": 0.2039, + "step": 6943 + }, + { + "epoch": 0.35, + "grad_norm": 0.9287323221846299, + "learning_rate": 1.5006179431670853e-05, + "loss": 0.2021, + "step": 6944 + }, + { + "epoch": 0.35, + "grad_norm": 0.8194781241438943, + "learning_rate": 1.5004753635184482e-05, + "loss": 0.2027, + "step": 6945 + }, + { + "epoch": 0.35, + "grad_norm": 1.3093313015385672, + "learning_rate": 1.5003327702943886e-05, + "loss": 0.2096, + "step": 6946 + }, + { + "epoch": 0.35, + "grad_norm": 0.751471795194662, + "learning_rate": 1.5001901634987741e-05, + "loss": 0.1795, + "step": 6947 + }, + { + "epoch": 0.35, + "grad_norm": 0.8658145897043582, + "learning_rate": 1.500047543135473e-05, + "loss": 0.198, + "step": 6948 + }, + { + "epoch": 0.35, + "grad_norm": 0.927812722124989, + "learning_rate": 1.4999049092083546e-05, + "loss": 0.204, + "step": 6949 + }, + { + "epoch": 0.35, + "grad_norm": 1.0196694768308388, + "learning_rate": 1.499762261721287e-05, + "loss": 0.2097, + "step": 6950 + }, + { + "epoch": 0.35, + "grad_norm": 1.1506071629214236, + "learning_rate": 1.4996196006781398e-05, + "loss": 0.2234, + "step": 6951 + }, + { + "epoch": 0.35, + "grad_norm": 1.0757424215079343, + "learning_rate": 1.4994769260827825e-05, + "loss": 0.2187, + "step": 6952 + }, + { + "epoch": 0.35, + "grad_norm": 0.8124410268278714, + "learning_rate": 1.4993342379390859e-05, + "loss": 0.2122, + "step": 6953 + }, + { + "epoch": 0.35, + "grad_norm": 4.055419148942473, + "learning_rate": 1.4991915362509196e-05, + "loss": 0.1975, + "step": 6954 + }, + { + "epoch": 0.35, + "grad_norm": 1.074342125307164, + "learning_rate": 1.4990488210221545e-05, + "loss": 0.2257, + "step": 6955 + }, + { + "epoch": 0.35, + "grad_norm": 0.8775555168046059, + "learning_rate": 1.4989060922566623e-05, + "loss": 0.1871, + "step": 6956 + }, + { + "epoch": 0.35, + "grad_norm": 0.9828925549110333, + "learning_rate": 1.4987633499583138e-05, + "loss": 0.2055, + "step": 6957 + }, + { + "epoch": 0.35, + "grad_norm": 1.0764743184030248, + "learning_rate": 1.4986205941309818e-05, + "loss": 0.205, + "step": 6958 + }, + { + "epoch": 0.35, + "grad_norm": 0.9363216899302678, + "learning_rate": 1.4984778247785375e-05, + "loss": 0.1804, + "step": 6959 + }, + { + "epoch": 0.35, + "grad_norm": 0.931636292642853, + "learning_rate": 1.4983350419048544e-05, + "loss": 0.1797, + "step": 6960 + }, + { + "epoch": 0.35, + "grad_norm": 0.9904177526333235, + "learning_rate": 1.498192245513805e-05, + "loss": 0.1902, + "step": 6961 + }, + { + "epoch": 0.35, + "grad_norm": 0.8392932536433085, + "learning_rate": 1.4980494356092626e-05, + "loss": 0.206, + "step": 6962 + }, + { + "epoch": 0.35, + "grad_norm": 1.8715178627695, + "learning_rate": 1.4979066121951014e-05, + "loss": 0.1972, + "step": 6963 + }, + { + "epoch": 0.35, + "grad_norm": 1.1678631470250729, + "learning_rate": 1.4977637752751953e-05, + "loss": 0.1891, + "step": 6964 + }, + { + "epoch": 0.35, + "grad_norm": 0.7647725339845265, + "learning_rate": 1.4976209248534183e-05, + "loss": 0.1959, + "step": 6965 + }, + { + "epoch": 0.35, + "grad_norm": 0.8787375165725418, + "learning_rate": 1.4974780609336459e-05, + "loss": 0.2146, + "step": 6966 + }, + { + "epoch": 0.35, + "grad_norm": 0.8037882146692238, + "learning_rate": 1.497335183519753e-05, + "loss": 0.2131, + "step": 6967 + }, + { + "epoch": 0.35, + "grad_norm": 0.9437660746025482, + "learning_rate": 1.497192292615615e-05, + "loss": 0.1672, + "step": 6968 + }, + { + "epoch": 0.35, + "grad_norm": 0.9548608764647659, + "learning_rate": 1.497049388225108e-05, + "loss": 0.1965, + "step": 6969 + }, + { + "epoch": 0.35, + "grad_norm": 0.8126396317616935, + "learning_rate": 1.4969064703521082e-05, + "loss": 0.206, + "step": 6970 + }, + { + "epoch": 0.35, + "grad_norm": 4.600045784968216, + "learning_rate": 1.4967635390004924e-05, + "loss": 0.181, + "step": 6971 + }, + { + "epoch": 0.35, + "grad_norm": 1.1969410873168724, + "learning_rate": 1.496620594174138e-05, + "loss": 0.2295, + "step": 6972 + }, + { + "epoch": 0.35, + "grad_norm": 0.9415730347137686, + "learning_rate": 1.4964776358769213e-05, + "loss": 0.2113, + "step": 6973 + }, + { + "epoch": 0.35, + "grad_norm": 1.163755494326272, + "learning_rate": 1.496334664112721e-05, + "loss": 0.2001, + "step": 6974 + }, + { + "epoch": 0.35, + "grad_norm": 0.9033674842959668, + "learning_rate": 1.4961916788854147e-05, + "loss": 0.1906, + "step": 6975 + }, + { + "epoch": 0.35, + "grad_norm": 1.1287017676044906, + "learning_rate": 1.4960486801988811e-05, + "loss": 0.2106, + "step": 6976 + }, + { + "epoch": 0.35, + "grad_norm": 1.1671485012836642, + "learning_rate": 1.4959056680569992e-05, + "loss": 0.1838, + "step": 6977 + }, + { + "epoch": 0.35, + "grad_norm": 0.9909927570451686, + "learning_rate": 1.4957626424636482e-05, + "loss": 0.2005, + "step": 6978 + }, + { + "epoch": 0.35, + "grad_norm": 0.7024089312556172, + "learning_rate": 1.495619603422707e-05, + "loss": 0.1863, + "step": 6979 + }, + { + "epoch": 0.35, + "grad_norm": 0.8736425378884803, + "learning_rate": 1.4954765509380565e-05, + "loss": 0.2134, + "step": 6980 + }, + { + "epoch": 0.35, + "grad_norm": 1.0248494978958178, + "learning_rate": 1.4953334850135765e-05, + "loss": 0.2203, + "step": 6981 + }, + { + "epoch": 0.36, + "grad_norm": 1.1433657512824882, + "learning_rate": 1.495190405653148e-05, + "loss": 0.2075, + "step": 6982 + }, + { + "epoch": 0.36, + "grad_norm": 0.797062012180071, + "learning_rate": 1.4950473128606513e-05, + "loss": 0.1951, + "step": 6983 + }, + { + "epoch": 0.36, + "grad_norm": 0.9021502798448437, + "learning_rate": 1.4949042066399684e-05, + "loss": 0.1921, + "step": 6984 + }, + { + "epoch": 0.36, + "grad_norm": 0.9354344111856875, + "learning_rate": 1.4947610869949811e-05, + "loss": 0.2152, + "step": 6985 + }, + { + "epoch": 0.36, + "grad_norm": 1.2873601297598602, + "learning_rate": 1.4946179539295713e-05, + "loss": 0.2066, + "step": 6986 + }, + { + "epoch": 0.36, + "grad_norm": 1.1742473061174559, + "learning_rate": 1.4944748074476211e-05, + "loss": 0.2242, + "step": 6987 + }, + { + "epoch": 0.36, + "grad_norm": 1.027615230081389, + "learning_rate": 1.4943316475530145e-05, + "loss": 0.2241, + "step": 6988 + }, + { + "epoch": 0.36, + "grad_norm": 1.4938750326455092, + "learning_rate": 1.4941884742496338e-05, + "loss": 0.2058, + "step": 6989 + }, + { + "epoch": 0.36, + "grad_norm": 0.8638539100195232, + "learning_rate": 1.4940452875413627e-05, + "loss": 0.2076, + "step": 6990 + }, + { + "epoch": 0.36, + "grad_norm": 1.8909119223100532, + "learning_rate": 1.4939020874320856e-05, + "loss": 0.2382, + "step": 6991 + }, + { + "epoch": 0.36, + "grad_norm": 0.959846464244343, + "learning_rate": 1.4937588739256861e-05, + "loss": 0.1874, + "step": 6992 + }, + { + "epoch": 0.36, + "grad_norm": 0.8957640924216222, + "learning_rate": 1.4936156470260494e-05, + "loss": 0.1897, + "step": 6993 + }, + { + "epoch": 0.36, + "grad_norm": 0.9760286467099434, + "learning_rate": 1.4934724067370604e-05, + "loss": 0.1896, + "step": 6994 + }, + { + "epoch": 0.36, + "grad_norm": 0.7759132698260575, + "learning_rate": 1.4933291530626047e-05, + "loss": 0.1905, + "step": 6995 + }, + { + "epoch": 0.36, + "grad_norm": 1.6179256438101102, + "learning_rate": 1.4931858860065676e-05, + "loss": 0.1889, + "step": 6996 + }, + { + "epoch": 0.36, + "grad_norm": 1.1805021494268257, + "learning_rate": 1.4930426055728354e-05, + "loss": 0.2042, + "step": 6997 + }, + { + "epoch": 0.36, + "grad_norm": 0.9349921230667492, + "learning_rate": 1.492899311765295e-05, + "loss": 0.1933, + "step": 6998 + }, + { + "epoch": 0.36, + "grad_norm": 1.0884404176575875, + "learning_rate": 1.4927560045878328e-05, + "loss": 0.1874, + "step": 6999 + }, + { + "epoch": 0.36, + "grad_norm": 1.371867437927246, + "learning_rate": 1.492612684044336e-05, + "loss": 0.1859, + "step": 7000 + }, + { + "epoch": 0.36, + "grad_norm": 1.5014463510510556, + "learning_rate": 1.4924693501386925e-05, + "loss": 0.1972, + "step": 7001 + }, + { + "epoch": 0.36, + "grad_norm": 1.658847598307763, + "learning_rate": 1.4923260028747899e-05, + "loss": 0.1931, + "step": 7002 + }, + { + "epoch": 0.36, + "grad_norm": 0.9026298128366345, + "learning_rate": 1.4921826422565169e-05, + "loss": 0.2159, + "step": 7003 + }, + { + "epoch": 0.36, + "grad_norm": 1.740979167213659, + "learning_rate": 1.4920392682877618e-05, + "loss": 0.1958, + "step": 7004 + }, + { + "epoch": 0.36, + "grad_norm": 0.8531910200230725, + "learning_rate": 1.4918958809724135e-05, + "loss": 0.1846, + "step": 7005 + }, + { + "epoch": 0.36, + "grad_norm": 0.7906279464839503, + "learning_rate": 1.491752480314362e-05, + "loss": 0.2073, + "step": 7006 + }, + { + "epoch": 0.36, + "grad_norm": 0.9873303650652501, + "learning_rate": 1.4916090663174966e-05, + "loss": 0.1949, + "step": 7007 + }, + { + "epoch": 0.36, + "grad_norm": 1.881165750191825, + "learning_rate": 1.4914656389857076e-05, + "loss": 0.2105, + "step": 7008 + }, + { + "epoch": 0.36, + "grad_norm": 0.9239039519027061, + "learning_rate": 1.4913221983228851e-05, + "loss": 0.2083, + "step": 7009 + }, + { + "epoch": 0.36, + "grad_norm": 1.0745052245332594, + "learning_rate": 1.4911787443329204e-05, + "loss": 0.2027, + "step": 7010 + }, + { + "epoch": 0.36, + "grad_norm": 0.8717707200753981, + "learning_rate": 1.4910352770197044e-05, + "loss": 0.2013, + "step": 7011 + }, + { + "epoch": 0.36, + "grad_norm": 0.8507639295424179, + "learning_rate": 1.4908917963871292e-05, + "loss": 0.2, + "step": 7012 + }, + { + "epoch": 0.36, + "grad_norm": 2.7524155811833033, + "learning_rate": 1.4907483024390859e-05, + "loss": 0.1818, + "step": 7013 + }, + { + "epoch": 0.36, + "grad_norm": 1.8231612141524862, + "learning_rate": 1.4906047951794671e-05, + "loss": 0.1897, + "step": 7014 + }, + { + "epoch": 0.36, + "grad_norm": 1.0364202333602426, + "learning_rate": 1.4904612746121657e-05, + "loss": 0.2018, + "step": 7015 + }, + { + "epoch": 0.36, + "grad_norm": 1.3215366699557016, + "learning_rate": 1.4903177407410749e-05, + "loss": 0.1915, + "step": 7016 + }, + { + "epoch": 0.36, + "grad_norm": 0.8798560451011856, + "learning_rate": 1.4901741935700873e-05, + "loss": 0.2068, + "step": 7017 + }, + { + "epoch": 0.36, + "grad_norm": 0.8262393299901581, + "learning_rate": 1.4900306331030967e-05, + "loss": 0.2209, + "step": 7018 + }, + { + "epoch": 0.36, + "grad_norm": 0.817376552435723, + "learning_rate": 1.489887059343998e-05, + "loss": 0.2057, + "step": 7019 + }, + { + "epoch": 0.36, + "grad_norm": 1.2862260016588265, + "learning_rate": 1.4897434722966851e-05, + "loss": 0.2076, + "step": 7020 + }, + { + "epoch": 0.36, + "grad_norm": 0.8367966460811618, + "learning_rate": 1.4895998719650526e-05, + "loss": 0.2173, + "step": 7021 + }, + { + "epoch": 0.36, + "grad_norm": 0.6914550113491021, + "learning_rate": 1.4894562583529961e-05, + "loss": 0.197, + "step": 7022 + }, + { + "epoch": 0.36, + "grad_norm": 1.0438177686296022, + "learning_rate": 1.4893126314644106e-05, + "loss": 0.1845, + "step": 7023 + }, + { + "epoch": 0.36, + "grad_norm": 0.8025077624120668, + "learning_rate": 1.4891689913031928e-05, + "loss": 0.1915, + "step": 7024 + }, + { + "epoch": 0.36, + "grad_norm": 0.9815403441598226, + "learning_rate": 1.4890253378732385e-05, + "loss": 0.2024, + "step": 7025 + }, + { + "epoch": 0.36, + "grad_norm": 1.4646887196561633, + "learning_rate": 1.488881671178444e-05, + "loss": 0.2244, + "step": 7026 + }, + { + "epoch": 0.36, + "grad_norm": 0.8324670358442616, + "learning_rate": 1.4887379912227064e-05, + "loss": 0.2185, + "step": 7027 + }, + { + "epoch": 0.36, + "grad_norm": 0.870578956635888, + "learning_rate": 1.4885942980099236e-05, + "loss": 0.2036, + "step": 7028 + }, + { + "epoch": 0.36, + "grad_norm": 1.3454427996276077, + "learning_rate": 1.488450591543993e-05, + "loss": 0.2091, + "step": 7029 + }, + { + "epoch": 0.36, + "grad_norm": 1.1991823657329022, + "learning_rate": 1.4883068718288121e-05, + "loss": 0.2294, + "step": 7030 + }, + { + "epoch": 0.36, + "grad_norm": 0.8558186090970756, + "learning_rate": 1.4881631388682797e-05, + "loss": 0.1883, + "step": 7031 + }, + { + "epoch": 0.36, + "grad_norm": 1.0901259681525386, + "learning_rate": 1.4880193926662948e-05, + "loss": 0.1977, + "step": 7032 + }, + { + "epoch": 0.36, + "grad_norm": 1.000147735629142, + "learning_rate": 1.4878756332267563e-05, + "loss": 0.1937, + "step": 7033 + }, + { + "epoch": 0.36, + "grad_norm": 0.9307866687602493, + "learning_rate": 1.4877318605535638e-05, + "loss": 0.2057, + "step": 7034 + }, + { + "epoch": 0.36, + "grad_norm": 0.8674026920795987, + "learning_rate": 1.4875880746506169e-05, + "loss": 0.1858, + "step": 7035 + }, + { + "epoch": 0.36, + "grad_norm": 1.066773643311902, + "learning_rate": 1.4874442755218156e-05, + "loss": 0.1948, + "step": 7036 + }, + { + "epoch": 0.36, + "grad_norm": 0.9281596268519601, + "learning_rate": 1.4873004631710613e-05, + "loss": 0.2231, + "step": 7037 + }, + { + "epoch": 0.36, + "grad_norm": 0.8973859838471487, + "learning_rate": 1.4871566376022542e-05, + "loss": 0.1794, + "step": 7038 + }, + { + "epoch": 0.36, + "grad_norm": 0.9926979536022584, + "learning_rate": 1.4870127988192957e-05, + "loss": 0.1973, + "step": 7039 + }, + { + "epoch": 0.36, + "grad_norm": 1.1506318008619696, + "learning_rate": 1.4868689468260876e-05, + "loss": 0.2115, + "step": 7040 + }, + { + "epoch": 0.36, + "grad_norm": 1.0734519175053021, + "learning_rate": 1.4867250816265318e-05, + "loss": 0.182, + "step": 7041 + }, + { + "epoch": 0.36, + "grad_norm": 1.229578365298459, + "learning_rate": 1.4865812032245308e-05, + "loss": 0.1948, + "step": 7042 + }, + { + "epoch": 0.36, + "grad_norm": 0.9429939490887894, + "learning_rate": 1.4864373116239872e-05, + "loss": 0.2185, + "step": 7043 + }, + { + "epoch": 0.36, + "grad_norm": 1.1696130548769823, + "learning_rate": 1.486293406828804e-05, + "loss": 0.2244, + "step": 7044 + }, + { + "epoch": 0.36, + "grad_norm": 1.382434322965532, + "learning_rate": 1.4861494888428845e-05, + "loss": 0.2003, + "step": 7045 + }, + { + "epoch": 0.36, + "grad_norm": 1.0071367527080957, + "learning_rate": 1.4860055576701327e-05, + "loss": 0.1995, + "step": 7046 + }, + { + "epoch": 0.36, + "grad_norm": 1.1758286360602521, + "learning_rate": 1.4858616133144527e-05, + "loss": 0.1906, + "step": 7047 + }, + { + "epoch": 0.36, + "grad_norm": 1.0913830281505044, + "learning_rate": 1.4857176557797493e-05, + "loss": 0.2076, + "step": 7048 + }, + { + "epoch": 0.36, + "grad_norm": 0.8585908518502955, + "learning_rate": 1.4855736850699268e-05, + "loss": 0.2014, + "step": 7049 + }, + { + "epoch": 0.36, + "grad_norm": 1.6561467985014158, + "learning_rate": 1.4854297011888905e-05, + "loss": 0.2059, + "step": 7050 + }, + { + "epoch": 0.36, + "grad_norm": 1.4893530541324937, + "learning_rate": 1.4852857041405466e-05, + "loss": 0.1821, + "step": 7051 + }, + { + "epoch": 0.36, + "grad_norm": 1.1291541762800112, + "learning_rate": 1.4851416939288003e-05, + "loss": 0.2142, + "step": 7052 + }, + { + "epoch": 0.36, + "grad_norm": 0.889361943725798, + "learning_rate": 1.484997670557558e-05, + "loss": 0.1824, + "step": 7053 + }, + { + "epoch": 0.36, + "grad_norm": 0.8613533659766596, + "learning_rate": 1.4848536340307267e-05, + "loss": 0.2051, + "step": 7054 + }, + { + "epoch": 0.36, + "grad_norm": 1.3496139019332325, + "learning_rate": 1.4847095843522133e-05, + "loss": 0.2061, + "step": 7055 + }, + { + "epoch": 0.36, + "grad_norm": 0.8141814539341413, + "learning_rate": 1.4845655215259249e-05, + "loss": 0.1955, + "step": 7056 + }, + { + "epoch": 0.36, + "grad_norm": 0.7389541225733909, + "learning_rate": 1.4844214455557693e-05, + "loss": 0.1944, + "step": 7057 + }, + { + "epoch": 0.36, + "grad_norm": 1.100671299271619, + "learning_rate": 1.4842773564456545e-05, + "loss": 0.2101, + "step": 7058 + }, + { + "epoch": 0.36, + "grad_norm": 1.1313577250220692, + "learning_rate": 1.4841332541994893e-05, + "loss": 0.2043, + "step": 7059 + }, + { + "epoch": 0.36, + "grad_norm": 0.7963336071239322, + "learning_rate": 1.4839891388211822e-05, + "loss": 0.2073, + "step": 7060 + }, + { + "epoch": 0.36, + "grad_norm": 0.9057394292716163, + "learning_rate": 1.4838450103146424e-05, + "loss": 0.2432, + "step": 7061 + }, + { + "epoch": 0.36, + "grad_norm": 2.270303282906903, + "learning_rate": 1.4837008686837791e-05, + "loss": 0.2007, + "step": 7062 + }, + { + "epoch": 0.36, + "grad_norm": 0.8417121898936355, + "learning_rate": 1.4835567139325026e-05, + "loss": 0.2002, + "step": 7063 + }, + { + "epoch": 0.36, + "grad_norm": 0.8494513029708324, + "learning_rate": 1.4834125460647231e-05, + "loss": 0.1912, + "step": 7064 + }, + { + "epoch": 0.36, + "grad_norm": 1.1110096798803188, + "learning_rate": 1.483268365084351e-05, + "loss": 0.1917, + "step": 7065 + }, + { + "epoch": 0.36, + "grad_norm": 1.2050849762105256, + "learning_rate": 1.4831241709952969e-05, + "loss": 0.2051, + "step": 7066 + }, + { + "epoch": 0.36, + "grad_norm": 1.1217378012786416, + "learning_rate": 1.4829799638014724e-05, + "loss": 0.2045, + "step": 7067 + }, + { + "epoch": 0.36, + "grad_norm": 1.118223071720989, + "learning_rate": 1.4828357435067895e-05, + "loss": 0.1838, + "step": 7068 + }, + { + "epoch": 0.36, + "grad_norm": 2.5884446354343718, + "learning_rate": 1.4826915101151595e-05, + "loss": 0.2154, + "step": 7069 + }, + { + "epoch": 0.36, + "grad_norm": 0.9053567421261107, + "learning_rate": 1.4825472636304952e-05, + "loss": 0.2375, + "step": 7070 + }, + { + "epoch": 0.36, + "grad_norm": 0.8367797883521595, + "learning_rate": 1.4824030040567086e-05, + "loss": 0.1871, + "step": 7071 + }, + { + "epoch": 0.36, + "grad_norm": 1.7442778069105056, + "learning_rate": 1.4822587313977137e-05, + "loss": 0.2197, + "step": 7072 + }, + { + "epoch": 0.36, + "grad_norm": 1.46195344130919, + "learning_rate": 1.4821144456574235e-05, + "loss": 0.1843, + "step": 7073 + }, + { + "epoch": 0.36, + "grad_norm": 0.871892278282872, + "learning_rate": 1.4819701468397516e-05, + "loss": 0.1998, + "step": 7074 + }, + { + "epoch": 0.36, + "grad_norm": 1.0590408819917725, + "learning_rate": 1.4818258349486121e-05, + "loss": 0.2003, + "step": 7075 + }, + { + "epoch": 0.36, + "grad_norm": 0.9773154856611629, + "learning_rate": 1.4816815099879199e-05, + "loss": 0.2079, + "step": 7076 + }, + { + "epoch": 0.36, + "grad_norm": 0.9642359721391925, + "learning_rate": 1.4815371719615895e-05, + "loss": 0.2001, + "step": 7077 + }, + { + "epoch": 0.36, + "grad_norm": 0.8066183899205053, + "learning_rate": 1.481392820873536e-05, + "loss": 0.1761, + "step": 7078 + }, + { + "epoch": 0.36, + "grad_norm": 1.0007716990939366, + "learning_rate": 1.481248456727675e-05, + "loss": 0.209, + "step": 7079 + }, + { + "epoch": 0.36, + "grad_norm": 0.8113832340307315, + "learning_rate": 1.4811040795279223e-05, + "loss": 0.1979, + "step": 7080 + }, + { + "epoch": 0.36, + "grad_norm": 1.0519165147720977, + "learning_rate": 1.4809596892781946e-05, + "loss": 0.2089, + "step": 7081 + }, + { + "epoch": 0.36, + "grad_norm": 0.8878676928357122, + "learning_rate": 1.480815285982408e-05, + "loss": 0.2352, + "step": 7082 + }, + { + "epoch": 0.36, + "grad_norm": 0.8365083305411455, + "learning_rate": 1.4806708696444796e-05, + "loss": 0.2044, + "step": 7083 + }, + { + "epoch": 0.36, + "grad_norm": 1.128972835355977, + "learning_rate": 1.4805264402683268e-05, + "loss": 0.1973, + "step": 7084 + }, + { + "epoch": 0.36, + "grad_norm": 1.2933985370978633, + "learning_rate": 1.480381997857867e-05, + "loss": 0.2061, + "step": 7085 + }, + { + "epoch": 0.36, + "grad_norm": 1.1958606496129567, + "learning_rate": 1.4802375424170187e-05, + "loss": 0.2116, + "step": 7086 + }, + { + "epoch": 0.36, + "grad_norm": 0.9234574265240589, + "learning_rate": 1.4800930739497e-05, + "loss": 0.185, + "step": 7087 + }, + { + "epoch": 0.36, + "grad_norm": 0.9777658505483192, + "learning_rate": 1.4799485924598292e-05, + "loss": 0.2006, + "step": 7088 + }, + { + "epoch": 0.36, + "grad_norm": 0.8632732697046354, + "learning_rate": 1.4798040979513258e-05, + "loss": 0.1881, + "step": 7089 + }, + { + "epoch": 0.36, + "grad_norm": 1.0198595161320083, + "learning_rate": 1.479659590428109e-05, + "loss": 0.2127, + "step": 7090 + }, + { + "epoch": 0.36, + "grad_norm": 0.9215793081250065, + "learning_rate": 1.479515069894099e-05, + "loss": 0.1954, + "step": 7091 + }, + { + "epoch": 0.36, + "grad_norm": 0.9916694114763142, + "learning_rate": 1.4793705363532156e-05, + "loss": 0.1978, + "step": 7092 + }, + { + "epoch": 0.36, + "grad_norm": 1.592385505546323, + "learning_rate": 1.4792259898093791e-05, + "loss": 0.2388, + "step": 7093 + }, + { + "epoch": 0.36, + "grad_norm": 1.012073087478319, + "learning_rate": 1.4790814302665107e-05, + "loss": 0.1814, + "step": 7094 + }, + { + "epoch": 0.36, + "grad_norm": 1.90806762365769, + "learning_rate": 1.4789368577285314e-05, + "loss": 0.2186, + "step": 7095 + }, + { + "epoch": 0.36, + "grad_norm": 0.8809231723802302, + "learning_rate": 1.478792272199363e-05, + "loss": 0.1912, + "step": 7096 + }, + { + "epoch": 0.36, + "grad_norm": 0.8225179914084455, + "learning_rate": 1.4786476736829267e-05, + "loss": 0.1968, + "step": 7097 + }, + { + "epoch": 0.36, + "grad_norm": 0.8506939817328405, + "learning_rate": 1.4785030621831458e-05, + "loss": 0.1949, + "step": 7098 + }, + { + "epoch": 0.36, + "grad_norm": 0.9884889449489223, + "learning_rate": 1.478358437703942e-05, + "loss": 0.2254, + "step": 7099 + }, + { + "epoch": 0.36, + "grad_norm": 7.4120500445281, + "learning_rate": 1.4782138002492385e-05, + "loss": 0.2025, + "step": 7100 + }, + { + "epoch": 0.36, + "grad_norm": 1.6147739500037153, + "learning_rate": 1.4780691498229588e-05, + "loss": 0.1985, + "step": 7101 + }, + { + "epoch": 0.36, + "grad_norm": 1.9097814287498975, + "learning_rate": 1.4779244864290264e-05, + "loss": 0.2045, + "step": 7102 + }, + { + "epoch": 0.36, + "grad_norm": 0.8348803855026005, + "learning_rate": 1.4777798100713655e-05, + "loss": 0.1769, + "step": 7103 + }, + { + "epoch": 0.36, + "grad_norm": 1.1900370878471898, + "learning_rate": 1.4776351207538999e-05, + "loss": 0.1719, + "step": 7104 + }, + { + "epoch": 0.36, + "grad_norm": 1.2091092328565474, + "learning_rate": 1.477490418480555e-05, + "loss": 0.1898, + "step": 7105 + }, + { + "epoch": 0.36, + "grad_norm": 1.5838612601289892, + "learning_rate": 1.4773457032552551e-05, + "loss": 0.224, + "step": 7106 + }, + { + "epoch": 0.36, + "grad_norm": 4.945232541245546, + "learning_rate": 1.4772009750819262e-05, + "loss": 0.2107, + "step": 7107 + }, + { + "epoch": 0.36, + "grad_norm": 0.9322743971599436, + "learning_rate": 1.4770562339644943e-05, + "loss": 0.235, + "step": 7108 + }, + { + "epoch": 0.36, + "grad_norm": 1.1461959180016132, + "learning_rate": 1.4769114799068847e-05, + "loss": 0.2051, + "step": 7109 + }, + { + "epoch": 0.36, + "grad_norm": 1.636216515301017, + "learning_rate": 1.4767667129130243e-05, + "loss": 0.221, + "step": 7110 + }, + { + "epoch": 0.36, + "grad_norm": 1.1442129928861824, + "learning_rate": 1.4766219329868399e-05, + "loss": 0.2033, + "step": 7111 + }, + { + "epoch": 0.36, + "grad_norm": 1.4749380356676163, + "learning_rate": 1.4764771401322588e-05, + "loss": 0.1876, + "step": 7112 + }, + { + "epoch": 0.36, + "grad_norm": 0.982928862953428, + "learning_rate": 1.4763323343532083e-05, + "loss": 0.2089, + "step": 7113 + }, + { + "epoch": 0.36, + "grad_norm": 1.0080157854821021, + "learning_rate": 1.4761875156536163e-05, + "loss": 0.2059, + "step": 7114 + }, + { + "epoch": 0.36, + "grad_norm": 0.9390907781578757, + "learning_rate": 1.476042684037411e-05, + "loss": 0.1792, + "step": 7115 + }, + { + "epoch": 0.36, + "grad_norm": 3.299468734945118, + "learning_rate": 1.4758978395085208e-05, + "loss": 0.2316, + "step": 7116 + }, + { + "epoch": 0.36, + "grad_norm": 0.9578609983641396, + "learning_rate": 1.4757529820708754e-05, + "loss": 0.1964, + "step": 7117 + }, + { + "epoch": 0.36, + "grad_norm": 0.8416583906906582, + "learning_rate": 1.475608111728403e-05, + "loss": 0.1789, + "step": 7118 + }, + { + "epoch": 0.36, + "grad_norm": 0.9457542076436503, + "learning_rate": 1.4754632284850338e-05, + "loss": 0.1833, + "step": 7119 + }, + { + "epoch": 0.36, + "grad_norm": 0.9545702827475283, + "learning_rate": 1.4753183323446978e-05, + "loss": 0.2101, + "step": 7120 + }, + { + "epoch": 0.36, + "grad_norm": 1.088974962874592, + "learning_rate": 1.4751734233113253e-05, + "loss": 0.2124, + "step": 7121 + }, + { + "epoch": 0.36, + "grad_norm": 0.9677299958915025, + "learning_rate": 1.4750285013888466e-05, + "loss": 0.1812, + "step": 7122 + }, + { + "epoch": 0.36, + "grad_norm": 1.9383390825244804, + "learning_rate": 1.474883566581193e-05, + "loss": 0.2187, + "step": 7123 + }, + { + "epoch": 0.36, + "grad_norm": 1.049802052114001, + "learning_rate": 1.474738618892296e-05, + "loss": 0.1903, + "step": 7124 + }, + { + "epoch": 0.36, + "grad_norm": 1.2918584280404934, + "learning_rate": 1.474593658326087e-05, + "loss": 0.2075, + "step": 7125 + }, + { + "epoch": 0.36, + "grad_norm": 1.2190451902433268, + "learning_rate": 1.4744486848864982e-05, + "loss": 0.1998, + "step": 7126 + }, + { + "epoch": 0.36, + "grad_norm": 2.2539579091445217, + "learning_rate": 1.4743036985774621e-05, + "loss": 0.1809, + "step": 7127 + }, + { + "epoch": 0.36, + "grad_norm": 0.8069144816251373, + "learning_rate": 1.4741586994029113e-05, + "loss": 0.1841, + "step": 7128 + }, + { + "epoch": 0.36, + "grad_norm": 0.8739653517671883, + "learning_rate": 1.474013687366779e-05, + "loss": 0.1951, + "step": 7129 + }, + { + "epoch": 0.36, + "grad_norm": 1.1976902309195656, + "learning_rate": 1.4738686624729987e-05, + "loss": 0.2206, + "step": 7130 + }, + { + "epoch": 0.36, + "grad_norm": 1.1363599177482622, + "learning_rate": 1.4737236247255045e-05, + "loss": 0.1928, + "step": 7131 + }, + { + "epoch": 0.36, + "grad_norm": 1.0141472201673492, + "learning_rate": 1.4735785741282298e-05, + "loss": 0.17, + "step": 7132 + }, + { + "epoch": 0.36, + "grad_norm": 0.8962760373086162, + "learning_rate": 1.4734335106851095e-05, + "loss": 0.1788, + "step": 7133 + }, + { + "epoch": 0.36, + "grad_norm": 1.0975108161546672, + "learning_rate": 1.4732884344000787e-05, + "loss": 0.1932, + "step": 7134 + }, + { + "epoch": 0.36, + "grad_norm": 1.1304931506411495, + "learning_rate": 1.4731433452770723e-05, + "loss": 0.2232, + "step": 7135 + }, + { + "epoch": 0.36, + "grad_norm": 0.9684346870572593, + "learning_rate": 1.4729982433200261e-05, + "loss": 0.2124, + "step": 7136 + }, + { + "epoch": 0.36, + "grad_norm": 2.5665638986844876, + "learning_rate": 1.4728531285328753e-05, + "loss": 0.2219, + "step": 7137 + }, + { + "epoch": 0.36, + "grad_norm": 0.9854542831804197, + "learning_rate": 1.4727080009195573e-05, + "loss": 0.217, + "step": 7138 + }, + { + "epoch": 0.36, + "grad_norm": 1.0710552426739544, + "learning_rate": 1.4725628604840078e-05, + "loss": 0.1855, + "step": 7139 + }, + { + "epoch": 0.36, + "grad_norm": 1.2343554376103933, + "learning_rate": 1.4724177072301642e-05, + "loss": 0.2069, + "step": 7140 + }, + { + "epoch": 0.36, + "grad_norm": 0.9514401751415055, + "learning_rate": 1.4722725411619634e-05, + "loss": 0.2106, + "step": 7141 + }, + { + "epoch": 0.36, + "grad_norm": 2.030571956714932, + "learning_rate": 1.4721273622833432e-05, + "loss": 0.2027, + "step": 7142 + }, + { + "epoch": 0.36, + "grad_norm": 1.506294984385844, + "learning_rate": 1.4719821705982417e-05, + "loss": 0.199, + "step": 7143 + }, + { + "epoch": 0.36, + "grad_norm": 1.258818755205757, + "learning_rate": 1.4718369661105973e-05, + "loss": 0.2058, + "step": 7144 + }, + { + "epoch": 0.36, + "grad_norm": 0.7594735060195563, + "learning_rate": 1.4716917488243485e-05, + "loss": 0.1839, + "step": 7145 + }, + { + "epoch": 0.36, + "grad_norm": 1.3661682393111336, + "learning_rate": 1.4715465187434342e-05, + "loss": 0.1978, + "step": 7146 + }, + { + "epoch": 0.36, + "grad_norm": 1.1330714487070268, + "learning_rate": 1.4714012758717941e-05, + "loss": 0.1889, + "step": 7147 + }, + { + "epoch": 0.36, + "grad_norm": 1.1189407523486754, + "learning_rate": 1.4712560202133679e-05, + "loss": 0.2008, + "step": 7148 + }, + { + "epoch": 0.36, + "grad_norm": 1.8707259683031279, + "learning_rate": 1.4711107517720953e-05, + "loss": 0.1833, + "step": 7149 + }, + { + "epoch": 0.36, + "grad_norm": 1.05214004080036, + "learning_rate": 1.4709654705519168e-05, + "loss": 0.1987, + "step": 7150 + }, + { + "epoch": 0.36, + "grad_norm": 1.0682777254050864, + "learning_rate": 1.4708201765567736e-05, + "loss": 0.1798, + "step": 7151 + }, + { + "epoch": 0.36, + "grad_norm": 1.1839164935435411, + "learning_rate": 1.4706748697906065e-05, + "loss": 0.201, + "step": 7152 + }, + { + "epoch": 0.36, + "grad_norm": 1.1184231051281768, + "learning_rate": 1.4705295502573571e-05, + "loss": 0.2206, + "step": 7153 + }, + { + "epoch": 0.36, + "grad_norm": 0.8979461790109566, + "learning_rate": 1.4703842179609668e-05, + "loss": 0.2269, + "step": 7154 + }, + { + "epoch": 0.36, + "grad_norm": 0.9264038467836124, + "learning_rate": 1.470238872905378e-05, + "loss": 0.1965, + "step": 7155 + }, + { + "epoch": 0.36, + "grad_norm": 0.8079661378752513, + "learning_rate": 1.4700935150945334e-05, + "loss": 0.1951, + "step": 7156 + }, + { + "epoch": 0.36, + "grad_norm": 1.1541144569815773, + "learning_rate": 1.4699481445323757e-05, + "loss": 0.2105, + "step": 7157 + }, + { + "epoch": 0.36, + "grad_norm": 1.1023098748718994, + "learning_rate": 1.4698027612228478e-05, + "loss": 0.2134, + "step": 7158 + }, + { + "epoch": 0.36, + "grad_norm": 0.9886838350178068, + "learning_rate": 1.4696573651698937e-05, + "loss": 0.2033, + "step": 7159 + }, + { + "epoch": 0.36, + "grad_norm": 0.6959608514363341, + "learning_rate": 1.4695119563774568e-05, + "loss": 0.1985, + "step": 7160 + }, + { + "epoch": 0.36, + "grad_norm": 43.03118587420249, + "learning_rate": 1.4693665348494819e-05, + "loss": 0.1765, + "step": 7161 + }, + { + "epoch": 0.36, + "grad_norm": 1.0170267614935629, + "learning_rate": 1.469221100589913e-05, + "loss": 0.182, + "step": 7162 + }, + { + "epoch": 0.36, + "grad_norm": 0.7638325075184338, + "learning_rate": 1.4690756536026952e-05, + "loss": 0.2022, + "step": 7163 + }, + { + "epoch": 0.36, + "grad_norm": 0.7929466280136379, + "learning_rate": 1.4689301938917737e-05, + "loss": 0.1787, + "step": 7164 + }, + { + "epoch": 0.36, + "grad_norm": 0.9428273508130095, + "learning_rate": 1.4687847214610944e-05, + "loss": 0.1829, + "step": 7165 + }, + { + "epoch": 0.36, + "grad_norm": 1.7559421200980139, + "learning_rate": 1.4686392363146032e-05, + "loss": 0.2028, + "step": 7166 + }, + { + "epoch": 0.36, + "grad_norm": 0.9443670684991925, + "learning_rate": 1.468493738456246e-05, + "loss": 0.1837, + "step": 7167 + }, + { + "epoch": 0.36, + "grad_norm": 1.187792744550687, + "learning_rate": 1.4683482278899696e-05, + "loss": 0.2238, + "step": 7168 + }, + { + "epoch": 0.36, + "grad_norm": 1.1056454767564257, + "learning_rate": 1.4682027046197214e-05, + "loss": 0.2076, + "step": 7169 + }, + { + "epoch": 0.36, + "grad_norm": 0.8500674701813452, + "learning_rate": 1.4680571686494483e-05, + "loss": 0.2032, + "step": 7170 + }, + { + "epoch": 0.36, + "grad_norm": 1.1124369387671862, + "learning_rate": 1.4679116199830978e-05, + "loss": 0.1817, + "step": 7171 + }, + { + "epoch": 0.36, + "grad_norm": 1.0194354860544301, + "learning_rate": 1.4677660586246183e-05, + "loss": 0.1875, + "step": 7172 + }, + { + "epoch": 0.36, + "grad_norm": 0.8772144588619847, + "learning_rate": 1.4676204845779585e-05, + "loss": 0.1882, + "step": 7173 + }, + { + "epoch": 0.36, + "grad_norm": 1.169638257110541, + "learning_rate": 1.4674748978470663e-05, + "loss": 0.195, + "step": 7174 + }, + { + "epoch": 0.36, + "grad_norm": 1.0328090611192982, + "learning_rate": 1.4673292984358911e-05, + "loss": 0.1922, + "step": 7175 + }, + { + "epoch": 0.36, + "grad_norm": 0.8585369331116066, + "learning_rate": 1.4671836863483819e-05, + "loss": 0.1723, + "step": 7176 + }, + { + "epoch": 0.36, + "grad_norm": 0.9757881472890122, + "learning_rate": 1.4670380615884891e-05, + "loss": 0.2182, + "step": 7177 + }, + { + "epoch": 0.37, + "grad_norm": 1.1618572034347405, + "learning_rate": 1.4668924241601627e-05, + "loss": 0.1893, + "step": 7178 + }, + { + "epoch": 0.37, + "grad_norm": 0.8359477577026576, + "learning_rate": 1.4667467740673528e-05, + "loss": 0.1752, + "step": 7179 + }, + { + "epoch": 0.37, + "grad_norm": 0.9593541875986683, + "learning_rate": 1.4666011113140103e-05, + "loss": 0.2085, + "step": 7180 + }, + { + "epoch": 0.37, + "grad_norm": 0.9373030226818541, + "learning_rate": 1.4664554359040862e-05, + "loss": 0.2049, + "step": 7181 + }, + { + "epoch": 0.37, + "grad_norm": 1.046339481942067, + "learning_rate": 1.4663097478415322e-05, + "loss": 0.2115, + "step": 7182 + }, + { + "epoch": 0.37, + "grad_norm": 1.16317388913559, + "learning_rate": 1.4661640471302998e-05, + "loss": 0.1874, + "step": 7183 + }, + { + "epoch": 0.37, + "grad_norm": 1.0277480391584366, + "learning_rate": 1.4660183337743414e-05, + "loss": 0.2128, + "step": 7184 + }, + { + "epoch": 0.37, + "grad_norm": 1.18453723334063, + "learning_rate": 1.4658726077776093e-05, + "loss": 0.2066, + "step": 7185 + }, + { + "epoch": 0.37, + "grad_norm": 1.0996444774192848, + "learning_rate": 1.4657268691440564e-05, + "loss": 0.2115, + "step": 7186 + }, + { + "epoch": 0.37, + "grad_norm": 0.8886067711511231, + "learning_rate": 1.465581117877636e-05, + "loss": 0.178, + "step": 7187 + }, + { + "epoch": 0.37, + "grad_norm": 1.113528221312272, + "learning_rate": 1.4654353539823014e-05, + "loss": 0.2113, + "step": 7188 + }, + { + "epoch": 0.37, + "grad_norm": 2.355610201304584, + "learning_rate": 1.4652895774620066e-05, + "loss": 0.1913, + "step": 7189 + }, + { + "epoch": 0.37, + "grad_norm": 0.9662037049135782, + "learning_rate": 1.4651437883207056e-05, + "loss": 0.2023, + "step": 7190 + }, + { + "epoch": 0.37, + "grad_norm": 1.791090091633652, + "learning_rate": 1.4649979865623531e-05, + "loss": 0.1935, + "step": 7191 + }, + { + "epoch": 0.37, + "grad_norm": 1.1973119684181808, + "learning_rate": 1.4648521721909042e-05, + "loss": 0.1807, + "step": 7192 + }, + { + "epoch": 0.37, + "grad_norm": 0.7438397230367796, + "learning_rate": 1.4647063452103135e-05, + "loss": 0.1938, + "step": 7193 + }, + { + "epoch": 0.37, + "grad_norm": 1.017059128027557, + "learning_rate": 1.464560505624537e-05, + "loss": 0.1893, + "step": 7194 + }, + { + "epoch": 0.37, + "grad_norm": 1.1394480960376436, + "learning_rate": 1.4644146534375307e-05, + "loss": 0.1966, + "step": 7195 + }, + { + "epoch": 0.37, + "grad_norm": 7.309035780184552, + "learning_rate": 1.4642687886532507e-05, + "loss": 0.2063, + "step": 7196 + }, + { + "epoch": 0.37, + "grad_norm": 0.965556266585576, + "learning_rate": 1.4641229112756537e-05, + "loss": 0.2061, + "step": 7197 + }, + { + "epoch": 0.37, + "grad_norm": 0.9247343728821459, + "learning_rate": 1.4639770213086962e-05, + "loss": 0.2257, + "step": 7198 + }, + { + "epoch": 0.37, + "grad_norm": 1.2873322184303597, + "learning_rate": 1.463831118756336e-05, + "loss": 0.2219, + "step": 7199 + }, + { + "epoch": 0.37, + "grad_norm": 1.0503873999197977, + "learning_rate": 1.4636852036225304e-05, + "loss": 0.1835, + "step": 7200 + }, + { + "epoch": 0.37, + "grad_norm": 1.0441262383963452, + "learning_rate": 1.4635392759112374e-05, + "loss": 0.2196, + "step": 7201 + }, + { + "epoch": 0.37, + "grad_norm": 1.0886323583800523, + "learning_rate": 1.4633933356264156e-05, + "loss": 0.2264, + "step": 7202 + }, + { + "epoch": 0.37, + "grad_norm": 1.0740117152730773, + "learning_rate": 1.463247382772023e-05, + "loss": 0.2016, + "step": 7203 + }, + { + "epoch": 0.37, + "grad_norm": 1.0431229186759934, + "learning_rate": 1.4631014173520192e-05, + "loss": 0.2064, + "step": 7204 + }, + { + "epoch": 0.37, + "grad_norm": 1.2250889849637991, + "learning_rate": 1.4629554393703635e-05, + "loss": 0.1874, + "step": 7205 + }, + { + "epoch": 0.37, + "grad_norm": 0.8985830906857448, + "learning_rate": 1.4628094488310154e-05, + "loss": 0.1906, + "step": 7206 + }, + { + "epoch": 0.37, + "grad_norm": 1.4827841710717786, + "learning_rate": 1.4626634457379343e-05, + "loss": 0.1823, + "step": 7207 + }, + { + "epoch": 0.37, + "grad_norm": 2.3135064026868895, + "learning_rate": 1.4625174300950817e-05, + "loss": 0.2345, + "step": 7208 + }, + { + "epoch": 0.37, + "grad_norm": 1.3767976555094654, + "learning_rate": 1.4623714019064178e-05, + "loss": 0.2239, + "step": 7209 + }, + { + "epoch": 0.37, + "grad_norm": 1.0399025276719043, + "learning_rate": 1.462225361175903e-05, + "loss": 0.1845, + "step": 7210 + }, + { + "epoch": 0.37, + "grad_norm": 1.1365818325865469, + "learning_rate": 1.4620793079074991e-05, + "loss": 0.2074, + "step": 7211 + }, + { + "epoch": 0.37, + "grad_norm": 0.9650778862329886, + "learning_rate": 1.4619332421051682e-05, + "loss": 0.201, + "step": 7212 + }, + { + "epoch": 0.37, + "grad_norm": 1.2317824187188442, + "learning_rate": 1.4617871637728719e-05, + "loss": 0.181, + "step": 7213 + }, + { + "epoch": 0.37, + "grad_norm": 1.2293569861227824, + "learning_rate": 1.461641072914573e-05, + "loss": 0.1788, + "step": 7214 + }, + { + "epoch": 0.37, + "grad_norm": 0.8421056403657364, + "learning_rate": 1.4614949695342335e-05, + "loss": 0.1826, + "step": 7215 + }, + { + "epoch": 0.37, + "grad_norm": 1.7556245059003615, + "learning_rate": 1.461348853635817e-05, + "loss": 0.198, + "step": 7216 + }, + { + "epoch": 0.37, + "grad_norm": 0.8373865110950348, + "learning_rate": 1.4612027252232868e-05, + "loss": 0.1869, + "step": 7217 + }, + { + "epoch": 0.37, + "grad_norm": 1.4859909383682117, + "learning_rate": 1.4610565843006066e-05, + "loss": 0.2279, + "step": 7218 + }, + { + "epoch": 0.37, + "grad_norm": 0.9840411718276176, + "learning_rate": 1.4609104308717405e-05, + "loss": 0.2095, + "step": 7219 + }, + { + "epoch": 0.37, + "grad_norm": 1.0813751383332257, + "learning_rate": 1.4607642649406529e-05, + "loss": 0.1955, + "step": 7220 + }, + { + "epoch": 0.37, + "grad_norm": 1.7086729803488199, + "learning_rate": 1.4606180865113087e-05, + "loss": 0.1809, + "step": 7221 + }, + { + "epoch": 0.37, + "grad_norm": 2.0543148531001396, + "learning_rate": 1.460471895587673e-05, + "loss": 0.2351, + "step": 7222 + }, + { + "epoch": 0.37, + "grad_norm": 1.5590508133007468, + "learning_rate": 1.460325692173711e-05, + "loss": 0.1929, + "step": 7223 + }, + { + "epoch": 0.37, + "grad_norm": 1.8575470786430182, + "learning_rate": 1.4601794762733885e-05, + "loss": 0.2414, + "step": 7224 + }, + { + "epoch": 0.37, + "grad_norm": 1.0713855039589257, + "learning_rate": 1.4600332478906718e-05, + "loss": 0.191, + "step": 7225 + }, + { + "epoch": 0.37, + "grad_norm": 1.050489799066373, + "learning_rate": 1.4598870070295274e-05, + "loss": 0.1957, + "step": 7226 + }, + { + "epoch": 0.37, + "grad_norm": 1.1241824559481621, + "learning_rate": 1.4597407536939221e-05, + "loss": 0.2044, + "step": 7227 + }, + { + "epoch": 0.37, + "grad_norm": 0.9398946171930076, + "learning_rate": 1.4595944878878226e-05, + "loss": 0.1788, + "step": 7228 + }, + { + "epoch": 0.37, + "grad_norm": 2.6923161481460482, + "learning_rate": 1.4594482096151965e-05, + "loss": 0.1869, + "step": 7229 + }, + { + "epoch": 0.37, + "grad_norm": 0.8537422935352744, + "learning_rate": 1.459301918880012e-05, + "loss": 0.1784, + "step": 7230 + }, + { + "epoch": 0.37, + "grad_norm": 1.251240074898674, + "learning_rate": 1.459155615686237e-05, + "loss": 0.2119, + "step": 7231 + }, + { + "epoch": 0.37, + "grad_norm": 1.04468001598618, + "learning_rate": 1.45900930003784e-05, + "loss": 0.2061, + "step": 7232 + }, + { + "epoch": 0.37, + "grad_norm": 0.9481835045186339, + "learning_rate": 1.4588629719387895e-05, + "loss": 0.1806, + "step": 7233 + }, + { + "epoch": 0.37, + "grad_norm": 1.5470941140754693, + "learning_rate": 1.4587166313930551e-05, + "loss": 0.1995, + "step": 7234 + }, + { + "epoch": 0.37, + "grad_norm": 0.8879328653548417, + "learning_rate": 1.4585702784046065e-05, + "loss": 0.2003, + "step": 7235 + }, + { + "epoch": 0.37, + "grad_norm": 0.9206448905493988, + "learning_rate": 1.4584239129774131e-05, + "loss": 0.1888, + "step": 7236 + }, + { + "epoch": 0.37, + "grad_norm": 1.7014245751046675, + "learning_rate": 1.458277535115445e-05, + "loss": 0.1778, + "step": 7237 + }, + { + "epoch": 0.37, + "grad_norm": 1.0059919959825143, + "learning_rate": 1.458131144822673e-05, + "loss": 0.2028, + "step": 7238 + }, + { + "epoch": 0.37, + "grad_norm": 1.0593151433111, + "learning_rate": 1.4579847421030677e-05, + "loss": 0.2188, + "step": 7239 + }, + { + "epoch": 0.37, + "grad_norm": 1.4445554920908814, + "learning_rate": 1.4578383269606004e-05, + "loss": 0.1781, + "step": 7240 + }, + { + "epoch": 0.37, + "grad_norm": 1.774630230057031, + "learning_rate": 1.4576918993992429e-05, + "loss": 0.1994, + "step": 7241 + }, + { + "epoch": 0.37, + "grad_norm": 1.711497024864439, + "learning_rate": 1.4575454594229666e-05, + "loss": 0.2623, + "step": 7242 + }, + { + "epoch": 0.37, + "grad_norm": 0.8612693862757297, + "learning_rate": 1.4573990070357437e-05, + "loss": 0.1703, + "step": 7243 + }, + { + "epoch": 0.37, + "grad_norm": 2.0058030680566827, + "learning_rate": 1.4572525422415471e-05, + "loss": 0.2049, + "step": 7244 + }, + { + "epoch": 0.37, + "grad_norm": 0.913388143288835, + "learning_rate": 1.4571060650443494e-05, + "loss": 0.197, + "step": 7245 + }, + { + "epoch": 0.37, + "grad_norm": 1.4343812327113545, + "learning_rate": 1.4569595754481238e-05, + "loss": 0.1961, + "step": 7246 + }, + { + "epoch": 0.37, + "grad_norm": 1.2043942073468759, + "learning_rate": 1.456813073456844e-05, + "loss": 0.1734, + "step": 7247 + }, + { + "epoch": 0.37, + "grad_norm": 1.0221013480787766, + "learning_rate": 1.4566665590744838e-05, + "loss": 0.1844, + "step": 7248 + }, + { + "epoch": 0.37, + "grad_norm": 2.427492512904666, + "learning_rate": 1.4565200323050173e-05, + "loss": 0.1967, + "step": 7249 + }, + { + "epoch": 0.37, + "grad_norm": 0.8977229493298299, + "learning_rate": 1.4563734931524191e-05, + "loss": 0.197, + "step": 7250 + }, + { + "epoch": 0.37, + "grad_norm": 0.8534301150679654, + "learning_rate": 1.4562269416206642e-05, + "loss": 0.1963, + "step": 7251 + }, + { + "epoch": 0.37, + "grad_norm": 1.7761348402379218, + "learning_rate": 1.4560803777137279e-05, + "loss": 0.195, + "step": 7252 + }, + { + "epoch": 0.37, + "grad_norm": 1.1073974709757195, + "learning_rate": 1.4559338014355853e-05, + "loss": 0.2045, + "step": 7253 + }, + { + "epoch": 0.37, + "grad_norm": 1.488909966122852, + "learning_rate": 1.455787212790213e-05, + "loss": 0.2125, + "step": 7254 + }, + { + "epoch": 0.37, + "grad_norm": 0.8852613049754224, + "learning_rate": 1.4556406117815864e-05, + "loss": 0.1959, + "step": 7255 + }, + { + "epoch": 0.37, + "grad_norm": 0.8339723125115669, + "learning_rate": 1.4554939984136824e-05, + "loss": 0.1803, + "step": 7256 + }, + { + "epoch": 0.37, + "grad_norm": 0.9529550397471396, + "learning_rate": 1.4553473726904783e-05, + "loss": 0.223, + "step": 7257 + }, + { + "epoch": 0.37, + "grad_norm": 0.8851176434942727, + "learning_rate": 1.4552007346159509e-05, + "loss": 0.1759, + "step": 7258 + }, + { + "epoch": 0.37, + "grad_norm": 0.8007076173251486, + "learning_rate": 1.4550540841940778e-05, + "loss": 0.2015, + "step": 7259 + }, + { + "epoch": 0.37, + "grad_norm": 0.9714155323978105, + "learning_rate": 1.4549074214288368e-05, + "loss": 0.1795, + "step": 7260 + }, + { + "epoch": 0.37, + "grad_norm": 1.6302081131518966, + "learning_rate": 1.4547607463242068e-05, + "loss": 0.1857, + "step": 7261 + }, + { + "epoch": 0.37, + "grad_norm": 1.1714822203917645, + "learning_rate": 1.4546140588841656e-05, + "loss": 0.2201, + "step": 7262 + }, + { + "epoch": 0.37, + "grad_norm": 1.3467129135182294, + "learning_rate": 1.4544673591126924e-05, + "loss": 0.1882, + "step": 7263 + }, + { + "epoch": 0.37, + "grad_norm": 1.0569269794229985, + "learning_rate": 1.4543206470137663e-05, + "loss": 0.2448, + "step": 7264 + }, + { + "epoch": 0.37, + "grad_norm": 1.2157216517942167, + "learning_rate": 1.4541739225913669e-05, + "loss": 0.2044, + "step": 7265 + }, + { + "epoch": 0.37, + "grad_norm": 0.9933802128857785, + "learning_rate": 1.4540271858494746e-05, + "loss": 0.2042, + "step": 7266 + }, + { + "epoch": 0.37, + "grad_norm": 0.933445012615465, + "learning_rate": 1.453880436792069e-05, + "loss": 0.2293, + "step": 7267 + }, + { + "epoch": 0.37, + "grad_norm": 2.7724878186469613, + "learning_rate": 1.4537336754231307e-05, + "loss": 0.1888, + "step": 7268 + }, + { + "epoch": 0.37, + "grad_norm": 1.4015165482196508, + "learning_rate": 1.453586901746641e-05, + "loss": 0.1998, + "step": 7269 + }, + { + "epoch": 0.37, + "grad_norm": 0.8703397411856586, + "learning_rate": 1.4534401157665812e-05, + "loss": 0.2205, + "step": 7270 + }, + { + "epoch": 0.37, + "grad_norm": 1.0065068650542348, + "learning_rate": 1.4532933174869323e-05, + "loss": 0.1992, + "step": 7271 + }, + { + "epoch": 0.37, + "grad_norm": 1.3699495130815498, + "learning_rate": 1.4531465069116771e-05, + "loss": 0.2028, + "step": 7272 + }, + { + "epoch": 0.37, + "grad_norm": 1.0793924999933862, + "learning_rate": 1.4529996840447966e-05, + "loss": 0.2349, + "step": 7273 + }, + { + "epoch": 0.37, + "grad_norm": 0.7948066903330879, + "learning_rate": 1.4528528488902745e-05, + "loss": 0.1809, + "step": 7274 + }, + { + "epoch": 0.37, + "grad_norm": 1.1284683175111199, + "learning_rate": 1.4527060014520932e-05, + "loss": 0.217, + "step": 7275 + }, + { + "epoch": 0.37, + "grad_norm": 1.5284300188468116, + "learning_rate": 1.452559141734236e-05, + "loss": 0.2062, + "step": 7276 + }, + { + "epoch": 0.37, + "grad_norm": 0.781343543319652, + "learning_rate": 1.4524122697406866e-05, + "loss": 0.1962, + "step": 7277 + }, + { + "epoch": 0.37, + "grad_norm": 1.295639290670492, + "learning_rate": 1.452265385475429e-05, + "loss": 0.212, + "step": 7278 + }, + { + "epoch": 0.37, + "grad_norm": 1.740549444374038, + "learning_rate": 1.4521184889424472e-05, + "loss": 0.2036, + "step": 7279 + }, + { + "epoch": 0.37, + "grad_norm": 1.0011987948570245, + "learning_rate": 1.4519715801457256e-05, + "loss": 0.1967, + "step": 7280 + }, + { + "epoch": 0.37, + "grad_norm": 2.5341229496916164, + "learning_rate": 1.4518246590892493e-05, + "loss": 0.2037, + "step": 7281 + }, + { + "epoch": 0.37, + "grad_norm": 1.0225133863288018, + "learning_rate": 1.4516777257770039e-05, + "loss": 0.1988, + "step": 7282 + }, + { + "epoch": 0.37, + "grad_norm": 0.8600004079706661, + "learning_rate": 1.4515307802129746e-05, + "loss": 0.1886, + "step": 7283 + }, + { + "epoch": 0.37, + "grad_norm": 0.8580445077526663, + "learning_rate": 1.4513838224011474e-05, + "loss": 0.1931, + "step": 7284 + }, + { + "epoch": 0.37, + "grad_norm": 1.277280462094834, + "learning_rate": 1.4512368523455085e-05, + "loss": 0.2025, + "step": 7285 + }, + { + "epoch": 0.37, + "grad_norm": 1.00902764493686, + "learning_rate": 1.4510898700500442e-05, + "loss": 0.2112, + "step": 7286 + }, + { + "epoch": 0.37, + "grad_norm": 0.9961082386523968, + "learning_rate": 1.450942875518742e-05, + "loss": 0.1924, + "step": 7287 + }, + { + "epoch": 0.37, + "grad_norm": 1.0077175637689286, + "learning_rate": 1.4507958687555887e-05, + "loss": 0.1927, + "step": 7288 + }, + { + "epoch": 0.37, + "grad_norm": 1.064534723234433, + "learning_rate": 1.4506488497645724e-05, + "loss": 0.1952, + "step": 7289 + }, + { + "epoch": 0.37, + "grad_norm": 0.8798686625193131, + "learning_rate": 1.4505018185496802e-05, + "loss": 0.2013, + "step": 7290 + }, + { + "epoch": 0.37, + "grad_norm": 0.9306270609813555, + "learning_rate": 1.4503547751149007e-05, + "loss": 0.2206, + "step": 7291 + }, + { + "epoch": 0.37, + "grad_norm": 1.1938661547775145, + "learning_rate": 1.4502077194642229e-05, + "loss": 0.2108, + "step": 7292 + }, + { + "epoch": 0.37, + "grad_norm": 1.0111927132329457, + "learning_rate": 1.4500606516016347e-05, + "loss": 0.1839, + "step": 7293 + }, + { + "epoch": 0.37, + "grad_norm": 1.2037406245910078, + "learning_rate": 1.4499135715311262e-05, + "loss": 0.212, + "step": 7294 + }, + { + "epoch": 0.37, + "grad_norm": 1.6647718656816444, + "learning_rate": 1.4497664792566865e-05, + "loss": 0.224, + "step": 7295 + }, + { + "epoch": 0.37, + "grad_norm": 0.9636072832329631, + "learning_rate": 1.4496193747823062e-05, + "loss": 0.2038, + "step": 7296 + }, + { + "epoch": 0.37, + "grad_norm": 0.8220322234345318, + "learning_rate": 1.4494722581119748e-05, + "loss": 0.1819, + "step": 7297 + }, + { + "epoch": 0.37, + "grad_norm": 1.1926677975279927, + "learning_rate": 1.4493251292496826e-05, + "loss": 0.1974, + "step": 7298 + }, + { + "epoch": 0.37, + "grad_norm": 1.2505678449152033, + "learning_rate": 1.4491779881994208e-05, + "loss": 0.2185, + "step": 7299 + }, + { + "epoch": 0.37, + "grad_norm": 1.339102115123226, + "learning_rate": 1.4490308349651812e-05, + "loss": 0.1729, + "step": 7300 + }, + { + "epoch": 0.37, + "grad_norm": 1.3449780056843892, + "learning_rate": 1.4488836695509545e-05, + "loss": 0.1973, + "step": 7301 + }, + { + "epoch": 0.37, + "grad_norm": 1.0480233615897767, + "learning_rate": 1.4487364919607331e-05, + "loss": 0.2261, + "step": 7302 + }, + { + "epoch": 0.37, + "grad_norm": 1.1627938006870673, + "learning_rate": 1.4485893021985091e-05, + "loss": 0.1835, + "step": 7303 + }, + { + "epoch": 0.37, + "grad_norm": 0.9359446400184624, + "learning_rate": 1.4484421002682745e-05, + "loss": 0.2001, + "step": 7304 + }, + { + "epoch": 0.37, + "grad_norm": 1.7920756240041322, + "learning_rate": 1.4482948861740229e-05, + "loss": 0.2011, + "step": 7305 + }, + { + "epoch": 0.37, + "grad_norm": 0.998299872861058, + "learning_rate": 1.4481476599197473e-05, + "loss": 0.2082, + "step": 7306 + }, + { + "epoch": 0.37, + "grad_norm": 0.9021048208921083, + "learning_rate": 1.448000421509441e-05, + "loss": 0.2173, + "step": 7307 + }, + { + "epoch": 0.37, + "grad_norm": 0.8386435457194779, + "learning_rate": 1.4478531709470978e-05, + "loss": 0.2075, + "step": 7308 + }, + { + "epoch": 0.37, + "grad_norm": 1.4749227890431893, + "learning_rate": 1.4477059082367122e-05, + "loss": 0.1917, + "step": 7309 + }, + { + "epoch": 0.37, + "grad_norm": 0.9811174959203954, + "learning_rate": 1.4475586333822787e-05, + "loss": 0.1978, + "step": 7310 + }, + { + "epoch": 0.37, + "grad_norm": 1.4198098792649345, + "learning_rate": 1.4474113463877917e-05, + "loss": 0.2122, + "step": 7311 + }, + { + "epoch": 0.37, + "grad_norm": 0.9695964071852331, + "learning_rate": 1.4472640472572468e-05, + "loss": 0.2082, + "step": 7312 + }, + { + "epoch": 0.37, + "grad_norm": 0.9142693265376828, + "learning_rate": 1.4471167359946394e-05, + "loss": 0.186, + "step": 7313 + }, + { + "epoch": 0.37, + "grad_norm": 1.0171477584067625, + "learning_rate": 1.4469694126039653e-05, + "loss": 0.1787, + "step": 7314 + }, + { + "epoch": 0.37, + "grad_norm": 1.016606679527248, + "learning_rate": 1.4468220770892208e-05, + "loss": 0.1863, + "step": 7315 + }, + { + "epoch": 0.37, + "grad_norm": 0.969808532614358, + "learning_rate": 1.4466747294544017e-05, + "loss": 0.2009, + "step": 7316 + }, + { + "epoch": 0.37, + "grad_norm": 0.820909974683755, + "learning_rate": 1.4465273697035055e-05, + "loss": 0.2328, + "step": 7317 + }, + { + "epoch": 0.37, + "grad_norm": 0.894778322942877, + "learning_rate": 1.4463799978405295e-05, + "loss": 0.2129, + "step": 7318 + }, + { + "epoch": 0.37, + "grad_norm": 1.4698423912176912, + "learning_rate": 1.4462326138694706e-05, + "loss": 0.2089, + "step": 7319 + }, + { + "epoch": 0.37, + "grad_norm": 0.8868102604174576, + "learning_rate": 1.446085217794327e-05, + "loss": 0.1753, + "step": 7320 + }, + { + "epoch": 0.37, + "grad_norm": 1.1381290790671303, + "learning_rate": 1.4459378096190966e-05, + "loss": 0.2157, + "step": 7321 + }, + { + "epoch": 0.37, + "grad_norm": 0.9477793579680197, + "learning_rate": 1.4457903893477779e-05, + "loss": 0.2068, + "step": 7322 + }, + { + "epoch": 0.37, + "grad_norm": 1.0369651684066326, + "learning_rate": 1.4456429569843698e-05, + "loss": 0.1826, + "step": 7323 + }, + { + "epoch": 0.37, + "grad_norm": 0.8862359245446289, + "learning_rate": 1.4454955125328711e-05, + "loss": 0.2096, + "step": 7324 + }, + { + "epoch": 0.37, + "grad_norm": 0.971297388987907, + "learning_rate": 1.4453480559972817e-05, + "loss": 0.2092, + "step": 7325 + }, + { + "epoch": 0.37, + "grad_norm": 1.621655362230059, + "learning_rate": 1.4452005873816009e-05, + "loss": 0.1852, + "step": 7326 + }, + { + "epoch": 0.37, + "grad_norm": 0.9555106190526966, + "learning_rate": 1.4450531066898293e-05, + "loss": 0.1971, + "step": 7327 + }, + { + "epoch": 0.37, + "grad_norm": 1.1937414917284117, + "learning_rate": 1.4449056139259667e-05, + "loss": 0.2201, + "step": 7328 + }, + { + "epoch": 0.37, + "grad_norm": 0.8844007950976225, + "learning_rate": 1.4447581090940144e-05, + "loss": 0.2079, + "step": 7329 + }, + { + "epoch": 0.37, + "grad_norm": 1.3966735678188713, + "learning_rate": 1.4446105921979731e-05, + "loss": 0.1648, + "step": 7330 + }, + { + "epoch": 0.37, + "grad_norm": 0.8794422204306628, + "learning_rate": 1.4444630632418449e-05, + "loss": 0.1935, + "step": 7331 + }, + { + "epoch": 0.37, + "grad_norm": 1.1053488098602837, + "learning_rate": 1.4443155222296305e-05, + "loss": 0.2236, + "step": 7332 + }, + { + "epoch": 0.37, + "grad_norm": 1.165254773028472, + "learning_rate": 1.4441679691653327e-05, + "loss": 0.1892, + "step": 7333 + }, + { + "epoch": 0.37, + "grad_norm": 0.9169400364770195, + "learning_rate": 1.4440204040529536e-05, + "loss": 0.1849, + "step": 7334 + }, + { + "epoch": 0.37, + "grad_norm": 0.8791850419992676, + "learning_rate": 1.4438728268964956e-05, + "loss": 0.195, + "step": 7335 + }, + { + "epoch": 0.37, + "grad_norm": 0.8901616149988154, + "learning_rate": 1.4437252376999627e-05, + "loss": 0.2114, + "step": 7336 + }, + { + "epoch": 0.37, + "grad_norm": 1.0122521359050152, + "learning_rate": 1.4435776364673573e-05, + "loss": 0.195, + "step": 7337 + }, + { + "epoch": 0.37, + "grad_norm": 0.9392493572957487, + "learning_rate": 1.4434300232026837e-05, + "loss": 0.1802, + "step": 7338 + }, + { + "epoch": 0.37, + "grad_norm": 1.1768578635257316, + "learning_rate": 1.4432823979099453e-05, + "loss": 0.2006, + "step": 7339 + }, + { + "epoch": 0.37, + "grad_norm": 1.3795244554400317, + "learning_rate": 1.443134760593147e-05, + "loss": 0.1662, + "step": 7340 + }, + { + "epoch": 0.37, + "grad_norm": 1.452143408625368, + "learning_rate": 1.4429871112562935e-05, + "loss": 0.197, + "step": 7341 + }, + { + "epoch": 0.37, + "grad_norm": 1.1602637874632054, + "learning_rate": 1.4428394499033893e-05, + "loss": 0.2098, + "step": 7342 + }, + { + "epoch": 0.37, + "grad_norm": 1.4394723268569924, + "learning_rate": 1.4426917765384398e-05, + "loss": 0.2087, + "step": 7343 + }, + { + "epoch": 0.37, + "grad_norm": 1.0905020016637779, + "learning_rate": 1.4425440911654514e-05, + "loss": 0.207, + "step": 7344 + }, + { + "epoch": 0.37, + "grad_norm": 1.5623935818671604, + "learning_rate": 1.4423963937884293e-05, + "loss": 0.2205, + "step": 7345 + }, + { + "epoch": 0.37, + "grad_norm": 0.9808215660267777, + "learning_rate": 1.44224868441138e-05, + "loss": 0.1919, + "step": 7346 + }, + { + "epoch": 0.37, + "grad_norm": 1.0041036859515802, + "learning_rate": 1.4421009630383103e-05, + "loss": 0.18, + "step": 7347 + }, + { + "epoch": 0.37, + "grad_norm": 1.1655556834955962, + "learning_rate": 1.4419532296732271e-05, + "loss": 0.1819, + "step": 7348 + }, + { + "epoch": 0.37, + "grad_norm": 1.073691610451434, + "learning_rate": 1.4418054843201373e-05, + "loss": 0.1892, + "step": 7349 + }, + { + "epoch": 0.37, + "grad_norm": 0.9877252828329729, + "learning_rate": 1.441657726983049e-05, + "loss": 0.195, + "step": 7350 + }, + { + "epoch": 0.37, + "grad_norm": 1.0427460053130362, + "learning_rate": 1.4415099576659698e-05, + "loss": 0.1944, + "step": 7351 + }, + { + "epoch": 0.37, + "grad_norm": 0.9397173058898916, + "learning_rate": 1.4413621763729077e-05, + "loss": 0.1876, + "step": 7352 + }, + { + "epoch": 0.37, + "grad_norm": 1.1310771497695997, + "learning_rate": 1.4412143831078722e-05, + "loss": 0.1863, + "step": 7353 + }, + { + "epoch": 0.37, + "grad_norm": 1.048926621899362, + "learning_rate": 1.4410665778748714e-05, + "loss": 0.2142, + "step": 7354 + }, + { + "epoch": 0.37, + "grad_norm": 0.9894402322478005, + "learning_rate": 1.4409187606779149e-05, + "loss": 0.1851, + "step": 7355 + }, + { + "epoch": 0.37, + "grad_norm": 0.9000909516825687, + "learning_rate": 1.4407709315210117e-05, + "loss": 0.1865, + "step": 7356 + }, + { + "epoch": 0.37, + "grad_norm": 0.6959787873830762, + "learning_rate": 1.4406230904081724e-05, + "loss": 0.194, + "step": 7357 + }, + { + "epoch": 0.37, + "grad_norm": 1.5086135648888905, + "learning_rate": 1.440475237343407e-05, + "loss": 0.2017, + "step": 7358 + }, + { + "epoch": 0.37, + "grad_norm": 1.5819638309039437, + "learning_rate": 1.4403273723307259e-05, + "loss": 0.1928, + "step": 7359 + }, + { + "epoch": 0.37, + "grad_norm": 1.6196587202106798, + "learning_rate": 1.4401794953741397e-05, + "loss": 0.1949, + "step": 7360 + }, + { + "epoch": 0.37, + "grad_norm": 1.0728889712252914, + "learning_rate": 1.4400316064776598e-05, + "loss": 0.2076, + "step": 7361 + }, + { + "epoch": 0.37, + "grad_norm": 0.9184986881056747, + "learning_rate": 1.4398837056452979e-05, + "loss": 0.1919, + "step": 7362 + }, + { + "epoch": 0.37, + "grad_norm": 0.8316174688989357, + "learning_rate": 1.4397357928810657e-05, + "loss": 0.1754, + "step": 7363 + }, + { + "epoch": 0.37, + "grad_norm": 0.7761411361598949, + "learning_rate": 1.4395878681889753e-05, + "loss": 0.2109, + "step": 7364 + }, + { + "epoch": 0.37, + "grad_norm": 1.2223321694261715, + "learning_rate": 1.4394399315730389e-05, + "loss": 0.2328, + "step": 7365 + }, + { + "epoch": 0.37, + "grad_norm": 0.9737362109650243, + "learning_rate": 1.4392919830372698e-05, + "loss": 0.2028, + "step": 7366 + }, + { + "epoch": 0.37, + "grad_norm": 0.9702903964399952, + "learning_rate": 1.4391440225856807e-05, + "loss": 0.1712, + "step": 7367 + }, + { + "epoch": 0.37, + "grad_norm": 0.7614903580983567, + "learning_rate": 1.4389960502222855e-05, + "loss": 0.2001, + "step": 7368 + }, + { + "epoch": 0.37, + "grad_norm": 0.9259848994936244, + "learning_rate": 1.438848065951097e-05, + "loss": 0.2102, + "step": 7369 + }, + { + "epoch": 0.37, + "grad_norm": 1.48112954255448, + "learning_rate": 1.4387000697761305e-05, + "loss": 0.193, + "step": 7370 + }, + { + "epoch": 0.37, + "grad_norm": 0.8531334505763309, + "learning_rate": 1.4385520617013998e-05, + "loss": 0.1966, + "step": 7371 + }, + { + "epoch": 0.37, + "grad_norm": 1.103475362674807, + "learning_rate": 1.4384040417309194e-05, + "loss": 0.1842, + "step": 7372 + }, + { + "epoch": 0.37, + "grad_norm": 1.266035539089789, + "learning_rate": 1.4382560098687045e-05, + "loss": 0.2053, + "step": 7373 + }, + { + "epoch": 0.37, + "grad_norm": 1.0031143586674998, + "learning_rate": 1.4381079661187708e-05, + "loss": 0.2169, + "step": 7374 + }, + { + "epoch": 0.38, + "grad_norm": 0.9897718835102313, + "learning_rate": 1.4379599104851336e-05, + "loss": 0.2133, + "step": 7375 + }, + { + "epoch": 0.38, + "grad_norm": 4.726912981667751, + "learning_rate": 1.4378118429718093e-05, + "loss": 0.249, + "step": 7376 + }, + { + "epoch": 0.38, + "grad_norm": 0.84419251677018, + "learning_rate": 1.4376637635828142e-05, + "loss": 0.1989, + "step": 7377 + }, + { + "epoch": 0.38, + "grad_norm": 0.9721913264285105, + "learning_rate": 1.4375156723221642e-05, + "loss": 0.2163, + "step": 7378 + }, + { + "epoch": 0.38, + "grad_norm": 1.0242439576936002, + "learning_rate": 1.4373675691938773e-05, + "loss": 0.2078, + "step": 7379 + }, + { + "epoch": 0.38, + "grad_norm": 0.9130983342073931, + "learning_rate": 1.4372194542019705e-05, + "loss": 0.2066, + "step": 7380 + }, + { + "epoch": 0.38, + "grad_norm": 1.3429044399629129, + "learning_rate": 1.4370713273504611e-05, + "loss": 0.1946, + "step": 7381 + }, + { + "epoch": 0.38, + "grad_norm": 1.2895851595974883, + "learning_rate": 1.4369231886433672e-05, + "loss": 0.1807, + "step": 7382 + }, + { + "epoch": 0.38, + "grad_norm": 1.0427998515920929, + "learning_rate": 1.4367750380847073e-05, + "loss": 0.2046, + "step": 7383 + }, + { + "epoch": 0.38, + "grad_norm": 0.7801428574537339, + "learning_rate": 1.4366268756784998e-05, + "loss": 0.1846, + "step": 7384 + }, + { + "epoch": 0.38, + "grad_norm": 0.8523927910703909, + "learning_rate": 1.4364787014287636e-05, + "loss": 0.1846, + "step": 7385 + }, + { + "epoch": 0.38, + "grad_norm": 0.8105319148802839, + "learning_rate": 1.4363305153395179e-05, + "loss": 0.1764, + "step": 7386 + }, + { + "epoch": 0.38, + "grad_norm": 0.856027812134765, + "learning_rate": 1.4361823174147822e-05, + "loss": 0.2052, + "step": 7387 + }, + { + "epoch": 0.38, + "grad_norm": 0.8132340069486842, + "learning_rate": 1.4360341076585769e-05, + "loss": 0.1954, + "step": 7388 + }, + { + "epoch": 0.38, + "grad_norm": 0.7088325614833517, + "learning_rate": 1.4358858860749213e-05, + "loss": 0.1911, + "step": 7389 + }, + { + "epoch": 0.38, + "grad_norm": 0.8891267655937457, + "learning_rate": 1.4357376526678368e-05, + "loss": 0.1839, + "step": 7390 + }, + { + "epoch": 0.38, + "grad_norm": 0.866720734186058, + "learning_rate": 1.4355894074413436e-05, + "loss": 0.2069, + "step": 7391 + }, + { + "epoch": 0.38, + "grad_norm": 1.0826656456304604, + "learning_rate": 1.4354411503994634e-05, + "loss": 0.2116, + "step": 7392 + }, + { + "epoch": 0.38, + "grad_norm": 1.064708131590027, + "learning_rate": 1.4352928815462175e-05, + "loss": 0.1811, + "step": 7393 + }, + { + "epoch": 0.38, + "grad_norm": 0.8589484404006575, + "learning_rate": 1.4351446008856274e-05, + "loss": 0.2032, + "step": 7394 + }, + { + "epoch": 0.38, + "grad_norm": 0.7668618139828199, + "learning_rate": 1.4349963084217154e-05, + "loss": 0.2154, + "step": 7395 + }, + { + "epoch": 0.38, + "grad_norm": 0.7697956911015835, + "learning_rate": 1.4348480041585037e-05, + "loss": 0.2101, + "step": 7396 + }, + { + "epoch": 0.38, + "grad_norm": 0.8783925945149618, + "learning_rate": 1.4346996881000157e-05, + "loss": 0.193, + "step": 7397 + }, + { + "epoch": 0.38, + "grad_norm": 2.150747005100924, + "learning_rate": 1.434551360250274e-05, + "loss": 0.1899, + "step": 7398 + }, + { + "epoch": 0.38, + "grad_norm": 1.1226882074944422, + "learning_rate": 1.4344030206133022e-05, + "loss": 0.214, + "step": 7399 + }, + { + "epoch": 0.38, + "grad_norm": 0.8692486363623312, + "learning_rate": 1.4342546691931238e-05, + "loss": 0.2153, + "step": 7400 + }, + { + "epoch": 0.38, + "grad_norm": 1.4976160493577089, + "learning_rate": 1.4341063059937631e-05, + "loss": 0.2022, + "step": 7401 + }, + { + "epoch": 0.38, + "grad_norm": 1.1188129385862706, + "learning_rate": 1.4339579310192444e-05, + "loss": 0.2345, + "step": 7402 + }, + { + "epoch": 0.38, + "grad_norm": 1.0964171334407278, + "learning_rate": 1.4338095442735923e-05, + "loss": 0.1857, + "step": 7403 + }, + { + "epoch": 0.38, + "grad_norm": 1.8696443304840087, + "learning_rate": 1.4336611457608314e-05, + "loss": 0.2192, + "step": 7404 + }, + { + "epoch": 0.38, + "grad_norm": 1.1151258175613268, + "learning_rate": 1.4335127354849876e-05, + "loss": 0.1896, + "step": 7405 + }, + { + "epoch": 0.38, + "grad_norm": 0.8802160194360952, + "learning_rate": 1.4333643134500865e-05, + "loss": 0.2021, + "step": 7406 + }, + { + "epoch": 0.38, + "grad_norm": 0.8063957735402743, + "learning_rate": 1.433215879660154e-05, + "loss": 0.1948, + "step": 7407 + }, + { + "epoch": 0.38, + "grad_norm": 0.913986916751216, + "learning_rate": 1.4330674341192163e-05, + "loss": 0.1951, + "step": 7408 + }, + { + "epoch": 0.38, + "grad_norm": 1.3042370977547812, + "learning_rate": 1.4329189768312997e-05, + "loss": 0.2044, + "step": 7409 + }, + { + "epoch": 0.38, + "grad_norm": 1.18897831691075, + "learning_rate": 1.4327705078004317e-05, + "loss": 0.1884, + "step": 7410 + }, + { + "epoch": 0.38, + "grad_norm": 1.80509128784862, + "learning_rate": 1.432622027030639e-05, + "loss": 0.1911, + "step": 7411 + }, + { + "epoch": 0.38, + "grad_norm": 1.1516519194253798, + "learning_rate": 1.4324735345259495e-05, + "loss": 0.1702, + "step": 7412 + }, + { + "epoch": 0.38, + "grad_norm": 0.8693490250888909, + "learning_rate": 1.432325030290391e-05, + "loss": 0.1905, + "step": 7413 + }, + { + "epoch": 0.38, + "grad_norm": 0.8508701142949681, + "learning_rate": 1.4321765143279916e-05, + "loss": 0.2042, + "step": 7414 + }, + { + "epoch": 0.38, + "grad_norm": 1.2434167042103295, + "learning_rate": 1.4320279866427798e-05, + "loss": 0.2234, + "step": 7415 + }, + { + "epoch": 0.38, + "grad_norm": 0.8106162299147703, + "learning_rate": 1.4318794472387845e-05, + "loss": 0.1855, + "step": 7416 + }, + { + "epoch": 0.38, + "grad_norm": 0.8147454401274455, + "learning_rate": 1.4317308961200347e-05, + "loss": 0.1982, + "step": 7417 + }, + { + "epoch": 0.38, + "grad_norm": 0.7589531970409737, + "learning_rate": 1.43158233329056e-05, + "loss": 0.1847, + "step": 7418 + }, + { + "epoch": 0.38, + "grad_norm": 0.7345539558339856, + "learning_rate": 1.4314337587543903e-05, + "loss": 0.1827, + "step": 7419 + }, + { + "epoch": 0.38, + "grad_norm": 0.8228984501748001, + "learning_rate": 1.4312851725155554e-05, + "loss": 0.1878, + "step": 7420 + }, + { + "epoch": 0.38, + "grad_norm": 1.7492280721147175, + "learning_rate": 1.431136574578086e-05, + "loss": 0.1696, + "step": 7421 + }, + { + "epoch": 0.38, + "grad_norm": 1.1691536457309282, + "learning_rate": 1.4309879649460123e-05, + "loss": 0.1944, + "step": 7422 + }, + { + "epoch": 0.38, + "grad_norm": 0.9063338491403092, + "learning_rate": 1.4308393436233658e-05, + "loss": 0.1912, + "step": 7423 + }, + { + "epoch": 0.38, + "grad_norm": 1.2322521307302403, + "learning_rate": 1.4306907106141781e-05, + "loss": 0.1909, + "step": 7424 + }, + { + "epoch": 0.38, + "grad_norm": 0.9291985895172377, + "learning_rate": 1.4305420659224802e-05, + "loss": 0.2008, + "step": 7425 + }, + { + "epoch": 0.38, + "grad_norm": 0.8540370909551723, + "learning_rate": 1.4303934095523046e-05, + "loss": 0.1881, + "step": 7426 + }, + { + "epoch": 0.38, + "grad_norm": 0.8181487956916195, + "learning_rate": 1.4302447415076835e-05, + "loss": 0.2017, + "step": 7427 + }, + { + "epoch": 0.38, + "grad_norm": 0.8370007506643188, + "learning_rate": 1.4300960617926495e-05, + "loss": 0.192, + "step": 7428 + }, + { + "epoch": 0.38, + "grad_norm": 0.6955940772303095, + "learning_rate": 1.4299473704112355e-05, + "loss": 0.2156, + "step": 7429 + }, + { + "epoch": 0.38, + "grad_norm": 0.9342504930900565, + "learning_rate": 1.4297986673674747e-05, + "loss": 0.2549, + "step": 7430 + }, + { + "epoch": 0.38, + "grad_norm": 0.9147917475970455, + "learning_rate": 1.4296499526654004e-05, + "loss": 0.1773, + "step": 7431 + }, + { + "epoch": 0.38, + "grad_norm": 0.8086678880363469, + "learning_rate": 1.4295012263090475e-05, + "loss": 0.1908, + "step": 7432 + }, + { + "epoch": 0.38, + "grad_norm": 0.95526409863462, + "learning_rate": 1.4293524883024494e-05, + "loss": 0.1924, + "step": 7433 + }, + { + "epoch": 0.38, + "grad_norm": 0.9898514284623698, + "learning_rate": 1.4292037386496407e-05, + "loss": 0.1975, + "step": 7434 + }, + { + "epoch": 0.38, + "grad_norm": 1.214407578688893, + "learning_rate": 1.4290549773546565e-05, + "loss": 0.2102, + "step": 7435 + }, + { + "epoch": 0.38, + "grad_norm": 0.7862349311310997, + "learning_rate": 1.4289062044215318e-05, + "loss": 0.1952, + "step": 7436 + }, + { + "epoch": 0.38, + "grad_norm": 0.805684277750261, + "learning_rate": 1.428757419854302e-05, + "loss": 0.2077, + "step": 7437 + }, + { + "epoch": 0.38, + "grad_norm": 1.52754589046096, + "learning_rate": 1.428608623657003e-05, + "loss": 0.2141, + "step": 7438 + }, + { + "epoch": 0.38, + "grad_norm": 0.995220102864857, + "learning_rate": 1.4284598158336707e-05, + "loss": 0.2145, + "step": 7439 + }, + { + "epoch": 0.38, + "grad_norm": 0.9295949768900353, + "learning_rate": 1.428310996388342e-05, + "loss": 0.2259, + "step": 7440 + }, + { + "epoch": 0.38, + "grad_norm": 0.9462065479377103, + "learning_rate": 1.428162165325053e-05, + "loss": 0.2235, + "step": 7441 + }, + { + "epoch": 0.38, + "grad_norm": 1.2948090680155522, + "learning_rate": 1.4280133226478413e-05, + "loss": 0.1929, + "step": 7442 + }, + { + "epoch": 0.38, + "grad_norm": 1.4418156783883276, + "learning_rate": 1.4278644683607442e-05, + "loss": 0.1942, + "step": 7443 + }, + { + "epoch": 0.38, + "grad_norm": 0.8066543175705367, + "learning_rate": 1.4277156024677987e-05, + "loss": 0.1654, + "step": 7444 + }, + { + "epoch": 0.38, + "grad_norm": 1.230279856593095, + "learning_rate": 1.4275667249730437e-05, + "loss": 0.2238, + "step": 7445 + }, + { + "epoch": 0.38, + "grad_norm": 1.1038156894093265, + "learning_rate": 1.427417835880517e-05, + "loss": 0.2287, + "step": 7446 + }, + { + "epoch": 0.38, + "grad_norm": 1.218749043552778, + "learning_rate": 1.4272689351942577e-05, + "loss": 0.2005, + "step": 7447 + }, + { + "epoch": 0.38, + "grad_norm": 1.5229360109931684, + "learning_rate": 1.4271200229183043e-05, + "loss": 0.1836, + "step": 7448 + }, + { + "epoch": 0.38, + "grad_norm": 1.0477126003594448, + "learning_rate": 1.4269710990566958e-05, + "loss": 0.2061, + "step": 7449 + }, + { + "epoch": 0.38, + "grad_norm": 1.009328444666866, + "learning_rate": 1.4268221636134722e-05, + "loss": 0.2418, + "step": 7450 + }, + { + "epoch": 0.38, + "grad_norm": 0.8603144680485548, + "learning_rate": 1.4266732165926735e-05, + "loss": 0.2214, + "step": 7451 + }, + { + "epoch": 0.38, + "grad_norm": 1.0262286597197119, + "learning_rate": 1.4265242579983398e-05, + "loss": 0.2136, + "step": 7452 + }, + { + "epoch": 0.38, + "grad_norm": 0.7806878375610516, + "learning_rate": 1.4263752878345112e-05, + "loss": 0.2169, + "step": 7453 + }, + { + "epoch": 0.38, + "grad_norm": 1.1225825958989664, + "learning_rate": 1.4262263061052291e-05, + "loss": 0.2022, + "step": 7454 + }, + { + "epoch": 0.38, + "grad_norm": 1.6108452092479106, + "learning_rate": 1.4260773128145341e-05, + "loss": 0.1909, + "step": 7455 + }, + { + "epoch": 0.38, + "grad_norm": 1.0173183086801756, + "learning_rate": 1.4259283079664683e-05, + "loss": 0.1895, + "step": 7456 + }, + { + "epoch": 0.38, + "grad_norm": 0.8439453311015034, + "learning_rate": 1.4257792915650728e-05, + "loss": 0.1973, + "step": 7457 + }, + { + "epoch": 0.38, + "grad_norm": 0.8723570887290385, + "learning_rate": 1.42563026361439e-05, + "loss": 0.2065, + "step": 7458 + }, + { + "epoch": 0.38, + "grad_norm": 0.8536901531933275, + "learning_rate": 1.4254812241184623e-05, + "loss": 0.2027, + "step": 7459 + }, + { + "epoch": 0.38, + "grad_norm": 1.3662775011425288, + "learning_rate": 1.4253321730813326e-05, + "loss": 0.2275, + "step": 7460 + }, + { + "epoch": 0.38, + "grad_norm": 2.97442449862382, + "learning_rate": 1.4251831105070433e-05, + "loss": 0.2079, + "step": 7461 + }, + { + "epoch": 0.38, + "grad_norm": 0.790465580849618, + "learning_rate": 1.4250340363996382e-05, + "loss": 0.2051, + "step": 7462 + }, + { + "epoch": 0.38, + "grad_norm": 1.093983192420429, + "learning_rate": 1.4248849507631608e-05, + "loss": 0.212, + "step": 7463 + }, + { + "epoch": 0.38, + "grad_norm": 0.9556791044852044, + "learning_rate": 1.4247358536016554e-05, + "loss": 0.2081, + "step": 7464 + }, + { + "epoch": 0.38, + "grad_norm": 1.5223824451295664, + "learning_rate": 1.424586744919166e-05, + "loss": 0.1855, + "step": 7465 + }, + { + "epoch": 0.38, + "grad_norm": 0.8448789582666036, + "learning_rate": 1.4244376247197367e-05, + "loss": 0.2186, + "step": 7466 + }, + { + "epoch": 0.38, + "grad_norm": 1.3207283654711193, + "learning_rate": 1.424288493007413e-05, + "loss": 0.2033, + "step": 7467 + }, + { + "epoch": 0.38, + "grad_norm": 1.0601494536216605, + "learning_rate": 1.4241393497862403e-05, + "loss": 0.2069, + "step": 7468 + }, + { + "epoch": 0.38, + "grad_norm": 1.0956312757128686, + "learning_rate": 1.4239901950602635e-05, + "loss": 0.2081, + "step": 7469 + }, + { + "epoch": 0.38, + "grad_norm": 1.2581936092687398, + "learning_rate": 1.4238410288335289e-05, + "loss": 0.1978, + "step": 7470 + }, + { + "epoch": 0.38, + "grad_norm": 0.9771948394045026, + "learning_rate": 1.4236918511100824e-05, + "loss": 0.2054, + "step": 7471 + }, + { + "epoch": 0.38, + "grad_norm": 1.166621037901537, + "learning_rate": 1.4235426618939704e-05, + "loss": 0.2156, + "step": 7472 + }, + { + "epoch": 0.38, + "grad_norm": 0.9622643795882899, + "learning_rate": 1.4233934611892399e-05, + "loss": 0.2164, + "step": 7473 + }, + { + "epoch": 0.38, + "grad_norm": 1.004021903210207, + "learning_rate": 1.423244248999938e-05, + "loss": 0.1907, + "step": 7474 + }, + { + "epoch": 0.38, + "grad_norm": 1.0349109753246033, + "learning_rate": 1.4230950253301115e-05, + "loss": 0.1946, + "step": 7475 + }, + { + "epoch": 0.38, + "grad_norm": 1.343381097620725, + "learning_rate": 1.4229457901838087e-05, + "loss": 0.2022, + "step": 7476 + }, + { + "epoch": 0.38, + "grad_norm": 1.08645471980482, + "learning_rate": 1.4227965435650774e-05, + "loss": 0.1711, + "step": 7477 + }, + { + "epoch": 0.38, + "grad_norm": 1.376681948697426, + "learning_rate": 1.4226472854779663e-05, + "loss": 0.1741, + "step": 7478 + }, + { + "epoch": 0.38, + "grad_norm": 0.8444694354214273, + "learning_rate": 1.4224980159265234e-05, + "loss": 0.1995, + "step": 7479 + }, + { + "epoch": 0.38, + "grad_norm": 1.358410968788333, + "learning_rate": 1.422348734914798e-05, + "loss": 0.1953, + "step": 7480 + }, + { + "epoch": 0.38, + "grad_norm": 1.0823915381063276, + "learning_rate": 1.4221994424468395e-05, + "loss": 0.1951, + "step": 7481 + }, + { + "epoch": 0.38, + "grad_norm": 1.2856834550347218, + "learning_rate": 1.4220501385266971e-05, + "loss": 0.205, + "step": 7482 + }, + { + "epoch": 0.38, + "grad_norm": 1.185673366244246, + "learning_rate": 1.4219008231584211e-05, + "loss": 0.205, + "step": 7483 + }, + { + "epoch": 0.38, + "grad_norm": 1.0602670135534036, + "learning_rate": 1.421751496346061e-05, + "loss": 0.1888, + "step": 7484 + }, + { + "epoch": 0.38, + "grad_norm": 1.1431011201509098, + "learning_rate": 1.4216021580936681e-05, + "loss": 0.201, + "step": 7485 + }, + { + "epoch": 0.38, + "grad_norm": 1.9083813915428518, + "learning_rate": 1.4214528084052925e-05, + "loss": 0.1979, + "step": 7486 + }, + { + "epoch": 0.38, + "grad_norm": 1.0573463288335716, + "learning_rate": 1.421303447284986e-05, + "loss": 0.1886, + "step": 7487 + }, + { + "epoch": 0.38, + "grad_norm": 0.9609320810574705, + "learning_rate": 1.4211540747367998e-05, + "loss": 0.1885, + "step": 7488 + }, + { + "epoch": 0.38, + "grad_norm": 0.9505170150612685, + "learning_rate": 1.4210046907647852e-05, + "loss": 0.2028, + "step": 7489 + }, + { + "epoch": 0.38, + "grad_norm": 1.1493563606221087, + "learning_rate": 1.4208552953729949e-05, + "loss": 0.2101, + "step": 7490 + }, + { + "epoch": 0.38, + "grad_norm": 0.9009289958930716, + "learning_rate": 1.4207058885654807e-05, + "loss": 0.2003, + "step": 7491 + }, + { + "epoch": 0.38, + "grad_norm": 1.0713540112615993, + "learning_rate": 1.4205564703462955e-05, + "loss": 0.1874, + "step": 7492 + }, + { + "epoch": 0.38, + "grad_norm": 1.0702896376829447, + "learning_rate": 1.4204070407194923e-05, + "loss": 0.1953, + "step": 7493 + }, + { + "epoch": 0.38, + "grad_norm": 1.272473715175916, + "learning_rate": 1.4202575996891246e-05, + "loss": 0.2053, + "step": 7494 + }, + { + "epoch": 0.38, + "grad_norm": 0.908677908811022, + "learning_rate": 1.4201081472592455e-05, + "loss": 0.2103, + "step": 7495 + }, + { + "epoch": 0.38, + "grad_norm": 1.013295489272494, + "learning_rate": 1.4199586834339093e-05, + "loss": 0.2058, + "step": 7496 + }, + { + "epoch": 0.38, + "grad_norm": 1.0451142798524864, + "learning_rate": 1.41980920821717e-05, + "loss": 0.1923, + "step": 7497 + }, + { + "epoch": 0.38, + "grad_norm": 3.086382225473405, + "learning_rate": 1.4196597216130823e-05, + "loss": 0.1879, + "step": 7498 + }, + { + "epoch": 0.38, + "grad_norm": 1.0869633424813556, + "learning_rate": 1.4195102236257011e-05, + "loss": 0.2132, + "step": 7499 + }, + { + "epoch": 0.38, + "grad_norm": 1.0450687978973898, + "learning_rate": 1.4193607142590812e-05, + "loss": 0.2081, + "step": 7500 + }, + { + "epoch": 0.38, + "grad_norm": 1.1569443333472407, + "learning_rate": 1.4192111935172781e-05, + "loss": 0.221, + "step": 7501 + }, + { + "epoch": 0.38, + "grad_norm": 1.0329301579206815, + "learning_rate": 1.419061661404348e-05, + "loss": 0.1791, + "step": 7502 + }, + { + "epoch": 0.38, + "grad_norm": 1.6107073747509522, + "learning_rate": 1.4189121179243466e-05, + "loss": 0.189, + "step": 7503 + }, + { + "epoch": 0.38, + "grad_norm": 0.9123903187240583, + "learning_rate": 1.4187625630813303e-05, + "loss": 0.1986, + "step": 7504 + }, + { + "epoch": 0.38, + "grad_norm": 0.9802098693497492, + "learning_rate": 1.4186129968793558e-05, + "loss": 0.1743, + "step": 7505 + }, + { + "epoch": 0.38, + "grad_norm": 1.0183773359868546, + "learning_rate": 1.41846341932248e-05, + "loss": 0.1797, + "step": 7506 + }, + { + "epoch": 0.38, + "grad_norm": 1.0375210887014656, + "learning_rate": 1.4183138304147605e-05, + "loss": 0.1949, + "step": 7507 + }, + { + "epoch": 0.38, + "grad_norm": 0.9658665195692431, + "learning_rate": 1.4181642301602547e-05, + "loss": 0.195, + "step": 7508 + }, + { + "epoch": 0.38, + "grad_norm": 1.3377407285594807, + "learning_rate": 1.4180146185630205e-05, + "loss": 0.2411, + "step": 7509 + }, + { + "epoch": 0.38, + "grad_norm": 1.2619605983747744, + "learning_rate": 1.4178649956271162e-05, + "loss": 0.1842, + "step": 7510 + }, + { + "epoch": 0.38, + "grad_norm": 0.96968818167038, + "learning_rate": 1.4177153613566002e-05, + "loss": 0.191, + "step": 7511 + }, + { + "epoch": 0.38, + "grad_norm": 0.992558708079268, + "learning_rate": 1.4175657157555316e-05, + "loss": 0.182, + "step": 7512 + }, + { + "epoch": 0.38, + "grad_norm": 1.4435494809485996, + "learning_rate": 1.4174160588279692e-05, + "loss": 0.2136, + "step": 7513 + }, + { + "epoch": 0.38, + "grad_norm": 1.0321323815625674, + "learning_rate": 1.4172663905779725e-05, + "loss": 0.198, + "step": 7514 + }, + { + "epoch": 0.38, + "grad_norm": 0.7564634893716371, + "learning_rate": 1.4171167110096017e-05, + "loss": 0.1783, + "step": 7515 + }, + { + "epoch": 0.38, + "grad_norm": 0.9330135482607076, + "learning_rate": 1.4169670201269164e-05, + "loss": 0.2053, + "step": 7516 + }, + { + "epoch": 0.38, + "grad_norm": 0.8472369283082383, + "learning_rate": 1.4168173179339772e-05, + "loss": 0.2047, + "step": 7517 + }, + { + "epoch": 0.38, + "grad_norm": 0.8801592059554315, + "learning_rate": 1.4166676044348448e-05, + "loss": 0.1955, + "step": 7518 + }, + { + "epoch": 0.38, + "grad_norm": 1.0766442544705805, + "learning_rate": 1.4165178796335797e-05, + "loss": 0.2025, + "step": 7519 + }, + { + "epoch": 0.38, + "grad_norm": 1.0174652520991234, + "learning_rate": 1.416368143534244e-05, + "loss": 0.23, + "step": 7520 + }, + { + "epoch": 0.38, + "grad_norm": 0.9339941915171122, + "learning_rate": 1.4162183961408987e-05, + "loss": 0.1897, + "step": 7521 + }, + { + "epoch": 0.38, + "grad_norm": 1.5105807693516058, + "learning_rate": 1.416068637457606e-05, + "loss": 0.1781, + "step": 7522 + }, + { + "epoch": 0.38, + "grad_norm": 0.935675378996467, + "learning_rate": 1.4159188674884279e-05, + "loss": 0.2052, + "step": 7523 + }, + { + "epoch": 0.38, + "grad_norm": 1.1427958146169708, + "learning_rate": 1.4157690862374272e-05, + "loss": 0.2039, + "step": 7524 + }, + { + "epoch": 0.38, + "grad_norm": 1.658788262602051, + "learning_rate": 1.4156192937086666e-05, + "loss": 0.2107, + "step": 7525 + }, + { + "epoch": 0.38, + "grad_norm": 0.8955541412189186, + "learning_rate": 1.4154694899062089e-05, + "loss": 0.1832, + "step": 7526 + }, + { + "epoch": 0.38, + "grad_norm": 0.8889059275984288, + "learning_rate": 1.4153196748341179e-05, + "loss": 0.2379, + "step": 7527 + }, + { + "epoch": 0.38, + "grad_norm": 1.0487778302161304, + "learning_rate": 1.4151698484964574e-05, + "loss": 0.2044, + "step": 7528 + }, + { + "epoch": 0.38, + "grad_norm": 0.8812194186270116, + "learning_rate": 1.4150200108972912e-05, + "loss": 0.2, + "step": 7529 + }, + { + "epoch": 0.38, + "grad_norm": 0.9193206661664284, + "learning_rate": 1.4148701620406838e-05, + "loss": 0.2098, + "step": 7530 + }, + { + "epoch": 0.38, + "grad_norm": 0.993980966928357, + "learning_rate": 1.4147203019306997e-05, + "loss": 0.2048, + "step": 7531 + }, + { + "epoch": 0.38, + "grad_norm": 1.0750132796267666, + "learning_rate": 1.4145704305714038e-05, + "loss": 0.1998, + "step": 7532 + }, + { + "epoch": 0.38, + "grad_norm": 0.6834923657312354, + "learning_rate": 1.414420547966862e-05, + "loss": 0.1738, + "step": 7533 + }, + { + "epoch": 0.38, + "grad_norm": 0.7890938616586272, + "learning_rate": 1.4142706541211392e-05, + "loss": 0.1867, + "step": 7534 + }, + { + "epoch": 0.38, + "grad_norm": 1.2978997036990871, + "learning_rate": 1.4141207490383018e-05, + "loss": 0.1919, + "step": 7535 + }, + { + "epoch": 0.38, + "grad_norm": 0.8117457987197362, + "learning_rate": 1.4139708327224155e-05, + "loss": 0.1937, + "step": 7536 + }, + { + "epoch": 0.38, + "grad_norm": 0.9688092437532801, + "learning_rate": 1.4138209051775467e-05, + "loss": 0.2102, + "step": 7537 + }, + { + "epoch": 0.38, + "grad_norm": 1.0092214002394286, + "learning_rate": 1.4136709664077628e-05, + "loss": 0.2067, + "step": 7538 + }, + { + "epoch": 0.38, + "grad_norm": 1.2446739747004254, + "learning_rate": 1.4135210164171306e-05, + "loss": 0.2075, + "step": 7539 + }, + { + "epoch": 0.38, + "grad_norm": 1.409950331815906, + "learning_rate": 1.4133710552097175e-05, + "loss": 0.1752, + "step": 7540 + }, + { + "epoch": 0.38, + "grad_norm": 1.0317598910736085, + "learning_rate": 1.4132210827895909e-05, + "loss": 0.2115, + "step": 7541 + }, + { + "epoch": 0.38, + "grad_norm": 1.363377833518173, + "learning_rate": 1.4130710991608194e-05, + "loss": 0.2003, + "step": 7542 + }, + { + "epoch": 0.38, + "grad_norm": 1.0121446264043472, + "learning_rate": 1.4129211043274709e-05, + "loss": 0.2075, + "step": 7543 + }, + { + "epoch": 0.38, + "grad_norm": 0.8417773315163373, + "learning_rate": 1.412771098293614e-05, + "loss": 0.1946, + "step": 7544 + }, + { + "epoch": 0.38, + "grad_norm": 1.1855268084816621, + "learning_rate": 1.4126210810633176e-05, + "loss": 0.2152, + "step": 7545 + }, + { + "epoch": 0.38, + "grad_norm": 1.0237124307200967, + "learning_rate": 1.4124710526406516e-05, + "loss": 0.2443, + "step": 7546 + }, + { + "epoch": 0.38, + "grad_norm": 1.404831522108842, + "learning_rate": 1.4123210130296845e-05, + "loss": 0.198, + "step": 7547 + }, + { + "epoch": 0.38, + "grad_norm": 0.9074529008026088, + "learning_rate": 1.412170962234487e-05, + "loss": 0.1943, + "step": 7548 + }, + { + "epoch": 0.38, + "grad_norm": 1.51360435231819, + "learning_rate": 1.4120209002591286e-05, + "loss": 0.1871, + "step": 7549 + }, + { + "epoch": 0.38, + "grad_norm": 0.9224528569596793, + "learning_rate": 1.41187082710768e-05, + "loss": 0.1955, + "step": 7550 + }, + { + "epoch": 0.38, + "grad_norm": 0.8130885991407517, + "learning_rate": 1.4117207427842122e-05, + "loss": 0.1914, + "step": 7551 + }, + { + "epoch": 0.38, + "grad_norm": 1.0662624440238975, + "learning_rate": 1.4115706472927957e-05, + "loss": 0.1817, + "step": 7552 + }, + { + "epoch": 0.38, + "grad_norm": 0.8750154796571464, + "learning_rate": 1.4114205406375025e-05, + "loss": 0.2128, + "step": 7553 + }, + { + "epoch": 0.38, + "grad_norm": 1.0251582509921566, + "learning_rate": 1.4112704228224034e-05, + "loss": 0.1807, + "step": 7554 + }, + { + "epoch": 0.38, + "grad_norm": 0.8656304735082289, + "learning_rate": 1.4111202938515711e-05, + "loss": 0.1855, + "step": 7555 + }, + { + "epoch": 0.38, + "grad_norm": 0.9354978653471507, + "learning_rate": 1.4109701537290779e-05, + "loss": 0.2093, + "step": 7556 + }, + { + "epoch": 0.38, + "grad_norm": 0.9342037452734095, + "learning_rate": 1.4108200024589958e-05, + "loss": 0.1875, + "step": 7557 + }, + { + "epoch": 0.38, + "grad_norm": 0.9043537891505412, + "learning_rate": 1.410669840045398e-05, + "loss": 0.1928, + "step": 7558 + }, + { + "epoch": 0.38, + "grad_norm": 1.0488505805359303, + "learning_rate": 1.4105196664923577e-05, + "loss": 0.2078, + "step": 7559 + }, + { + "epoch": 0.38, + "grad_norm": 1.0669685487941905, + "learning_rate": 1.4103694818039483e-05, + "loss": 0.1933, + "step": 7560 + }, + { + "epoch": 0.38, + "grad_norm": 1.0754201019931622, + "learning_rate": 1.4102192859842432e-05, + "loss": 0.2111, + "step": 7561 + }, + { + "epoch": 0.38, + "grad_norm": 1.0524796337983948, + "learning_rate": 1.4100690790373168e-05, + "loss": 0.1853, + "step": 7562 + }, + { + "epoch": 0.38, + "grad_norm": 1.023500297719706, + "learning_rate": 1.4099188609672436e-05, + "loss": 0.1698, + "step": 7563 + }, + { + "epoch": 0.38, + "grad_norm": 0.6937197254418842, + "learning_rate": 1.409768631778098e-05, + "loss": 0.1687, + "step": 7564 + }, + { + "epoch": 0.38, + "grad_norm": 0.9742004715333842, + "learning_rate": 1.4096183914739554e-05, + "loss": 0.1947, + "step": 7565 + }, + { + "epoch": 0.38, + "grad_norm": 0.920578932705953, + "learning_rate": 1.4094681400588908e-05, + "loss": 0.1946, + "step": 7566 + }, + { + "epoch": 0.38, + "grad_norm": 1.3660496533422637, + "learning_rate": 1.4093178775369793e-05, + "loss": 0.2007, + "step": 7567 + }, + { + "epoch": 0.38, + "grad_norm": 0.8202394736784563, + "learning_rate": 1.4091676039122977e-05, + "loss": 0.2181, + "step": 7568 + }, + { + "epoch": 0.38, + "grad_norm": 0.8823324782020567, + "learning_rate": 1.4090173191889216e-05, + "loss": 0.2063, + "step": 7569 + }, + { + "epoch": 0.38, + "grad_norm": 0.6719778905366667, + "learning_rate": 1.4088670233709278e-05, + "loss": 0.185, + "step": 7570 + }, + { + "epoch": 0.38, + "grad_norm": 0.9286681884469269, + "learning_rate": 1.4087167164623927e-05, + "loss": 0.1838, + "step": 7571 + }, + { + "epoch": 0.39, + "grad_norm": 0.9178992253331234, + "learning_rate": 1.4085663984673936e-05, + "loss": 0.2011, + "step": 7572 + }, + { + "epoch": 0.39, + "grad_norm": 0.867270369064487, + "learning_rate": 1.408416069390008e-05, + "loss": 0.1782, + "step": 7573 + }, + { + "epoch": 0.39, + "grad_norm": 0.6760773325114068, + "learning_rate": 1.4082657292343131e-05, + "loss": 0.2006, + "step": 7574 + }, + { + "epoch": 0.39, + "grad_norm": 0.9197042191937326, + "learning_rate": 1.4081153780043878e-05, + "loss": 0.1803, + "step": 7575 + }, + { + "epoch": 0.39, + "grad_norm": 0.7566894677669311, + "learning_rate": 1.4079650157043095e-05, + "loss": 0.2011, + "step": 7576 + }, + { + "epoch": 0.39, + "grad_norm": 1.9803458625908985, + "learning_rate": 1.4078146423381571e-05, + "loss": 0.2276, + "step": 7577 + }, + { + "epoch": 0.39, + "grad_norm": 0.8948601920458561, + "learning_rate": 1.4076642579100095e-05, + "loss": 0.1957, + "step": 7578 + }, + { + "epoch": 0.39, + "grad_norm": 0.8251175785770059, + "learning_rate": 1.407513862423946e-05, + "loss": 0.2001, + "step": 7579 + }, + { + "epoch": 0.39, + "grad_norm": 0.8159217586434515, + "learning_rate": 1.4073634558840454e-05, + "loss": 0.1887, + "step": 7580 + }, + { + "epoch": 0.39, + "grad_norm": 1.712292700290352, + "learning_rate": 1.4072130382943888e-05, + "loss": 0.1882, + "step": 7581 + }, + { + "epoch": 0.39, + "grad_norm": 1.0144574994086635, + "learning_rate": 1.4070626096590552e-05, + "loss": 0.2054, + "step": 7582 + }, + { + "epoch": 0.39, + "grad_norm": 0.8214830372217938, + "learning_rate": 1.4069121699821253e-05, + "loss": 0.1965, + "step": 7583 + }, + { + "epoch": 0.39, + "grad_norm": 1.0635809609229483, + "learning_rate": 1.4067617192676799e-05, + "loss": 0.1768, + "step": 7584 + }, + { + "epoch": 0.39, + "grad_norm": 0.9959854507722363, + "learning_rate": 1.4066112575197996e-05, + "loss": 0.1972, + "step": 7585 + }, + { + "epoch": 0.39, + "grad_norm": 1.0418149972153863, + "learning_rate": 1.4064607847425664e-05, + "loss": 0.216, + "step": 7586 + }, + { + "epoch": 0.39, + "grad_norm": 0.9272614314221803, + "learning_rate": 1.4063103009400613e-05, + "loss": 0.2134, + "step": 7587 + }, + { + "epoch": 0.39, + "grad_norm": 0.8282259836531397, + "learning_rate": 1.4061598061163664e-05, + "loss": 0.2053, + "step": 7588 + }, + { + "epoch": 0.39, + "grad_norm": 1.518892077046803, + "learning_rate": 1.4060093002755636e-05, + "loss": 0.21, + "step": 7589 + }, + { + "epoch": 0.39, + "grad_norm": 0.872535797315508, + "learning_rate": 1.4058587834217356e-05, + "loss": 0.1814, + "step": 7590 + }, + { + "epoch": 0.39, + "grad_norm": 1.2812768967861146, + "learning_rate": 1.4057082555589654e-05, + "loss": 0.2118, + "step": 7591 + }, + { + "epoch": 0.39, + "grad_norm": 1.8252964740636697, + "learning_rate": 1.4055577166913358e-05, + "loss": 0.2015, + "step": 7592 + }, + { + "epoch": 0.39, + "grad_norm": 1.0118958760314445, + "learning_rate": 1.4054071668229304e-05, + "loss": 0.207, + "step": 7593 + }, + { + "epoch": 0.39, + "grad_norm": 0.8168769002372456, + "learning_rate": 1.4052566059578326e-05, + "loss": 0.1981, + "step": 7594 + }, + { + "epoch": 0.39, + "grad_norm": 0.9911446249748135, + "learning_rate": 1.4051060341001264e-05, + "loss": 0.2068, + "step": 7595 + }, + { + "epoch": 0.39, + "grad_norm": 0.8867221481455925, + "learning_rate": 1.4049554512538958e-05, + "loss": 0.1939, + "step": 7596 + }, + { + "epoch": 0.39, + "grad_norm": 0.8235464707735634, + "learning_rate": 1.4048048574232261e-05, + "loss": 0.1808, + "step": 7597 + }, + { + "epoch": 0.39, + "grad_norm": 0.8495631625786931, + "learning_rate": 1.4046542526122018e-05, + "loss": 0.2044, + "step": 7598 + }, + { + "epoch": 0.39, + "grad_norm": 1.0312588114857115, + "learning_rate": 1.4045036368249079e-05, + "loss": 0.1923, + "step": 7599 + }, + { + "epoch": 0.39, + "grad_norm": 1.2257480499644202, + "learning_rate": 1.4043530100654301e-05, + "loss": 0.2095, + "step": 7600 + }, + { + "epoch": 0.39, + "grad_norm": 0.9347192728461339, + "learning_rate": 1.4042023723378541e-05, + "loss": 0.1793, + "step": 7601 + }, + { + "epoch": 0.39, + "grad_norm": 0.8903951969647127, + "learning_rate": 1.4040517236462656e-05, + "loss": 0.1929, + "step": 7602 + }, + { + "epoch": 0.39, + "grad_norm": 1.3242813549316574, + "learning_rate": 1.4039010639947516e-05, + "loss": 0.1972, + "step": 7603 + }, + { + "epoch": 0.39, + "grad_norm": 1.201248666535143, + "learning_rate": 1.4037503933873984e-05, + "loss": 0.2101, + "step": 7604 + }, + { + "epoch": 0.39, + "grad_norm": 1.1902203553720785, + "learning_rate": 1.403599711828293e-05, + "loss": 0.2285, + "step": 7605 + }, + { + "epoch": 0.39, + "grad_norm": 0.9544624554727649, + "learning_rate": 1.4034490193215224e-05, + "loss": 0.1957, + "step": 7606 + }, + { + "epoch": 0.39, + "grad_norm": 1.0195966094164068, + "learning_rate": 1.4032983158711744e-05, + "loss": 0.2003, + "step": 7607 + }, + { + "epoch": 0.39, + "grad_norm": 1.3276408942198594, + "learning_rate": 1.403147601481337e-05, + "loss": 0.2001, + "step": 7608 + }, + { + "epoch": 0.39, + "grad_norm": 0.7275677793246561, + "learning_rate": 1.4029968761560979e-05, + "loss": 0.1805, + "step": 7609 + }, + { + "epoch": 0.39, + "grad_norm": 0.8479951597951434, + "learning_rate": 1.402846139899546e-05, + "loss": 0.2048, + "step": 7610 + }, + { + "epoch": 0.39, + "grad_norm": 1.838782389886137, + "learning_rate": 1.4026953927157698e-05, + "loss": 0.2025, + "step": 7611 + }, + { + "epoch": 0.39, + "grad_norm": 0.8453036827999479, + "learning_rate": 1.4025446346088582e-05, + "loss": 0.1987, + "step": 7612 + }, + { + "epoch": 0.39, + "grad_norm": 1.0457773409872628, + "learning_rate": 1.402393865582901e-05, + "loss": 0.1991, + "step": 7613 + }, + { + "epoch": 0.39, + "grad_norm": 0.8654771992999472, + "learning_rate": 1.4022430856419872e-05, + "loss": 0.1999, + "step": 7614 + }, + { + "epoch": 0.39, + "grad_norm": 1.1394541387656685, + "learning_rate": 1.4020922947902067e-05, + "loss": 0.2206, + "step": 7615 + }, + { + "epoch": 0.39, + "grad_norm": 0.8091480970897881, + "learning_rate": 1.4019414930316501e-05, + "loss": 0.1888, + "step": 7616 + }, + { + "epoch": 0.39, + "grad_norm": 1.0685495245375725, + "learning_rate": 1.4017906803704083e-05, + "loss": 0.1874, + "step": 7617 + }, + { + "epoch": 0.39, + "grad_norm": 0.872612439298588, + "learning_rate": 1.401639856810571e-05, + "loss": 0.1935, + "step": 7618 + }, + { + "epoch": 0.39, + "grad_norm": 1.1548406631959833, + "learning_rate": 1.4014890223562303e-05, + "loss": 0.1862, + "step": 7619 + }, + { + "epoch": 0.39, + "grad_norm": 0.9949755985513199, + "learning_rate": 1.401338177011477e-05, + "loss": 0.1795, + "step": 7620 + }, + { + "epoch": 0.39, + "grad_norm": 0.9879919215858408, + "learning_rate": 1.401187320780403e-05, + "loss": 0.1913, + "step": 7621 + }, + { + "epoch": 0.39, + "grad_norm": 0.8082593760487321, + "learning_rate": 1.4010364536671004e-05, + "loss": 0.1858, + "step": 7622 + }, + { + "epoch": 0.39, + "grad_norm": 1.4274036461103985, + "learning_rate": 1.4008855756756612e-05, + "loss": 0.2293, + "step": 7623 + }, + { + "epoch": 0.39, + "grad_norm": 0.9867689521147746, + "learning_rate": 1.4007346868101779e-05, + "loss": 0.1839, + "step": 7624 + }, + { + "epoch": 0.39, + "grad_norm": 0.9102121091949064, + "learning_rate": 1.4005837870747439e-05, + "loss": 0.2006, + "step": 7625 + }, + { + "epoch": 0.39, + "grad_norm": 0.854600912758212, + "learning_rate": 1.400432876473452e-05, + "loss": 0.1996, + "step": 7626 + }, + { + "epoch": 0.39, + "grad_norm": 1.2096446917432433, + "learning_rate": 1.4002819550103958e-05, + "loss": 0.1817, + "step": 7627 + }, + { + "epoch": 0.39, + "grad_norm": 1.0627923072240306, + "learning_rate": 1.4001310226896689e-05, + "loss": 0.2394, + "step": 7628 + }, + { + "epoch": 0.39, + "grad_norm": 1.2724119703874543, + "learning_rate": 1.3999800795153652e-05, + "loss": 0.186, + "step": 7629 + }, + { + "epoch": 0.39, + "grad_norm": 0.9999657164938784, + "learning_rate": 1.3998291254915797e-05, + "loss": 0.197, + "step": 7630 + }, + { + "epoch": 0.39, + "grad_norm": 1.2678985627552222, + "learning_rate": 1.3996781606224063e-05, + "loss": 0.1817, + "step": 7631 + }, + { + "epoch": 0.39, + "grad_norm": 0.9871813158684251, + "learning_rate": 1.3995271849119403e-05, + "loss": 0.2123, + "step": 7632 + }, + { + "epoch": 0.39, + "grad_norm": 0.7917965541568505, + "learning_rate": 1.3993761983642765e-05, + "loss": 0.1854, + "step": 7633 + }, + { + "epoch": 0.39, + "grad_norm": 0.948713022523547, + "learning_rate": 1.3992252009835112e-05, + "loss": 0.1835, + "step": 7634 + }, + { + "epoch": 0.39, + "grad_norm": 1.013801688872813, + "learning_rate": 1.3990741927737395e-05, + "loss": 0.1914, + "step": 7635 + }, + { + "epoch": 0.39, + "grad_norm": 1.0338027271382328, + "learning_rate": 1.3989231737390578e-05, + "loss": 0.2086, + "step": 7636 + }, + { + "epoch": 0.39, + "grad_norm": 1.0038262656980823, + "learning_rate": 1.3987721438835626e-05, + "loss": 0.2164, + "step": 7637 + }, + { + "epoch": 0.39, + "grad_norm": 1.198776705998106, + "learning_rate": 1.3986211032113504e-05, + "loss": 0.2259, + "step": 7638 + }, + { + "epoch": 0.39, + "grad_norm": 1.1559578172326987, + "learning_rate": 1.3984700517265184e-05, + "loss": 0.2053, + "step": 7639 + }, + { + "epoch": 0.39, + "grad_norm": 1.9478082394204406, + "learning_rate": 1.3983189894331636e-05, + "loss": 0.2075, + "step": 7640 + }, + { + "epoch": 0.39, + "grad_norm": 1.0319933891708784, + "learning_rate": 1.3981679163353839e-05, + "loss": 0.1778, + "step": 7641 + }, + { + "epoch": 0.39, + "grad_norm": 1.2467961556378275, + "learning_rate": 1.3980168324372768e-05, + "loss": 0.1881, + "step": 7642 + }, + { + "epoch": 0.39, + "grad_norm": 1.097771450032399, + "learning_rate": 1.3978657377429405e-05, + "loss": 0.2074, + "step": 7643 + }, + { + "epoch": 0.39, + "grad_norm": 0.7948181272258855, + "learning_rate": 1.3977146322564739e-05, + "loss": 0.1797, + "step": 7644 + }, + { + "epoch": 0.39, + "grad_norm": 1.6906764323928702, + "learning_rate": 1.3975635159819757e-05, + "loss": 0.1994, + "step": 7645 + }, + { + "epoch": 0.39, + "grad_norm": 0.8555549281608942, + "learning_rate": 1.3974123889235437e-05, + "loss": 0.1963, + "step": 7646 + }, + { + "epoch": 0.39, + "grad_norm": 0.8152633474642856, + "learning_rate": 1.3972612510852789e-05, + "loss": 0.2028, + "step": 7647 + }, + { + "epoch": 0.39, + "grad_norm": 0.834483546768694, + "learning_rate": 1.3971101024712803e-05, + "loss": 0.2009, + "step": 7648 + }, + { + "epoch": 0.39, + "grad_norm": 0.8059419858590291, + "learning_rate": 1.3969589430856476e-05, + "loss": 0.1705, + "step": 7649 + }, + { + "epoch": 0.39, + "grad_norm": 0.8663444660848391, + "learning_rate": 1.3968077729324808e-05, + "loss": 0.2038, + "step": 7650 + }, + { + "epoch": 0.39, + "grad_norm": 0.9524959304972902, + "learning_rate": 1.3966565920158811e-05, + "loss": 0.1794, + "step": 7651 + }, + { + "epoch": 0.39, + "grad_norm": 0.9974758966169943, + "learning_rate": 1.3965054003399493e-05, + "loss": 0.1747, + "step": 7652 + }, + { + "epoch": 0.39, + "grad_norm": 1.0143608736447438, + "learning_rate": 1.3963541979087855e-05, + "loss": 0.1919, + "step": 7653 + }, + { + "epoch": 0.39, + "grad_norm": 0.8770771941762959, + "learning_rate": 1.396202984726492e-05, + "loss": 0.2121, + "step": 7654 + }, + { + "epoch": 0.39, + "grad_norm": 1.058384880625591, + "learning_rate": 1.3960517607971697e-05, + "loss": 0.2058, + "step": 7655 + }, + { + "epoch": 0.39, + "grad_norm": 1.2266740375032155, + "learning_rate": 1.3959005261249217e-05, + "loss": 0.1884, + "step": 7656 + }, + { + "epoch": 0.39, + "grad_norm": 0.803399173151223, + "learning_rate": 1.3957492807138491e-05, + "loss": 0.1858, + "step": 7657 + }, + { + "epoch": 0.39, + "grad_norm": 2.5027341436903714, + "learning_rate": 1.3955980245680551e-05, + "loss": 0.2012, + "step": 7658 + }, + { + "epoch": 0.39, + "grad_norm": 0.7537433177825293, + "learning_rate": 1.3954467576916422e-05, + "loss": 0.197, + "step": 7659 + }, + { + "epoch": 0.39, + "grad_norm": 1.0281275927478755, + "learning_rate": 1.3952954800887137e-05, + "loss": 0.1985, + "step": 7660 + }, + { + "epoch": 0.39, + "grad_norm": 3.4007636048539807, + "learning_rate": 1.3951441917633732e-05, + "loss": 0.2119, + "step": 7661 + }, + { + "epoch": 0.39, + "grad_norm": 1.5874162488104218, + "learning_rate": 1.394992892719724e-05, + "loss": 0.1999, + "step": 7662 + }, + { + "epoch": 0.39, + "grad_norm": 0.90732612942027, + "learning_rate": 1.3948415829618705e-05, + "loss": 0.1906, + "step": 7663 + }, + { + "epoch": 0.39, + "grad_norm": 0.8673104556121487, + "learning_rate": 1.3946902624939164e-05, + "loss": 0.2082, + "step": 7664 + }, + { + "epoch": 0.39, + "grad_norm": 1.037346088119156, + "learning_rate": 1.3945389313199669e-05, + "loss": 0.2019, + "step": 7665 + }, + { + "epoch": 0.39, + "grad_norm": 1.123917949735145, + "learning_rate": 1.3943875894441265e-05, + "loss": 0.2053, + "step": 7666 + }, + { + "epoch": 0.39, + "grad_norm": 0.9346108283378147, + "learning_rate": 1.3942362368705007e-05, + "loss": 0.1752, + "step": 7667 + }, + { + "epoch": 0.39, + "grad_norm": 0.8821888566223758, + "learning_rate": 1.3940848736031943e-05, + "loss": 0.1883, + "step": 7668 + }, + { + "epoch": 0.39, + "grad_norm": 1.0537674571428322, + "learning_rate": 1.3939334996463136e-05, + "loss": 0.2164, + "step": 7669 + }, + { + "epoch": 0.39, + "grad_norm": 1.0721094239860878, + "learning_rate": 1.3937821150039645e-05, + "loss": 0.2313, + "step": 7670 + }, + { + "epoch": 0.39, + "grad_norm": 0.8856670502546947, + "learning_rate": 1.3936307196802532e-05, + "loss": 0.1929, + "step": 7671 + }, + { + "epoch": 0.39, + "grad_norm": 1.111960198508083, + "learning_rate": 1.3934793136792863e-05, + "loss": 0.2132, + "step": 7672 + }, + { + "epoch": 0.39, + "grad_norm": 1.1384490541523558, + "learning_rate": 1.3933278970051712e-05, + "loss": 0.2056, + "step": 7673 + }, + { + "epoch": 0.39, + "grad_norm": 1.2061262362828225, + "learning_rate": 1.3931764696620144e-05, + "loss": 0.2098, + "step": 7674 + }, + { + "epoch": 0.39, + "grad_norm": 1.6609050792349747, + "learning_rate": 1.3930250316539237e-05, + "loss": 0.1937, + "step": 7675 + }, + { + "epoch": 0.39, + "grad_norm": 1.0601825483540026, + "learning_rate": 1.3928735829850069e-05, + "loss": 0.2108, + "step": 7676 + }, + { + "epoch": 0.39, + "grad_norm": 0.9550895412085639, + "learning_rate": 1.3927221236593717e-05, + "loss": 0.1926, + "step": 7677 + }, + { + "epoch": 0.39, + "grad_norm": 0.8557196616439343, + "learning_rate": 1.392570653681127e-05, + "loss": 0.196, + "step": 7678 + }, + { + "epoch": 0.39, + "grad_norm": 0.9880124750398763, + "learning_rate": 1.392419173054381e-05, + "loss": 0.2059, + "step": 7679 + }, + { + "epoch": 0.39, + "grad_norm": 0.8231742504746878, + "learning_rate": 1.3922676817832432e-05, + "loss": 0.1963, + "step": 7680 + }, + { + "epoch": 0.39, + "grad_norm": 1.1466035230522185, + "learning_rate": 1.3921161798718217e-05, + "loss": 0.1918, + "step": 7681 + }, + { + "epoch": 0.39, + "grad_norm": 0.8208451969313353, + "learning_rate": 1.3919646673242272e-05, + "loss": 0.1905, + "step": 7682 + }, + { + "epoch": 0.39, + "grad_norm": 1.3997750841679093, + "learning_rate": 1.3918131441445687e-05, + "loss": 0.2143, + "step": 7683 + }, + { + "epoch": 0.39, + "grad_norm": 0.9636337485045307, + "learning_rate": 1.3916616103369567e-05, + "loss": 0.1944, + "step": 7684 + }, + { + "epoch": 0.39, + "grad_norm": 1.6144844863183354, + "learning_rate": 1.391510065905501e-05, + "loss": 0.1966, + "step": 7685 + }, + { + "epoch": 0.39, + "grad_norm": 0.9800808641664451, + "learning_rate": 1.3913585108543131e-05, + "loss": 0.1754, + "step": 7686 + }, + { + "epoch": 0.39, + "grad_norm": 0.8861425397209246, + "learning_rate": 1.3912069451875032e-05, + "loss": 0.192, + "step": 7687 + }, + { + "epoch": 0.39, + "grad_norm": 0.9087226051425511, + "learning_rate": 1.391055368909183e-05, + "loss": 0.1769, + "step": 7688 + }, + { + "epoch": 0.39, + "grad_norm": 1.054864288688696, + "learning_rate": 1.3909037820234637e-05, + "loss": 0.1894, + "step": 7689 + }, + { + "epoch": 0.39, + "grad_norm": 1.1708274247339223, + "learning_rate": 1.3907521845344571e-05, + "loss": 0.1985, + "step": 7690 + }, + { + "epoch": 0.39, + "grad_norm": 1.0713949500783901, + "learning_rate": 1.3906005764462758e-05, + "loss": 0.2113, + "step": 7691 + }, + { + "epoch": 0.39, + "grad_norm": 1.5889139746273195, + "learning_rate": 1.3904489577630316e-05, + "loss": 0.1841, + "step": 7692 + }, + { + "epoch": 0.39, + "grad_norm": 1.091357490656932, + "learning_rate": 1.3902973284888375e-05, + "loss": 0.1912, + "step": 7693 + }, + { + "epoch": 0.39, + "grad_norm": 0.8368483420261934, + "learning_rate": 1.3901456886278063e-05, + "loss": 0.204, + "step": 7694 + }, + { + "epoch": 0.39, + "grad_norm": 1.3583693284481406, + "learning_rate": 1.389994038184051e-05, + "loss": 0.203, + "step": 7695 + }, + { + "epoch": 0.39, + "grad_norm": 0.9312531164878359, + "learning_rate": 1.3898423771616854e-05, + "loss": 0.2161, + "step": 7696 + }, + { + "epoch": 0.39, + "grad_norm": 1.0903451646742162, + "learning_rate": 1.3896907055648235e-05, + "loss": 0.1956, + "step": 7697 + }, + { + "epoch": 0.39, + "grad_norm": 1.3439929901930354, + "learning_rate": 1.3895390233975793e-05, + "loss": 0.1906, + "step": 7698 + }, + { + "epoch": 0.39, + "grad_norm": 0.9825101816701041, + "learning_rate": 1.3893873306640669e-05, + "loss": 0.1943, + "step": 7699 + }, + { + "epoch": 0.39, + "grad_norm": 1.7285069594345552, + "learning_rate": 1.3892356273684012e-05, + "loss": 0.1971, + "step": 7700 + }, + { + "epoch": 0.39, + "grad_norm": 0.8837068614510146, + "learning_rate": 1.3890839135146972e-05, + "loss": 0.2041, + "step": 7701 + }, + { + "epoch": 0.39, + "grad_norm": 1.0181464717308626, + "learning_rate": 1.38893218910707e-05, + "loss": 0.1595, + "step": 7702 + }, + { + "epoch": 0.39, + "grad_norm": 1.1712417123030014, + "learning_rate": 1.388780454149635e-05, + "loss": 0.1995, + "step": 7703 + }, + { + "epoch": 0.39, + "grad_norm": 1.4386055659474735, + "learning_rate": 1.3886287086465085e-05, + "loss": 0.199, + "step": 7704 + }, + { + "epoch": 0.39, + "grad_norm": 0.9347759139666304, + "learning_rate": 1.3884769526018063e-05, + "loss": 0.1871, + "step": 7705 + }, + { + "epoch": 0.39, + "grad_norm": 1.035172557245781, + "learning_rate": 1.3883251860196446e-05, + "loss": 0.2382, + "step": 7706 + }, + { + "epoch": 0.39, + "grad_norm": 1.0406406386957872, + "learning_rate": 1.3881734089041405e-05, + "loss": 0.1744, + "step": 7707 + }, + { + "epoch": 0.39, + "grad_norm": 1.0977754191926146, + "learning_rate": 1.388021621259411e-05, + "loss": 0.197, + "step": 7708 + }, + { + "epoch": 0.39, + "grad_norm": 1.0018304593302076, + "learning_rate": 1.3878698230895726e-05, + "loss": 0.1818, + "step": 7709 + }, + { + "epoch": 0.39, + "grad_norm": 0.9539595526956814, + "learning_rate": 1.3877180143987436e-05, + "loss": 0.1904, + "step": 7710 + }, + { + "epoch": 0.39, + "grad_norm": 1.3344670181819707, + "learning_rate": 1.3875661951910417e-05, + "loss": 0.1971, + "step": 7711 + }, + { + "epoch": 0.39, + "grad_norm": 0.960361113128031, + "learning_rate": 1.3874143654705845e-05, + "loss": 0.1771, + "step": 7712 + }, + { + "epoch": 0.39, + "grad_norm": 1.2090564270325268, + "learning_rate": 1.387262525241491e-05, + "loss": 0.2206, + "step": 7713 + }, + { + "epoch": 0.39, + "grad_norm": 0.986088483516514, + "learning_rate": 1.3871106745078798e-05, + "loss": 0.1889, + "step": 7714 + }, + { + "epoch": 0.39, + "grad_norm": 1.434936865372642, + "learning_rate": 1.3869588132738696e-05, + "loss": 0.2354, + "step": 7715 + }, + { + "epoch": 0.39, + "grad_norm": 1.0088629743524504, + "learning_rate": 1.3868069415435795e-05, + "loss": 0.184, + "step": 7716 + }, + { + "epoch": 0.39, + "grad_norm": 1.6641815842457266, + "learning_rate": 1.3866550593211292e-05, + "loss": 0.2017, + "step": 7717 + }, + { + "epoch": 0.39, + "grad_norm": 1.3170396920934737, + "learning_rate": 1.386503166610639e-05, + "loss": 0.1842, + "step": 7718 + }, + { + "epoch": 0.39, + "grad_norm": 0.7855746745131865, + "learning_rate": 1.3863512634162283e-05, + "loss": 0.1922, + "step": 7719 + }, + { + "epoch": 0.39, + "grad_norm": 0.9041492336676009, + "learning_rate": 1.3861993497420176e-05, + "loss": 0.1913, + "step": 7720 + }, + { + "epoch": 0.39, + "grad_norm": 1.1314961895930964, + "learning_rate": 1.3860474255921279e-05, + "loss": 0.2246, + "step": 7721 + }, + { + "epoch": 0.39, + "grad_norm": 1.5367511841726282, + "learning_rate": 1.38589549097068e-05, + "loss": 0.1881, + "step": 7722 + }, + { + "epoch": 0.39, + "grad_norm": 0.9589789968589525, + "learning_rate": 1.3857435458817952e-05, + "loss": 0.2037, + "step": 7723 + }, + { + "epoch": 0.39, + "grad_norm": 1.1288332166730548, + "learning_rate": 1.3855915903295949e-05, + "loss": 0.1902, + "step": 7724 + }, + { + "epoch": 0.39, + "grad_norm": 0.960376489981517, + "learning_rate": 1.3854396243182007e-05, + "loss": 0.2171, + "step": 7725 + }, + { + "epoch": 0.39, + "grad_norm": 0.9273880760394527, + "learning_rate": 1.3852876478517352e-05, + "loss": 0.2086, + "step": 7726 + }, + { + "epoch": 0.39, + "grad_norm": 1.2440017744030931, + "learning_rate": 1.3851356609343202e-05, + "loss": 0.1798, + "step": 7727 + }, + { + "epoch": 0.39, + "grad_norm": 1.8535773225164793, + "learning_rate": 1.3849836635700791e-05, + "loss": 0.1772, + "step": 7728 + }, + { + "epoch": 0.39, + "grad_norm": 0.963785047508875, + "learning_rate": 1.384831655763134e-05, + "loss": 0.2068, + "step": 7729 + }, + { + "epoch": 0.39, + "grad_norm": 1.214837663043247, + "learning_rate": 1.3846796375176083e-05, + "loss": 0.1957, + "step": 7730 + }, + { + "epoch": 0.39, + "grad_norm": 1.1073446266276163, + "learning_rate": 1.3845276088376262e-05, + "loss": 0.2112, + "step": 7731 + }, + { + "epoch": 0.39, + "grad_norm": 1.0268993364703471, + "learning_rate": 1.3843755697273109e-05, + "loss": 0.1911, + "step": 7732 + }, + { + "epoch": 0.39, + "grad_norm": 0.9552792651480384, + "learning_rate": 1.3842235201907865e-05, + "loss": 0.2289, + "step": 7733 + }, + { + "epoch": 0.39, + "grad_norm": 1.0202539534015398, + "learning_rate": 1.3840714602321774e-05, + "loss": 0.1571, + "step": 7734 + }, + { + "epoch": 0.39, + "grad_norm": 0.8635891135148347, + "learning_rate": 1.3839193898556083e-05, + "loss": 0.1876, + "step": 7735 + }, + { + "epoch": 0.39, + "grad_norm": 1.1217723335679453, + "learning_rate": 1.383767309065204e-05, + "loss": 0.1988, + "step": 7736 + }, + { + "epoch": 0.39, + "grad_norm": 1.0698837728586303, + "learning_rate": 1.3836152178650899e-05, + "loss": 0.1921, + "step": 7737 + }, + { + "epoch": 0.39, + "grad_norm": 1.0058637146346, + "learning_rate": 1.383463116259391e-05, + "loss": 0.2095, + "step": 7738 + }, + { + "epoch": 0.39, + "grad_norm": 0.9999314188011952, + "learning_rate": 1.3833110042522337e-05, + "loss": 0.2046, + "step": 7739 + }, + { + "epoch": 0.39, + "grad_norm": 1.4672213955523357, + "learning_rate": 1.3831588818477437e-05, + "loss": 0.2053, + "step": 7740 + }, + { + "epoch": 0.39, + "grad_norm": 1.0337800417999594, + "learning_rate": 1.3830067490500474e-05, + "loss": 0.2175, + "step": 7741 + }, + { + "epoch": 0.39, + "grad_norm": 1.2319011038386658, + "learning_rate": 1.3828546058632714e-05, + "loss": 0.1902, + "step": 7742 + }, + { + "epoch": 0.39, + "grad_norm": 1.3044079880196222, + "learning_rate": 1.3827024522915425e-05, + "loss": 0.2291, + "step": 7743 + }, + { + "epoch": 0.39, + "grad_norm": 0.8538781044258716, + "learning_rate": 1.382550288338988e-05, + "loss": 0.2123, + "step": 7744 + }, + { + "epoch": 0.39, + "grad_norm": 0.879168221739137, + "learning_rate": 1.3823981140097354e-05, + "loss": 0.179, + "step": 7745 + }, + { + "epoch": 0.39, + "grad_norm": 0.9413694338433246, + "learning_rate": 1.3822459293079122e-05, + "loss": 0.1967, + "step": 7746 + }, + { + "epoch": 0.39, + "grad_norm": 1.1056894724859516, + "learning_rate": 1.3820937342376467e-05, + "loss": 0.191, + "step": 7747 + }, + { + "epoch": 0.39, + "grad_norm": 0.9126483606695378, + "learning_rate": 1.3819415288030672e-05, + "loss": 0.1762, + "step": 7748 + }, + { + "epoch": 0.39, + "grad_norm": 1.1355339214771147, + "learning_rate": 1.381789313008302e-05, + "loss": 0.2235, + "step": 7749 + }, + { + "epoch": 0.39, + "grad_norm": 0.9826179082927652, + "learning_rate": 1.3816370868574804e-05, + "loss": 0.2131, + "step": 7750 + }, + { + "epoch": 0.39, + "grad_norm": 2.8860565691463593, + "learning_rate": 1.3814848503547308e-05, + "loss": 0.1782, + "step": 7751 + }, + { + "epoch": 0.39, + "grad_norm": 1.5081861453338383, + "learning_rate": 1.3813326035041832e-05, + "loss": 0.2113, + "step": 7752 + }, + { + "epoch": 0.39, + "grad_norm": 1.3892948185606584, + "learning_rate": 1.3811803463099675e-05, + "loss": 0.2063, + "step": 7753 + }, + { + "epoch": 0.39, + "grad_norm": 0.9911639651533942, + "learning_rate": 1.3810280787762131e-05, + "loss": 0.1817, + "step": 7754 + }, + { + "epoch": 0.39, + "grad_norm": 2.022925229717426, + "learning_rate": 1.3808758009070506e-05, + "loss": 0.1943, + "step": 7755 + }, + { + "epoch": 0.39, + "grad_norm": 1.3548483263654096, + "learning_rate": 1.3807235127066104e-05, + "loss": 0.2179, + "step": 7756 + }, + { + "epoch": 0.39, + "grad_norm": 1.1166557524441694, + "learning_rate": 1.3805712141790237e-05, + "loss": 0.2168, + "step": 7757 + }, + { + "epoch": 0.39, + "grad_norm": 0.9892567026131145, + "learning_rate": 1.380418905328421e-05, + "loss": 0.177, + "step": 7758 + }, + { + "epoch": 0.39, + "grad_norm": 1.0970740628385713, + "learning_rate": 1.3802665861589342e-05, + "loss": 0.229, + "step": 7759 + }, + { + "epoch": 0.39, + "grad_norm": 1.8689804869980224, + "learning_rate": 1.3801142566746945e-05, + "loss": 0.167, + "step": 7760 + }, + { + "epoch": 0.39, + "grad_norm": 1.081339687802085, + "learning_rate": 1.3799619168798346e-05, + "loss": 0.1883, + "step": 7761 + }, + { + "epoch": 0.39, + "grad_norm": 1.7086114323382755, + "learning_rate": 1.3798095667784859e-05, + "loss": 0.2153, + "step": 7762 + }, + { + "epoch": 0.39, + "grad_norm": 1.2132432860413234, + "learning_rate": 1.3796572063747813e-05, + "loss": 0.2145, + "step": 7763 + }, + { + "epoch": 0.39, + "grad_norm": 0.9325342418973277, + "learning_rate": 1.3795048356728538e-05, + "loss": 0.173, + "step": 7764 + }, + { + "epoch": 0.39, + "grad_norm": 1.1566731200513392, + "learning_rate": 1.3793524546768358e-05, + "loss": 0.1983, + "step": 7765 + }, + { + "epoch": 0.39, + "grad_norm": 1.9601961276441815, + "learning_rate": 1.3792000633908612e-05, + "loss": 0.2059, + "step": 7766 + }, + { + "epoch": 0.39, + "grad_norm": 0.8099371655872485, + "learning_rate": 1.3790476618190634e-05, + "loss": 0.2094, + "step": 7767 + }, + { + "epoch": 0.4, + "grad_norm": 1.7473037502181532, + "learning_rate": 1.3788952499655765e-05, + "loss": 0.1907, + "step": 7768 + }, + { + "epoch": 0.4, + "grad_norm": 0.9096819251192658, + "learning_rate": 1.3787428278345344e-05, + "loss": 0.1857, + "step": 7769 + }, + { + "epoch": 0.4, + "grad_norm": 0.9874258926455962, + "learning_rate": 1.378590395430072e-05, + "loss": 0.1808, + "step": 7770 + }, + { + "epoch": 0.4, + "grad_norm": 0.9443362322677041, + "learning_rate": 1.3784379527563233e-05, + "loss": 0.1781, + "step": 7771 + }, + { + "epoch": 0.4, + "grad_norm": 1.1366189588352082, + "learning_rate": 1.3782854998174243e-05, + "loss": 0.2119, + "step": 7772 + }, + { + "epoch": 0.4, + "grad_norm": 0.9441430740776766, + "learning_rate": 1.3781330366175093e-05, + "loss": 0.1915, + "step": 7773 + }, + { + "epoch": 0.4, + "grad_norm": 1.1722347958655013, + "learning_rate": 1.3779805631607144e-05, + "loss": 0.2206, + "step": 7774 + }, + { + "epoch": 0.4, + "grad_norm": 1.011282633832701, + "learning_rate": 1.3778280794511753e-05, + "loss": 0.1823, + "step": 7775 + }, + { + "epoch": 0.4, + "grad_norm": 1.1645863169044224, + "learning_rate": 1.3776755854930285e-05, + "loss": 0.2104, + "step": 7776 + }, + { + "epoch": 0.4, + "grad_norm": 1.1038162763376442, + "learning_rate": 1.3775230812904101e-05, + "loss": 0.2008, + "step": 7777 + }, + { + "epoch": 0.4, + "grad_norm": 1.5586894325978662, + "learning_rate": 1.3773705668474564e-05, + "loss": 0.2054, + "step": 7778 + }, + { + "epoch": 0.4, + "grad_norm": 1.0103587347743808, + "learning_rate": 1.377218042168305e-05, + "loss": 0.2099, + "step": 7779 + }, + { + "epoch": 0.4, + "grad_norm": 1.0698592267223566, + "learning_rate": 1.3770655072570929e-05, + "loss": 0.2056, + "step": 7780 + }, + { + "epoch": 0.4, + "grad_norm": 1.1252856387119448, + "learning_rate": 1.3769129621179578e-05, + "loss": 0.1959, + "step": 7781 + }, + { + "epoch": 0.4, + "grad_norm": 1.3938755135494856, + "learning_rate": 1.3767604067550369e-05, + "loss": 0.2072, + "step": 7782 + }, + { + "epoch": 0.4, + "grad_norm": 1.043094931379382, + "learning_rate": 1.376607841172469e-05, + "loss": 0.1888, + "step": 7783 + }, + { + "epoch": 0.4, + "grad_norm": 0.9900803380343406, + "learning_rate": 1.376455265374392e-05, + "loss": 0.199, + "step": 7784 + }, + { + "epoch": 0.4, + "grad_norm": 1.3610036980785805, + "learning_rate": 1.376302679364945e-05, + "loss": 0.17, + "step": 7785 + }, + { + "epoch": 0.4, + "grad_norm": 0.8993769498920858, + "learning_rate": 1.3761500831482661e-05, + "loss": 0.1989, + "step": 7786 + }, + { + "epoch": 0.4, + "grad_norm": 0.9511935062881577, + "learning_rate": 1.3759974767284954e-05, + "loss": 0.1929, + "step": 7787 + }, + { + "epoch": 0.4, + "grad_norm": 1.04482184811609, + "learning_rate": 1.3758448601097715e-05, + "loss": 0.1905, + "step": 7788 + }, + { + "epoch": 0.4, + "grad_norm": 1.0151552795044474, + "learning_rate": 1.3756922332962349e-05, + "loss": 0.1841, + "step": 7789 + }, + { + "epoch": 0.4, + "grad_norm": 1.1638260461131706, + "learning_rate": 1.375539596292025e-05, + "loss": 0.203, + "step": 7790 + }, + { + "epoch": 0.4, + "grad_norm": 1.1345663298986353, + "learning_rate": 1.3753869491012822e-05, + "loss": 0.1927, + "step": 7791 + }, + { + "epoch": 0.4, + "grad_norm": 1.274166847278216, + "learning_rate": 1.3752342917281474e-05, + "loss": 0.1876, + "step": 7792 + }, + { + "epoch": 0.4, + "grad_norm": 3.2116466657736265, + "learning_rate": 1.3750816241767612e-05, + "loss": 0.2162, + "step": 7793 + }, + { + "epoch": 0.4, + "grad_norm": 3.292280681415955, + "learning_rate": 1.374928946451265e-05, + "loss": 0.2137, + "step": 7794 + }, + { + "epoch": 0.4, + "grad_norm": 0.9538512883912856, + "learning_rate": 1.3747762585557995e-05, + "loss": 0.1932, + "step": 7795 + }, + { + "epoch": 0.4, + "grad_norm": 0.902903017738043, + "learning_rate": 1.374623560494507e-05, + "loss": 0.2053, + "step": 7796 + }, + { + "epoch": 0.4, + "grad_norm": 0.9089488041497878, + "learning_rate": 1.3744708522715295e-05, + "loss": 0.1868, + "step": 7797 + }, + { + "epoch": 0.4, + "grad_norm": 1.036402555926078, + "learning_rate": 1.3743181338910088e-05, + "loss": 0.175, + "step": 7798 + }, + { + "epoch": 0.4, + "grad_norm": 1.1412140081353452, + "learning_rate": 1.3741654053570877e-05, + "loss": 0.2069, + "step": 7799 + }, + { + "epoch": 0.4, + "grad_norm": 0.9812081753904945, + "learning_rate": 1.3740126666739086e-05, + "loss": 0.1921, + "step": 7800 + }, + { + "epoch": 0.4, + "grad_norm": 0.8599736416295825, + "learning_rate": 1.3738599178456149e-05, + "loss": 0.1993, + "step": 7801 + }, + { + "epoch": 0.4, + "grad_norm": 1.3933742441258217, + "learning_rate": 1.37370715887635e-05, + "loss": 0.2095, + "step": 7802 + }, + { + "epoch": 0.4, + "grad_norm": 0.9186112337500361, + "learning_rate": 1.3735543897702572e-05, + "loss": 0.2101, + "step": 7803 + }, + { + "epoch": 0.4, + "grad_norm": 0.7780230586723094, + "learning_rate": 1.3734016105314803e-05, + "loss": 0.1716, + "step": 7804 + }, + { + "epoch": 0.4, + "grad_norm": 0.912468228908501, + "learning_rate": 1.3732488211641638e-05, + "loss": 0.1777, + "step": 7805 + }, + { + "epoch": 0.4, + "grad_norm": 1.0843294009632642, + "learning_rate": 1.3730960216724518e-05, + "loss": 0.1716, + "step": 7806 + }, + { + "epoch": 0.4, + "grad_norm": 1.3501717023837643, + "learning_rate": 1.3729432120604895e-05, + "loss": 0.1975, + "step": 7807 + }, + { + "epoch": 0.4, + "grad_norm": 1.033915342936664, + "learning_rate": 1.3727903923324211e-05, + "loss": 0.1954, + "step": 7808 + }, + { + "epoch": 0.4, + "grad_norm": 0.9776691062281382, + "learning_rate": 1.3726375624923925e-05, + "loss": 0.1895, + "step": 7809 + }, + { + "epoch": 0.4, + "grad_norm": 0.8463396302747215, + "learning_rate": 1.3724847225445488e-05, + "loss": 0.1929, + "step": 7810 + }, + { + "epoch": 0.4, + "grad_norm": 1.1511623956776489, + "learning_rate": 1.3723318724930362e-05, + "loss": 0.168, + "step": 7811 + }, + { + "epoch": 0.4, + "grad_norm": 1.6733247641066649, + "learning_rate": 1.3721790123420002e-05, + "loss": 0.2306, + "step": 7812 + }, + { + "epoch": 0.4, + "grad_norm": 1.0738043696689645, + "learning_rate": 1.3720261420955874e-05, + "loss": 0.1865, + "step": 7813 + }, + { + "epoch": 0.4, + "grad_norm": 0.9350696789972645, + "learning_rate": 1.3718732617579449e-05, + "loss": 0.1919, + "step": 7814 + }, + { + "epoch": 0.4, + "grad_norm": 1.0062826557369187, + "learning_rate": 1.371720371333219e-05, + "loss": 0.178, + "step": 7815 + }, + { + "epoch": 0.4, + "grad_norm": 1.095925858088011, + "learning_rate": 1.3715674708255571e-05, + "loss": 0.1966, + "step": 7816 + }, + { + "epoch": 0.4, + "grad_norm": 1.072683440548299, + "learning_rate": 1.3714145602391063e-05, + "loss": 0.1699, + "step": 7817 + }, + { + "epoch": 0.4, + "grad_norm": 1.0745173488233095, + "learning_rate": 1.3712616395780148e-05, + "loss": 0.1882, + "step": 7818 + }, + { + "epoch": 0.4, + "grad_norm": 0.9455046613837105, + "learning_rate": 1.3711087088464303e-05, + "loss": 0.1703, + "step": 7819 + }, + { + "epoch": 0.4, + "grad_norm": 2.875943906638408, + "learning_rate": 1.3709557680485013e-05, + "loss": 0.21, + "step": 7820 + }, + { + "epoch": 0.4, + "grad_norm": 3.555443947600797, + "learning_rate": 1.3708028171883757e-05, + "loss": 0.2067, + "step": 7821 + }, + { + "epoch": 0.4, + "grad_norm": 1.019118936893599, + "learning_rate": 1.3706498562702032e-05, + "loss": 0.1905, + "step": 7822 + }, + { + "epoch": 0.4, + "grad_norm": 0.9698314019007279, + "learning_rate": 1.3704968852981322e-05, + "loss": 0.2184, + "step": 7823 + }, + { + "epoch": 0.4, + "grad_norm": 0.9142833794886264, + "learning_rate": 1.3703439042763122e-05, + "loss": 0.2014, + "step": 7824 + }, + { + "epoch": 0.4, + "grad_norm": 0.9029009788321303, + "learning_rate": 1.3701909132088931e-05, + "loss": 0.2014, + "step": 7825 + }, + { + "epoch": 0.4, + "grad_norm": 1.252169628887097, + "learning_rate": 1.3700379121000245e-05, + "loss": 0.2111, + "step": 7826 + }, + { + "epoch": 0.4, + "grad_norm": 1.3070075999421975, + "learning_rate": 1.3698849009538564e-05, + "loss": 0.2055, + "step": 7827 + }, + { + "epoch": 0.4, + "grad_norm": 1.3437122932148957, + "learning_rate": 1.3697318797745399e-05, + "loss": 0.1853, + "step": 7828 + }, + { + "epoch": 0.4, + "grad_norm": 1.0379265187071747, + "learning_rate": 1.3695788485662248e-05, + "loss": 0.1936, + "step": 7829 + }, + { + "epoch": 0.4, + "grad_norm": 1.1959800320200566, + "learning_rate": 1.3694258073330626e-05, + "loss": 0.1847, + "step": 7830 + }, + { + "epoch": 0.4, + "grad_norm": 1.0100684236799955, + "learning_rate": 1.3692727560792048e-05, + "loss": 0.2191, + "step": 7831 + }, + { + "epoch": 0.4, + "grad_norm": 1.0045741496998244, + "learning_rate": 1.3691196948088026e-05, + "loss": 0.2086, + "step": 7832 + }, + { + "epoch": 0.4, + "grad_norm": 0.880542853536741, + "learning_rate": 1.3689666235260078e-05, + "loss": 0.2093, + "step": 7833 + }, + { + "epoch": 0.4, + "grad_norm": 0.8663881683353095, + "learning_rate": 1.3688135422349724e-05, + "loss": 0.1842, + "step": 7834 + }, + { + "epoch": 0.4, + "grad_norm": 1.1894788719346479, + "learning_rate": 1.3686604509398489e-05, + "loss": 0.215, + "step": 7835 + }, + { + "epoch": 0.4, + "grad_norm": 1.239200767986605, + "learning_rate": 1.3685073496447898e-05, + "loss": 0.2123, + "step": 7836 + }, + { + "epoch": 0.4, + "grad_norm": 1.0676797817770391, + "learning_rate": 1.3683542383539482e-05, + "loss": 0.1925, + "step": 7837 + }, + { + "epoch": 0.4, + "grad_norm": 1.179582556696482, + "learning_rate": 1.3682011170714771e-05, + "loss": 0.1993, + "step": 7838 + }, + { + "epoch": 0.4, + "grad_norm": 0.9022362728092863, + "learning_rate": 1.3680479858015297e-05, + "loss": 0.184, + "step": 7839 + }, + { + "epoch": 0.4, + "grad_norm": 0.8072628959257323, + "learning_rate": 1.3678948445482598e-05, + "loss": 0.1991, + "step": 7840 + }, + { + "epoch": 0.4, + "grad_norm": 1.1446276432557878, + "learning_rate": 1.3677416933158216e-05, + "loss": 0.1997, + "step": 7841 + }, + { + "epoch": 0.4, + "grad_norm": 1.3285163458160731, + "learning_rate": 1.3675885321083693e-05, + "loss": 0.1934, + "step": 7842 + }, + { + "epoch": 0.4, + "grad_norm": 1.0662290021048162, + "learning_rate": 1.3674353609300571e-05, + "loss": 0.1701, + "step": 7843 + }, + { + "epoch": 0.4, + "grad_norm": 0.8600687517356913, + "learning_rate": 1.3672821797850402e-05, + "loss": 0.1782, + "step": 7844 + }, + { + "epoch": 0.4, + "grad_norm": 0.9653570695824466, + "learning_rate": 1.3671289886774733e-05, + "loss": 0.2028, + "step": 7845 + }, + { + "epoch": 0.4, + "grad_norm": 0.9470181500627005, + "learning_rate": 1.3669757876115117e-05, + "loss": 0.2137, + "step": 7846 + }, + { + "epoch": 0.4, + "grad_norm": 1.3353820715794826, + "learning_rate": 1.3668225765913114e-05, + "loss": 0.1769, + "step": 7847 + }, + { + "epoch": 0.4, + "grad_norm": 1.0425801707777265, + "learning_rate": 1.3666693556210278e-05, + "loss": 0.1965, + "step": 7848 + }, + { + "epoch": 0.4, + "grad_norm": 0.9905064591861154, + "learning_rate": 1.3665161247048173e-05, + "loss": 0.1833, + "step": 7849 + }, + { + "epoch": 0.4, + "grad_norm": 1.2775710642059581, + "learning_rate": 1.3663628838468364e-05, + "loss": 0.1912, + "step": 7850 + }, + { + "epoch": 0.4, + "grad_norm": 1.2917881575654395, + "learning_rate": 1.3662096330512413e-05, + "loss": 0.2011, + "step": 7851 + }, + { + "epoch": 0.4, + "grad_norm": 1.0359447949370741, + "learning_rate": 1.3660563723221894e-05, + "loss": 0.2311, + "step": 7852 + }, + { + "epoch": 0.4, + "grad_norm": 0.8779461362875117, + "learning_rate": 1.3659031016638376e-05, + "loss": 0.2028, + "step": 7853 + }, + { + "epoch": 0.4, + "grad_norm": 1.0540674291123995, + "learning_rate": 1.3657498210803435e-05, + "loss": 0.1967, + "step": 7854 + }, + { + "epoch": 0.4, + "grad_norm": 1.4087280910830706, + "learning_rate": 1.3655965305758652e-05, + "loss": 0.1976, + "step": 7855 + }, + { + "epoch": 0.4, + "grad_norm": 0.8105676637666767, + "learning_rate": 1.36544323015456e-05, + "loss": 0.214, + "step": 7856 + }, + { + "epoch": 0.4, + "grad_norm": 0.9090277786069536, + "learning_rate": 1.3652899198205864e-05, + "loss": 0.1919, + "step": 7857 + }, + { + "epoch": 0.4, + "grad_norm": 1.0320170900219516, + "learning_rate": 1.3651365995781034e-05, + "loss": 0.1886, + "step": 7858 + }, + { + "epoch": 0.4, + "grad_norm": 0.8236703494092736, + "learning_rate": 1.3649832694312695e-05, + "loss": 0.2061, + "step": 7859 + }, + { + "epoch": 0.4, + "grad_norm": 1.380715986676055, + "learning_rate": 1.3648299293842438e-05, + "loss": 0.1852, + "step": 7860 + }, + { + "epoch": 0.4, + "grad_norm": 0.7081578575084013, + "learning_rate": 1.3646765794411854e-05, + "loss": 0.172, + "step": 7861 + }, + { + "epoch": 0.4, + "grad_norm": 0.8488985864335844, + "learning_rate": 1.3645232196062544e-05, + "loss": 0.2182, + "step": 7862 + }, + { + "epoch": 0.4, + "grad_norm": 1.006743983617717, + "learning_rate": 1.3643698498836104e-05, + "loss": 0.1942, + "step": 7863 + }, + { + "epoch": 0.4, + "grad_norm": 1.14256700029364, + "learning_rate": 1.3642164702774137e-05, + "loss": 0.196, + "step": 7864 + }, + { + "epoch": 0.4, + "grad_norm": 1.1811893526465074, + "learning_rate": 1.3640630807918246e-05, + "loss": 0.1893, + "step": 7865 + }, + { + "epoch": 0.4, + "grad_norm": 0.8455848742795187, + "learning_rate": 1.3639096814310037e-05, + "loss": 0.2269, + "step": 7866 + }, + { + "epoch": 0.4, + "grad_norm": 0.7749192958494059, + "learning_rate": 1.3637562721991127e-05, + "loss": 0.1869, + "step": 7867 + }, + { + "epoch": 0.4, + "grad_norm": 0.8349972385642351, + "learning_rate": 1.3636028531003118e-05, + "loss": 0.2016, + "step": 7868 + }, + { + "epoch": 0.4, + "grad_norm": 0.9953791132956865, + "learning_rate": 1.3634494241387632e-05, + "loss": 0.2042, + "step": 7869 + }, + { + "epoch": 0.4, + "grad_norm": 0.9222005288634191, + "learning_rate": 1.363295985318628e-05, + "loss": 0.2072, + "step": 7870 + }, + { + "epoch": 0.4, + "grad_norm": 1.075824838479233, + "learning_rate": 1.3631425366440691e-05, + "loss": 0.1992, + "step": 7871 + }, + { + "epoch": 0.4, + "grad_norm": 1.5194579873674021, + "learning_rate": 1.3629890781192486e-05, + "loss": 0.1939, + "step": 7872 + }, + { + "epoch": 0.4, + "grad_norm": 0.7643820525970437, + "learning_rate": 1.3628356097483288e-05, + "loss": 0.1971, + "step": 7873 + }, + { + "epoch": 0.4, + "grad_norm": 1.197943921189177, + "learning_rate": 1.362682131535472e-05, + "loss": 0.1876, + "step": 7874 + }, + { + "epoch": 0.4, + "grad_norm": 1.2330583994530917, + "learning_rate": 1.3625286434848424e-05, + "loss": 0.1771, + "step": 7875 + }, + { + "epoch": 0.4, + "grad_norm": 1.1913529068812612, + "learning_rate": 1.3623751456006027e-05, + "loss": 0.201, + "step": 7876 + }, + { + "epoch": 0.4, + "grad_norm": 1.1474609620458969, + "learning_rate": 1.3622216378869169e-05, + "loss": 0.1877, + "step": 7877 + }, + { + "epoch": 0.4, + "grad_norm": 1.5636328342053893, + "learning_rate": 1.3620681203479484e-05, + "loss": 0.1911, + "step": 7878 + }, + { + "epoch": 0.4, + "grad_norm": 1.1944564472343593, + "learning_rate": 1.3619145929878617e-05, + "loss": 0.1999, + "step": 7879 + }, + { + "epoch": 0.4, + "grad_norm": 0.9179919973426882, + "learning_rate": 1.3617610558108214e-05, + "loss": 0.2123, + "step": 7880 + }, + { + "epoch": 0.4, + "grad_norm": 1.060644212888549, + "learning_rate": 1.3616075088209921e-05, + "loss": 0.2137, + "step": 7881 + }, + { + "epoch": 0.4, + "grad_norm": 0.857124640067879, + "learning_rate": 1.3614539520225388e-05, + "loss": 0.1664, + "step": 7882 + }, + { + "epoch": 0.4, + "grad_norm": 1.6058142618696125, + "learning_rate": 1.361300385419626e-05, + "loss": 0.1857, + "step": 7883 + }, + { + "epoch": 0.4, + "grad_norm": 1.335899099030283, + "learning_rate": 1.3611468090164203e-05, + "loss": 0.2105, + "step": 7884 + }, + { + "epoch": 0.4, + "grad_norm": 1.0110629983418822, + "learning_rate": 1.3609932228170873e-05, + "loss": 0.189, + "step": 7885 + }, + { + "epoch": 0.4, + "grad_norm": 0.9841281444487436, + "learning_rate": 1.3608396268257922e-05, + "loss": 0.1885, + "step": 7886 + }, + { + "epoch": 0.4, + "grad_norm": 0.9728002662716808, + "learning_rate": 1.360686021046702e-05, + "loss": 0.2076, + "step": 7887 + }, + { + "epoch": 0.4, + "grad_norm": 1.1365569391346437, + "learning_rate": 1.360532405483983e-05, + "loss": 0.2065, + "step": 7888 + }, + { + "epoch": 0.4, + "grad_norm": 1.5428500470719355, + "learning_rate": 1.3603787801418025e-05, + "loss": 0.1843, + "step": 7889 + }, + { + "epoch": 0.4, + "grad_norm": 0.9149812568191804, + "learning_rate": 1.3602251450243273e-05, + "loss": 0.2111, + "step": 7890 + }, + { + "epoch": 0.4, + "grad_norm": 1.4272311709229732, + "learning_rate": 1.3600715001357241e-05, + "loss": 0.2076, + "step": 7891 + }, + { + "epoch": 0.4, + "grad_norm": 1.1625936777606585, + "learning_rate": 1.3599178454801615e-05, + "loss": 0.2171, + "step": 7892 + }, + { + "epoch": 0.4, + "grad_norm": 0.8890855284761452, + "learning_rate": 1.3597641810618071e-05, + "loss": 0.2132, + "step": 7893 + }, + { + "epoch": 0.4, + "grad_norm": 0.9647647431889569, + "learning_rate": 1.3596105068848289e-05, + "loss": 0.2097, + "step": 7894 + }, + { + "epoch": 0.4, + "grad_norm": 0.9228742641602002, + "learning_rate": 1.3594568229533953e-05, + "loss": 0.1896, + "step": 7895 + }, + { + "epoch": 0.4, + "grad_norm": 0.9523702934499297, + "learning_rate": 1.359303129271675e-05, + "loss": 0.2188, + "step": 7896 + }, + { + "epoch": 0.4, + "grad_norm": 1.2907224300111102, + "learning_rate": 1.3591494258438372e-05, + "loss": 0.2196, + "step": 7897 + }, + { + "epoch": 0.4, + "grad_norm": 1.4941226422465974, + "learning_rate": 1.3589957126740508e-05, + "loss": 0.1844, + "step": 7898 + }, + { + "epoch": 0.4, + "grad_norm": 1.39262869659887, + "learning_rate": 1.3588419897664855e-05, + "loss": 0.2123, + "step": 7899 + }, + { + "epoch": 0.4, + "grad_norm": 1.3557954440930702, + "learning_rate": 1.358688257125311e-05, + "loss": 0.2364, + "step": 7900 + }, + { + "epoch": 0.4, + "grad_norm": 2.8274355379770744, + "learning_rate": 1.3585345147546971e-05, + "loss": 0.2012, + "step": 7901 + }, + { + "epoch": 0.4, + "grad_norm": 1.445332020814645, + "learning_rate": 1.3583807626588143e-05, + "loss": 0.2053, + "step": 7902 + }, + { + "epoch": 0.4, + "grad_norm": 1.0809255064379315, + "learning_rate": 1.3582270008418332e-05, + "loss": 0.1686, + "step": 7903 + }, + { + "epoch": 0.4, + "grad_norm": 1.1250535830016777, + "learning_rate": 1.3580732293079244e-05, + "loss": 0.1923, + "step": 7904 + }, + { + "epoch": 0.4, + "grad_norm": 0.9187987560101761, + "learning_rate": 1.357919448061259e-05, + "loss": 0.1737, + "step": 7905 + }, + { + "epoch": 0.4, + "grad_norm": 0.9939882858974505, + "learning_rate": 1.3577656571060082e-05, + "loss": 0.1889, + "step": 7906 + }, + { + "epoch": 0.4, + "grad_norm": 1.1124667983403917, + "learning_rate": 1.357611856446344e-05, + "loss": 0.204, + "step": 7907 + }, + { + "epoch": 0.4, + "grad_norm": 1.2556400683929114, + "learning_rate": 1.3574580460864381e-05, + "loss": 0.1767, + "step": 7908 + }, + { + "epoch": 0.4, + "grad_norm": 1.139433671837242, + "learning_rate": 1.3573042260304623e-05, + "loss": 0.2201, + "step": 7909 + }, + { + "epoch": 0.4, + "grad_norm": 0.7612526668488224, + "learning_rate": 1.3571503962825892e-05, + "loss": 0.1781, + "step": 7910 + }, + { + "epoch": 0.4, + "grad_norm": 0.9627113385215519, + "learning_rate": 1.3569965568469915e-05, + "loss": 0.1901, + "step": 7911 + }, + { + "epoch": 0.4, + "grad_norm": 0.8696829378181286, + "learning_rate": 1.3568427077278422e-05, + "loss": 0.1642, + "step": 7912 + }, + { + "epoch": 0.4, + "grad_norm": 0.8150121776016657, + "learning_rate": 1.356688848929314e-05, + "loss": 0.1953, + "step": 7913 + }, + { + "epoch": 0.4, + "grad_norm": 1.192895214033769, + "learning_rate": 1.3565349804555805e-05, + "loss": 0.1792, + "step": 7914 + }, + { + "epoch": 0.4, + "grad_norm": 0.9312534145645553, + "learning_rate": 1.3563811023108157e-05, + "loss": 0.1973, + "step": 7915 + }, + { + "epoch": 0.4, + "grad_norm": 1.102289423897505, + "learning_rate": 1.3562272144991934e-05, + "loss": 0.187, + "step": 7916 + }, + { + "epoch": 0.4, + "grad_norm": 1.5551690732592423, + "learning_rate": 1.3560733170248878e-05, + "loss": 0.2014, + "step": 7917 + }, + { + "epoch": 0.4, + "grad_norm": 0.9967216384377598, + "learning_rate": 1.3559194098920732e-05, + "loss": 0.1793, + "step": 7918 + }, + { + "epoch": 0.4, + "grad_norm": 0.98329460157344, + "learning_rate": 1.3557654931049247e-05, + "loss": 0.1944, + "step": 7919 + }, + { + "epoch": 0.4, + "grad_norm": 2.558620907919238, + "learning_rate": 1.355611566667617e-05, + "loss": 0.1906, + "step": 7920 + }, + { + "epoch": 0.4, + "grad_norm": 1.1066331357961985, + "learning_rate": 1.3554576305843257e-05, + "loss": 0.2229, + "step": 7921 + }, + { + "epoch": 0.4, + "grad_norm": 0.9693267653975826, + "learning_rate": 1.3553036848592261e-05, + "loss": 0.2108, + "step": 7922 + }, + { + "epoch": 0.4, + "grad_norm": 0.8412025440545284, + "learning_rate": 1.3551497294964935e-05, + "loss": 0.2106, + "step": 7923 + }, + { + "epoch": 0.4, + "grad_norm": 1.1110383384259537, + "learning_rate": 1.3549957645003046e-05, + "loss": 0.1907, + "step": 7924 + }, + { + "epoch": 0.4, + "grad_norm": 0.8034700067923729, + "learning_rate": 1.3548417898748361e-05, + "loss": 0.2206, + "step": 7925 + }, + { + "epoch": 0.4, + "grad_norm": 0.7850263892895291, + "learning_rate": 1.3546878056242632e-05, + "loss": 0.1809, + "step": 7926 + }, + { + "epoch": 0.4, + "grad_norm": 1.1270612963348563, + "learning_rate": 1.354533811752764e-05, + "loss": 0.1852, + "step": 7927 + }, + { + "epoch": 0.4, + "grad_norm": 0.9435404761329577, + "learning_rate": 1.3543798082645152e-05, + "loss": 0.2031, + "step": 7928 + }, + { + "epoch": 0.4, + "grad_norm": 0.8921086902767899, + "learning_rate": 1.3542257951636939e-05, + "loss": 0.2034, + "step": 7929 + }, + { + "epoch": 0.4, + "grad_norm": 1.6722635092982678, + "learning_rate": 1.354071772454478e-05, + "loss": 0.2064, + "step": 7930 + }, + { + "epoch": 0.4, + "grad_norm": 2.201211190418289, + "learning_rate": 1.353917740141045e-05, + "loss": 0.2042, + "step": 7931 + }, + { + "epoch": 0.4, + "grad_norm": 0.9930918100552604, + "learning_rate": 1.3537636982275734e-05, + "loss": 0.1872, + "step": 7932 + }, + { + "epoch": 0.4, + "grad_norm": 0.9903922908206649, + "learning_rate": 1.3536096467182418e-05, + "loss": 0.1829, + "step": 7933 + }, + { + "epoch": 0.4, + "grad_norm": 0.8269314138269611, + "learning_rate": 1.3534555856172285e-05, + "loss": 0.2094, + "step": 7934 + }, + { + "epoch": 0.4, + "grad_norm": 1.193805357692574, + "learning_rate": 1.3533015149287123e-05, + "loss": 0.1892, + "step": 7935 + }, + { + "epoch": 0.4, + "grad_norm": 2.504864313457114, + "learning_rate": 1.3531474346568724e-05, + "loss": 0.1833, + "step": 7936 + }, + { + "epoch": 0.4, + "grad_norm": 0.8930310541385884, + "learning_rate": 1.3529933448058885e-05, + "loss": 0.1991, + "step": 7937 + }, + { + "epoch": 0.4, + "grad_norm": 1.112814505702536, + "learning_rate": 1.3528392453799403e-05, + "loss": 0.2099, + "step": 7938 + }, + { + "epoch": 0.4, + "grad_norm": 1.1446346643591627, + "learning_rate": 1.352685136383208e-05, + "loss": 0.2001, + "step": 7939 + }, + { + "epoch": 0.4, + "grad_norm": 0.8912392415537557, + "learning_rate": 1.3525310178198707e-05, + "loss": 0.1905, + "step": 7940 + }, + { + "epoch": 0.4, + "grad_norm": 1.1065270220767909, + "learning_rate": 1.3523768896941101e-05, + "loss": 0.2287, + "step": 7941 + }, + { + "epoch": 0.4, + "grad_norm": 1.0090741292296308, + "learning_rate": 1.3522227520101064e-05, + "loss": 0.1835, + "step": 7942 + }, + { + "epoch": 0.4, + "grad_norm": 0.9676105550984178, + "learning_rate": 1.3520686047720409e-05, + "loss": 0.1913, + "step": 7943 + }, + { + "epoch": 0.4, + "grad_norm": 1.073028571367729, + "learning_rate": 1.3519144479840942e-05, + "loss": 0.2014, + "step": 7944 + }, + { + "epoch": 0.4, + "grad_norm": 0.796391365846085, + "learning_rate": 1.3517602816504482e-05, + "loss": 0.193, + "step": 7945 + }, + { + "epoch": 0.4, + "grad_norm": 1.0138190132905434, + "learning_rate": 1.351606105775285e-05, + "loss": 0.1995, + "step": 7946 + }, + { + "epoch": 0.4, + "grad_norm": 1.0018287764775855, + "learning_rate": 1.3514519203627863e-05, + "loss": 0.1938, + "step": 7947 + }, + { + "epoch": 0.4, + "grad_norm": 0.8105897346963113, + "learning_rate": 1.3512977254171343e-05, + "loss": 0.1963, + "step": 7948 + }, + { + "epoch": 0.4, + "grad_norm": 0.8502494921740894, + "learning_rate": 1.3511435209425115e-05, + "loss": 0.2106, + "step": 7949 + }, + { + "epoch": 0.4, + "grad_norm": 1.1600577932155047, + "learning_rate": 1.3509893069431011e-05, + "loss": 0.1868, + "step": 7950 + }, + { + "epoch": 0.4, + "grad_norm": 1.1029932924061727, + "learning_rate": 1.3508350834230857e-05, + "loss": 0.2033, + "step": 7951 + }, + { + "epoch": 0.4, + "grad_norm": 0.8956362559777115, + "learning_rate": 1.3506808503866491e-05, + "loss": 0.1889, + "step": 7952 + }, + { + "epoch": 0.4, + "grad_norm": 1.144965652654032, + "learning_rate": 1.3505266078379741e-05, + "loss": 0.1805, + "step": 7953 + }, + { + "epoch": 0.4, + "grad_norm": 0.9835022229395287, + "learning_rate": 1.3503723557812455e-05, + "loss": 0.1973, + "step": 7954 + }, + { + "epoch": 0.4, + "grad_norm": 0.7857287589948215, + "learning_rate": 1.3502180942206472e-05, + "loss": 0.1822, + "step": 7955 + }, + { + "epoch": 0.4, + "grad_norm": 1.045687291906436, + "learning_rate": 1.350063823160363e-05, + "loss": 0.1969, + "step": 7956 + }, + { + "epoch": 0.4, + "grad_norm": 0.8156672686588005, + "learning_rate": 1.3499095426045779e-05, + "loss": 0.1872, + "step": 7957 + }, + { + "epoch": 0.4, + "grad_norm": 1.2209587892466776, + "learning_rate": 1.3497552525574763e-05, + "loss": 0.2176, + "step": 7958 + }, + { + "epoch": 0.4, + "grad_norm": 0.8184576292391839, + "learning_rate": 1.3496009530232444e-05, + "loss": 0.2073, + "step": 7959 + }, + { + "epoch": 0.4, + "grad_norm": 0.7409523366673464, + "learning_rate": 1.3494466440060667e-05, + "loss": 0.1898, + "step": 7960 + }, + { + "epoch": 0.4, + "grad_norm": 0.9175046443611684, + "learning_rate": 1.349292325510129e-05, + "loss": 0.1953, + "step": 7961 + }, + { + "epoch": 0.4, + "grad_norm": 0.991920168357191, + "learning_rate": 1.3491379975396171e-05, + "loss": 0.1846, + "step": 7962 + }, + { + "epoch": 0.4, + "grad_norm": 0.9289700337194272, + "learning_rate": 1.3489836600987173e-05, + "loss": 0.1884, + "step": 7963 + }, + { + "epoch": 0.4, + "grad_norm": 0.9731661635607622, + "learning_rate": 1.3488293131916161e-05, + "loss": 0.1642, + "step": 7964 + }, + { + "epoch": 0.41, + "grad_norm": 0.8785920997118186, + "learning_rate": 1.3486749568225002e-05, + "loss": 0.1948, + "step": 7965 + }, + { + "epoch": 0.41, + "grad_norm": 0.9600923761808118, + "learning_rate": 1.3485205909955562e-05, + "loss": 0.1848, + "step": 7966 + }, + { + "epoch": 0.41, + "grad_norm": 0.9832943610219766, + "learning_rate": 1.3483662157149713e-05, + "loss": 0.181, + "step": 7967 + }, + { + "epoch": 0.41, + "grad_norm": 1.164552711160505, + "learning_rate": 1.3482118309849335e-05, + "loss": 0.2095, + "step": 7968 + }, + { + "epoch": 0.41, + "grad_norm": 0.8209572252079305, + "learning_rate": 1.34805743680963e-05, + "loss": 0.1783, + "step": 7969 + }, + { + "epoch": 0.41, + "grad_norm": 2.279067529820421, + "learning_rate": 1.3479030331932488e-05, + "loss": 0.18, + "step": 7970 + }, + { + "epoch": 0.41, + "grad_norm": 0.70461197632034, + "learning_rate": 1.347748620139978e-05, + "loss": 0.1622, + "step": 7971 + }, + { + "epoch": 0.41, + "grad_norm": 0.9765087054600745, + "learning_rate": 1.3475941976540066e-05, + "loss": 0.2011, + "step": 7972 + }, + { + "epoch": 0.41, + "grad_norm": 1.6868005859222541, + "learning_rate": 1.3474397657395231e-05, + "loss": 0.1957, + "step": 7973 + }, + { + "epoch": 0.41, + "grad_norm": 0.8892658074672084, + "learning_rate": 1.347285324400716e-05, + "loss": 0.2046, + "step": 7974 + }, + { + "epoch": 0.41, + "grad_norm": 1.2671014445922417, + "learning_rate": 1.347130873641775e-05, + "loss": 0.2059, + "step": 7975 + }, + { + "epoch": 0.41, + "grad_norm": 0.8699850071913555, + "learning_rate": 1.346976413466889e-05, + "loss": 0.1762, + "step": 7976 + }, + { + "epoch": 0.41, + "grad_norm": 1.8392292585051886, + "learning_rate": 1.3468219438802487e-05, + "loss": 0.1654, + "step": 7977 + }, + { + "epoch": 0.41, + "grad_norm": 1.5843249219827986, + "learning_rate": 1.3466674648860436e-05, + "loss": 0.2023, + "step": 7978 + }, + { + "epoch": 0.41, + "grad_norm": 0.8306870529471548, + "learning_rate": 1.3465129764884636e-05, + "loss": 0.1781, + "step": 7979 + }, + { + "epoch": 0.41, + "grad_norm": 0.9424208434182422, + "learning_rate": 1.3463584786916997e-05, + "loss": 0.2124, + "step": 7980 + }, + { + "epoch": 0.41, + "grad_norm": 1.0553131794893045, + "learning_rate": 1.3462039714999426e-05, + "loss": 0.2425, + "step": 7981 + }, + { + "epoch": 0.41, + "grad_norm": 0.8565393972030418, + "learning_rate": 1.3460494549173833e-05, + "loss": 0.1893, + "step": 7982 + }, + { + "epoch": 0.41, + "grad_norm": 1.3153057889606654, + "learning_rate": 1.3458949289482126e-05, + "loss": 0.174, + "step": 7983 + }, + { + "epoch": 0.41, + "grad_norm": 0.972633249567405, + "learning_rate": 1.3457403935966227e-05, + "loss": 0.1992, + "step": 7984 + }, + { + "epoch": 0.41, + "grad_norm": 1.1382920500187748, + "learning_rate": 1.345585848866805e-05, + "loss": 0.2359, + "step": 7985 + }, + { + "epoch": 0.41, + "grad_norm": 0.8575969850834319, + "learning_rate": 1.3454312947629515e-05, + "loss": 0.1995, + "step": 7986 + }, + { + "epoch": 0.41, + "grad_norm": 1.0979026964467675, + "learning_rate": 1.345276731289255e-05, + "loss": 0.2002, + "step": 7987 + }, + { + "epoch": 0.41, + "grad_norm": 1.079195049925033, + "learning_rate": 1.3451221584499073e-05, + "loss": 0.2269, + "step": 7988 + }, + { + "epoch": 0.41, + "grad_norm": 0.9035088056320559, + "learning_rate": 1.3449675762491017e-05, + "loss": 0.1953, + "step": 7989 + }, + { + "epoch": 0.41, + "grad_norm": 0.8710857518370761, + "learning_rate": 1.3448129846910312e-05, + "loss": 0.2033, + "step": 7990 + }, + { + "epoch": 0.41, + "grad_norm": 0.7995548269495811, + "learning_rate": 1.344658383779889e-05, + "loss": 0.2021, + "step": 7991 + }, + { + "epoch": 0.41, + "grad_norm": 1.317150873310952, + "learning_rate": 1.3445037735198684e-05, + "loss": 0.2065, + "step": 7992 + }, + { + "epoch": 0.41, + "grad_norm": 0.8921681775584891, + "learning_rate": 1.3443491539151636e-05, + "loss": 0.2054, + "step": 7993 + }, + { + "epoch": 0.41, + "grad_norm": 0.8545579901327621, + "learning_rate": 1.3441945249699687e-05, + "loss": 0.2003, + "step": 7994 + }, + { + "epoch": 0.41, + "grad_norm": 1.0063728134691161, + "learning_rate": 1.3440398866884781e-05, + "loss": 0.2002, + "step": 7995 + }, + { + "epoch": 0.41, + "grad_norm": 0.9023502076409247, + "learning_rate": 1.343885239074886e-05, + "loss": 0.2044, + "step": 7996 + }, + { + "epoch": 0.41, + "grad_norm": 0.7519701313854469, + "learning_rate": 1.343730582133387e-05, + "loss": 0.1963, + "step": 7997 + }, + { + "epoch": 0.41, + "grad_norm": 0.9367432902659597, + "learning_rate": 1.3435759158681767e-05, + "loss": 0.1965, + "step": 7998 + }, + { + "epoch": 0.41, + "grad_norm": 0.9849913851261063, + "learning_rate": 1.3434212402834503e-05, + "loss": 0.1797, + "step": 7999 + }, + { + "epoch": 0.41, + "grad_norm": 0.9043783031126352, + "learning_rate": 1.3432665553834036e-05, + "loss": 0.1825, + "step": 8000 + }, + { + "epoch": 0.41, + "grad_norm": 0.9773402570390824, + "learning_rate": 1.3431118611722317e-05, + "loss": 0.1956, + "step": 8001 + }, + { + "epoch": 0.41, + "grad_norm": 0.9198481783329289, + "learning_rate": 1.3429571576541315e-05, + "loss": 0.1996, + "step": 8002 + }, + { + "epoch": 0.41, + "grad_norm": 0.9380731530654524, + "learning_rate": 1.3428024448332992e-05, + "loss": 0.1938, + "step": 8003 + }, + { + "epoch": 0.41, + "grad_norm": 1.1878699897617486, + "learning_rate": 1.342647722713931e-05, + "loss": 0.1915, + "step": 8004 + }, + { + "epoch": 0.41, + "grad_norm": 0.7892945827995911, + "learning_rate": 1.342492991300224e-05, + "loss": 0.1886, + "step": 8005 + }, + { + "epoch": 0.41, + "grad_norm": 0.8627428951792862, + "learning_rate": 1.3423382505963752e-05, + "loss": 0.1665, + "step": 8006 + }, + { + "epoch": 0.41, + "grad_norm": 0.8358965250113496, + "learning_rate": 1.3421835006065821e-05, + "loss": 0.1845, + "step": 8007 + }, + { + "epoch": 0.41, + "grad_norm": 0.8940788101936157, + "learning_rate": 1.3420287413350424e-05, + "loss": 0.1943, + "step": 8008 + }, + { + "epoch": 0.41, + "grad_norm": 0.8457027511164052, + "learning_rate": 1.3418739727859536e-05, + "loss": 0.1884, + "step": 8009 + }, + { + "epoch": 0.41, + "grad_norm": 0.9616924394345931, + "learning_rate": 1.3417191949635137e-05, + "loss": 0.2005, + "step": 8010 + }, + { + "epoch": 0.41, + "grad_norm": 0.9765691241789635, + "learning_rate": 1.3415644078719216e-05, + "loss": 0.2, + "step": 8011 + }, + { + "epoch": 0.41, + "grad_norm": 1.0198592589757345, + "learning_rate": 1.3414096115153758e-05, + "loss": 0.2008, + "step": 8012 + }, + { + "epoch": 0.41, + "grad_norm": 1.078405481686966, + "learning_rate": 1.341254805898075e-05, + "loss": 0.1754, + "step": 8013 + }, + { + "epoch": 0.41, + "grad_norm": 1.1520658043098602, + "learning_rate": 1.341099991024218e-05, + "loss": 0.195, + "step": 8014 + }, + { + "epoch": 0.41, + "grad_norm": 0.8431762267412917, + "learning_rate": 1.3409451668980047e-05, + "loss": 0.1917, + "step": 8015 + }, + { + "epoch": 0.41, + "grad_norm": 0.9709340793317897, + "learning_rate": 1.3407903335236342e-05, + "loss": 0.1988, + "step": 8016 + }, + { + "epoch": 0.41, + "grad_norm": 0.9562143034143634, + "learning_rate": 1.3406354909053072e-05, + "loss": 0.1865, + "step": 8017 + }, + { + "epoch": 0.41, + "grad_norm": 1.8903264345603317, + "learning_rate": 1.340480639047223e-05, + "loss": 0.1795, + "step": 8018 + }, + { + "epoch": 0.41, + "grad_norm": 0.9493680900295547, + "learning_rate": 1.340325777953582e-05, + "loss": 0.196, + "step": 8019 + }, + { + "epoch": 0.41, + "grad_norm": 0.892180765456923, + "learning_rate": 1.3401709076285854e-05, + "loss": 0.1962, + "step": 8020 + }, + { + "epoch": 0.41, + "grad_norm": 1.0172462805046316, + "learning_rate": 1.3400160280764334e-05, + "loss": 0.1945, + "step": 8021 + }, + { + "epoch": 0.41, + "grad_norm": 0.8633358976212213, + "learning_rate": 1.3398611393013276e-05, + "loss": 0.184, + "step": 8022 + }, + { + "epoch": 0.41, + "grad_norm": 0.8679448290497178, + "learning_rate": 1.3397062413074692e-05, + "loss": 0.2077, + "step": 8023 + }, + { + "epoch": 0.41, + "grad_norm": 0.8340346318477808, + "learning_rate": 1.3395513340990599e-05, + "loss": 0.1961, + "step": 8024 + }, + { + "epoch": 0.41, + "grad_norm": 1.2509060316174685, + "learning_rate": 1.3393964176803014e-05, + "loss": 0.2064, + "step": 8025 + }, + { + "epoch": 0.41, + "grad_norm": 0.7697603617815317, + "learning_rate": 1.3392414920553958e-05, + "loss": 0.1696, + "step": 8026 + }, + { + "epoch": 0.41, + "grad_norm": 0.8871225258106962, + "learning_rate": 1.3390865572285456e-05, + "loss": 0.191, + "step": 8027 + }, + { + "epoch": 0.41, + "grad_norm": 0.9158079068168967, + "learning_rate": 1.3389316132039534e-05, + "loss": 0.1744, + "step": 8028 + }, + { + "epoch": 0.41, + "grad_norm": 0.7186316771574106, + "learning_rate": 1.3387766599858223e-05, + "loss": 0.2006, + "step": 8029 + }, + { + "epoch": 0.41, + "grad_norm": 1.4655735167646904, + "learning_rate": 1.338621697578355e-05, + "loss": 0.1928, + "step": 8030 + }, + { + "epoch": 0.41, + "grad_norm": 3.429526068500552, + "learning_rate": 1.338466725985755e-05, + "loss": 0.1723, + "step": 8031 + }, + { + "epoch": 0.41, + "grad_norm": 1.057773856701616, + "learning_rate": 1.3383117452122259e-05, + "loss": 0.1977, + "step": 8032 + }, + { + "epoch": 0.41, + "grad_norm": 1.035697937100189, + "learning_rate": 1.3381567552619716e-05, + "loss": 0.1796, + "step": 8033 + }, + { + "epoch": 0.41, + "grad_norm": 1.8543769830582006, + "learning_rate": 1.3380017561391964e-05, + "loss": 0.1993, + "step": 8034 + }, + { + "epoch": 0.41, + "grad_norm": 1.1232548014489392, + "learning_rate": 1.3378467478481043e-05, + "loss": 0.2145, + "step": 8035 + }, + { + "epoch": 0.41, + "grad_norm": 0.7160729312689819, + "learning_rate": 1.3376917303929e-05, + "loss": 0.1762, + "step": 8036 + }, + { + "epoch": 0.41, + "grad_norm": 1.0801627888125969, + "learning_rate": 1.3375367037777887e-05, + "loss": 0.223, + "step": 8037 + }, + { + "epoch": 0.41, + "grad_norm": 2.247750747667892, + "learning_rate": 1.3373816680069749e-05, + "loss": 0.1975, + "step": 8038 + }, + { + "epoch": 0.41, + "grad_norm": 0.8611190036702995, + "learning_rate": 1.3372266230846647e-05, + "loss": 0.2073, + "step": 8039 + }, + { + "epoch": 0.41, + "grad_norm": 1.0154499916864541, + "learning_rate": 1.3370715690150631e-05, + "loss": 0.1905, + "step": 8040 + }, + { + "epoch": 0.41, + "grad_norm": 0.9680673347591338, + "learning_rate": 1.336916505802376e-05, + "loss": 0.1972, + "step": 8041 + }, + { + "epoch": 0.41, + "grad_norm": 0.9171675293224171, + "learning_rate": 1.3367614334508097e-05, + "loss": 0.2005, + "step": 8042 + }, + { + "epoch": 0.41, + "grad_norm": 0.7850773437724924, + "learning_rate": 1.3366063519645707e-05, + "loss": 0.2221, + "step": 8043 + }, + { + "epoch": 0.41, + "grad_norm": 0.9294976754762169, + "learning_rate": 1.3364512613478654e-05, + "loss": 0.175, + "step": 8044 + }, + { + "epoch": 0.41, + "grad_norm": 0.8474386472668382, + "learning_rate": 1.3362961616049006e-05, + "loss": 0.1965, + "step": 8045 + }, + { + "epoch": 0.41, + "grad_norm": 0.8462067143572174, + "learning_rate": 1.336141052739883e-05, + "loss": 0.2185, + "step": 8046 + }, + { + "epoch": 0.41, + "grad_norm": 0.895323741516153, + "learning_rate": 1.335985934757021e-05, + "loss": 0.2025, + "step": 8047 + }, + { + "epoch": 0.41, + "grad_norm": 1.1842591109627791, + "learning_rate": 1.3358308076605213e-05, + "loss": 0.1718, + "step": 8048 + }, + { + "epoch": 0.41, + "grad_norm": 1.358975880195966, + "learning_rate": 1.3356756714545917e-05, + "loss": 0.1839, + "step": 8049 + }, + { + "epoch": 0.41, + "grad_norm": 1.2801457031286867, + "learning_rate": 1.3355205261434408e-05, + "loss": 0.1898, + "step": 8050 + }, + { + "epoch": 0.41, + "grad_norm": 0.9469890610484961, + "learning_rate": 1.3353653717312767e-05, + "loss": 0.1808, + "step": 8051 + }, + { + "epoch": 0.41, + "grad_norm": 2.865045301335842, + "learning_rate": 1.335210208222308e-05, + "loss": 0.2102, + "step": 8052 + }, + { + "epoch": 0.41, + "grad_norm": 1.0032497809315304, + "learning_rate": 1.3350550356207435e-05, + "loss": 0.2145, + "step": 8053 + }, + { + "epoch": 0.41, + "grad_norm": 1.1154342561170936, + "learning_rate": 1.3348998539307919e-05, + "loss": 0.1813, + "step": 8054 + }, + { + "epoch": 0.41, + "grad_norm": 1.0187690617804463, + "learning_rate": 1.334744663156663e-05, + "loss": 0.1844, + "step": 8055 + }, + { + "epoch": 0.41, + "grad_norm": 1.0827207692673682, + "learning_rate": 1.3345894633025662e-05, + "loss": 0.2129, + "step": 8056 + }, + { + "epoch": 0.41, + "grad_norm": 1.2555042431760213, + "learning_rate": 1.3344342543727115e-05, + "loss": 0.2022, + "step": 8057 + }, + { + "epoch": 0.41, + "grad_norm": 0.8291008640026881, + "learning_rate": 1.3342790363713088e-05, + "loss": 0.182, + "step": 8058 + }, + { + "epoch": 0.41, + "grad_norm": 1.4148639718049554, + "learning_rate": 1.3341238093025679e-05, + "loss": 0.1941, + "step": 8059 + }, + { + "epoch": 0.41, + "grad_norm": 0.9735277681041818, + "learning_rate": 1.3339685731707002e-05, + "loss": 0.1892, + "step": 8060 + }, + { + "epoch": 0.41, + "grad_norm": 0.923651269778692, + "learning_rate": 1.3338133279799159e-05, + "loss": 0.1852, + "step": 8061 + }, + { + "epoch": 0.41, + "grad_norm": 1.071944681667006, + "learning_rate": 1.3336580737344265e-05, + "loss": 0.2174, + "step": 8062 + }, + { + "epoch": 0.41, + "grad_norm": 1.6682334959961802, + "learning_rate": 1.3335028104384424e-05, + "loss": 0.2173, + "step": 8063 + }, + { + "epoch": 0.41, + "grad_norm": 1.2152012251443567, + "learning_rate": 1.3333475380961762e-05, + "loss": 0.1892, + "step": 8064 + }, + { + "epoch": 0.41, + "grad_norm": 1.0145161664772557, + "learning_rate": 1.3331922567118394e-05, + "loss": 0.2089, + "step": 8065 + }, + { + "epoch": 0.41, + "grad_norm": 1.1507511222245936, + "learning_rate": 1.3330369662896437e-05, + "loss": 0.214, + "step": 8066 + }, + { + "epoch": 0.41, + "grad_norm": 0.9486939578167846, + "learning_rate": 1.3328816668338012e-05, + "loss": 0.2021, + "step": 8067 + }, + { + "epoch": 0.41, + "grad_norm": 1.114137032112211, + "learning_rate": 1.3327263583485248e-05, + "loss": 0.1919, + "step": 8068 + }, + { + "epoch": 0.41, + "grad_norm": 1.1410597004242617, + "learning_rate": 1.3325710408380272e-05, + "loss": 0.2153, + "step": 8069 + }, + { + "epoch": 0.41, + "grad_norm": 0.8798023649166551, + "learning_rate": 1.3324157143065213e-05, + "loss": 0.195, + "step": 8070 + }, + { + "epoch": 0.41, + "grad_norm": 0.9157107395312563, + "learning_rate": 1.3322603787582205e-05, + "loss": 0.1989, + "step": 8071 + }, + { + "epoch": 0.41, + "grad_norm": 0.8099024728710942, + "learning_rate": 1.3321050341973378e-05, + "loss": 0.2094, + "step": 8072 + }, + { + "epoch": 0.41, + "grad_norm": 1.0920822863428858, + "learning_rate": 1.3319496806280877e-05, + "loss": 0.1846, + "step": 8073 + }, + { + "epoch": 0.41, + "grad_norm": 2.260886118708162, + "learning_rate": 1.3317943180546836e-05, + "loss": 0.1959, + "step": 8074 + }, + { + "epoch": 0.41, + "grad_norm": 1.0141664566644863, + "learning_rate": 1.3316389464813397e-05, + "loss": 0.1988, + "step": 8075 + }, + { + "epoch": 0.41, + "grad_norm": 0.8211926777508631, + "learning_rate": 1.3314835659122707e-05, + "loss": 0.1757, + "step": 8076 + }, + { + "epoch": 0.41, + "grad_norm": 0.949669333859531, + "learning_rate": 1.3313281763516915e-05, + "loss": 0.1939, + "step": 8077 + }, + { + "epoch": 0.41, + "grad_norm": 0.9081621642327156, + "learning_rate": 1.3311727778038165e-05, + "loss": 0.1835, + "step": 8078 + }, + { + "epoch": 0.41, + "grad_norm": 0.9381499951139102, + "learning_rate": 1.3310173702728614e-05, + "loss": 0.196, + "step": 8079 + }, + { + "epoch": 0.41, + "grad_norm": 1.4136313347465652, + "learning_rate": 1.3308619537630416e-05, + "loss": 0.2122, + "step": 8080 + }, + { + "epoch": 0.41, + "grad_norm": 0.7596837076372943, + "learning_rate": 1.3307065282785723e-05, + "loss": 0.2217, + "step": 8081 + }, + { + "epoch": 0.41, + "grad_norm": 1.1243254028756764, + "learning_rate": 1.33055109382367e-05, + "loss": 0.1903, + "step": 8082 + }, + { + "epoch": 0.41, + "grad_norm": 0.9040116470936199, + "learning_rate": 1.3303956504025506e-05, + "loss": 0.2046, + "step": 8083 + }, + { + "epoch": 0.41, + "grad_norm": 1.1134412438267782, + "learning_rate": 1.3302401980194303e-05, + "loss": 0.1973, + "step": 8084 + }, + { + "epoch": 0.41, + "grad_norm": 1.311444185148507, + "learning_rate": 1.3300847366785261e-05, + "loss": 0.2263, + "step": 8085 + }, + { + "epoch": 0.41, + "grad_norm": 1.6291026055605924, + "learning_rate": 1.3299292663840546e-05, + "loss": 0.205, + "step": 8086 + }, + { + "epoch": 0.41, + "grad_norm": 1.1327737332574543, + "learning_rate": 1.3297737871402333e-05, + "loss": 0.2018, + "step": 8087 + }, + { + "epoch": 0.41, + "grad_norm": 0.9299835949328888, + "learning_rate": 1.3296182989512794e-05, + "loss": 0.1824, + "step": 8088 + }, + { + "epoch": 0.41, + "grad_norm": 0.8818644971901961, + "learning_rate": 1.3294628018214105e-05, + "loss": 0.2143, + "step": 8089 + }, + { + "epoch": 0.41, + "grad_norm": 0.9567065848980094, + "learning_rate": 1.3293072957548443e-05, + "loss": 0.1917, + "step": 8090 + }, + { + "epoch": 0.41, + "grad_norm": 0.9484045798329989, + "learning_rate": 1.3291517807557994e-05, + "loss": 0.1908, + "step": 8091 + }, + { + "epoch": 0.41, + "grad_norm": 1.0639832334067085, + "learning_rate": 1.3289962568284937e-05, + "loss": 0.1944, + "step": 8092 + }, + { + "epoch": 0.41, + "grad_norm": 1.2129082756641967, + "learning_rate": 1.3288407239771462e-05, + "loss": 0.1931, + "step": 8093 + }, + { + "epoch": 0.41, + "grad_norm": 1.5496534950187957, + "learning_rate": 1.328685182205975e-05, + "loss": 0.1984, + "step": 8094 + }, + { + "epoch": 0.41, + "grad_norm": 0.7701213256886666, + "learning_rate": 1.3285296315192e-05, + "loss": 0.1883, + "step": 8095 + }, + { + "epoch": 0.41, + "grad_norm": 1.2667584968529702, + "learning_rate": 1.32837407192104e-05, + "loss": 0.196, + "step": 8096 + }, + { + "epoch": 0.41, + "grad_norm": 1.4217626914391113, + "learning_rate": 1.3282185034157151e-05, + "loss": 0.2033, + "step": 8097 + }, + { + "epoch": 0.41, + "grad_norm": 0.9766157940575106, + "learning_rate": 1.3280629260074442e-05, + "loss": 0.1865, + "step": 8098 + }, + { + "epoch": 0.41, + "grad_norm": 0.7803982157049401, + "learning_rate": 1.3279073397004485e-05, + "loss": 0.1695, + "step": 8099 + }, + { + "epoch": 0.41, + "grad_norm": 1.0041553192413004, + "learning_rate": 1.3277517444989476e-05, + "loss": 0.1876, + "step": 8100 + }, + { + "epoch": 0.41, + "grad_norm": 0.8422004848718346, + "learning_rate": 1.327596140407162e-05, + "loss": 0.195, + "step": 8101 + }, + { + "epoch": 0.41, + "grad_norm": 1.126805813050145, + "learning_rate": 1.3274405274293122e-05, + "loss": 0.1922, + "step": 8102 + }, + { + "epoch": 0.41, + "grad_norm": 1.3489940250126804, + "learning_rate": 1.3272849055696203e-05, + "loss": 0.1994, + "step": 8103 + }, + { + "epoch": 0.41, + "grad_norm": 1.2051419046998122, + "learning_rate": 1.3271292748323064e-05, + "loss": 0.2189, + "step": 8104 + }, + { + "epoch": 0.41, + "grad_norm": 0.9861422782076765, + "learning_rate": 1.3269736352215925e-05, + "loss": 0.2058, + "step": 8105 + }, + { + "epoch": 0.41, + "grad_norm": 0.7770971461313525, + "learning_rate": 1.3268179867417004e-05, + "loss": 0.1905, + "step": 8106 + }, + { + "epoch": 0.41, + "grad_norm": 0.9277715775572186, + "learning_rate": 1.3266623293968518e-05, + "loss": 0.1919, + "step": 8107 + }, + { + "epoch": 0.41, + "grad_norm": 0.9695669138804043, + "learning_rate": 1.326506663191269e-05, + "loss": 0.1894, + "step": 8108 + }, + { + "epoch": 0.41, + "grad_norm": 0.9098778962285857, + "learning_rate": 1.3263509881291748e-05, + "loss": 0.1768, + "step": 8109 + }, + { + "epoch": 0.41, + "grad_norm": 0.8360349993777317, + "learning_rate": 1.3261953042147915e-05, + "loss": 0.1895, + "step": 8110 + }, + { + "epoch": 0.41, + "grad_norm": 1.147216914825333, + "learning_rate": 1.326039611452342e-05, + "loss": 0.1896, + "step": 8111 + }, + { + "epoch": 0.41, + "grad_norm": 0.8526896466281547, + "learning_rate": 1.3258839098460496e-05, + "loss": 0.1937, + "step": 8112 + }, + { + "epoch": 0.41, + "grad_norm": 1.1435652297643273, + "learning_rate": 1.325728199400138e-05, + "loss": 0.2093, + "step": 8113 + }, + { + "epoch": 0.41, + "grad_norm": 0.8185991738538979, + "learning_rate": 1.3255724801188305e-05, + "loss": 0.1922, + "step": 8114 + }, + { + "epoch": 0.41, + "grad_norm": 1.0561514708661517, + "learning_rate": 1.325416752006351e-05, + "loss": 0.191, + "step": 8115 + }, + { + "epoch": 0.41, + "grad_norm": 0.7351805421960459, + "learning_rate": 1.3252610150669236e-05, + "loss": 0.1879, + "step": 8116 + }, + { + "epoch": 0.41, + "grad_norm": 2.522389693987051, + "learning_rate": 1.3251052693047732e-05, + "loss": 0.1902, + "step": 8117 + }, + { + "epoch": 0.41, + "grad_norm": 0.9716643739300673, + "learning_rate": 1.324949514724124e-05, + "loss": 0.1864, + "step": 8118 + }, + { + "epoch": 0.41, + "grad_norm": 1.372780519038876, + "learning_rate": 1.3247937513292007e-05, + "loss": 0.2246, + "step": 8119 + }, + { + "epoch": 0.41, + "grad_norm": 1.1258389056492442, + "learning_rate": 1.3246379791242284e-05, + "loss": 0.2059, + "step": 8120 + }, + { + "epoch": 0.41, + "grad_norm": 0.6779761807315811, + "learning_rate": 1.3244821981134326e-05, + "loss": 0.1742, + "step": 8121 + }, + { + "epoch": 0.41, + "grad_norm": 0.9024791913390288, + "learning_rate": 1.324326408301039e-05, + "loss": 0.214, + "step": 8122 + }, + { + "epoch": 0.41, + "grad_norm": 0.9387075044127428, + "learning_rate": 1.3241706096912731e-05, + "loss": 0.1813, + "step": 8123 + }, + { + "epoch": 0.41, + "grad_norm": 1.0895957947682025, + "learning_rate": 1.324014802288361e-05, + "loss": 0.2017, + "step": 8124 + }, + { + "epoch": 0.41, + "grad_norm": 0.8091951199061025, + "learning_rate": 1.3238589860965295e-05, + "loss": 0.1847, + "step": 8125 + }, + { + "epoch": 0.41, + "grad_norm": 0.9760857115992949, + "learning_rate": 1.3237031611200044e-05, + "loss": 0.2073, + "step": 8126 + }, + { + "epoch": 0.41, + "grad_norm": 0.8664661099865184, + "learning_rate": 1.3235473273630128e-05, + "loss": 0.1859, + "step": 8127 + }, + { + "epoch": 0.41, + "grad_norm": 1.08176486103619, + "learning_rate": 1.3233914848297817e-05, + "loss": 0.1751, + "step": 8128 + }, + { + "epoch": 0.41, + "grad_norm": 1.0521867252969814, + "learning_rate": 1.3232356335245381e-05, + "loss": 0.1917, + "step": 8129 + }, + { + "epoch": 0.41, + "grad_norm": 1.8501926151086432, + "learning_rate": 1.3230797734515102e-05, + "loss": 0.1832, + "step": 8130 + }, + { + "epoch": 0.41, + "grad_norm": 0.8890514909572997, + "learning_rate": 1.3229239046149249e-05, + "loss": 0.1979, + "step": 8131 + }, + { + "epoch": 0.41, + "grad_norm": 1.0085527273462358, + "learning_rate": 1.3227680270190106e-05, + "loss": 0.1977, + "step": 8132 + }, + { + "epoch": 0.41, + "grad_norm": 1.7789829508973025, + "learning_rate": 1.322612140667995e-05, + "loss": 0.1936, + "step": 8133 + }, + { + "epoch": 0.41, + "grad_norm": 1.0270694342051736, + "learning_rate": 1.3224562455661069e-05, + "loss": 0.2294, + "step": 8134 + }, + { + "epoch": 0.41, + "grad_norm": 0.8857809528499025, + "learning_rate": 1.3223003417175755e-05, + "loss": 0.2033, + "step": 8135 + }, + { + "epoch": 0.41, + "grad_norm": 0.8872779913474352, + "learning_rate": 1.3221444291266288e-05, + "loss": 0.209, + "step": 8136 + }, + { + "epoch": 0.41, + "grad_norm": 1.9116143631173013, + "learning_rate": 1.3219885077974959e-05, + "loss": 0.1896, + "step": 8137 + }, + { + "epoch": 0.41, + "grad_norm": 0.8408863730472417, + "learning_rate": 1.321832577734407e-05, + "loss": 0.2315, + "step": 8138 + }, + { + "epoch": 0.41, + "grad_norm": 0.9435006909990574, + "learning_rate": 1.3216766389415909e-05, + "loss": 0.2047, + "step": 8139 + }, + { + "epoch": 0.41, + "grad_norm": 0.9122559504169311, + "learning_rate": 1.321520691423278e-05, + "loss": 0.1946, + "step": 8140 + }, + { + "epoch": 0.41, + "grad_norm": 0.9570472472937975, + "learning_rate": 1.3213647351836985e-05, + "loss": 0.1801, + "step": 8141 + }, + { + "epoch": 0.41, + "grad_norm": 1.3682519206358967, + "learning_rate": 1.3212087702270817e-05, + "loss": 0.2392, + "step": 8142 + }, + { + "epoch": 0.41, + "grad_norm": 0.7131643812133875, + "learning_rate": 1.3210527965576594e-05, + "loss": 0.165, + "step": 8143 + }, + { + "epoch": 0.41, + "grad_norm": 1.0247955772632986, + "learning_rate": 1.3208968141796616e-05, + "loss": 0.169, + "step": 8144 + }, + { + "epoch": 0.41, + "grad_norm": 0.7389961092282391, + "learning_rate": 1.3207408230973198e-05, + "loss": 0.1991, + "step": 8145 + }, + { + "epoch": 0.41, + "grad_norm": 0.987690758612029, + "learning_rate": 1.3205848233148649e-05, + "loss": 0.2091, + "step": 8146 + }, + { + "epoch": 0.41, + "grad_norm": 1.162320781384892, + "learning_rate": 1.3204288148365285e-05, + "loss": 0.1972, + "step": 8147 + }, + { + "epoch": 0.41, + "grad_norm": 1.259879163113445, + "learning_rate": 1.3202727976665426e-05, + "loss": 0.1776, + "step": 8148 + }, + { + "epoch": 0.41, + "grad_norm": 2.6173310688838565, + "learning_rate": 1.320116771809139e-05, + "loss": 0.2075, + "step": 8149 + }, + { + "epoch": 0.41, + "grad_norm": 1.15319608640954, + "learning_rate": 1.3199607372685497e-05, + "loss": 0.2367, + "step": 8150 + }, + { + "epoch": 0.41, + "grad_norm": 1.1085735482195873, + "learning_rate": 1.3198046940490072e-05, + "loss": 0.1925, + "step": 8151 + }, + { + "epoch": 0.41, + "grad_norm": 0.9526619299240466, + "learning_rate": 1.3196486421547447e-05, + "loss": 0.2255, + "step": 8152 + }, + { + "epoch": 0.41, + "grad_norm": 1.186250935829803, + "learning_rate": 1.3194925815899946e-05, + "loss": 0.1938, + "step": 8153 + }, + { + "epoch": 0.41, + "grad_norm": 0.9462820876927331, + "learning_rate": 1.3193365123589904e-05, + "loss": 0.2193, + "step": 8154 + }, + { + "epoch": 0.41, + "grad_norm": 1.0680716000128938, + "learning_rate": 1.3191804344659647e-05, + "loss": 0.2051, + "step": 8155 + }, + { + "epoch": 0.41, + "grad_norm": 0.890509111019689, + "learning_rate": 1.319024347915152e-05, + "loss": 0.197, + "step": 8156 + }, + { + "epoch": 0.41, + "grad_norm": 0.9695353673509965, + "learning_rate": 1.3188682527107856e-05, + "loss": 0.2027, + "step": 8157 + }, + { + "epoch": 0.41, + "grad_norm": 0.8790321917308074, + "learning_rate": 1.3187121488571001e-05, + "loss": 0.1975, + "step": 8158 + }, + { + "epoch": 0.41, + "grad_norm": 1.0904526851948093, + "learning_rate": 1.3185560363583294e-05, + "loss": 0.2031, + "step": 8159 + }, + { + "epoch": 0.41, + "grad_norm": 1.0304998013343536, + "learning_rate": 1.3183999152187084e-05, + "loss": 0.1907, + "step": 8160 + }, + { + "epoch": 0.42, + "grad_norm": 1.3517359749264424, + "learning_rate": 1.3182437854424716e-05, + "loss": 0.2229, + "step": 8161 + }, + { + "epoch": 0.42, + "grad_norm": 0.9239570982372851, + "learning_rate": 1.3180876470338545e-05, + "loss": 0.2058, + "step": 8162 + }, + { + "epoch": 0.42, + "grad_norm": 0.8717308844191373, + "learning_rate": 1.3179314999970915e-05, + "loss": 0.1981, + "step": 8163 + }, + { + "epoch": 0.42, + "grad_norm": 0.8246230871063063, + "learning_rate": 1.3177753443364188e-05, + "loss": 0.1909, + "step": 8164 + }, + { + "epoch": 0.42, + "grad_norm": 2.4695548290453426, + "learning_rate": 1.317619180056072e-05, + "loss": 0.1985, + "step": 8165 + }, + { + "epoch": 0.42, + "grad_norm": 0.8400845928983217, + "learning_rate": 1.317463007160287e-05, + "loss": 0.1846, + "step": 8166 + }, + { + "epoch": 0.42, + "grad_norm": 0.7093872820281457, + "learning_rate": 1.3173068256533e-05, + "loss": 0.1812, + "step": 8167 + }, + { + "epoch": 0.42, + "grad_norm": 0.9064441522202039, + "learning_rate": 1.3171506355393473e-05, + "loss": 0.2122, + "step": 8168 + }, + { + "epoch": 0.42, + "grad_norm": 1.367856901127145, + "learning_rate": 1.3169944368226655e-05, + "loss": 0.2047, + "step": 8169 + }, + { + "epoch": 0.42, + "grad_norm": 0.8530899850369583, + "learning_rate": 1.3168382295074923e-05, + "loss": 0.1974, + "step": 8170 + }, + { + "epoch": 0.42, + "grad_norm": 0.9398278061149311, + "learning_rate": 1.316682013598064e-05, + "loss": 0.1778, + "step": 8171 + }, + { + "epoch": 0.42, + "grad_norm": 1.2818932363910829, + "learning_rate": 1.3165257890986178e-05, + "loss": 0.1952, + "step": 8172 + }, + { + "epoch": 0.42, + "grad_norm": 0.7649808941738352, + "learning_rate": 1.3163695560133922e-05, + "loss": 0.198, + "step": 8173 + }, + { + "epoch": 0.42, + "grad_norm": 1.2894358803233346, + "learning_rate": 1.3162133143466242e-05, + "loss": 0.1856, + "step": 8174 + }, + { + "epoch": 0.42, + "grad_norm": 1.665193046043179, + "learning_rate": 1.3160570641025526e-05, + "loss": 0.1955, + "step": 8175 + }, + { + "epoch": 0.42, + "grad_norm": 1.157724088983787, + "learning_rate": 1.3159008052854147e-05, + "loss": 0.1865, + "step": 8176 + }, + { + "epoch": 0.42, + "grad_norm": 1.0373369719316299, + "learning_rate": 1.3157445378994498e-05, + "loss": 0.2099, + "step": 8177 + }, + { + "epoch": 0.42, + "grad_norm": 1.8229843102712258, + "learning_rate": 1.3155882619488967e-05, + "loss": 0.1947, + "step": 8178 + }, + { + "epoch": 0.42, + "grad_norm": 1.0086904947534578, + "learning_rate": 1.315431977437994e-05, + "loss": 0.2151, + "step": 8179 + }, + { + "epoch": 0.42, + "grad_norm": 1.1766623038489123, + "learning_rate": 1.3152756843709814e-05, + "loss": 0.1941, + "step": 8180 + }, + { + "epoch": 0.42, + "grad_norm": 4.190846556534522, + "learning_rate": 1.3151193827520975e-05, + "loss": 0.1877, + "step": 8181 + }, + { + "epoch": 0.42, + "grad_norm": 1.2251108654586467, + "learning_rate": 1.3149630725855828e-05, + "loss": 0.2026, + "step": 8182 + }, + { + "epoch": 0.42, + "grad_norm": 1.0803278763437667, + "learning_rate": 1.314806753875677e-05, + "loss": 0.1935, + "step": 8183 + }, + { + "epoch": 0.42, + "grad_norm": 0.9777490138647087, + "learning_rate": 1.3146504266266202e-05, + "loss": 0.2, + "step": 8184 + }, + { + "epoch": 0.42, + "grad_norm": 0.8264047600720581, + "learning_rate": 1.3144940908426532e-05, + "loss": 0.1841, + "step": 8185 + }, + { + "epoch": 0.42, + "grad_norm": 1.5967147063009872, + "learning_rate": 1.3143377465280155e-05, + "loss": 0.1837, + "step": 8186 + }, + { + "epoch": 0.42, + "grad_norm": 0.9558789187028566, + "learning_rate": 1.3141813936869494e-05, + "loss": 0.2139, + "step": 8187 + }, + { + "epoch": 0.42, + "grad_norm": 0.8320678707779134, + "learning_rate": 1.314025032323695e-05, + "loss": 0.201, + "step": 8188 + }, + { + "epoch": 0.42, + "grad_norm": 0.9471377988177363, + "learning_rate": 1.3138686624424937e-05, + "loss": 0.2062, + "step": 8189 + }, + { + "epoch": 0.42, + "grad_norm": 0.9599458958587899, + "learning_rate": 1.313712284047587e-05, + "loss": 0.1865, + "step": 8190 + }, + { + "epoch": 0.42, + "grad_norm": 1.3681755844392067, + "learning_rate": 1.3135558971432172e-05, + "loss": 0.1965, + "step": 8191 + }, + { + "epoch": 0.42, + "grad_norm": 0.8330320087338403, + "learning_rate": 1.3133995017336259e-05, + "loss": 0.2007, + "step": 8192 + }, + { + "epoch": 0.42, + "grad_norm": 0.9023380179478936, + "learning_rate": 1.3132430978230555e-05, + "loss": 0.182, + "step": 8193 + }, + { + "epoch": 0.42, + "grad_norm": 2.4851398984555777, + "learning_rate": 1.3130866854157482e-05, + "loss": 0.1946, + "step": 8194 + }, + { + "epoch": 0.42, + "grad_norm": 0.8570937044939294, + "learning_rate": 1.312930264515947e-05, + "loss": 0.1703, + "step": 8195 + }, + { + "epoch": 0.42, + "grad_norm": 2.3065861691308562, + "learning_rate": 1.3127738351278946e-05, + "loss": 0.1996, + "step": 8196 + }, + { + "epoch": 0.42, + "grad_norm": 1.3622707775354201, + "learning_rate": 1.3126173972558345e-05, + "loss": 0.1964, + "step": 8197 + }, + { + "epoch": 0.42, + "grad_norm": 1.1894897579210884, + "learning_rate": 1.3124609509040095e-05, + "loss": 0.1916, + "step": 8198 + }, + { + "epoch": 0.42, + "grad_norm": 1.0950019770268824, + "learning_rate": 1.3123044960766638e-05, + "loss": 0.1734, + "step": 8199 + }, + { + "epoch": 0.42, + "grad_norm": 1.4604954107534205, + "learning_rate": 1.3121480327780409e-05, + "loss": 0.1936, + "step": 8200 + }, + { + "epoch": 0.42, + "grad_norm": 1.0583170625721745, + "learning_rate": 1.311991561012385e-05, + "loss": 0.1843, + "step": 8201 + }, + { + "epoch": 0.42, + "grad_norm": 2.0871249935829725, + "learning_rate": 1.3118350807839404e-05, + "loss": 0.2015, + "step": 8202 + }, + { + "epoch": 0.42, + "grad_norm": 1.1809231282154058, + "learning_rate": 1.3116785920969517e-05, + "loss": 0.1884, + "step": 8203 + }, + { + "epoch": 0.42, + "grad_norm": 1.1723187003173714, + "learning_rate": 1.3115220949556635e-05, + "loss": 0.1776, + "step": 8204 + }, + { + "epoch": 0.42, + "grad_norm": 1.4721516041880833, + "learning_rate": 1.3113655893643208e-05, + "loss": 0.1993, + "step": 8205 + }, + { + "epoch": 0.42, + "grad_norm": 1.3186840370466455, + "learning_rate": 1.3112090753271693e-05, + "loss": 0.1927, + "step": 8206 + }, + { + "epoch": 0.42, + "grad_norm": 1.0011682661603702, + "learning_rate": 1.3110525528484535e-05, + "loss": 0.1978, + "step": 8207 + }, + { + "epoch": 0.42, + "grad_norm": 1.07523059426114, + "learning_rate": 1.3108960219324201e-05, + "loss": 0.1978, + "step": 8208 + }, + { + "epoch": 0.42, + "grad_norm": 1.2301338016941046, + "learning_rate": 1.3107394825833142e-05, + "loss": 0.192, + "step": 8209 + }, + { + "epoch": 0.42, + "grad_norm": 1.430169746032634, + "learning_rate": 1.3105829348053824e-05, + "loss": 0.2017, + "step": 8210 + }, + { + "epoch": 0.42, + "grad_norm": 0.9970915919775609, + "learning_rate": 1.310426378602871e-05, + "loss": 0.202, + "step": 8211 + }, + { + "epoch": 0.42, + "grad_norm": 1.7391855967398469, + "learning_rate": 1.3102698139800266e-05, + "loss": 0.2163, + "step": 8212 + }, + { + "epoch": 0.42, + "grad_norm": 1.366594790036297, + "learning_rate": 1.3101132409410957e-05, + "loss": 0.2044, + "step": 8213 + }, + { + "epoch": 0.42, + "grad_norm": 1.329529746225666, + "learning_rate": 1.309956659490326e-05, + "loss": 0.2073, + "step": 8214 + }, + { + "epoch": 0.42, + "grad_norm": 0.864839755199251, + "learning_rate": 1.3098000696319642e-05, + "loss": 0.1674, + "step": 8215 + }, + { + "epoch": 0.42, + "grad_norm": 0.9127792797131793, + "learning_rate": 1.3096434713702579e-05, + "loss": 0.1944, + "step": 8216 + }, + { + "epoch": 0.42, + "grad_norm": 0.8344816953431473, + "learning_rate": 1.309486864709455e-05, + "loss": 0.1928, + "step": 8217 + }, + { + "epoch": 0.42, + "grad_norm": 0.7475117512260006, + "learning_rate": 1.3093302496538036e-05, + "loss": 0.1854, + "step": 8218 + }, + { + "epoch": 0.42, + "grad_norm": 0.8922062545828179, + "learning_rate": 1.3091736262075516e-05, + "loss": 0.182, + "step": 8219 + }, + { + "epoch": 0.42, + "grad_norm": 1.0794784191756126, + "learning_rate": 1.3090169943749475e-05, + "loss": 0.2091, + "step": 8220 + }, + { + "epoch": 0.42, + "grad_norm": 1.1386734232776148, + "learning_rate": 1.3088603541602401e-05, + "loss": 0.1989, + "step": 8221 + }, + { + "epoch": 0.42, + "grad_norm": 0.97328561492684, + "learning_rate": 1.3087037055676782e-05, + "loss": 0.2353, + "step": 8222 + }, + { + "epoch": 0.42, + "grad_norm": 0.7729143032057222, + "learning_rate": 1.3085470486015106e-05, + "loss": 0.2031, + "step": 8223 + }, + { + "epoch": 0.42, + "grad_norm": 0.9819343230584475, + "learning_rate": 1.308390383265987e-05, + "loss": 0.2244, + "step": 8224 + }, + { + "epoch": 0.42, + "grad_norm": 0.8593104249308028, + "learning_rate": 1.3082337095653569e-05, + "loss": 0.1637, + "step": 8225 + }, + { + "epoch": 0.42, + "grad_norm": 1.200272710074114, + "learning_rate": 1.30807702750387e-05, + "loss": 0.2117, + "step": 8226 + }, + { + "epoch": 0.42, + "grad_norm": 1.3002822205995266, + "learning_rate": 1.307920337085776e-05, + "loss": 0.2086, + "step": 8227 + }, + { + "epoch": 0.42, + "grad_norm": 0.870542691222419, + "learning_rate": 1.3077636383153258e-05, + "loss": 0.1817, + "step": 8228 + }, + { + "epoch": 0.42, + "grad_norm": 0.9153276421843458, + "learning_rate": 1.3076069311967696e-05, + "loss": 0.2096, + "step": 8229 + }, + { + "epoch": 0.42, + "grad_norm": 1.2293731778608028, + "learning_rate": 1.3074502157343575e-05, + "loss": 0.2198, + "step": 8230 + }, + { + "epoch": 0.42, + "grad_norm": 0.8688580667543977, + "learning_rate": 1.3072934919323414e-05, + "loss": 0.1632, + "step": 8231 + }, + { + "epoch": 0.42, + "grad_norm": 1.409556841159473, + "learning_rate": 1.307136759794972e-05, + "loss": 0.1991, + "step": 8232 + }, + { + "epoch": 0.42, + "grad_norm": 0.8755205631804428, + "learning_rate": 1.3069800193265005e-05, + "loss": 0.1977, + "step": 8233 + }, + { + "epoch": 0.42, + "grad_norm": 0.9340942948944939, + "learning_rate": 1.3068232705311784e-05, + "loss": 0.188, + "step": 8234 + }, + { + "epoch": 0.42, + "grad_norm": 0.9190292523868503, + "learning_rate": 1.3066665134132584e-05, + "loss": 0.2043, + "step": 8235 + }, + { + "epoch": 0.42, + "grad_norm": 1.1836600836195899, + "learning_rate": 1.3065097479769915e-05, + "loss": 0.2094, + "step": 8236 + }, + { + "epoch": 0.42, + "grad_norm": 0.8521202367831131, + "learning_rate": 1.3063529742266304e-05, + "loss": 0.2102, + "step": 8237 + }, + { + "epoch": 0.42, + "grad_norm": 1.0102823962086578, + "learning_rate": 1.3061961921664276e-05, + "loss": 0.1673, + "step": 8238 + }, + { + "epoch": 0.42, + "grad_norm": 0.8387168527811484, + "learning_rate": 1.3060394018006357e-05, + "loss": 0.1924, + "step": 8239 + }, + { + "epoch": 0.42, + "grad_norm": 1.0532826607045784, + "learning_rate": 1.305882603133508e-05, + "loss": 0.1784, + "step": 8240 + }, + { + "epoch": 0.42, + "grad_norm": 0.8118358299802393, + "learning_rate": 1.305725796169297e-05, + "loss": 0.1818, + "step": 8241 + }, + { + "epoch": 0.42, + "grad_norm": 0.9002539298249029, + "learning_rate": 1.3055689809122569e-05, + "loss": 0.2066, + "step": 8242 + }, + { + "epoch": 0.42, + "grad_norm": 0.9585852126504476, + "learning_rate": 1.3054121573666408e-05, + "loss": 0.168, + "step": 8243 + }, + { + "epoch": 0.42, + "grad_norm": 0.8460014460500977, + "learning_rate": 1.3052553255367024e-05, + "loss": 0.1781, + "step": 8244 + }, + { + "epoch": 0.42, + "grad_norm": 1.0891528902857406, + "learning_rate": 1.3050984854266963e-05, + "loss": 0.197, + "step": 8245 + }, + { + "epoch": 0.42, + "grad_norm": 0.7424589594293574, + "learning_rate": 1.3049416370408768e-05, + "loss": 0.1609, + "step": 8246 + }, + { + "epoch": 0.42, + "grad_norm": 1.135546053361693, + "learning_rate": 1.3047847803834976e-05, + "loss": 0.2029, + "step": 8247 + }, + { + "epoch": 0.42, + "grad_norm": 1.0569142092023314, + "learning_rate": 1.3046279154588146e-05, + "loss": 0.2046, + "step": 8248 + }, + { + "epoch": 0.42, + "grad_norm": 1.7796741457759377, + "learning_rate": 1.3044710422710818e-05, + "loss": 0.2229, + "step": 8249 + }, + { + "epoch": 0.42, + "grad_norm": 1.0588020023041, + "learning_rate": 1.3043141608245551e-05, + "loss": 0.2038, + "step": 8250 + }, + { + "epoch": 0.42, + "grad_norm": 1.1450205997702398, + "learning_rate": 1.3041572711234893e-05, + "loss": 0.1869, + "step": 8251 + }, + { + "epoch": 0.42, + "grad_norm": 1.2504897105794466, + "learning_rate": 1.3040003731721402e-05, + "loss": 0.2002, + "step": 8252 + }, + { + "epoch": 0.42, + "grad_norm": 0.7303802878941769, + "learning_rate": 1.3038434669747644e-05, + "loss": 0.1793, + "step": 8253 + }, + { + "epoch": 0.42, + "grad_norm": 1.1163159311554531, + "learning_rate": 1.3036865525356168e-05, + "loss": 0.1896, + "step": 8254 + }, + { + "epoch": 0.42, + "grad_norm": 1.3143525222797499, + "learning_rate": 1.3035296298589549e-05, + "loss": 0.2188, + "step": 8255 + }, + { + "epoch": 0.42, + "grad_norm": 1.1341176124382015, + "learning_rate": 1.3033726989490341e-05, + "loss": 0.2011, + "step": 8256 + }, + { + "epoch": 0.42, + "grad_norm": 1.1201148079058703, + "learning_rate": 1.303215759810112e-05, + "loss": 0.1983, + "step": 8257 + }, + { + "epoch": 0.42, + "grad_norm": 1.014726310011175, + "learning_rate": 1.3030588124464453e-05, + "loss": 0.1923, + "step": 8258 + }, + { + "epoch": 0.42, + "grad_norm": 2.5739238510028213, + "learning_rate": 1.302901856862291e-05, + "loss": 0.1954, + "step": 8259 + }, + { + "epoch": 0.42, + "grad_norm": 1.6029945998288895, + "learning_rate": 1.302744893061907e-05, + "loss": 0.2006, + "step": 8260 + }, + { + "epoch": 0.42, + "grad_norm": 1.152759348936254, + "learning_rate": 1.3025879210495505e-05, + "loss": 0.2025, + "step": 8261 + }, + { + "epoch": 0.42, + "grad_norm": 0.9133265209563681, + "learning_rate": 1.3024309408294795e-05, + "loss": 0.1915, + "step": 8262 + }, + { + "epoch": 0.42, + "grad_norm": 0.8821852254147305, + "learning_rate": 1.3022739524059521e-05, + "loss": 0.1704, + "step": 8263 + }, + { + "epoch": 0.42, + "grad_norm": 0.7904481085547684, + "learning_rate": 1.3021169557832269e-05, + "loss": 0.187, + "step": 8264 + }, + { + "epoch": 0.42, + "grad_norm": 0.9535276347236445, + "learning_rate": 1.301959950965562e-05, + "loss": 0.1792, + "step": 8265 + }, + { + "epoch": 0.42, + "grad_norm": 1.4203925883538724, + "learning_rate": 1.3018029379572163e-05, + "loss": 0.2237, + "step": 8266 + }, + { + "epoch": 0.42, + "grad_norm": 1.147777070235677, + "learning_rate": 1.3016459167624494e-05, + "loss": 0.2015, + "step": 8267 + }, + { + "epoch": 0.42, + "grad_norm": 0.8485290112978522, + "learning_rate": 1.3014888873855194e-05, + "loss": 0.2094, + "step": 8268 + }, + { + "epoch": 0.42, + "grad_norm": 0.8724810785819876, + "learning_rate": 1.3013318498306864e-05, + "loss": 0.1882, + "step": 8269 + }, + { + "epoch": 0.42, + "grad_norm": 1.0869723494866173, + "learning_rate": 1.3011748041022101e-05, + "loss": 0.2007, + "step": 8270 + }, + { + "epoch": 0.42, + "grad_norm": 0.8982814311359465, + "learning_rate": 1.3010177502043502e-05, + "loss": 0.2056, + "step": 8271 + }, + { + "epoch": 0.42, + "grad_norm": 0.9975730113802105, + "learning_rate": 1.3008606881413668e-05, + "loss": 0.2134, + "step": 8272 + }, + { + "epoch": 0.42, + "grad_norm": 1.1142646889120968, + "learning_rate": 1.3007036179175203e-05, + "loss": 0.2105, + "step": 8273 + }, + { + "epoch": 0.42, + "grad_norm": 0.8904410113100958, + "learning_rate": 1.300546539537071e-05, + "loss": 0.2045, + "step": 8274 + }, + { + "epoch": 0.42, + "grad_norm": 0.9203103644433339, + "learning_rate": 1.3003894530042803e-05, + "loss": 0.2076, + "step": 8275 + }, + { + "epoch": 0.42, + "grad_norm": 0.9444630885033088, + "learning_rate": 1.3002323583234082e-05, + "loss": 0.1697, + "step": 8276 + }, + { + "epoch": 0.42, + "grad_norm": 0.8244494027922004, + "learning_rate": 1.3000752554987166e-05, + "loss": 0.1908, + "step": 8277 + }, + { + "epoch": 0.42, + "grad_norm": 0.8985091790541165, + "learning_rate": 1.2999181445344666e-05, + "loss": 0.1874, + "step": 8278 + }, + { + "epoch": 0.42, + "grad_norm": 1.04902462783325, + "learning_rate": 1.2997610254349203e-05, + "loss": 0.1777, + "step": 8279 + }, + { + "epoch": 0.42, + "grad_norm": 0.8914789760341593, + "learning_rate": 1.299603898204339e-05, + "loss": 0.2008, + "step": 8280 + }, + { + "epoch": 0.42, + "grad_norm": 0.8652214926690659, + "learning_rate": 1.2994467628469853e-05, + "loss": 0.1843, + "step": 8281 + }, + { + "epoch": 0.42, + "grad_norm": 0.7753537627462095, + "learning_rate": 1.299289619367121e-05, + "loss": 0.1718, + "step": 8282 + }, + { + "epoch": 0.42, + "grad_norm": 0.9956931480728481, + "learning_rate": 1.299132467769009e-05, + "loss": 0.1941, + "step": 8283 + }, + { + "epoch": 0.42, + "grad_norm": 1.201701752879004, + "learning_rate": 1.2989753080569119e-05, + "loss": 0.2011, + "step": 8284 + }, + { + "epoch": 0.42, + "grad_norm": 0.9191160847591369, + "learning_rate": 1.2988181402350926e-05, + "loss": 0.1923, + "step": 8285 + }, + { + "epoch": 0.42, + "grad_norm": 0.9220969801360414, + "learning_rate": 1.2986609643078145e-05, + "loss": 0.2123, + "step": 8286 + }, + { + "epoch": 0.42, + "grad_norm": 0.9169015302630571, + "learning_rate": 1.2985037802793405e-05, + "loss": 0.2273, + "step": 8287 + }, + { + "epoch": 0.42, + "grad_norm": 0.8521753910449956, + "learning_rate": 1.298346588153935e-05, + "loss": 0.2115, + "step": 8288 + }, + { + "epoch": 0.42, + "grad_norm": 1.2120447235605898, + "learning_rate": 1.2981893879358616e-05, + "loss": 0.2047, + "step": 8289 + }, + { + "epoch": 0.42, + "grad_norm": 3.101266182978855, + "learning_rate": 1.2980321796293838e-05, + "loss": 0.2001, + "step": 8290 + }, + { + "epoch": 0.42, + "grad_norm": 1.174558001806414, + "learning_rate": 1.2978749632387665e-05, + "loss": 0.1848, + "step": 8291 + }, + { + "epoch": 0.42, + "grad_norm": 0.8953418782568198, + "learning_rate": 1.297717738768274e-05, + "loss": 0.2, + "step": 8292 + }, + { + "epoch": 0.42, + "grad_norm": 1.6475492947463053, + "learning_rate": 1.297560506222171e-05, + "loss": 0.1929, + "step": 8293 + }, + { + "epoch": 0.42, + "grad_norm": 1.157541240282237, + "learning_rate": 1.2974032656047223e-05, + "loss": 0.2041, + "step": 8294 + }, + { + "epoch": 0.42, + "grad_norm": 2.4156656473816827, + "learning_rate": 1.2972460169201933e-05, + "loss": 0.1935, + "step": 8295 + }, + { + "epoch": 0.42, + "grad_norm": 0.9467195639374285, + "learning_rate": 1.2970887601728495e-05, + "loss": 0.2219, + "step": 8296 + }, + { + "epoch": 0.42, + "grad_norm": 1.6312858345818504, + "learning_rate": 1.2969314953669563e-05, + "loss": 0.1676, + "step": 8297 + }, + { + "epoch": 0.42, + "grad_norm": 1.4278720908189264, + "learning_rate": 1.2967742225067792e-05, + "loss": 0.1909, + "step": 8298 + }, + { + "epoch": 0.42, + "grad_norm": 0.9080688833167727, + "learning_rate": 1.2966169415965847e-05, + "loss": 0.1995, + "step": 8299 + }, + { + "epoch": 0.42, + "grad_norm": 0.9863305922628434, + "learning_rate": 1.296459652640639e-05, + "loss": 0.1974, + "step": 8300 + }, + { + "epoch": 0.42, + "grad_norm": 0.9664763132904046, + "learning_rate": 1.2963023556432083e-05, + "loss": 0.1962, + "step": 8301 + }, + { + "epoch": 0.42, + "grad_norm": 0.9776353817339578, + "learning_rate": 1.2961450506085597e-05, + "loss": 0.1841, + "step": 8302 + }, + { + "epoch": 0.42, + "grad_norm": 0.8443028312632579, + "learning_rate": 1.2959877375409598e-05, + "loss": 0.1786, + "step": 8303 + }, + { + "epoch": 0.42, + "grad_norm": 1.9857253074754555, + "learning_rate": 1.2958304164446758e-05, + "loss": 0.2038, + "step": 8304 + }, + { + "epoch": 0.42, + "grad_norm": 0.8825347887344253, + "learning_rate": 1.2956730873239746e-05, + "loss": 0.203, + "step": 8305 + }, + { + "epoch": 0.42, + "grad_norm": 0.9936270761193413, + "learning_rate": 1.2955157501831248e-05, + "loss": 0.206, + "step": 8306 + }, + { + "epoch": 0.42, + "grad_norm": 0.9087815807999129, + "learning_rate": 1.2953584050263935e-05, + "loss": 0.2033, + "step": 8307 + }, + { + "epoch": 0.42, + "grad_norm": 0.9204698155344063, + "learning_rate": 1.2952010518580487e-05, + "loss": 0.1886, + "step": 8308 + }, + { + "epoch": 0.42, + "grad_norm": 1.6209462461027426, + "learning_rate": 1.2950436906823584e-05, + "loss": 0.2012, + "step": 8309 + }, + { + "epoch": 0.42, + "grad_norm": 0.9357386669354606, + "learning_rate": 1.2948863215035918e-05, + "loss": 0.195, + "step": 8310 + }, + { + "epoch": 0.42, + "grad_norm": 0.8333558655932062, + "learning_rate": 1.2947289443260172e-05, + "loss": 0.1844, + "step": 8311 + }, + { + "epoch": 0.42, + "grad_norm": 1.3236927019756342, + "learning_rate": 1.2945715591539028e-05, + "loss": 0.208, + "step": 8312 + }, + { + "epoch": 0.42, + "grad_norm": 1.0149795519327343, + "learning_rate": 1.2944141659915184e-05, + "loss": 0.1809, + "step": 8313 + }, + { + "epoch": 0.42, + "grad_norm": 0.7928022468669075, + "learning_rate": 1.2942567648431333e-05, + "loss": 0.1779, + "step": 8314 + }, + { + "epoch": 0.42, + "grad_norm": 1.2245378017402813, + "learning_rate": 1.2940993557130166e-05, + "loss": 0.1938, + "step": 8315 + }, + { + "epoch": 0.42, + "grad_norm": 0.8517131351401706, + "learning_rate": 1.2939419386054384e-05, + "loss": 0.2013, + "step": 8316 + }, + { + "epoch": 0.42, + "grad_norm": 0.9578387527059876, + "learning_rate": 1.2937845135246682e-05, + "loss": 0.1977, + "step": 8317 + }, + { + "epoch": 0.42, + "grad_norm": 1.0551568804695122, + "learning_rate": 1.2936270804749769e-05, + "loss": 0.2133, + "step": 8318 + }, + { + "epoch": 0.42, + "grad_norm": 0.849943397128867, + "learning_rate": 1.2934696394606344e-05, + "loss": 0.1761, + "step": 8319 + }, + { + "epoch": 0.42, + "grad_norm": 0.8992953513527367, + "learning_rate": 1.2933121904859111e-05, + "loss": 0.2131, + "step": 8320 + }, + { + "epoch": 0.42, + "grad_norm": 1.9666343163181608, + "learning_rate": 1.2931547335550782e-05, + "loss": 0.205, + "step": 8321 + }, + { + "epoch": 0.42, + "grad_norm": 0.8788238128220377, + "learning_rate": 1.2929972686724066e-05, + "loss": 0.1943, + "step": 8322 + }, + { + "epoch": 0.42, + "grad_norm": 0.9164644091486011, + "learning_rate": 1.2928397958421674e-05, + "loss": 0.1861, + "step": 8323 + }, + { + "epoch": 0.42, + "grad_norm": 1.0092684889010668, + "learning_rate": 1.2926823150686325e-05, + "loss": 0.1942, + "step": 8324 + }, + { + "epoch": 0.42, + "grad_norm": 1.7327146496169548, + "learning_rate": 1.2925248263560733e-05, + "loss": 0.1946, + "step": 8325 + }, + { + "epoch": 0.42, + "grad_norm": 0.9039676989792145, + "learning_rate": 1.2923673297087613e-05, + "loss": 0.1961, + "step": 8326 + }, + { + "epoch": 0.42, + "grad_norm": 1.06902187233108, + "learning_rate": 1.2922098251309694e-05, + "loss": 0.204, + "step": 8327 + }, + { + "epoch": 0.42, + "grad_norm": 0.9821026300743326, + "learning_rate": 1.2920523126269692e-05, + "loss": 0.1915, + "step": 8328 + }, + { + "epoch": 0.42, + "grad_norm": 0.905264461468117, + "learning_rate": 1.2918947922010336e-05, + "loss": 0.1825, + "step": 8329 + }, + { + "epoch": 0.42, + "grad_norm": 0.8558085429359792, + "learning_rate": 1.291737263857435e-05, + "loss": 0.1857, + "step": 8330 + }, + { + "epoch": 0.42, + "grad_norm": 0.8567074452471907, + "learning_rate": 1.2915797276004469e-05, + "loss": 0.1843, + "step": 8331 + }, + { + "epoch": 0.42, + "grad_norm": 0.8409836164037255, + "learning_rate": 1.2914221834343423e-05, + "loss": 0.2138, + "step": 8332 + }, + { + "epoch": 0.42, + "grad_norm": 1.0864523715789525, + "learning_rate": 1.2912646313633945e-05, + "loss": 0.1695, + "step": 8333 + }, + { + "epoch": 0.42, + "grad_norm": 0.9739854947766973, + "learning_rate": 1.2911070713918772e-05, + "loss": 0.2029, + "step": 8334 + }, + { + "epoch": 0.42, + "grad_norm": 1.0051465038200151, + "learning_rate": 1.2909495035240638e-05, + "loss": 0.2148, + "step": 8335 + }, + { + "epoch": 0.42, + "grad_norm": 1.2388383034363506, + "learning_rate": 1.2907919277642287e-05, + "loss": 0.1904, + "step": 8336 + }, + { + "epoch": 0.42, + "grad_norm": 0.7647215138884825, + "learning_rate": 1.2906343441166465e-05, + "loss": 0.1729, + "step": 8337 + }, + { + "epoch": 0.42, + "grad_norm": 0.7929162694707192, + "learning_rate": 1.290476752585591e-05, + "loss": 0.173, + "step": 8338 + }, + { + "epoch": 0.42, + "grad_norm": 0.8801109553531988, + "learning_rate": 1.2903191531753373e-05, + "loss": 0.1831, + "step": 8339 + }, + { + "epoch": 0.42, + "grad_norm": 0.8117842995770987, + "learning_rate": 1.2901615458901602e-05, + "loss": 0.1862, + "step": 8340 + }, + { + "epoch": 0.42, + "grad_norm": 0.8564129234299726, + "learning_rate": 1.2900039307343345e-05, + "loss": 0.1944, + "step": 8341 + }, + { + "epoch": 0.42, + "grad_norm": 0.8326250384944172, + "learning_rate": 1.2898463077121361e-05, + "loss": 0.1922, + "step": 8342 + }, + { + "epoch": 0.42, + "grad_norm": 0.8476141151237748, + "learning_rate": 1.2896886768278406e-05, + "loss": 0.2183, + "step": 8343 + }, + { + "epoch": 0.42, + "grad_norm": 0.9497331360658231, + "learning_rate": 1.2895310380857224e-05, + "loss": 0.1823, + "step": 8344 + }, + { + "epoch": 0.42, + "grad_norm": 0.896734197061469, + "learning_rate": 1.2893733914900595e-05, + "loss": 0.1855, + "step": 8345 + }, + { + "epoch": 0.42, + "grad_norm": 0.9376838004640948, + "learning_rate": 1.2892157370451263e-05, + "loss": 0.1936, + "step": 8346 + }, + { + "epoch": 0.42, + "grad_norm": 1.2314498483898435, + "learning_rate": 1.2890580747552002e-05, + "loss": 0.2073, + "step": 8347 + }, + { + "epoch": 0.42, + "grad_norm": 0.9690556766269256, + "learning_rate": 1.2889004046245574e-05, + "loss": 0.1872, + "step": 8348 + }, + { + "epoch": 0.42, + "grad_norm": 0.8238006996316732, + "learning_rate": 1.2887427266574748e-05, + "loss": 0.2207, + "step": 8349 + }, + { + "epoch": 0.42, + "grad_norm": 0.8068928897313348, + "learning_rate": 1.2885850408582295e-05, + "loss": 0.1946, + "step": 8350 + }, + { + "epoch": 0.42, + "grad_norm": 0.859272390151615, + "learning_rate": 1.2884273472310986e-05, + "loss": 0.2203, + "step": 8351 + }, + { + "epoch": 0.42, + "grad_norm": 0.946612446604715, + "learning_rate": 1.2882696457803597e-05, + "loss": 0.159, + "step": 8352 + }, + { + "epoch": 0.42, + "grad_norm": 0.9089696914287012, + "learning_rate": 1.2881119365102901e-05, + "loss": 0.2278, + "step": 8353 + }, + { + "epoch": 0.42, + "grad_norm": 1.2959784235147993, + "learning_rate": 1.2879542194251681e-05, + "loss": 0.1795, + "step": 8354 + }, + { + "epoch": 0.42, + "grad_norm": 0.8042643958346923, + "learning_rate": 1.2877964945292717e-05, + "loss": 0.1843, + "step": 8355 + }, + { + "epoch": 0.42, + "grad_norm": 1.3629437841374252, + "learning_rate": 1.2876387618268793e-05, + "loss": 0.2049, + "step": 8356 + }, + { + "epoch": 0.42, + "grad_norm": 2.958231619240513, + "learning_rate": 1.2874810213222689e-05, + "loss": 0.1776, + "step": 8357 + }, + { + "epoch": 0.43, + "grad_norm": 1.5744423044538112, + "learning_rate": 1.2873232730197197e-05, + "loss": 0.2109, + "step": 8358 + }, + { + "epoch": 0.43, + "grad_norm": 0.936861568438983, + "learning_rate": 1.2871655169235104e-05, + "loss": 0.2103, + "step": 8359 + }, + { + "epoch": 0.43, + "grad_norm": 0.8437459959051726, + "learning_rate": 1.2870077530379205e-05, + "loss": 0.1947, + "step": 8360 + }, + { + "epoch": 0.43, + "grad_norm": 1.081735761931492, + "learning_rate": 1.286849981367229e-05, + "loss": 0.1811, + "step": 8361 + }, + { + "epoch": 0.43, + "grad_norm": 1.1563276410499428, + "learning_rate": 1.2866922019157155e-05, + "loss": 0.1966, + "step": 8362 + }, + { + "epoch": 0.43, + "grad_norm": 2.80787778076596, + "learning_rate": 1.28653441468766e-05, + "loss": 0.1989, + "step": 8363 + }, + { + "epoch": 0.43, + "grad_norm": 1.0611653379046442, + "learning_rate": 1.2863766196873419e-05, + "loss": 0.213, + "step": 8364 + }, + { + "epoch": 0.43, + "grad_norm": 0.90607110241615, + "learning_rate": 1.2862188169190419e-05, + "loss": 0.2084, + "step": 8365 + }, + { + "epoch": 0.43, + "grad_norm": 0.8946529178848721, + "learning_rate": 1.2860610063870405e-05, + "loss": 0.1944, + "step": 8366 + }, + { + "epoch": 0.43, + "grad_norm": 0.8532649529386813, + "learning_rate": 1.2859031880956181e-05, + "loss": 0.1794, + "step": 8367 + }, + { + "epoch": 0.43, + "grad_norm": 1.392909454230544, + "learning_rate": 1.2857453620490557e-05, + "loss": 0.1786, + "step": 8368 + }, + { + "epoch": 0.43, + "grad_norm": 0.8397705370290642, + "learning_rate": 1.2855875282516342e-05, + "loss": 0.2058, + "step": 8369 + }, + { + "epoch": 0.43, + "grad_norm": 1.0693770104817486, + "learning_rate": 1.2854296867076346e-05, + "loss": 0.1955, + "step": 8370 + }, + { + "epoch": 0.43, + "grad_norm": 0.7623931235591676, + "learning_rate": 1.2852718374213389e-05, + "loss": 0.17, + "step": 8371 + }, + { + "epoch": 0.43, + "grad_norm": 0.9831192248267664, + "learning_rate": 1.2851139803970285e-05, + "loss": 0.2333, + "step": 8372 + }, + { + "epoch": 0.43, + "grad_norm": 0.8845103966984852, + "learning_rate": 1.2849561156389851e-05, + "loss": 0.1856, + "step": 8373 + }, + { + "epoch": 0.43, + "grad_norm": 1.0431153361684566, + "learning_rate": 1.2847982431514911e-05, + "loss": 0.1819, + "step": 8374 + }, + { + "epoch": 0.43, + "grad_norm": 0.9885410602615909, + "learning_rate": 1.2846403629388285e-05, + "loss": 0.1831, + "step": 8375 + }, + { + "epoch": 0.43, + "grad_norm": 0.8649856620753934, + "learning_rate": 1.28448247500528e-05, + "loss": 0.1959, + "step": 8376 + }, + { + "epoch": 0.43, + "grad_norm": 1.3401866519005952, + "learning_rate": 1.2843245793551284e-05, + "loss": 0.2121, + "step": 8377 + }, + { + "epoch": 0.43, + "grad_norm": 0.9549802734325415, + "learning_rate": 1.2841666759926566e-05, + "loss": 0.1829, + "step": 8378 + }, + { + "epoch": 0.43, + "grad_norm": 0.8970244098474727, + "learning_rate": 1.2840087649221476e-05, + "loss": 0.1897, + "step": 8379 + }, + { + "epoch": 0.43, + "grad_norm": 0.9181250871587772, + "learning_rate": 1.283850846147885e-05, + "loss": 0.1814, + "step": 8380 + }, + { + "epoch": 0.43, + "grad_norm": 0.9677772504052429, + "learning_rate": 1.2836929196741518e-05, + "loss": 0.2052, + "step": 8381 + }, + { + "epoch": 0.43, + "grad_norm": 1.3704789262684547, + "learning_rate": 1.2835349855052324e-05, + "loss": 0.1896, + "step": 8382 + }, + { + "epoch": 0.43, + "grad_norm": 1.0717251508841295, + "learning_rate": 1.2833770436454103e-05, + "loss": 0.2023, + "step": 8383 + }, + { + "epoch": 0.43, + "grad_norm": 0.9885072449972804, + "learning_rate": 1.2832190940989699e-05, + "loss": 0.1995, + "step": 8384 + }, + { + "epoch": 0.43, + "grad_norm": 0.896312751966259, + "learning_rate": 1.2830611368701957e-05, + "loss": 0.2039, + "step": 8385 + }, + { + "epoch": 0.43, + "grad_norm": 1.7110044731285126, + "learning_rate": 1.2829031719633722e-05, + "loss": 0.1865, + "step": 8386 + }, + { + "epoch": 0.43, + "grad_norm": 0.9079049703949122, + "learning_rate": 1.2827451993827841e-05, + "loss": 0.1777, + "step": 8387 + }, + { + "epoch": 0.43, + "grad_norm": 1.2645822150236385, + "learning_rate": 1.2825872191327164e-05, + "loss": 0.1932, + "step": 8388 + }, + { + "epoch": 0.43, + "grad_norm": 1.7242785606944482, + "learning_rate": 1.2824292312174547e-05, + "loss": 0.1961, + "step": 8389 + }, + { + "epoch": 0.43, + "grad_norm": 0.8742480736548195, + "learning_rate": 1.282271235641284e-05, + "loss": 0.1985, + "step": 8390 + }, + { + "epoch": 0.43, + "grad_norm": 1.0083321320072796, + "learning_rate": 1.28211323240849e-05, + "loss": 0.1837, + "step": 8391 + }, + { + "epoch": 0.43, + "grad_norm": 0.7590299682094305, + "learning_rate": 1.2819552215233585e-05, + "loss": 0.1861, + "step": 8392 + }, + { + "epoch": 0.43, + "grad_norm": 0.7632714309789075, + "learning_rate": 1.2817972029901759e-05, + "loss": 0.1687, + "step": 8393 + }, + { + "epoch": 0.43, + "grad_norm": 0.8250873902419275, + "learning_rate": 1.2816391768132284e-05, + "loss": 0.2067, + "step": 8394 + }, + { + "epoch": 0.43, + "grad_norm": 0.9130439612947502, + "learning_rate": 1.2814811429968022e-05, + "loss": 0.1961, + "step": 8395 + }, + { + "epoch": 0.43, + "grad_norm": 1.1247865595713258, + "learning_rate": 1.2813231015451842e-05, + "loss": 0.1934, + "step": 8396 + }, + { + "epoch": 0.43, + "grad_norm": 2.418166635996874, + "learning_rate": 1.2811650524626608e-05, + "loss": 0.2263, + "step": 8397 + }, + { + "epoch": 0.43, + "grad_norm": 1.4470070889450257, + "learning_rate": 1.2810069957535198e-05, + "loss": 0.1728, + "step": 8398 + }, + { + "epoch": 0.43, + "grad_norm": 1.002144849625383, + "learning_rate": 1.2808489314220483e-05, + "loss": 0.1923, + "step": 8399 + }, + { + "epoch": 0.43, + "grad_norm": 1.3487231469057854, + "learning_rate": 1.2806908594725335e-05, + "loss": 0.1714, + "step": 8400 + }, + { + "epoch": 0.43, + "grad_norm": 1.0348615877972056, + "learning_rate": 1.280532779909263e-05, + "loss": 0.19, + "step": 8401 + }, + { + "epoch": 0.43, + "grad_norm": 0.818483891086596, + "learning_rate": 1.2803746927365252e-05, + "loss": 0.1832, + "step": 8402 + }, + { + "epoch": 0.43, + "grad_norm": 0.9141276415044727, + "learning_rate": 1.2802165979586084e-05, + "loss": 0.2016, + "step": 8403 + }, + { + "epoch": 0.43, + "grad_norm": 1.0645918152401999, + "learning_rate": 1.2800584955798e-05, + "loss": 0.1963, + "step": 8404 + }, + { + "epoch": 0.43, + "grad_norm": 0.9973262286041973, + "learning_rate": 1.2799003856043893e-05, + "loss": 0.207, + "step": 8405 + }, + { + "epoch": 0.43, + "grad_norm": 0.9579121869369324, + "learning_rate": 1.2797422680366649e-05, + "loss": 0.1928, + "step": 8406 + }, + { + "epoch": 0.43, + "grad_norm": 1.0122691887181539, + "learning_rate": 1.2795841428809155e-05, + "loss": 0.1898, + "step": 8407 + }, + { + "epoch": 0.43, + "grad_norm": 0.8437227917655225, + "learning_rate": 1.2794260101414307e-05, + "loss": 0.2134, + "step": 8408 + }, + { + "epoch": 0.43, + "grad_norm": 1.298694300803485, + "learning_rate": 1.2792678698224995e-05, + "loss": 0.1703, + "step": 8409 + }, + { + "epoch": 0.43, + "grad_norm": 0.8327681044213492, + "learning_rate": 1.2791097219284115e-05, + "loss": 0.2284, + "step": 8410 + }, + { + "epoch": 0.43, + "grad_norm": 1.6873455193552223, + "learning_rate": 1.2789515664634564e-05, + "loss": 0.2054, + "step": 8411 + }, + { + "epoch": 0.43, + "grad_norm": 1.0429920359977503, + "learning_rate": 1.2787934034319245e-05, + "loss": 0.1796, + "step": 8412 + }, + { + "epoch": 0.43, + "grad_norm": 0.831414425233925, + "learning_rate": 1.2786352328381057e-05, + "loss": 0.1732, + "step": 8413 + }, + { + "epoch": 0.43, + "grad_norm": 1.0479045262632114, + "learning_rate": 1.2784770546862905e-05, + "loss": 0.2097, + "step": 8414 + }, + { + "epoch": 0.43, + "grad_norm": 0.9688634782356663, + "learning_rate": 1.2783188689807697e-05, + "loss": 0.1877, + "step": 8415 + }, + { + "epoch": 0.43, + "grad_norm": 0.9186954634465992, + "learning_rate": 1.2781606757258335e-05, + "loss": 0.2137, + "step": 8416 + }, + { + "epoch": 0.43, + "grad_norm": 0.8804643222421765, + "learning_rate": 1.2780024749257736e-05, + "loss": 0.187, + "step": 8417 + }, + { + "epoch": 0.43, + "grad_norm": 1.0031110219542805, + "learning_rate": 1.2778442665848805e-05, + "loss": 0.2042, + "step": 8418 + }, + { + "epoch": 0.43, + "grad_norm": 0.7537420190969669, + "learning_rate": 1.277686050707446e-05, + "loss": 0.1772, + "step": 8419 + }, + { + "epoch": 0.43, + "grad_norm": 1.0582699108537157, + "learning_rate": 1.277527827297762e-05, + "loss": 0.1974, + "step": 8420 + }, + { + "epoch": 0.43, + "grad_norm": 1.1413213859696738, + "learning_rate": 1.2773695963601199e-05, + "loss": 0.1822, + "step": 8421 + }, + { + "epoch": 0.43, + "grad_norm": 0.8847067204436639, + "learning_rate": 1.2772113578988117e-05, + "loss": 0.1918, + "step": 8422 + }, + { + "epoch": 0.43, + "grad_norm": 1.3751260454860468, + "learning_rate": 1.2770531119181295e-05, + "loss": 0.1992, + "step": 8423 + }, + { + "epoch": 0.43, + "grad_norm": 1.181669385070735, + "learning_rate": 1.2768948584223666e-05, + "loss": 0.1808, + "step": 8424 + }, + { + "epoch": 0.43, + "grad_norm": 0.9003848410651017, + "learning_rate": 1.2767365974158146e-05, + "loss": 0.1724, + "step": 8425 + }, + { + "epoch": 0.43, + "grad_norm": 1.1547567937475915, + "learning_rate": 1.2765783289027671e-05, + "loss": 0.2013, + "step": 8426 + }, + { + "epoch": 0.43, + "grad_norm": 0.9531012582250008, + "learning_rate": 1.2764200528875164e-05, + "loss": 0.2085, + "step": 8427 + }, + { + "epoch": 0.43, + "grad_norm": 1.4375307799373926, + "learning_rate": 1.2762617693743562e-05, + "loss": 0.1851, + "step": 8428 + }, + { + "epoch": 0.43, + "grad_norm": 0.9412421733587898, + "learning_rate": 1.2761034783675803e-05, + "loss": 0.1906, + "step": 8429 + }, + { + "epoch": 0.43, + "grad_norm": 0.9939292690651369, + "learning_rate": 1.2759451798714816e-05, + "loss": 0.1956, + "step": 8430 + }, + { + "epoch": 0.43, + "grad_norm": 0.7829017756475471, + "learning_rate": 1.2757868738903545e-05, + "loss": 0.1985, + "step": 8431 + }, + { + "epoch": 0.43, + "grad_norm": 1.1982992843299392, + "learning_rate": 1.2756285604284928e-05, + "loss": 0.1817, + "step": 8432 + }, + { + "epoch": 0.43, + "grad_norm": 0.9110729910834504, + "learning_rate": 1.275470239490191e-05, + "loss": 0.2057, + "step": 8433 + }, + { + "epoch": 0.43, + "grad_norm": 1.2121332217693976, + "learning_rate": 1.2753119110797432e-05, + "loss": 0.2112, + "step": 8434 + }, + { + "epoch": 0.43, + "grad_norm": 1.2017977376703233, + "learning_rate": 1.2751535752014444e-05, + "loss": 0.1876, + "step": 8435 + }, + { + "epoch": 0.43, + "grad_norm": 1.162603626637288, + "learning_rate": 1.274995231859589e-05, + "loss": 0.161, + "step": 8436 + }, + { + "epoch": 0.43, + "grad_norm": 1.8380059596350338, + "learning_rate": 1.2748368810584725e-05, + "loss": 0.1758, + "step": 8437 + }, + { + "epoch": 0.43, + "grad_norm": 1.424109898645111, + "learning_rate": 1.2746785228023904e-05, + "loss": 0.1961, + "step": 8438 + }, + { + "epoch": 0.43, + "grad_norm": 1.3728472348261007, + "learning_rate": 1.2745201570956379e-05, + "loss": 0.1872, + "step": 8439 + }, + { + "epoch": 0.43, + "grad_norm": 2.2101805295026007, + "learning_rate": 1.27436178394251e-05, + "loss": 0.1694, + "step": 8440 + }, + { + "epoch": 0.43, + "grad_norm": 0.9012241586461739, + "learning_rate": 1.2742034033473037e-05, + "loss": 0.1645, + "step": 8441 + }, + { + "epoch": 0.43, + "grad_norm": 1.4716426150058821, + "learning_rate": 1.2740450153143144e-05, + "loss": 0.1965, + "step": 8442 + }, + { + "epoch": 0.43, + "grad_norm": 0.8971699492952784, + "learning_rate": 1.2738866198478388e-05, + "loss": 0.2057, + "step": 8443 + }, + { + "epoch": 0.43, + "grad_norm": 0.8749718882080885, + "learning_rate": 1.2737282169521732e-05, + "loss": 0.1899, + "step": 8444 + }, + { + "epoch": 0.43, + "grad_norm": 0.9340663748135312, + "learning_rate": 1.2735698066316138e-05, + "loss": 0.2032, + "step": 8445 + }, + { + "epoch": 0.43, + "grad_norm": 0.9371509604709034, + "learning_rate": 1.2734113888904584e-05, + "loss": 0.1909, + "step": 8446 + }, + { + "epoch": 0.43, + "grad_norm": 1.2213004935273502, + "learning_rate": 1.2732529637330036e-05, + "loss": 0.1849, + "step": 8447 + }, + { + "epoch": 0.43, + "grad_norm": 0.9992218155973259, + "learning_rate": 1.2730945311635465e-05, + "loss": 0.198, + "step": 8448 + }, + { + "epoch": 0.43, + "grad_norm": 0.8369839626639749, + "learning_rate": 1.272936091186385e-05, + "loss": 0.1964, + "step": 8449 + }, + { + "epoch": 0.43, + "grad_norm": 0.9779530137060174, + "learning_rate": 1.2727776438058166e-05, + "loss": 0.2013, + "step": 8450 + }, + { + "epoch": 0.43, + "grad_norm": 1.746936294451183, + "learning_rate": 1.2726191890261393e-05, + "loss": 0.19, + "step": 8451 + }, + { + "epoch": 0.43, + "grad_norm": 1.2654470125593666, + "learning_rate": 1.272460726851651e-05, + "loss": 0.1716, + "step": 8452 + }, + { + "epoch": 0.43, + "grad_norm": 1.6691442469544424, + "learning_rate": 1.2723022572866497e-05, + "loss": 0.1844, + "step": 8453 + }, + { + "epoch": 0.43, + "grad_norm": 1.4040660915184713, + "learning_rate": 1.2721437803354348e-05, + "loss": 0.2178, + "step": 8454 + }, + { + "epoch": 0.43, + "grad_norm": 3.168892417400603, + "learning_rate": 1.2719852960023043e-05, + "loss": 0.206, + "step": 8455 + }, + { + "epoch": 0.43, + "grad_norm": 1.2688061946410185, + "learning_rate": 1.2718268042915574e-05, + "loss": 0.1774, + "step": 8456 + }, + { + "epoch": 0.43, + "grad_norm": 1.2904127430747385, + "learning_rate": 1.271668305207493e-05, + "loss": 0.206, + "step": 8457 + }, + { + "epoch": 0.43, + "grad_norm": 1.1834309738463864, + "learning_rate": 1.2715097987544104e-05, + "loss": 0.2018, + "step": 8458 + }, + { + "epoch": 0.43, + "grad_norm": 1.3807424297364006, + "learning_rate": 1.2713512849366092e-05, + "loss": 0.1826, + "step": 8459 + }, + { + "epoch": 0.43, + "grad_norm": 1.1125232250625734, + "learning_rate": 1.2711927637583892e-05, + "loss": 0.1961, + "step": 8460 + }, + { + "epoch": 0.43, + "grad_norm": 0.8912732082138243, + "learning_rate": 1.2710342352240498e-05, + "loss": 0.2065, + "step": 8461 + }, + { + "epoch": 0.43, + "grad_norm": 1.2823631484358686, + "learning_rate": 1.270875699337892e-05, + "loss": 0.2041, + "step": 8462 + }, + { + "epoch": 0.43, + "grad_norm": 0.9186526027198525, + "learning_rate": 1.270717156104215e-05, + "loss": 0.2414, + "step": 8463 + }, + { + "epoch": 0.43, + "grad_norm": 0.9547704881649449, + "learning_rate": 1.2705586055273202e-05, + "loss": 0.2094, + "step": 8464 + }, + { + "epoch": 0.43, + "grad_norm": 0.9810576545703948, + "learning_rate": 1.2704000476115079e-05, + "loss": 0.218, + "step": 8465 + }, + { + "epoch": 0.43, + "grad_norm": 0.8829659202427127, + "learning_rate": 1.2702414823610791e-05, + "loss": 0.1928, + "step": 8466 + }, + { + "epoch": 0.43, + "grad_norm": 1.0581841757246446, + "learning_rate": 1.2700829097803347e-05, + "loss": 0.1974, + "step": 8467 + }, + { + "epoch": 0.43, + "grad_norm": 0.9613440736632844, + "learning_rate": 1.2699243298735762e-05, + "loss": 0.189, + "step": 8468 + }, + { + "epoch": 0.43, + "grad_norm": 1.1715634892409108, + "learning_rate": 1.2697657426451051e-05, + "loss": 0.1986, + "step": 8469 + }, + { + "epoch": 0.43, + "grad_norm": 0.9137834082634491, + "learning_rate": 1.2696071480992229e-05, + "loss": 0.2278, + "step": 8470 + }, + { + "epoch": 0.43, + "grad_norm": 0.8783778890963508, + "learning_rate": 1.2694485462402315e-05, + "loss": 0.1965, + "step": 8471 + }, + { + "epoch": 0.43, + "grad_norm": 1.1505391364195132, + "learning_rate": 1.269289937072433e-05, + "loss": 0.2037, + "step": 8472 + }, + { + "epoch": 0.43, + "grad_norm": 1.4161874301172996, + "learning_rate": 1.2691313206001298e-05, + "loss": 0.2112, + "step": 8473 + }, + { + "epoch": 0.43, + "grad_norm": 1.647897930955205, + "learning_rate": 1.2689726968276246e-05, + "loss": 0.2061, + "step": 8474 + }, + { + "epoch": 0.43, + "grad_norm": 0.9562300082535361, + "learning_rate": 1.2688140657592195e-05, + "loss": 0.2063, + "step": 8475 + }, + { + "epoch": 0.43, + "grad_norm": 2.6988876400429254, + "learning_rate": 1.2686554273992177e-05, + "loss": 0.2024, + "step": 8476 + }, + { + "epoch": 0.43, + "grad_norm": 1.0352568876456878, + "learning_rate": 1.2684967817519222e-05, + "loss": 0.2233, + "step": 8477 + }, + { + "epoch": 0.43, + "grad_norm": 1.6628966240070668, + "learning_rate": 1.2683381288216368e-05, + "loss": 0.2163, + "step": 8478 + }, + { + "epoch": 0.43, + "grad_norm": 1.0610061654242153, + "learning_rate": 1.268179468612664e-05, + "loss": 0.2103, + "step": 8479 + }, + { + "epoch": 0.43, + "grad_norm": 1.0703551730464311, + "learning_rate": 1.268020801129308e-05, + "loss": 0.1937, + "step": 8480 + }, + { + "epoch": 0.43, + "grad_norm": 0.9026742355707935, + "learning_rate": 1.2678621263758728e-05, + "loss": 0.2067, + "step": 8481 + }, + { + "epoch": 0.43, + "grad_norm": 1.1872590164339674, + "learning_rate": 1.2677034443566623e-05, + "loss": 0.1944, + "step": 8482 + }, + { + "epoch": 0.43, + "grad_norm": 1.3809906224953725, + "learning_rate": 1.2675447550759807e-05, + "loss": 0.1892, + "step": 8483 + }, + { + "epoch": 0.43, + "grad_norm": 1.1300445482997878, + "learning_rate": 1.2673860585381329e-05, + "loss": 0.1978, + "step": 8484 + }, + { + "epoch": 0.43, + "grad_norm": 1.0810033265516914, + "learning_rate": 1.2672273547474225e-05, + "loss": 0.17, + "step": 8485 + }, + { + "epoch": 0.43, + "grad_norm": 1.190920506332146, + "learning_rate": 1.2670686437081554e-05, + "loss": 0.1965, + "step": 8486 + }, + { + "epoch": 0.43, + "grad_norm": 1.093795744759877, + "learning_rate": 1.2669099254246363e-05, + "loss": 0.2049, + "step": 8487 + }, + { + "epoch": 0.43, + "grad_norm": 1.4062279100264012, + "learning_rate": 1.2667511999011699e-05, + "loss": 0.1928, + "step": 8488 + }, + { + "epoch": 0.43, + "grad_norm": 1.3611497401758716, + "learning_rate": 1.2665924671420626e-05, + "loss": 0.1897, + "step": 8489 + }, + { + "epoch": 0.43, + "grad_norm": 0.8179653065978323, + "learning_rate": 1.2664337271516194e-05, + "loss": 0.1996, + "step": 8490 + }, + { + "epoch": 0.43, + "grad_norm": 2.221152195388021, + "learning_rate": 1.2662749799341464e-05, + "loss": 0.1651, + "step": 8491 + }, + { + "epoch": 0.43, + "grad_norm": 1.174205904888376, + "learning_rate": 1.2661162254939496e-05, + "loss": 0.2103, + "step": 8492 + }, + { + "epoch": 0.43, + "grad_norm": 1.3784424859747435, + "learning_rate": 1.2659574638353349e-05, + "loss": 0.1805, + "step": 8493 + }, + { + "epoch": 0.43, + "grad_norm": 1.1577475217917517, + "learning_rate": 1.2657986949626091e-05, + "loss": 0.1849, + "step": 8494 + }, + { + "epoch": 0.43, + "grad_norm": 0.8962746567132944, + "learning_rate": 1.2656399188800788e-05, + "loss": 0.1944, + "step": 8495 + }, + { + "epoch": 0.43, + "grad_norm": 0.9910380022034406, + "learning_rate": 1.2654811355920505e-05, + "loss": 0.1889, + "step": 8496 + }, + { + "epoch": 0.43, + "grad_norm": 1.0426069901807775, + "learning_rate": 1.2653223451028316e-05, + "loss": 0.1993, + "step": 8497 + }, + { + "epoch": 0.43, + "grad_norm": 2.1887980320486227, + "learning_rate": 1.2651635474167287e-05, + "loss": 0.177, + "step": 8498 + }, + { + "epoch": 0.43, + "grad_norm": 3.0126773212647495, + "learning_rate": 1.2650047425380501e-05, + "loss": 0.2027, + "step": 8499 + }, + { + "epoch": 0.43, + "grad_norm": 5.165218024798249, + "learning_rate": 1.2648459304711026e-05, + "loss": 0.1823, + "step": 8500 + }, + { + "epoch": 0.43, + "grad_norm": 0.9797609981321793, + "learning_rate": 1.2646871112201943e-05, + "loss": 0.1933, + "step": 8501 + }, + { + "epoch": 0.43, + "grad_norm": 1.344103428058884, + "learning_rate": 1.2645282847896335e-05, + "loss": 0.1963, + "step": 8502 + }, + { + "epoch": 0.43, + "grad_norm": 1.2433867429641254, + "learning_rate": 1.2643694511837278e-05, + "loss": 0.1712, + "step": 8503 + }, + { + "epoch": 0.43, + "grad_norm": 1.3329144189269893, + "learning_rate": 1.2642106104067857e-05, + "loss": 0.1836, + "step": 8504 + }, + { + "epoch": 0.43, + "grad_norm": 1.7401092387768005, + "learning_rate": 1.264051762463116e-05, + "loss": 0.2108, + "step": 8505 + }, + { + "epoch": 0.43, + "grad_norm": 0.7026921755147951, + "learning_rate": 1.2638929073570273e-05, + "loss": 0.1618, + "step": 8506 + }, + { + "epoch": 0.43, + "grad_norm": 1.188762430576397, + "learning_rate": 1.2637340450928284e-05, + "loss": 0.2004, + "step": 8507 + }, + { + "epoch": 0.43, + "grad_norm": 1.169245700509231, + "learning_rate": 1.263575175674829e-05, + "loss": 0.1656, + "step": 8508 + }, + { + "epoch": 0.43, + "grad_norm": 0.9169441692984407, + "learning_rate": 1.2634162991073376e-05, + "loss": 0.1608, + "step": 8509 + }, + { + "epoch": 0.43, + "grad_norm": 4.239622696551675, + "learning_rate": 1.2632574153946646e-05, + "loss": 0.1836, + "step": 8510 + }, + { + "epoch": 0.43, + "grad_norm": 1.0826608464510832, + "learning_rate": 1.263098524541119e-05, + "loss": 0.1777, + "step": 8511 + }, + { + "epoch": 0.43, + "grad_norm": 1.5205226845830233, + "learning_rate": 1.2629396265510113e-05, + "loss": 0.1969, + "step": 8512 + }, + { + "epoch": 0.43, + "grad_norm": 0.9138966349728019, + "learning_rate": 1.2627807214286514e-05, + "loss": 0.1927, + "step": 8513 + }, + { + "epoch": 0.43, + "grad_norm": 1.9706735417872003, + "learning_rate": 1.2626218091783496e-05, + "loss": 0.1932, + "step": 8514 + }, + { + "epoch": 0.43, + "grad_norm": 1.0667045902349253, + "learning_rate": 1.262462889804416e-05, + "loss": 0.1853, + "step": 8515 + }, + { + "epoch": 0.43, + "grad_norm": 0.9375839118068797, + "learning_rate": 1.2623039633111623e-05, + "loss": 0.1862, + "step": 8516 + }, + { + "epoch": 0.43, + "grad_norm": 0.9210059162958971, + "learning_rate": 1.2621450297028984e-05, + "loss": 0.1892, + "step": 8517 + }, + { + "epoch": 0.43, + "grad_norm": 1.321499182230024, + "learning_rate": 1.261986088983936e-05, + "loss": 0.1814, + "step": 8518 + }, + { + "epoch": 0.43, + "grad_norm": 1.2698154007000069, + "learning_rate": 1.2618271411585859e-05, + "loss": 0.1871, + "step": 8519 + }, + { + "epoch": 0.43, + "grad_norm": 0.8777129749240363, + "learning_rate": 1.26166818623116e-05, + "loss": 0.2097, + "step": 8520 + }, + { + "epoch": 0.43, + "grad_norm": 1.712414234101878, + "learning_rate": 1.2615092242059697e-05, + "loss": 0.1877, + "step": 8521 + }, + { + "epoch": 0.43, + "grad_norm": 1.0140691144950047, + "learning_rate": 1.2613502550873269e-05, + "loss": 0.1742, + "step": 8522 + }, + { + "epoch": 0.43, + "grad_norm": 1.5850344252998092, + "learning_rate": 1.2611912788795437e-05, + "loss": 0.2046, + "step": 8523 + }, + { + "epoch": 0.43, + "grad_norm": 1.329465016086951, + "learning_rate": 1.261032295586932e-05, + "loss": 0.1894, + "step": 8524 + }, + { + "epoch": 0.43, + "grad_norm": 0.8296339341431521, + "learning_rate": 1.260873305213805e-05, + "loss": 0.202, + "step": 8525 + }, + { + "epoch": 0.43, + "grad_norm": 0.9733137634418483, + "learning_rate": 1.2607143077644746e-05, + "loss": 0.194, + "step": 8526 + }, + { + "epoch": 0.43, + "grad_norm": 1.160550697515113, + "learning_rate": 1.260555303243254e-05, + "loss": 0.2085, + "step": 8527 + }, + { + "epoch": 0.43, + "grad_norm": 0.8414652418885628, + "learning_rate": 1.2603962916544558e-05, + "loss": 0.1943, + "step": 8528 + }, + { + "epoch": 0.43, + "grad_norm": 1.0728029220031154, + "learning_rate": 1.2602372730023938e-05, + "loss": 0.1731, + "step": 8529 + }, + { + "epoch": 0.43, + "grad_norm": 0.8849396662531649, + "learning_rate": 1.2600782472913811e-05, + "loss": 0.1975, + "step": 8530 + }, + { + "epoch": 0.43, + "grad_norm": 0.7267393674010236, + "learning_rate": 1.259919214525731e-05, + "loss": 0.1781, + "step": 8531 + }, + { + "epoch": 0.43, + "grad_norm": 0.8599653618866973, + "learning_rate": 1.2597601747097578e-05, + "loss": 0.212, + "step": 8532 + }, + { + "epoch": 0.43, + "grad_norm": 1.0116049169051977, + "learning_rate": 1.259601127847775e-05, + "loss": 0.1909, + "step": 8533 + }, + { + "epoch": 0.43, + "grad_norm": 1.0711190542073004, + "learning_rate": 1.259442073944097e-05, + "loss": 0.1807, + "step": 8534 + }, + { + "epoch": 0.43, + "grad_norm": 1.0228121141488749, + "learning_rate": 1.259283013003038e-05, + "loss": 0.2351, + "step": 8535 + }, + { + "epoch": 0.43, + "grad_norm": 0.9452815167442381, + "learning_rate": 1.2591239450289127e-05, + "loss": 0.2026, + "step": 8536 + }, + { + "epoch": 0.43, + "grad_norm": 0.9610403379351731, + "learning_rate": 1.2589648700260359e-05, + "loss": 0.221, + "step": 8537 + }, + { + "epoch": 0.43, + "grad_norm": 1.6244814059214097, + "learning_rate": 1.2588057879987223e-05, + "loss": 0.1703, + "step": 8538 + }, + { + "epoch": 0.43, + "grad_norm": 0.9149145552404859, + "learning_rate": 1.2586466989512872e-05, + "loss": 0.2023, + "step": 8539 + }, + { + "epoch": 0.43, + "grad_norm": 0.9812119245902255, + "learning_rate": 1.2584876028880455e-05, + "loss": 0.1755, + "step": 8540 + }, + { + "epoch": 0.43, + "grad_norm": 1.0256757321539571, + "learning_rate": 1.258328499813313e-05, + "loss": 0.1977, + "step": 8541 + }, + { + "epoch": 0.43, + "grad_norm": 1.0537374073758372, + "learning_rate": 1.2581693897314056e-05, + "loss": 0.1901, + "step": 8542 + }, + { + "epoch": 0.43, + "grad_norm": 0.9823156564187766, + "learning_rate": 1.2580102726466388e-05, + "loss": 0.2047, + "step": 8543 + }, + { + "epoch": 0.43, + "grad_norm": 1.1843880734627192, + "learning_rate": 1.2578511485633288e-05, + "loss": 0.1823, + "step": 8544 + }, + { + "epoch": 0.43, + "grad_norm": 2.93633345218317, + "learning_rate": 1.2576920174857917e-05, + "loss": 0.2008, + "step": 8545 + }, + { + "epoch": 0.43, + "grad_norm": 1.040963176406591, + "learning_rate": 1.2575328794183439e-05, + "loss": 0.1999, + "step": 8546 + }, + { + "epoch": 0.43, + "grad_norm": 1.0780583875307697, + "learning_rate": 1.2573737343653026e-05, + "loss": 0.2144, + "step": 8547 + }, + { + "epoch": 0.43, + "grad_norm": 0.9516385201913629, + "learning_rate": 1.257214582330984e-05, + "loss": 0.1737, + "step": 8548 + }, + { + "epoch": 0.43, + "grad_norm": 0.9613205197820011, + "learning_rate": 1.2570554233197054e-05, + "loss": 0.1744, + "step": 8549 + }, + { + "epoch": 0.43, + "grad_norm": 1.019145823439213, + "learning_rate": 1.2568962573357837e-05, + "loss": 0.2127, + "step": 8550 + }, + { + "epoch": 0.43, + "grad_norm": 0.873002514272877, + "learning_rate": 1.256737084383537e-05, + "loss": 0.2042, + "step": 8551 + }, + { + "epoch": 0.43, + "grad_norm": 1.050032016240503, + "learning_rate": 1.2565779044672821e-05, + "loss": 0.1894, + "step": 8552 + }, + { + "epoch": 0.43, + "grad_norm": 0.8068052044940806, + "learning_rate": 1.256418717591337e-05, + "loss": 0.1846, + "step": 8553 + }, + { + "epoch": 0.43, + "grad_norm": 1.3979721822608953, + "learning_rate": 1.25625952376002e-05, + "loss": 0.1842, + "step": 8554 + }, + { + "epoch": 0.44, + "grad_norm": 2.547408364045404, + "learning_rate": 1.2561003229776485e-05, + "loss": 0.182, + "step": 8555 + }, + { + "epoch": 0.44, + "grad_norm": 0.9049443458327565, + "learning_rate": 1.2559411152485414e-05, + "loss": 0.1992, + "step": 8556 + }, + { + "epoch": 0.44, + "grad_norm": 0.8407326410671497, + "learning_rate": 1.2557819005770174e-05, + "loss": 0.1639, + "step": 8557 + }, + { + "epoch": 0.44, + "grad_norm": 0.9925588954179055, + "learning_rate": 1.2556226789673946e-05, + "loss": 0.2002, + "step": 8558 + }, + { + "epoch": 0.44, + "grad_norm": 0.8053222521118775, + "learning_rate": 1.2554634504239923e-05, + "loss": 0.2065, + "step": 8559 + }, + { + "epoch": 0.44, + "grad_norm": 0.8122917580352957, + "learning_rate": 1.2553042149511295e-05, + "loss": 0.1815, + "step": 8560 + }, + { + "epoch": 0.44, + "grad_norm": 0.9270132598078689, + "learning_rate": 1.2551449725531254e-05, + "loss": 0.1895, + "step": 8561 + }, + { + "epoch": 0.44, + "grad_norm": 0.9402093869050032, + "learning_rate": 1.2549857232342995e-05, + "loss": 0.198, + "step": 8562 + }, + { + "epoch": 0.44, + "grad_norm": 0.9951990611606287, + "learning_rate": 1.2548264669989712e-05, + "loss": 0.2181, + "step": 8563 + }, + { + "epoch": 0.44, + "grad_norm": 1.195070825105065, + "learning_rate": 1.2546672038514608e-05, + "loss": 0.1918, + "step": 8564 + }, + { + "epoch": 0.44, + "grad_norm": 0.8772802211101891, + "learning_rate": 1.2545079337960883e-05, + "loss": 0.1969, + "step": 8565 + }, + { + "epoch": 0.44, + "grad_norm": 1.2160331913974665, + "learning_rate": 1.2543486568371736e-05, + "loss": 0.195, + "step": 8566 + }, + { + "epoch": 0.44, + "grad_norm": 2.994132559894767, + "learning_rate": 1.2541893729790374e-05, + "loss": 0.1901, + "step": 8567 + }, + { + "epoch": 0.44, + "grad_norm": 0.7510679233061346, + "learning_rate": 1.2540300822259996e-05, + "loss": 0.1633, + "step": 8568 + }, + { + "epoch": 0.44, + "grad_norm": 1.0775744359597887, + "learning_rate": 1.253870784582382e-05, + "loss": 0.1956, + "step": 8569 + }, + { + "epoch": 0.44, + "grad_norm": 1.0562615754525093, + "learning_rate": 1.2537114800525047e-05, + "loss": 0.182, + "step": 8570 + }, + { + "epoch": 0.44, + "grad_norm": 1.1632556247766093, + "learning_rate": 1.2535521686406892e-05, + "loss": 0.1942, + "step": 8571 + }, + { + "epoch": 0.44, + "grad_norm": 1.1858005068205464, + "learning_rate": 1.253392850351257e-05, + "loss": 0.2178, + "step": 8572 + }, + { + "epoch": 0.44, + "grad_norm": 0.9958119742587856, + "learning_rate": 1.2532335251885295e-05, + "loss": 0.1953, + "step": 8573 + }, + { + "epoch": 0.44, + "grad_norm": 1.1614657700601225, + "learning_rate": 1.253074193156828e-05, + "loss": 0.1933, + "step": 8574 + }, + { + "epoch": 0.44, + "grad_norm": 0.7794605022464707, + "learning_rate": 1.252914854260475e-05, + "loss": 0.1924, + "step": 8575 + }, + { + "epoch": 0.44, + "grad_norm": 8.52489821490543, + "learning_rate": 1.2527555085037919e-05, + "loss": 0.1658, + "step": 8576 + }, + { + "epoch": 0.44, + "grad_norm": 0.8420982181362046, + "learning_rate": 1.2525961558911018e-05, + "loss": 0.1941, + "step": 8577 + }, + { + "epoch": 0.44, + "grad_norm": 1.2260277438965748, + "learning_rate": 1.2524367964267264e-05, + "loss": 0.1826, + "step": 8578 + }, + { + "epoch": 0.44, + "grad_norm": 1.4095294544722203, + "learning_rate": 1.252277430114989e-05, + "loss": 0.2074, + "step": 8579 + }, + { + "epoch": 0.44, + "grad_norm": 1.062831301945892, + "learning_rate": 1.2521180569602117e-05, + "loss": 0.1782, + "step": 8580 + }, + { + "epoch": 0.44, + "grad_norm": 1.1477816960609704, + "learning_rate": 1.2519586769667178e-05, + "loss": 0.1647, + "step": 8581 + }, + { + "epoch": 0.44, + "grad_norm": 0.9151787324161524, + "learning_rate": 1.2517992901388308e-05, + "loss": 0.2288, + "step": 8582 + }, + { + "epoch": 0.44, + "grad_norm": 0.8072999718676849, + "learning_rate": 1.2516398964808735e-05, + "loss": 0.2063, + "step": 8583 + }, + { + "epoch": 0.44, + "grad_norm": 0.9536553411143446, + "learning_rate": 1.2514804959971703e-05, + "loss": 0.1889, + "step": 8584 + }, + { + "epoch": 0.44, + "grad_norm": 1.374498857012934, + "learning_rate": 1.251321088692044e-05, + "loss": 0.191, + "step": 8585 + }, + { + "epoch": 0.44, + "grad_norm": 0.7731476687954281, + "learning_rate": 1.2511616745698192e-05, + "loss": 0.1846, + "step": 8586 + }, + { + "epoch": 0.44, + "grad_norm": 1.0778842872188044, + "learning_rate": 1.2510022536348198e-05, + "loss": 0.1892, + "step": 8587 + }, + { + "epoch": 0.44, + "grad_norm": 0.9253449989932073, + "learning_rate": 1.2508428258913701e-05, + "loss": 0.187, + "step": 8588 + }, + { + "epoch": 0.44, + "grad_norm": 0.793090613678402, + "learning_rate": 1.2506833913437946e-05, + "loss": 0.1968, + "step": 8589 + }, + { + "epoch": 0.44, + "grad_norm": 1.019474888579202, + "learning_rate": 1.2505239499964179e-05, + "loss": 0.1851, + "step": 8590 + }, + { + "epoch": 0.44, + "grad_norm": 0.8630947482479875, + "learning_rate": 1.2503645018535649e-05, + "loss": 0.1793, + "step": 8591 + }, + { + "epoch": 0.44, + "grad_norm": 0.9521707788414885, + "learning_rate": 1.2502050469195609e-05, + "loss": 0.1933, + "step": 8592 + }, + { + "epoch": 0.44, + "grad_norm": 1.2404247773503827, + "learning_rate": 1.2500455851987306e-05, + "loss": 0.1703, + "step": 8593 + }, + { + "epoch": 0.44, + "grad_norm": 1.1472370304032953, + "learning_rate": 1.2498861166953995e-05, + "loss": 0.182, + "step": 8594 + }, + { + "epoch": 0.44, + "grad_norm": 0.9263589657793647, + "learning_rate": 1.2497266414138935e-05, + "loss": 0.2068, + "step": 8595 + }, + { + "epoch": 0.44, + "grad_norm": 1.0346555947683853, + "learning_rate": 1.2495671593585384e-05, + "loss": 0.182, + "step": 8596 + }, + { + "epoch": 0.44, + "grad_norm": 1.2960625843711235, + "learning_rate": 1.2494076705336599e-05, + "loss": 0.1918, + "step": 8597 + }, + { + "epoch": 0.44, + "grad_norm": 0.9689840253768952, + "learning_rate": 1.249248174943584e-05, + "loss": 0.2027, + "step": 8598 + }, + { + "epoch": 0.44, + "grad_norm": 0.8396890836603846, + "learning_rate": 1.2490886725926376e-05, + "loss": 0.1822, + "step": 8599 + }, + { + "epoch": 0.44, + "grad_norm": 0.7729532838421248, + "learning_rate": 1.248929163485147e-05, + "loss": 0.191, + "step": 8600 + }, + { + "epoch": 0.44, + "grad_norm": 1.332863405617956, + "learning_rate": 1.2487696476254385e-05, + "loss": 0.1975, + "step": 8601 + }, + { + "epoch": 0.44, + "grad_norm": 0.8057277936675971, + "learning_rate": 1.2486101250178394e-05, + "loss": 0.1848, + "step": 8602 + }, + { + "epoch": 0.44, + "grad_norm": 0.8889873160686912, + "learning_rate": 1.2484505956666765e-05, + "loss": 0.1798, + "step": 8603 + }, + { + "epoch": 0.44, + "grad_norm": 0.7659955529390862, + "learning_rate": 1.2482910595762774e-05, + "loss": 0.1898, + "step": 8604 + }, + { + "epoch": 0.44, + "grad_norm": 1.3749674199455812, + "learning_rate": 1.2481315167509691e-05, + "loss": 0.2143, + "step": 8605 + }, + { + "epoch": 0.44, + "grad_norm": 1.045463034779189, + "learning_rate": 1.2479719671950794e-05, + "loss": 0.1893, + "step": 8606 + }, + { + "epoch": 0.44, + "grad_norm": 0.957490242738464, + "learning_rate": 1.247812410912936e-05, + "loss": 0.1875, + "step": 8607 + }, + { + "epoch": 0.44, + "grad_norm": 0.8685192921852094, + "learning_rate": 1.2476528479088672e-05, + "loss": 0.1966, + "step": 8608 + }, + { + "epoch": 0.44, + "grad_norm": 0.7153859251377935, + "learning_rate": 1.247493278187201e-05, + "loss": 0.166, + "step": 8609 + }, + { + "epoch": 0.44, + "grad_norm": 0.9957059549307132, + "learning_rate": 1.2473337017522653e-05, + "loss": 0.1893, + "step": 8610 + }, + { + "epoch": 0.44, + "grad_norm": 0.8473339655405635, + "learning_rate": 1.247174118608389e-05, + "loss": 0.2085, + "step": 8611 + }, + { + "epoch": 0.44, + "grad_norm": 0.9887067152293455, + "learning_rate": 1.247014528759901e-05, + "loss": 0.202, + "step": 8612 + }, + { + "epoch": 0.44, + "grad_norm": 0.7880322623898394, + "learning_rate": 1.2468549322111299e-05, + "loss": 0.1973, + "step": 8613 + }, + { + "epoch": 0.44, + "grad_norm": 1.2680939216214728, + "learning_rate": 1.2466953289664047e-05, + "loss": 0.2221, + "step": 8614 + }, + { + "epoch": 0.44, + "grad_norm": 0.7955595735621311, + "learning_rate": 1.246535719030055e-05, + "loss": 0.2079, + "step": 8615 + }, + { + "epoch": 0.44, + "grad_norm": 0.9892586690670219, + "learning_rate": 1.2463761024064093e-05, + "loss": 0.1772, + "step": 8616 + }, + { + "epoch": 0.44, + "grad_norm": 0.8184148596800643, + "learning_rate": 1.2462164790997986e-05, + "loss": 0.1804, + "step": 8617 + }, + { + "epoch": 0.44, + "grad_norm": 1.0393142650583775, + "learning_rate": 1.246056849114552e-05, + "loss": 0.1957, + "step": 8618 + }, + { + "epoch": 0.44, + "grad_norm": 0.7627059001636234, + "learning_rate": 1.2458972124549993e-05, + "loss": 0.1964, + "step": 8619 + }, + { + "epoch": 0.44, + "grad_norm": 0.9186241307525763, + "learning_rate": 1.2457375691254707e-05, + "loss": 0.1814, + "step": 8620 + }, + { + "epoch": 0.44, + "grad_norm": 0.9818593160010094, + "learning_rate": 1.2455779191302968e-05, + "loss": 0.2123, + "step": 8621 + }, + { + "epoch": 0.44, + "grad_norm": 0.9572114580336248, + "learning_rate": 1.2454182624738079e-05, + "loss": 0.2078, + "step": 8622 + }, + { + "epoch": 0.44, + "grad_norm": 0.9285814703568301, + "learning_rate": 1.2452585991603347e-05, + "loss": 0.2018, + "step": 8623 + }, + { + "epoch": 0.44, + "grad_norm": 0.8095708824494355, + "learning_rate": 1.2450989291942084e-05, + "loss": 0.1798, + "step": 8624 + }, + { + "epoch": 0.44, + "grad_norm": 0.8823515473747471, + "learning_rate": 1.2449392525797597e-05, + "loss": 0.1979, + "step": 8625 + }, + { + "epoch": 0.44, + "grad_norm": 0.8613819549391174, + "learning_rate": 1.24477956932132e-05, + "loss": 0.1853, + "step": 8626 + }, + { + "epoch": 0.44, + "grad_norm": 0.8301435562156122, + "learning_rate": 1.2446198794232206e-05, + "loss": 0.2087, + "step": 8627 + }, + { + "epoch": 0.44, + "grad_norm": 0.9817789102752862, + "learning_rate": 1.2444601828897932e-05, + "loss": 0.1995, + "step": 8628 + }, + { + "epoch": 0.44, + "grad_norm": 0.8351663415207485, + "learning_rate": 1.2443004797253692e-05, + "loss": 0.1902, + "step": 8629 + }, + { + "epoch": 0.44, + "grad_norm": 1.132073956766278, + "learning_rate": 1.244140769934281e-05, + "loss": 0.2077, + "step": 8630 + }, + { + "epoch": 0.44, + "grad_norm": 1.7589643983073007, + "learning_rate": 1.243981053520861e-05, + "loss": 0.2104, + "step": 8631 + }, + { + "epoch": 0.44, + "grad_norm": 0.8034555252532624, + "learning_rate": 1.2438213304894408e-05, + "loss": 0.1927, + "step": 8632 + }, + { + "epoch": 0.44, + "grad_norm": 1.1157291738011732, + "learning_rate": 1.2436616008443534e-05, + "loss": 0.1892, + "step": 8633 + }, + { + "epoch": 0.44, + "grad_norm": 0.8209965863930293, + "learning_rate": 1.2435018645899312e-05, + "loss": 0.1695, + "step": 8634 + }, + { + "epoch": 0.44, + "grad_norm": 0.8722595948991724, + "learning_rate": 1.2433421217305072e-05, + "loss": 0.1867, + "step": 8635 + }, + { + "epoch": 0.44, + "grad_norm": 1.0124487539884681, + "learning_rate": 1.2431823722704147e-05, + "loss": 0.1798, + "step": 8636 + }, + { + "epoch": 0.44, + "grad_norm": 0.7754073159722372, + "learning_rate": 1.2430226162139863e-05, + "loss": 0.1993, + "step": 8637 + }, + { + "epoch": 0.44, + "grad_norm": 0.7968838934251788, + "learning_rate": 1.2428628535655557e-05, + "loss": 0.1781, + "step": 8638 + }, + { + "epoch": 0.44, + "grad_norm": 1.0162402030663464, + "learning_rate": 1.2427030843294562e-05, + "loss": 0.2094, + "step": 8639 + }, + { + "epoch": 0.44, + "grad_norm": 0.9488292199973177, + "learning_rate": 1.2425433085100224e-05, + "loss": 0.2202, + "step": 8640 + }, + { + "epoch": 0.44, + "grad_norm": 0.8235359537085621, + "learning_rate": 1.2423835261115875e-05, + "loss": 0.1932, + "step": 8641 + }, + { + "epoch": 0.44, + "grad_norm": 1.0966483166359258, + "learning_rate": 1.2422237371384857e-05, + "loss": 0.2221, + "step": 8642 + }, + { + "epoch": 0.44, + "grad_norm": 0.7067805471174222, + "learning_rate": 1.2420639415950512e-05, + "loss": 0.2121, + "step": 8643 + }, + { + "epoch": 0.44, + "grad_norm": 0.9066505172572226, + "learning_rate": 1.241904139485619e-05, + "loss": 0.1914, + "step": 8644 + }, + { + "epoch": 0.44, + "grad_norm": 0.8482878763826599, + "learning_rate": 1.2417443308145231e-05, + "loss": 0.2082, + "step": 8645 + }, + { + "epoch": 0.44, + "grad_norm": 0.902057632643992, + "learning_rate": 1.2415845155860985e-05, + "loss": 0.2329, + "step": 8646 + }, + { + "epoch": 0.44, + "grad_norm": 0.8031248362414674, + "learning_rate": 1.2414246938046803e-05, + "loss": 0.1731, + "step": 8647 + }, + { + "epoch": 0.44, + "grad_norm": 1.212999570557578, + "learning_rate": 1.2412648654746038e-05, + "loss": 0.1741, + "step": 8648 + }, + { + "epoch": 0.44, + "grad_norm": 1.054499635574648, + "learning_rate": 1.2411050306002042e-05, + "loss": 0.1829, + "step": 8649 + }, + { + "epoch": 0.44, + "grad_norm": 0.9135542379212512, + "learning_rate": 1.240945189185817e-05, + "loss": 0.1867, + "step": 8650 + }, + { + "epoch": 0.44, + "grad_norm": 1.405275321330997, + "learning_rate": 1.2407853412357775e-05, + "loss": 0.1889, + "step": 8651 + }, + { + "epoch": 0.44, + "grad_norm": 1.0048655495585312, + "learning_rate": 1.2406254867544225e-05, + "loss": 0.2119, + "step": 8652 + }, + { + "epoch": 0.44, + "grad_norm": 1.0534382352091345, + "learning_rate": 1.2404656257460875e-05, + "loss": 0.1997, + "step": 8653 + }, + { + "epoch": 0.44, + "grad_norm": 0.8011986524483066, + "learning_rate": 1.2403057582151088e-05, + "loss": 0.2032, + "step": 8654 + }, + { + "epoch": 0.44, + "grad_norm": 1.1453188395860927, + "learning_rate": 1.2401458841658227e-05, + "loss": 0.2022, + "step": 8655 + }, + { + "epoch": 0.44, + "grad_norm": 0.9429234081767248, + "learning_rate": 1.239986003602566e-05, + "loss": 0.1846, + "step": 8656 + }, + { + "epoch": 0.44, + "grad_norm": 1.2492399758575994, + "learning_rate": 1.2398261165296755e-05, + "loss": 0.186, + "step": 8657 + }, + { + "epoch": 0.44, + "grad_norm": 1.1247829996000132, + "learning_rate": 1.239666222951488e-05, + "loss": 0.1907, + "step": 8658 + }, + { + "epoch": 0.44, + "grad_norm": 0.9308794957701302, + "learning_rate": 1.2395063228723405e-05, + "loss": 0.1974, + "step": 8659 + }, + { + "epoch": 0.44, + "grad_norm": 0.8476358733321918, + "learning_rate": 1.2393464162965708e-05, + "loss": 0.2024, + "step": 8660 + }, + { + "epoch": 0.44, + "grad_norm": 0.8318120223195576, + "learning_rate": 1.239186503228516e-05, + "loss": 0.1754, + "step": 8661 + }, + { + "epoch": 0.44, + "grad_norm": 0.9189154775984179, + "learning_rate": 1.2390265836725136e-05, + "loss": 0.1938, + "step": 8662 + }, + { + "epoch": 0.44, + "grad_norm": 0.8411064285076155, + "learning_rate": 1.2388666576329016e-05, + "loss": 0.2287, + "step": 8663 + }, + { + "epoch": 0.44, + "grad_norm": 1.0606276167362687, + "learning_rate": 1.2387067251140178e-05, + "loss": 0.1818, + "step": 8664 + }, + { + "epoch": 0.44, + "grad_norm": 1.6362813671613476, + "learning_rate": 1.238546786120201e-05, + "loss": 0.2011, + "step": 8665 + }, + { + "epoch": 0.44, + "grad_norm": 1.162047843891554, + "learning_rate": 1.238386840655789e-05, + "loss": 0.1743, + "step": 8666 + }, + { + "epoch": 0.44, + "grad_norm": 0.8695016899684824, + "learning_rate": 1.2382268887251207e-05, + "loss": 0.1849, + "step": 8667 + }, + { + "epoch": 0.44, + "grad_norm": 0.9264986670392068, + "learning_rate": 1.2380669303325346e-05, + "loss": 0.1989, + "step": 8668 + }, + { + "epoch": 0.44, + "grad_norm": 1.1435490303453728, + "learning_rate": 1.237906965482369e-05, + "loss": 0.2015, + "step": 8669 + }, + { + "epoch": 0.44, + "grad_norm": 0.7978001269514786, + "learning_rate": 1.2377469941789639e-05, + "loss": 0.1993, + "step": 8670 + }, + { + "epoch": 0.44, + "grad_norm": 1.1432618438727802, + "learning_rate": 1.2375870164266584e-05, + "loss": 0.1714, + "step": 8671 + }, + { + "epoch": 0.44, + "grad_norm": 0.9594113068655842, + "learning_rate": 1.2374270322297912e-05, + "loss": 0.1852, + "step": 8672 + }, + { + "epoch": 0.44, + "grad_norm": 0.7286130896672868, + "learning_rate": 1.2372670415927023e-05, + "loss": 0.1797, + "step": 8673 + }, + { + "epoch": 0.44, + "grad_norm": 0.938152450920805, + "learning_rate": 1.2371070445197319e-05, + "loss": 0.2, + "step": 8674 + }, + { + "epoch": 0.44, + "grad_norm": 0.8560302080257822, + "learning_rate": 1.2369470410152195e-05, + "loss": 0.1802, + "step": 8675 + }, + { + "epoch": 0.44, + "grad_norm": 0.923050463393855, + "learning_rate": 1.236787031083505e-05, + "loss": 0.2041, + "step": 8676 + }, + { + "epoch": 0.44, + "grad_norm": 0.9821030954763804, + "learning_rate": 1.2366270147289292e-05, + "loss": 0.1782, + "step": 8677 + }, + { + "epoch": 0.44, + "grad_norm": 1.9208630447896724, + "learning_rate": 1.2364669919558321e-05, + "loss": 0.1772, + "step": 8678 + }, + { + "epoch": 0.44, + "grad_norm": 0.8104379453709678, + "learning_rate": 1.2363069627685545e-05, + "loss": 0.1839, + "step": 8679 + }, + { + "epoch": 0.44, + "grad_norm": 0.8191806600272565, + "learning_rate": 1.2361469271714372e-05, + "loss": 0.1912, + "step": 8680 + }, + { + "epoch": 0.44, + "grad_norm": 0.9325590163572611, + "learning_rate": 1.235986885168821e-05, + "loss": 0.1951, + "step": 8681 + }, + { + "epoch": 0.44, + "grad_norm": 1.0513257766154047, + "learning_rate": 1.2358268367650472e-05, + "loss": 0.1946, + "step": 8682 + }, + { + "epoch": 0.44, + "grad_norm": 1.2444637911653882, + "learning_rate": 1.2356667819644575e-05, + "loss": 0.1932, + "step": 8683 + }, + { + "epoch": 0.44, + "grad_norm": 0.9103909492122905, + "learning_rate": 1.2355067207713927e-05, + "loss": 0.1942, + "step": 8684 + }, + { + "epoch": 0.44, + "grad_norm": 0.9457849858116818, + "learning_rate": 1.235346653190195e-05, + "loss": 0.2188, + "step": 8685 + }, + { + "epoch": 0.44, + "grad_norm": 0.9842814315695442, + "learning_rate": 1.2351865792252056e-05, + "loss": 0.1956, + "step": 8686 + }, + { + "epoch": 0.44, + "grad_norm": 0.8901122886363906, + "learning_rate": 1.2350264988807674e-05, + "loss": 0.1979, + "step": 8687 + }, + { + "epoch": 0.44, + "grad_norm": 0.7798350694565372, + "learning_rate": 1.2348664121612219e-05, + "loss": 0.2042, + "step": 8688 + }, + { + "epoch": 0.44, + "grad_norm": 0.9090186696452542, + "learning_rate": 1.234706319070912e-05, + "loss": 0.185, + "step": 8689 + }, + { + "epoch": 0.44, + "grad_norm": 0.7624892317314006, + "learning_rate": 1.2345462196141797e-05, + "loss": 0.1861, + "step": 8690 + }, + { + "epoch": 0.44, + "grad_norm": 1.3538646256449431, + "learning_rate": 1.2343861137953678e-05, + "loss": 0.1867, + "step": 8691 + }, + { + "epoch": 0.44, + "grad_norm": 0.9224124338770778, + "learning_rate": 1.2342260016188197e-05, + "loss": 0.1833, + "step": 8692 + }, + { + "epoch": 0.44, + "grad_norm": 0.8073268179435598, + "learning_rate": 1.234065883088878e-05, + "loss": 0.1896, + "step": 8693 + }, + { + "epoch": 0.44, + "grad_norm": 0.8844325994489721, + "learning_rate": 1.2339057582098859e-05, + "loss": 0.1878, + "step": 8694 + }, + { + "epoch": 0.44, + "grad_norm": 0.91483931694918, + "learning_rate": 1.233745626986187e-05, + "loss": 0.2062, + "step": 8695 + }, + { + "epoch": 0.44, + "grad_norm": 1.1966454284281745, + "learning_rate": 1.2335854894221247e-05, + "loss": 0.1903, + "step": 8696 + }, + { + "epoch": 0.44, + "grad_norm": 1.0333757994680617, + "learning_rate": 1.2334253455220429e-05, + "loss": 0.1973, + "step": 8697 + }, + { + "epoch": 0.44, + "grad_norm": 0.8421908931074247, + "learning_rate": 1.2332651952902852e-05, + "loss": 0.1971, + "step": 8698 + }, + { + "epoch": 0.44, + "grad_norm": 0.9574744317950326, + "learning_rate": 1.2331050387311957e-05, + "loss": 0.2113, + "step": 8699 + }, + { + "epoch": 0.44, + "grad_norm": 0.9223770881917542, + "learning_rate": 1.2329448758491195e-05, + "loss": 0.1758, + "step": 8700 + }, + { + "epoch": 0.44, + "grad_norm": 0.9606924406565363, + "learning_rate": 1.2327847066484e-05, + "loss": 0.178, + "step": 8701 + }, + { + "epoch": 0.44, + "grad_norm": 1.2263409336211948, + "learning_rate": 1.2326245311333823e-05, + "loss": 0.1908, + "step": 8702 + }, + { + "epoch": 0.44, + "grad_norm": 2.184016379817194, + "learning_rate": 1.232464349308411e-05, + "loss": 0.1893, + "step": 8703 + }, + { + "epoch": 0.44, + "grad_norm": 0.8384118727319358, + "learning_rate": 1.2323041611778309e-05, + "loss": 0.2002, + "step": 8704 + }, + { + "epoch": 0.44, + "grad_norm": 0.9282092440214317, + "learning_rate": 1.2321439667459876e-05, + "loss": 0.1757, + "step": 8705 + }, + { + "epoch": 0.44, + "grad_norm": 0.7959667284276463, + "learning_rate": 1.2319837660172258e-05, + "loss": 0.1889, + "step": 8706 + }, + { + "epoch": 0.44, + "grad_norm": 0.750400860745341, + "learning_rate": 1.2318235589958916e-05, + "loss": 0.1915, + "step": 8707 + }, + { + "epoch": 0.44, + "grad_norm": 0.7735865678129724, + "learning_rate": 1.2316633456863299e-05, + "loss": 0.1712, + "step": 8708 + }, + { + "epoch": 0.44, + "grad_norm": 0.8724522023035849, + "learning_rate": 1.2315031260928872e-05, + "loss": 0.2235, + "step": 8709 + }, + { + "epoch": 0.44, + "grad_norm": 0.915554410318466, + "learning_rate": 1.2313429002199088e-05, + "loss": 0.2089, + "step": 8710 + }, + { + "epoch": 0.44, + "grad_norm": 0.78490783447735, + "learning_rate": 1.2311826680717416e-05, + "loss": 0.1776, + "step": 8711 + }, + { + "epoch": 0.44, + "grad_norm": 0.8774957388654196, + "learning_rate": 1.231022429652731e-05, + "loss": 0.1975, + "step": 8712 + }, + { + "epoch": 0.44, + "grad_norm": 0.775234896911333, + "learning_rate": 1.2308621849672244e-05, + "loss": 0.1829, + "step": 8713 + }, + { + "epoch": 0.44, + "grad_norm": 0.9202591567715592, + "learning_rate": 1.2307019340195679e-05, + "loss": 0.2139, + "step": 8714 + }, + { + "epoch": 0.44, + "grad_norm": 0.9882359732294717, + "learning_rate": 1.2305416768141082e-05, + "loss": 0.2124, + "step": 8715 + }, + { + "epoch": 0.44, + "grad_norm": 1.443374205415701, + "learning_rate": 1.2303814133551926e-05, + "loss": 0.1838, + "step": 8716 + }, + { + "epoch": 0.44, + "grad_norm": 0.8106286511053232, + "learning_rate": 1.230221143647168e-05, + "loss": 0.1945, + "step": 8717 + }, + { + "epoch": 0.44, + "grad_norm": 1.2442072877651187, + "learning_rate": 1.230060867694382e-05, + "loss": 0.1881, + "step": 8718 + }, + { + "epoch": 0.44, + "grad_norm": 0.9047866963976102, + "learning_rate": 1.229900585501182e-05, + "loss": 0.1872, + "step": 8719 + }, + { + "epoch": 0.44, + "grad_norm": 1.072887532874953, + "learning_rate": 1.2297402970719157e-05, + "loss": 0.2219, + "step": 8720 + }, + { + "epoch": 0.44, + "grad_norm": 0.8320286552867004, + "learning_rate": 1.2295800024109306e-05, + "loss": 0.2063, + "step": 8721 + }, + { + "epoch": 0.44, + "grad_norm": 0.9128948927663452, + "learning_rate": 1.2294197015225751e-05, + "loss": 0.1819, + "step": 8722 + }, + { + "epoch": 0.44, + "grad_norm": 0.7655232053293657, + "learning_rate": 1.2292593944111972e-05, + "loss": 0.1741, + "step": 8723 + }, + { + "epoch": 0.44, + "grad_norm": 1.0419077231983973, + "learning_rate": 1.2290990810811456e-05, + "loss": 0.1952, + "step": 8724 + }, + { + "epoch": 0.44, + "grad_norm": 0.7758100286467379, + "learning_rate": 1.2289387615367684e-05, + "loss": 0.1884, + "step": 8725 + }, + { + "epoch": 0.44, + "grad_norm": 0.7020149836767687, + "learning_rate": 1.2287784357824138e-05, + "loss": 0.2069, + "step": 8726 + }, + { + "epoch": 0.44, + "grad_norm": 0.8239198097590726, + "learning_rate": 1.2286181038224316e-05, + "loss": 0.1886, + "step": 8727 + }, + { + "epoch": 0.44, + "grad_norm": 0.928750257315147, + "learning_rate": 1.2284577656611706e-05, + "loss": 0.1989, + "step": 8728 + }, + { + "epoch": 0.44, + "grad_norm": 1.0055916699279317, + "learning_rate": 1.2282974213029797e-05, + "loss": 0.1863, + "step": 8729 + }, + { + "epoch": 0.44, + "grad_norm": 0.8478829767427825, + "learning_rate": 1.2281370707522083e-05, + "loss": 0.2072, + "step": 8730 + }, + { + "epoch": 0.44, + "grad_norm": 0.765500143011328, + "learning_rate": 1.2279767140132059e-05, + "loss": 0.21, + "step": 8731 + }, + { + "epoch": 0.44, + "grad_norm": 1.8223953910269926, + "learning_rate": 1.2278163510903222e-05, + "loss": 0.2074, + "step": 8732 + }, + { + "epoch": 0.44, + "grad_norm": 1.0654584856574105, + "learning_rate": 1.2276559819879075e-05, + "loss": 0.2128, + "step": 8733 + }, + { + "epoch": 0.44, + "grad_norm": 0.9679552401237536, + "learning_rate": 1.227495606710311e-05, + "loss": 0.2025, + "step": 8734 + }, + { + "epoch": 0.44, + "grad_norm": 0.9519636449630081, + "learning_rate": 1.2273352252618834e-05, + "loss": 0.1731, + "step": 8735 + }, + { + "epoch": 0.44, + "grad_norm": 0.7244513931276524, + "learning_rate": 1.2271748376469753e-05, + "loss": 0.1691, + "step": 8736 + }, + { + "epoch": 0.44, + "grad_norm": 1.253965355212945, + "learning_rate": 1.2270144438699365e-05, + "loss": 0.1948, + "step": 8737 + }, + { + "epoch": 0.44, + "grad_norm": 0.8691465810010405, + "learning_rate": 1.2268540439351183e-05, + "loss": 0.2224, + "step": 8738 + }, + { + "epoch": 0.44, + "grad_norm": 0.9405752587134205, + "learning_rate": 1.226693637846871e-05, + "loss": 0.1693, + "step": 8739 + }, + { + "epoch": 0.44, + "grad_norm": 0.9853075377180287, + "learning_rate": 1.2265332256095463e-05, + "loss": 0.1713, + "step": 8740 + }, + { + "epoch": 0.44, + "grad_norm": 0.899562324586662, + "learning_rate": 1.2263728072274952e-05, + "loss": 0.1964, + "step": 8741 + }, + { + "epoch": 0.44, + "grad_norm": 0.8181308450571411, + "learning_rate": 1.2262123827050686e-05, + "loss": 0.1864, + "step": 8742 + }, + { + "epoch": 0.44, + "grad_norm": 0.8100402362729071, + "learning_rate": 1.2260519520466185e-05, + "loss": 0.2008, + "step": 8743 + }, + { + "epoch": 0.44, + "grad_norm": 0.7879823698755953, + "learning_rate": 1.2258915152564964e-05, + "loss": 0.1935, + "step": 8744 + }, + { + "epoch": 0.44, + "grad_norm": 0.8888512146131431, + "learning_rate": 1.2257310723390541e-05, + "loss": 0.1891, + "step": 8745 + }, + { + "epoch": 0.44, + "grad_norm": 1.0620541290396086, + "learning_rate": 1.2255706232986438e-05, + "loss": 0.184, + "step": 8746 + }, + { + "epoch": 0.44, + "grad_norm": 0.8531629248382205, + "learning_rate": 1.2254101681396177e-05, + "loss": 0.1811, + "step": 8747 + }, + { + "epoch": 0.44, + "grad_norm": 0.7726202393329789, + "learning_rate": 1.2252497068663281e-05, + "loss": 0.2278, + "step": 8748 + }, + { + "epoch": 0.44, + "grad_norm": 1.094567252642259, + "learning_rate": 1.2250892394831272e-05, + "loss": 0.2069, + "step": 8749 + }, + { + "epoch": 0.44, + "grad_norm": 0.9823153214494921, + "learning_rate": 1.2249287659943682e-05, + "loss": 0.2044, + "step": 8750 + }, + { + "epoch": 0.45, + "grad_norm": 1.0498614567686715, + "learning_rate": 1.2247682864044037e-05, + "loss": 0.1827, + "step": 8751 + }, + { + "epoch": 0.45, + "grad_norm": 0.7545538618223903, + "learning_rate": 1.2246078007175866e-05, + "loss": 0.1968, + "step": 8752 + }, + { + "epoch": 0.45, + "grad_norm": 1.061379383257261, + "learning_rate": 1.2244473089382702e-05, + "loss": 0.2101, + "step": 8753 + }, + { + "epoch": 0.45, + "grad_norm": 0.9009511671275275, + "learning_rate": 1.2242868110708079e-05, + "loss": 0.1945, + "step": 8754 + }, + { + "epoch": 0.45, + "grad_norm": 0.790125825791761, + "learning_rate": 1.2241263071195535e-05, + "loss": 0.1818, + "step": 8755 + }, + { + "epoch": 0.45, + "grad_norm": 0.8315887432214394, + "learning_rate": 1.2239657970888598e-05, + "loss": 0.1918, + "step": 8756 + }, + { + "epoch": 0.45, + "grad_norm": 2.2449738158638226, + "learning_rate": 1.2238052809830816e-05, + "loss": 0.1872, + "step": 8757 + }, + { + "epoch": 0.45, + "grad_norm": 0.867437863360933, + "learning_rate": 1.2236447588065723e-05, + "loss": 0.1945, + "step": 8758 + }, + { + "epoch": 0.45, + "grad_norm": 1.0749293306637857, + "learning_rate": 1.2234842305636865e-05, + "loss": 0.2122, + "step": 8759 + }, + { + "epoch": 0.45, + "grad_norm": 1.0562899061895583, + "learning_rate": 1.2233236962587782e-05, + "loss": 0.1708, + "step": 8760 + }, + { + "epoch": 0.45, + "grad_norm": 1.034263512526097, + "learning_rate": 1.223163155896202e-05, + "loss": 0.195, + "step": 8761 + }, + { + "epoch": 0.45, + "grad_norm": 1.5855367010272412, + "learning_rate": 1.2230026094803127e-05, + "loss": 0.2104, + "step": 8762 + }, + { + "epoch": 0.45, + "grad_norm": 0.7252618781768035, + "learning_rate": 1.2228420570154649e-05, + "loss": 0.1752, + "step": 8763 + }, + { + "epoch": 0.45, + "grad_norm": 1.0131144932423213, + "learning_rate": 1.222681498506014e-05, + "loss": 0.2102, + "step": 8764 + }, + { + "epoch": 0.45, + "grad_norm": 0.8044676222681848, + "learning_rate": 1.2225209339563144e-05, + "loss": 0.167, + "step": 8765 + }, + { + "epoch": 0.45, + "grad_norm": 0.867224693956808, + "learning_rate": 1.2223603633707224e-05, + "loss": 0.1935, + "step": 8766 + }, + { + "epoch": 0.45, + "grad_norm": 1.339476780240116, + "learning_rate": 1.2221997867535926e-05, + "loss": 0.1887, + "step": 8767 + }, + { + "epoch": 0.45, + "grad_norm": 0.8640242341151145, + "learning_rate": 1.2220392041092813e-05, + "loss": 0.2016, + "step": 8768 + }, + { + "epoch": 0.45, + "grad_norm": 1.1395164976092698, + "learning_rate": 1.2218786154421439e-05, + "loss": 0.1965, + "step": 8769 + }, + { + "epoch": 0.45, + "grad_norm": 1.0955555345668755, + "learning_rate": 1.2217180207565365e-05, + "loss": 0.1814, + "step": 8770 + }, + { + "epoch": 0.45, + "grad_norm": 1.080909186673661, + "learning_rate": 1.2215574200568155e-05, + "loss": 0.2092, + "step": 8771 + }, + { + "epoch": 0.45, + "grad_norm": 0.868806952837568, + "learning_rate": 1.2213968133473366e-05, + "loss": 0.2031, + "step": 8772 + }, + { + "epoch": 0.45, + "grad_norm": 1.4526469492698382, + "learning_rate": 1.221236200632457e-05, + "loss": 0.1876, + "step": 8773 + }, + { + "epoch": 0.45, + "grad_norm": 2.78784617514379, + "learning_rate": 1.2210755819165325e-05, + "loss": 0.1934, + "step": 8774 + }, + { + "epoch": 0.45, + "grad_norm": 1.1377878874747358, + "learning_rate": 1.2209149572039207e-05, + "loss": 0.2003, + "step": 8775 + }, + { + "epoch": 0.45, + "grad_norm": 1.1566898421232799, + "learning_rate": 1.220754326498978e-05, + "loss": 0.188, + "step": 8776 + }, + { + "epoch": 0.45, + "grad_norm": 1.4577368626043437, + "learning_rate": 1.2205936898060619e-05, + "loss": 0.2484, + "step": 8777 + }, + { + "epoch": 0.45, + "grad_norm": 1.076111363988673, + "learning_rate": 1.2204330471295296e-05, + "loss": 0.1786, + "step": 8778 + }, + { + "epoch": 0.45, + "grad_norm": 1.2501558204678522, + "learning_rate": 1.2202723984737381e-05, + "loss": 0.1708, + "step": 8779 + }, + { + "epoch": 0.45, + "grad_norm": 0.9423711066705222, + "learning_rate": 1.2201117438430456e-05, + "loss": 0.1917, + "step": 8780 + }, + { + "epoch": 0.45, + "grad_norm": 1.069649524013828, + "learning_rate": 1.2199510832418095e-05, + "loss": 0.1663, + "step": 8781 + }, + { + "epoch": 0.45, + "grad_norm": 1.181212300392255, + "learning_rate": 1.2197904166743882e-05, + "loss": 0.1815, + "step": 8782 + }, + { + "epoch": 0.45, + "grad_norm": 0.7986369608289404, + "learning_rate": 1.2196297441451392e-05, + "loss": 0.179, + "step": 8783 + }, + { + "epoch": 0.45, + "grad_norm": 0.9746185125165827, + "learning_rate": 1.2194690656584209e-05, + "loss": 0.2062, + "step": 8784 + }, + { + "epoch": 0.45, + "grad_norm": 0.8134645721588003, + "learning_rate": 1.2193083812185919e-05, + "loss": 0.1861, + "step": 8785 + }, + { + "epoch": 0.45, + "grad_norm": 1.052563619953177, + "learning_rate": 1.2191476908300107e-05, + "loss": 0.1774, + "step": 8786 + }, + { + "epoch": 0.45, + "grad_norm": 0.9659584698227893, + "learning_rate": 1.2189869944970356e-05, + "loss": 0.1747, + "step": 8787 + }, + { + "epoch": 0.45, + "grad_norm": 0.7230070985388103, + "learning_rate": 1.2188262922240263e-05, + "loss": 0.2043, + "step": 8788 + }, + { + "epoch": 0.45, + "grad_norm": 1.0231134064007024, + "learning_rate": 1.2186655840153413e-05, + "loss": 0.1926, + "step": 8789 + }, + { + "epoch": 0.45, + "grad_norm": 0.8286619944444407, + "learning_rate": 1.2185048698753403e-05, + "loss": 0.1684, + "step": 8790 + }, + { + "epoch": 0.45, + "grad_norm": 0.9708291745003088, + "learning_rate": 1.2183441498083821e-05, + "loss": 0.1807, + "step": 8791 + }, + { + "epoch": 0.45, + "grad_norm": 0.7781997192222662, + "learning_rate": 1.2181834238188264e-05, + "loss": 0.1757, + "step": 8792 + }, + { + "epoch": 0.45, + "grad_norm": 0.9225471816554169, + "learning_rate": 1.2180226919110332e-05, + "loss": 0.1986, + "step": 8793 + }, + { + "epoch": 0.45, + "grad_norm": 0.9767934398909636, + "learning_rate": 1.2178619540893621e-05, + "loss": 0.1835, + "step": 8794 + }, + { + "epoch": 0.45, + "grad_norm": 1.0241007879543205, + "learning_rate": 1.2177012103581733e-05, + "loss": 0.1993, + "step": 8795 + }, + { + "epoch": 0.45, + "grad_norm": 0.9077009583754211, + "learning_rate": 1.2175404607218267e-05, + "loss": 0.1816, + "step": 8796 + }, + { + "epoch": 0.45, + "grad_norm": 0.7274573376917196, + "learning_rate": 1.217379705184683e-05, + "loss": 0.1782, + "step": 8797 + }, + { + "epoch": 0.45, + "grad_norm": 1.072798711913059, + "learning_rate": 1.2172189437511023e-05, + "loss": 0.2116, + "step": 8798 + }, + { + "epoch": 0.45, + "grad_norm": 1.2541852954716943, + "learning_rate": 1.2170581764254458e-05, + "loss": 0.1911, + "step": 8799 + }, + { + "epoch": 0.45, + "grad_norm": 1.2076914791500577, + "learning_rate": 1.2168974032120737e-05, + "loss": 0.2009, + "step": 8800 + }, + { + "epoch": 0.45, + "grad_norm": 0.9503860641385818, + "learning_rate": 1.2167366241153475e-05, + "loss": 0.1913, + "step": 8801 + }, + { + "epoch": 0.45, + "grad_norm": 1.0390296904907494, + "learning_rate": 1.2165758391396281e-05, + "loss": 0.1984, + "step": 8802 + }, + { + "epoch": 0.45, + "grad_norm": 1.3874804890400194, + "learning_rate": 1.2164150482892768e-05, + "loss": 0.2067, + "step": 8803 + }, + { + "epoch": 0.45, + "grad_norm": 1.6212319706782996, + "learning_rate": 1.2162542515686551e-05, + "loss": 0.1909, + "step": 8804 + }, + { + "epoch": 0.45, + "grad_norm": 0.9567611095220859, + "learning_rate": 1.2160934489821244e-05, + "loss": 0.1839, + "step": 8805 + }, + { + "epoch": 0.45, + "grad_norm": 1.2799465065164584, + "learning_rate": 1.2159326405340468e-05, + "loss": 0.1921, + "step": 8806 + }, + { + "epoch": 0.45, + "grad_norm": 1.086003363283199, + "learning_rate": 1.2157718262287841e-05, + "loss": 0.1784, + "step": 8807 + }, + { + "epoch": 0.45, + "grad_norm": 0.7401550610848462, + "learning_rate": 1.2156110060706986e-05, + "loss": 0.1891, + "step": 8808 + }, + { + "epoch": 0.45, + "grad_norm": 0.8915999284651739, + "learning_rate": 1.215450180064152e-05, + "loss": 0.1977, + "step": 8809 + }, + { + "epoch": 0.45, + "grad_norm": 1.0195758291399881, + "learning_rate": 1.2152893482135075e-05, + "loss": 0.1806, + "step": 8810 + }, + { + "epoch": 0.45, + "grad_norm": 1.1749497925696335, + "learning_rate": 1.2151285105231273e-05, + "loss": 0.2189, + "step": 8811 + }, + { + "epoch": 0.45, + "grad_norm": 0.8225497459206069, + "learning_rate": 1.214967666997374e-05, + "loss": 0.1824, + "step": 8812 + }, + { + "epoch": 0.45, + "grad_norm": 1.214799801660971, + "learning_rate": 1.2148068176406104e-05, + "loss": 0.178, + "step": 8813 + }, + { + "epoch": 0.45, + "grad_norm": 0.9197690708431241, + "learning_rate": 1.2146459624571998e-05, + "loss": 0.1859, + "step": 8814 + }, + { + "epoch": 0.45, + "grad_norm": 0.9091858741271089, + "learning_rate": 1.2144851014515055e-05, + "loss": 0.1755, + "step": 8815 + }, + { + "epoch": 0.45, + "grad_norm": 1.086043007279395, + "learning_rate": 1.2143242346278908e-05, + "loss": 0.1923, + "step": 8816 + }, + { + "epoch": 0.45, + "grad_norm": 1.3372441168269635, + "learning_rate": 1.214163361990719e-05, + "loss": 0.1931, + "step": 8817 + }, + { + "epoch": 0.45, + "grad_norm": 0.963781956668138, + "learning_rate": 1.2140024835443537e-05, + "loss": 0.2051, + "step": 8818 + }, + { + "epoch": 0.45, + "grad_norm": 1.2980605044900115, + "learning_rate": 1.213841599293159e-05, + "loss": 0.1746, + "step": 8819 + }, + { + "epoch": 0.45, + "grad_norm": 0.8386179927027012, + "learning_rate": 1.2136807092414992e-05, + "loss": 0.1855, + "step": 8820 + }, + { + "epoch": 0.45, + "grad_norm": 0.8201498327266299, + "learning_rate": 1.2135198133937381e-05, + "loss": 0.1677, + "step": 8821 + }, + { + "epoch": 0.45, + "grad_norm": 0.9651597614348006, + "learning_rate": 1.2133589117542395e-05, + "loss": 0.1907, + "step": 8822 + }, + { + "epoch": 0.45, + "grad_norm": 0.9814233612261183, + "learning_rate": 1.2131980043273685e-05, + "loss": 0.2056, + "step": 8823 + }, + { + "epoch": 0.45, + "grad_norm": 0.8653900828234407, + "learning_rate": 1.2130370911174898e-05, + "loss": 0.1729, + "step": 8824 + }, + { + "epoch": 0.45, + "grad_norm": 1.3058003760102206, + "learning_rate": 1.212876172128968e-05, + "loss": 0.1978, + "step": 8825 + }, + { + "epoch": 0.45, + "grad_norm": 1.0536411372349836, + "learning_rate": 1.2127152473661678e-05, + "loss": 0.1949, + "step": 8826 + }, + { + "epoch": 0.45, + "grad_norm": 0.9398760268893137, + "learning_rate": 1.2125543168334546e-05, + "loss": 0.1978, + "step": 8827 + }, + { + "epoch": 0.45, + "grad_norm": 0.9603591809895711, + "learning_rate": 1.2123933805351934e-05, + "loss": 0.2136, + "step": 8828 + }, + { + "epoch": 0.45, + "grad_norm": 0.8674456935843604, + "learning_rate": 1.21223243847575e-05, + "loss": 0.1757, + "step": 8829 + }, + { + "epoch": 0.45, + "grad_norm": 0.959780207158066, + "learning_rate": 1.2120714906594897e-05, + "loss": 0.2059, + "step": 8830 + }, + { + "epoch": 0.45, + "grad_norm": 1.250047070583227, + "learning_rate": 1.211910537090778e-05, + "loss": 0.1826, + "step": 8831 + }, + { + "epoch": 0.45, + "grad_norm": 0.8414534827990336, + "learning_rate": 1.2117495777739815e-05, + "loss": 0.174, + "step": 8832 + }, + { + "epoch": 0.45, + "grad_norm": 1.105316291874254, + "learning_rate": 1.2115886127134653e-05, + "loss": 0.2109, + "step": 8833 + }, + { + "epoch": 0.45, + "grad_norm": 0.8495791290530903, + "learning_rate": 1.2114276419135964e-05, + "loss": 0.1905, + "step": 8834 + }, + { + "epoch": 0.45, + "grad_norm": 1.4008403268681422, + "learning_rate": 1.2112666653787404e-05, + "loss": 0.2, + "step": 8835 + }, + { + "epoch": 0.45, + "grad_norm": 1.0376823697906254, + "learning_rate": 1.2111056831132641e-05, + "loss": 0.1758, + "step": 8836 + }, + { + "epoch": 0.45, + "grad_norm": 0.9164332739485018, + "learning_rate": 1.2109446951215347e-05, + "loss": 0.196, + "step": 8837 + }, + { + "epoch": 0.45, + "grad_norm": 1.0077455559186472, + "learning_rate": 1.2107837014079182e-05, + "loss": 0.2039, + "step": 8838 + }, + { + "epoch": 0.45, + "grad_norm": 0.8600414769402724, + "learning_rate": 1.210622701976782e-05, + "loss": 0.1759, + "step": 8839 + }, + { + "epoch": 0.45, + "grad_norm": 0.83784665366274, + "learning_rate": 1.2104616968324928e-05, + "loss": 0.1779, + "step": 8840 + }, + { + "epoch": 0.45, + "grad_norm": 1.1928432869020684, + "learning_rate": 1.2103006859794184e-05, + "loss": 0.193, + "step": 8841 + }, + { + "epoch": 0.45, + "grad_norm": 0.9301657792242488, + "learning_rate": 1.2101396694219262e-05, + "loss": 0.176, + "step": 8842 + }, + { + "epoch": 0.45, + "grad_norm": 1.1520056763985131, + "learning_rate": 1.2099786471643834e-05, + "loss": 0.1946, + "step": 8843 + }, + { + "epoch": 0.45, + "grad_norm": 1.1447231599498406, + "learning_rate": 1.2098176192111578e-05, + "loss": 0.2055, + "step": 8844 + }, + { + "epoch": 0.45, + "grad_norm": 0.8944520715875532, + "learning_rate": 1.2096565855666178e-05, + "loss": 0.2033, + "step": 8845 + }, + { + "epoch": 0.45, + "grad_norm": 1.1372848444010801, + "learning_rate": 1.209495546235131e-05, + "loss": 0.2061, + "step": 8846 + }, + { + "epoch": 0.45, + "grad_norm": 0.7647781497580564, + "learning_rate": 1.2093345012210656e-05, + "loss": 0.1932, + "step": 8847 + }, + { + "epoch": 0.45, + "grad_norm": 0.8912513425073467, + "learning_rate": 1.20917345052879e-05, + "loss": 0.1796, + "step": 8848 + }, + { + "epoch": 0.45, + "grad_norm": 1.5089662709207687, + "learning_rate": 1.2090123941626726e-05, + "loss": 0.1784, + "step": 8849 + }, + { + "epoch": 0.45, + "grad_norm": 1.0933341421872094, + "learning_rate": 1.2088513321270823e-05, + "loss": 0.1995, + "step": 8850 + }, + { + "epoch": 0.45, + "grad_norm": 0.9191939310469205, + "learning_rate": 1.2086902644263878e-05, + "loss": 0.1889, + "step": 8851 + }, + { + "epoch": 0.45, + "grad_norm": 0.8287536485776805, + "learning_rate": 1.2085291910649585e-05, + "loss": 0.1873, + "step": 8852 + }, + { + "epoch": 0.45, + "grad_norm": 0.9271405890498322, + "learning_rate": 1.2083681120471626e-05, + "loss": 0.1982, + "step": 8853 + }, + { + "epoch": 0.45, + "grad_norm": 0.9672085990082487, + "learning_rate": 1.20820702737737e-05, + "loss": 0.1924, + "step": 8854 + }, + { + "epoch": 0.45, + "grad_norm": 1.050213589768552, + "learning_rate": 1.2080459370599502e-05, + "loss": 0.184, + "step": 8855 + }, + { + "epoch": 0.45, + "grad_norm": 0.9957672183719678, + "learning_rate": 1.2078848410992726e-05, + "loss": 0.1982, + "step": 8856 + }, + { + "epoch": 0.45, + "grad_norm": 0.7800792332036594, + "learning_rate": 1.2077237394997065e-05, + "loss": 0.1702, + "step": 8857 + }, + { + "epoch": 0.45, + "grad_norm": 1.0934036596223062, + "learning_rate": 1.2075626322656227e-05, + "loss": 0.1914, + "step": 8858 + }, + { + "epoch": 0.45, + "grad_norm": 0.8329459545117248, + "learning_rate": 1.2074015194013906e-05, + "loss": 0.1869, + "step": 8859 + }, + { + "epoch": 0.45, + "grad_norm": 0.8139405261848299, + "learning_rate": 1.2072404009113808e-05, + "loss": 0.2056, + "step": 8860 + }, + { + "epoch": 0.45, + "grad_norm": 0.9862417615264362, + "learning_rate": 1.2070792767999633e-05, + "loss": 0.1837, + "step": 8861 + }, + { + "epoch": 0.45, + "grad_norm": 1.0204044344285164, + "learning_rate": 1.2069181470715083e-05, + "loss": 0.2264, + "step": 8862 + }, + { + "epoch": 0.45, + "grad_norm": 2.1435750249832166, + "learning_rate": 1.2067570117303872e-05, + "loss": 0.198, + "step": 8863 + }, + { + "epoch": 0.45, + "grad_norm": 0.9935613757091443, + "learning_rate": 1.2065958707809705e-05, + "loss": 0.2109, + "step": 8864 + }, + { + "epoch": 0.45, + "grad_norm": 1.0114797607164516, + "learning_rate": 1.2064347242276293e-05, + "loss": 0.192, + "step": 8865 + }, + { + "epoch": 0.45, + "grad_norm": 1.118130182359597, + "learning_rate": 1.2062735720747343e-05, + "loss": 0.1888, + "step": 8866 + }, + { + "epoch": 0.45, + "grad_norm": 2.3928760503882156, + "learning_rate": 1.2061124143266571e-05, + "loss": 0.1989, + "step": 8867 + }, + { + "epoch": 0.45, + "grad_norm": 1.020045704762623, + "learning_rate": 1.2059512509877691e-05, + "loss": 0.1943, + "step": 8868 + }, + { + "epoch": 0.45, + "grad_norm": 0.7282918073861949, + "learning_rate": 1.205790082062442e-05, + "loss": 0.1686, + "step": 8869 + }, + { + "epoch": 0.45, + "grad_norm": 0.8817405987482647, + "learning_rate": 1.2056289075550467e-05, + "loss": 0.1743, + "step": 8870 + }, + { + "epoch": 0.45, + "grad_norm": 1.2194815943547086, + "learning_rate": 1.205467727469956e-05, + "loss": 0.1815, + "step": 8871 + }, + { + "epoch": 0.45, + "grad_norm": 1.1515293942216966, + "learning_rate": 1.2053065418115418e-05, + "loss": 0.1901, + "step": 8872 + }, + { + "epoch": 0.45, + "grad_norm": 0.8144674738067087, + "learning_rate": 1.2051453505841757e-05, + "loss": 0.194, + "step": 8873 + }, + { + "epoch": 0.45, + "grad_norm": 0.8586989430386209, + "learning_rate": 1.2049841537922307e-05, + "loss": 0.196, + "step": 8874 + }, + { + "epoch": 0.45, + "grad_norm": 0.7018719551715215, + "learning_rate": 1.2048229514400785e-05, + "loss": 0.1723, + "step": 8875 + }, + { + "epoch": 0.45, + "grad_norm": 1.0214569381837684, + "learning_rate": 1.2046617435320924e-05, + "loss": 0.1901, + "step": 8876 + }, + { + "epoch": 0.45, + "grad_norm": 1.005899395849749, + "learning_rate": 1.2045005300726452e-05, + "loss": 0.1923, + "step": 8877 + }, + { + "epoch": 0.45, + "grad_norm": 1.5862559489013401, + "learning_rate": 1.2043393110661092e-05, + "loss": 0.1911, + "step": 8878 + }, + { + "epoch": 0.45, + "grad_norm": 1.1185356170292566, + "learning_rate": 1.2041780865168577e-05, + "loss": 0.2061, + "step": 8879 + }, + { + "epoch": 0.45, + "grad_norm": 1.0316089428982487, + "learning_rate": 1.2040168564292644e-05, + "loss": 0.19, + "step": 8880 + }, + { + "epoch": 0.45, + "grad_norm": 0.6316760066241547, + "learning_rate": 1.2038556208077026e-05, + "loss": 0.1756, + "step": 8881 + }, + { + "epoch": 0.45, + "grad_norm": 0.9262215986725295, + "learning_rate": 1.2036943796565453e-05, + "loss": 0.1677, + "step": 8882 + }, + { + "epoch": 0.45, + "grad_norm": 0.8161375418600838, + "learning_rate": 1.2035331329801663e-05, + "loss": 0.2222, + "step": 8883 + }, + { + "epoch": 0.45, + "grad_norm": 0.9862382320843694, + "learning_rate": 1.2033718807829395e-05, + "loss": 0.2013, + "step": 8884 + }, + { + "epoch": 0.45, + "grad_norm": 0.924590732113376, + "learning_rate": 1.2032106230692394e-05, + "loss": 0.1705, + "step": 8885 + }, + { + "epoch": 0.45, + "grad_norm": 0.7549429941943393, + "learning_rate": 1.2030493598434392e-05, + "loss": 0.1843, + "step": 8886 + }, + { + "epoch": 0.45, + "grad_norm": 0.8770179406545869, + "learning_rate": 1.2028880911099141e-05, + "loss": 0.199, + "step": 8887 + }, + { + "epoch": 0.45, + "grad_norm": 1.0535531729638505, + "learning_rate": 1.2027268168730378e-05, + "loss": 0.1917, + "step": 8888 + }, + { + "epoch": 0.45, + "grad_norm": 0.9816682275340497, + "learning_rate": 1.202565537137185e-05, + "loss": 0.1873, + "step": 8889 + }, + { + "epoch": 0.45, + "grad_norm": 1.111591525187513, + "learning_rate": 1.202404251906731e-05, + "loss": 0.2159, + "step": 8890 + }, + { + "epoch": 0.45, + "grad_norm": 0.9930935676272202, + "learning_rate": 1.20224296118605e-05, + "loss": 0.1959, + "step": 8891 + }, + { + "epoch": 0.45, + "grad_norm": 1.0744553729919295, + "learning_rate": 1.202081664979517e-05, + "loss": 0.1829, + "step": 8892 + }, + { + "epoch": 0.45, + "grad_norm": 1.0903638199210839, + "learning_rate": 1.2019203632915078e-05, + "loss": 0.193, + "step": 8893 + }, + { + "epoch": 0.45, + "grad_norm": 1.8433993464195626, + "learning_rate": 1.2017590561263973e-05, + "loss": 0.1989, + "step": 8894 + }, + { + "epoch": 0.45, + "grad_norm": 0.9218661790570869, + "learning_rate": 1.2015977434885608e-05, + "loss": 0.1988, + "step": 8895 + }, + { + "epoch": 0.45, + "grad_norm": 1.086711989423081, + "learning_rate": 1.2014364253823742e-05, + "loss": 0.1997, + "step": 8896 + }, + { + "epoch": 0.45, + "grad_norm": 0.9552204008080996, + "learning_rate": 1.2012751018122132e-05, + "loss": 0.1918, + "step": 8897 + }, + { + "epoch": 0.45, + "grad_norm": 1.1356827714958893, + "learning_rate": 1.2011137727824536e-05, + "loss": 0.1838, + "step": 8898 + }, + { + "epoch": 0.45, + "grad_norm": 1.1033408198461252, + "learning_rate": 1.2009524382974717e-05, + "loss": 0.2092, + "step": 8899 + }, + { + "epoch": 0.45, + "grad_norm": 1.026359269650774, + "learning_rate": 1.2007910983616435e-05, + "loss": 0.1791, + "step": 8900 + }, + { + "epoch": 0.45, + "grad_norm": 0.7595638197151642, + "learning_rate": 1.2006297529793456e-05, + "loss": 0.185, + "step": 8901 + }, + { + "epoch": 0.45, + "grad_norm": 1.8070567756184621, + "learning_rate": 1.200468402154954e-05, + "loss": 0.1862, + "step": 8902 + }, + { + "epoch": 0.45, + "grad_norm": 0.9054734943926448, + "learning_rate": 1.2003070458928458e-05, + "loss": 0.215, + "step": 8903 + }, + { + "epoch": 0.45, + "grad_norm": 3.4442105832569023, + "learning_rate": 1.200145684197398e-05, + "loss": 0.1978, + "step": 8904 + }, + { + "epoch": 0.45, + "grad_norm": 1.2466883918482339, + "learning_rate": 1.1999843170729866e-05, + "loss": 0.1729, + "step": 8905 + }, + { + "epoch": 0.45, + "grad_norm": 0.941852164338417, + "learning_rate": 1.1998229445239898e-05, + "loss": 0.1966, + "step": 8906 + }, + { + "epoch": 0.45, + "grad_norm": 1.1718913462872127, + "learning_rate": 1.1996615665547841e-05, + "loss": 0.1927, + "step": 8907 + }, + { + "epoch": 0.45, + "grad_norm": 0.994145976940982, + "learning_rate": 1.1995001831697472e-05, + "loss": 0.2004, + "step": 8908 + }, + { + "epoch": 0.45, + "grad_norm": 0.8267161858948318, + "learning_rate": 1.1993387943732567e-05, + "loss": 0.2053, + "step": 8909 + }, + { + "epoch": 0.45, + "grad_norm": 0.7850285872276042, + "learning_rate": 1.1991774001696896e-05, + "loss": 0.1807, + "step": 8910 + }, + { + "epoch": 0.45, + "grad_norm": 0.9629868160271702, + "learning_rate": 1.1990160005634248e-05, + "loss": 0.1959, + "step": 8911 + }, + { + "epoch": 0.45, + "grad_norm": 1.0503190559300086, + "learning_rate": 1.1988545955588395e-05, + "loss": 0.1844, + "step": 8912 + }, + { + "epoch": 0.45, + "grad_norm": 0.9957325838836562, + "learning_rate": 1.1986931851603122e-05, + "loss": 0.2011, + "step": 8913 + }, + { + "epoch": 0.45, + "grad_norm": 1.0639985315351588, + "learning_rate": 1.1985317693722212e-05, + "loss": 0.1996, + "step": 8914 + }, + { + "epoch": 0.45, + "grad_norm": 0.9629188523199401, + "learning_rate": 1.1983703481989443e-05, + "loss": 0.2051, + "step": 8915 + }, + { + "epoch": 0.45, + "grad_norm": 0.9569474788455568, + "learning_rate": 1.1982089216448607e-05, + "loss": 0.2102, + "step": 8916 + }, + { + "epoch": 0.45, + "grad_norm": 1.0215416358534117, + "learning_rate": 1.198047489714349e-05, + "loss": 0.2206, + "step": 8917 + }, + { + "epoch": 0.45, + "grad_norm": 1.1053432675497292, + "learning_rate": 1.197886052411788e-05, + "loss": 0.2172, + "step": 8918 + }, + { + "epoch": 0.45, + "grad_norm": 1.2538967899336353, + "learning_rate": 1.1977246097415565e-05, + "loss": 0.1804, + "step": 8919 + }, + { + "epoch": 0.45, + "grad_norm": 0.7904544707992178, + "learning_rate": 1.1975631617080339e-05, + "loss": 0.2404, + "step": 8920 + }, + { + "epoch": 0.45, + "grad_norm": 1.010512958427013, + "learning_rate": 1.1974017083155993e-05, + "loss": 0.2069, + "step": 8921 + }, + { + "epoch": 0.45, + "grad_norm": 0.9187475659348551, + "learning_rate": 1.1972402495686323e-05, + "loss": 0.195, + "step": 8922 + }, + { + "epoch": 0.45, + "grad_norm": 1.1264834567954618, + "learning_rate": 1.1970787854715123e-05, + "loss": 0.1797, + "step": 8923 + }, + { + "epoch": 0.45, + "grad_norm": 0.902873139466147, + "learning_rate": 1.1969173160286191e-05, + "loss": 0.1672, + "step": 8924 + }, + { + "epoch": 0.45, + "grad_norm": 3.8147847023995554, + "learning_rate": 1.1967558412443328e-05, + "loss": 0.2024, + "step": 8925 + }, + { + "epoch": 0.45, + "grad_norm": 1.426225608926197, + "learning_rate": 1.1965943611230331e-05, + "loss": 0.1954, + "step": 8926 + }, + { + "epoch": 0.45, + "grad_norm": 0.9450555610864556, + "learning_rate": 1.1964328756691e-05, + "loss": 0.1639, + "step": 8927 + }, + { + "epoch": 0.45, + "grad_norm": 1.239627680378109, + "learning_rate": 1.196271384886914e-05, + "loss": 0.1813, + "step": 8928 + }, + { + "epoch": 0.45, + "grad_norm": 1.110179894405812, + "learning_rate": 1.196109888780856e-05, + "loss": 0.1652, + "step": 8929 + }, + { + "epoch": 0.45, + "grad_norm": 0.8597255073498342, + "learning_rate": 1.1959483873553059e-05, + "loss": 0.1854, + "step": 8930 + }, + { + "epoch": 0.45, + "grad_norm": 1.5145937177785822, + "learning_rate": 1.1957868806146449e-05, + "loss": 0.2137, + "step": 8931 + }, + { + "epoch": 0.45, + "grad_norm": 1.1041447036829297, + "learning_rate": 1.1956253685632534e-05, + "loss": 0.1661, + "step": 8932 + }, + { + "epoch": 0.45, + "grad_norm": 0.9525616525645625, + "learning_rate": 1.1954638512055131e-05, + "loss": 0.2112, + "step": 8933 + }, + { + "epoch": 0.45, + "grad_norm": 1.3595966325094786, + "learning_rate": 1.1953023285458047e-05, + "loss": 0.1958, + "step": 8934 + }, + { + "epoch": 0.45, + "grad_norm": 0.9975591742736065, + "learning_rate": 1.1951408005885098e-05, + "loss": 0.183, + "step": 8935 + }, + { + "epoch": 0.45, + "grad_norm": 0.773323065930237, + "learning_rate": 1.1949792673380094e-05, + "loss": 0.1689, + "step": 8936 + }, + { + "epoch": 0.45, + "grad_norm": 1.0978246199548096, + "learning_rate": 1.194817728798685e-05, + "loss": 0.2031, + "step": 8937 + }, + { + "epoch": 0.45, + "grad_norm": 0.9025386810836243, + "learning_rate": 1.1946561849749192e-05, + "loss": 0.1739, + "step": 8938 + }, + { + "epoch": 0.45, + "grad_norm": 1.0924668110309403, + "learning_rate": 1.1944946358710936e-05, + "loss": 0.1892, + "step": 8939 + }, + { + "epoch": 0.45, + "grad_norm": 0.8475628728718165, + "learning_rate": 1.1943330814915897e-05, + "loss": 0.1962, + "step": 8940 + }, + { + "epoch": 0.45, + "grad_norm": 1.144332901565883, + "learning_rate": 1.1941715218407898e-05, + "loss": 0.1789, + "step": 8941 + }, + { + "epoch": 0.45, + "grad_norm": 0.9641545428843032, + "learning_rate": 1.1940099569230767e-05, + "loss": 0.2029, + "step": 8942 + }, + { + "epoch": 0.45, + "grad_norm": 1.0694630086624335, + "learning_rate": 1.1938483867428326e-05, + "loss": 0.1833, + "step": 8943 + }, + { + "epoch": 0.45, + "grad_norm": 1.6671110565068417, + "learning_rate": 1.19368681130444e-05, + "loss": 0.1973, + "step": 8944 + }, + { + "epoch": 0.45, + "grad_norm": 1.0086089261741662, + "learning_rate": 1.1935252306122812e-05, + "loss": 0.1949, + "step": 8945 + }, + { + "epoch": 0.45, + "grad_norm": 0.8401339480562876, + "learning_rate": 1.1933636446707401e-05, + "loss": 0.159, + "step": 8946 + }, + { + "epoch": 0.45, + "grad_norm": 1.1733262203357395, + "learning_rate": 1.1932020534841992e-05, + "loss": 0.1925, + "step": 8947 + }, + { + "epoch": 0.46, + "grad_norm": 1.2062640371557118, + "learning_rate": 1.1930404570570417e-05, + "loss": 0.2117, + "step": 8948 + }, + { + "epoch": 0.46, + "grad_norm": 1.58050795448441, + "learning_rate": 1.1928788553936507e-05, + "loss": 0.1997, + "step": 8949 + }, + { + "epoch": 0.46, + "grad_norm": 1.2136141398608247, + "learning_rate": 1.19271724849841e-05, + "loss": 0.2107, + "step": 8950 + }, + { + "epoch": 0.46, + "grad_norm": 1.5287755275959543, + "learning_rate": 1.192555636375703e-05, + "loss": 0.1869, + "step": 8951 + }, + { + "epoch": 0.46, + "grad_norm": 0.7695097792624204, + "learning_rate": 1.1923940190299135e-05, + "loss": 0.1838, + "step": 8952 + }, + { + "epoch": 0.46, + "grad_norm": 1.046415308205685, + "learning_rate": 1.1922323964654254e-05, + "loss": 0.1898, + "step": 8953 + }, + { + "epoch": 0.46, + "grad_norm": 0.9872118514792896, + "learning_rate": 1.1920707686866227e-05, + "loss": 0.1898, + "step": 8954 + }, + { + "epoch": 0.46, + "grad_norm": 0.9799211018153227, + "learning_rate": 1.1919091356978894e-05, + "loss": 0.1642, + "step": 8955 + }, + { + "epoch": 0.46, + "grad_norm": 1.155722364648013, + "learning_rate": 1.19174749750361e-05, + "loss": 0.1772, + "step": 8956 + }, + { + "epoch": 0.46, + "grad_norm": 1.2290325356839902, + "learning_rate": 1.1915858541081693e-05, + "loss": 0.1663, + "step": 8957 + }, + { + "epoch": 0.46, + "grad_norm": 1.30211196478441, + "learning_rate": 1.191424205515951e-05, + "loss": 0.2073, + "step": 8958 + }, + { + "epoch": 0.46, + "grad_norm": 1.0336619162587886, + "learning_rate": 1.1912625517313406e-05, + "loss": 0.2175, + "step": 8959 + }, + { + "epoch": 0.46, + "grad_norm": 1.015136790947612, + "learning_rate": 1.1911008927587224e-05, + "loss": 0.1947, + "step": 8960 + }, + { + "epoch": 0.46, + "grad_norm": 1.0525402971892104, + "learning_rate": 1.190939228602482e-05, + "loss": 0.2008, + "step": 8961 + }, + { + "epoch": 0.46, + "grad_norm": 1.0309507319611686, + "learning_rate": 1.190777559267004e-05, + "loss": 0.1712, + "step": 8962 + }, + { + "epoch": 0.46, + "grad_norm": 1.042308202194219, + "learning_rate": 1.190615884756674e-05, + "loss": 0.2174, + "step": 8963 + }, + { + "epoch": 0.46, + "grad_norm": 1.2486820476818925, + "learning_rate": 1.1904542050758774e-05, + "loss": 0.2019, + "step": 8964 + }, + { + "epoch": 0.46, + "grad_norm": 0.9681283258309303, + "learning_rate": 1.1902925202289997e-05, + "loss": 0.1748, + "step": 8965 + }, + { + "epoch": 0.46, + "grad_norm": 0.8824864392862095, + "learning_rate": 1.1901308302204267e-05, + "loss": 0.1827, + "step": 8966 + }, + { + "epoch": 0.46, + "grad_norm": 2.0792711159266326, + "learning_rate": 1.189969135054544e-05, + "loss": 0.1817, + "step": 8967 + }, + { + "epoch": 0.46, + "grad_norm": 0.9004669137625958, + "learning_rate": 1.1898074347357377e-05, + "loss": 0.1905, + "step": 8968 + }, + { + "epoch": 0.46, + "grad_norm": 0.8946191807568328, + "learning_rate": 1.1896457292683945e-05, + "loss": 0.2004, + "step": 8969 + }, + { + "epoch": 0.46, + "grad_norm": 0.9348331603869736, + "learning_rate": 1.1894840186569e-05, + "loss": 0.1854, + "step": 8970 + }, + { + "epoch": 0.46, + "grad_norm": 1.7410801074117348, + "learning_rate": 1.1893223029056406e-05, + "loss": 0.1813, + "step": 8971 + }, + { + "epoch": 0.46, + "grad_norm": 1.1059641875465824, + "learning_rate": 1.1891605820190031e-05, + "loss": 0.1687, + "step": 8972 + }, + { + "epoch": 0.46, + "grad_norm": 0.9228413836557761, + "learning_rate": 1.1889988560013741e-05, + "loss": 0.1917, + "step": 8973 + }, + { + "epoch": 0.46, + "grad_norm": 0.8623997445511834, + "learning_rate": 1.1888371248571409e-05, + "loss": 0.1745, + "step": 8974 + }, + { + "epoch": 0.46, + "grad_norm": 1.3696551052147947, + "learning_rate": 1.1886753885906895e-05, + "loss": 0.2035, + "step": 8975 + }, + { + "epoch": 0.46, + "grad_norm": 1.1247215692096408, + "learning_rate": 1.1885136472064076e-05, + "loss": 0.2072, + "step": 8976 + }, + { + "epoch": 0.46, + "grad_norm": 1.1875189683505865, + "learning_rate": 1.1883519007086824e-05, + "loss": 0.217, + "step": 8977 + }, + { + "epoch": 0.46, + "grad_norm": 1.2625623423472383, + "learning_rate": 1.1881901491019014e-05, + "loss": 0.1774, + "step": 8978 + }, + { + "epoch": 0.46, + "grad_norm": 0.9240639516641106, + "learning_rate": 1.1880283923904518e-05, + "loss": 0.192, + "step": 8979 + }, + { + "epoch": 0.46, + "grad_norm": 1.8776495632201888, + "learning_rate": 1.1878666305787214e-05, + "loss": 0.1982, + "step": 8980 + }, + { + "epoch": 0.46, + "grad_norm": 0.9658463095638202, + "learning_rate": 1.1877048636710981e-05, + "loss": 0.1716, + "step": 8981 + }, + { + "epoch": 0.46, + "grad_norm": 1.2894250472685753, + "learning_rate": 1.1875430916719698e-05, + "loss": 0.1957, + "step": 8982 + }, + { + "epoch": 0.46, + "grad_norm": 1.0529602099603186, + "learning_rate": 1.187381314585725e-05, + "loss": 0.18, + "step": 8983 + }, + { + "epoch": 0.46, + "grad_norm": 1.159065460851313, + "learning_rate": 1.1872195324167508e-05, + "loss": 0.2057, + "step": 8984 + }, + { + "epoch": 0.46, + "grad_norm": 1.0456114107643657, + "learning_rate": 1.1870577451694363e-05, + "loss": 0.1668, + "step": 8985 + }, + { + "epoch": 0.46, + "grad_norm": 0.8648956682920848, + "learning_rate": 1.18689595284817e-05, + "loss": 0.1842, + "step": 8986 + }, + { + "epoch": 0.46, + "grad_norm": 1.0062545746069842, + "learning_rate": 1.1867341554573405e-05, + "loss": 0.1752, + "step": 8987 + }, + { + "epoch": 0.46, + "grad_norm": 0.9052168926450945, + "learning_rate": 1.1865723530013367e-05, + "loss": 0.178, + "step": 8988 + }, + { + "epoch": 0.46, + "grad_norm": 1.1849884410504337, + "learning_rate": 1.1864105454845467e-05, + "loss": 0.1898, + "step": 8989 + }, + { + "epoch": 0.46, + "grad_norm": 1.1768091963894038, + "learning_rate": 1.1862487329113606e-05, + "loss": 0.2171, + "step": 8990 + }, + { + "epoch": 0.46, + "grad_norm": 1.9149338679136048, + "learning_rate": 1.186086915286167e-05, + "loss": 0.191, + "step": 8991 + }, + { + "epoch": 0.46, + "grad_norm": 0.8723532019990747, + "learning_rate": 1.1859250926133554e-05, + "loss": 0.1828, + "step": 8992 + }, + { + "epoch": 0.46, + "grad_norm": 1.4707240199120826, + "learning_rate": 1.185763264897315e-05, + "loss": 0.1637, + "step": 8993 + }, + { + "epoch": 0.46, + "grad_norm": 0.7403666829101415, + "learning_rate": 1.1856014321424356e-05, + "loss": 0.1928, + "step": 8994 + }, + { + "epoch": 0.46, + "grad_norm": 0.9394086856654486, + "learning_rate": 1.185439594353107e-05, + "loss": 0.1723, + "step": 8995 + }, + { + "epoch": 0.46, + "grad_norm": 0.7670705541749999, + "learning_rate": 1.1852777515337186e-05, + "loss": 0.1841, + "step": 8996 + }, + { + "epoch": 0.46, + "grad_norm": 1.1035336169056782, + "learning_rate": 1.185115903688661e-05, + "loss": 0.1886, + "step": 8997 + }, + { + "epoch": 0.46, + "grad_norm": 1.2146454611707889, + "learning_rate": 1.1849540508223238e-05, + "loss": 0.1969, + "step": 8998 + }, + { + "epoch": 0.46, + "grad_norm": 1.0393209712450067, + "learning_rate": 1.1847921929390977e-05, + "loss": 0.1946, + "step": 8999 + }, + { + "epoch": 0.46, + "grad_norm": 1.008870498761671, + "learning_rate": 1.184630330043373e-05, + "loss": 0.1847, + "step": 9000 + }, + { + "epoch": 0.46, + "grad_norm": 0.9349115695325397, + "learning_rate": 1.1844684621395401e-05, + "loss": 0.2054, + "step": 9001 + }, + { + "epoch": 0.46, + "grad_norm": 0.924911367937286, + "learning_rate": 1.1843065892319895e-05, + "loss": 0.1891, + "step": 9002 + }, + { + "epoch": 0.46, + "grad_norm": 1.1563745043827904, + "learning_rate": 1.1841447113251126e-05, + "loss": 0.2084, + "step": 9003 + }, + { + "epoch": 0.46, + "grad_norm": 0.8965980604277103, + "learning_rate": 1.1839828284233e-05, + "loss": 0.1612, + "step": 9004 + }, + { + "epoch": 0.46, + "grad_norm": 0.9034527880843174, + "learning_rate": 1.1838209405309427e-05, + "loss": 0.1886, + "step": 9005 + }, + { + "epoch": 0.46, + "grad_norm": 1.0077625098069756, + "learning_rate": 1.183659047652432e-05, + "loss": 0.1865, + "step": 9006 + }, + { + "epoch": 0.46, + "grad_norm": 0.9196710553785808, + "learning_rate": 1.1834971497921591e-05, + "loss": 0.1953, + "step": 9007 + }, + { + "epoch": 0.46, + "grad_norm": 0.9536834374306841, + "learning_rate": 1.1833352469545158e-05, + "loss": 0.2062, + "step": 9008 + }, + { + "epoch": 0.46, + "grad_norm": 1.2212574071626323, + "learning_rate": 1.1831733391438937e-05, + "loss": 0.1907, + "step": 9009 + }, + { + "epoch": 0.46, + "grad_norm": 1.5537835504342088, + "learning_rate": 1.1830114263646844e-05, + "loss": 0.2087, + "step": 9010 + }, + { + "epoch": 0.46, + "grad_norm": 0.8431492083254475, + "learning_rate": 1.1828495086212794e-05, + "loss": 0.1801, + "step": 9011 + }, + { + "epoch": 0.46, + "grad_norm": 1.0463466456718005, + "learning_rate": 1.1826875859180718e-05, + "loss": 0.186, + "step": 9012 + }, + { + "epoch": 0.46, + "grad_norm": 1.1162356038874017, + "learning_rate": 1.1825256582594526e-05, + "loss": 0.207, + "step": 9013 + }, + { + "epoch": 0.46, + "grad_norm": 0.9375444806124058, + "learning_rate": 1.1823637256498149e-05, + "loss": 0.2037, + "step": 9014 + }, + { + "epoch": 0.46, + "grad_norm": 1.180452055645475, + "learning_rate": 1.1822017880935507e-05, + "loss": 0.1722, + "step": 9015 + }, + { + "epoch": 0.46, + "grad_norm": 0.8274008918640854, + "learning_rate": 1.1820398455950526e-05, + "loss": 0.1889, + "step": 9016 + }, + { + "epoch": 0.46, + "grad_norm": 0.7722948608780377, + "learning_rate": 1.1818778981587135e-05, + "loss": 0.1851, + "step": 9017 + }, + { + "epoch": 0.46, + "grad_norm": 0.8623477671210378, + "learning_rate": 1.1817159457889264e-05, + "loss": 0.1758, + "step": 9018 + }, + { + "epoch": 0.46, + "grad_norm": 0.7589775449226215, + "learning_rate": 1.1815539884900837e-05, + "loss": 0.1954, + "step": 9019 + }, + { + "epoch": 0.46, + "grad_norm": 1.054810457169351, + "learning_rate": 1.1813920262665788e-05, + "loss": 0.1878, + "step": 9020 + }, + { + "epoch": 0.46, + "grad_norm": 1.293935543595433, + "learning_rate": 1.1812300591228052e-05, + "loss": 0.212, + "step": 9021 + }, + { + "epoch": 0.46, + "grad_norm": 0.8558181092026415, + "learning_rate": 1.1810680870631558e-05, + "loss": 0.176, + "step": 9022 + }, + { + "epoch": 0.46, + "grad_norm": 0.7573874055928003, + "learning_rate": 1.1809061100920245e-05, + "loss": 0.1755, + "step": 9023 + }, + { + "epoch": 0.46, + "grad_norm": 1.1342319197752257, + "learning_rate": 1.1807441282138045e-05, + "loss": 0.1882, + "step": 9024 + }, + { + "epoch": 0.46, + "grad_norm": 1.052170888869555, + "learning_rate": 1.1805821414328897e-05, + "loss": 0.1855, + "step": 9025 + }, + { + "epoch": 0.46, + "grad_norm": 0.9040062958108732, + "learning_rate": 1.1804201497536746e-05, + "loss": 0.1781, + "step": 9026 + }, + { + "epoch": 0.46, + "grad_norm": 1.0001531848845238, + "learning_rate": 1.1802581531805525e-05, + "loss": 0.1835, + "step": 9027 + }, + { + "epoch": 0.46, + "grad_norm": 0.9702189858557619, + "learning_rate": 1.1800961517179177e-05, + "loss": 0.1925, + "step": 9028 + }, + { + "epoch": 0.46, + "grad_norm": 0.9391996907219045, + "learning_rate": 1.179934145370165e-05, + "loss": 0.1968, + "step": 9029 + }, + { + "epoch": 0.46, + "grad_norm": 1.169550256959143, + "learning_rate": 1.1797721341416882e-05, + "loss": 0.2095, + "step": 9030 + }, + { + "epoch": 0.46, + "grad_norm": 1.4986948128088586, + "learning_rate": 1.179610118036882e-05, + "loss": 0.1983, + "step": 9031 + }, + { + "epoch": 0.46, + "grad_norm": 1.2257047082649992, + "learning_rate": 1.1794480970601413e-05, + "loss": 0.2061, + "step": 9032 + }, + { + "epoch": 0.46, + "grad_norm": 1.1231876572801334, + "learning_rate": 1.1792860712158608e-05, + "loss": 0.1956, + "step": 9033 + }, + { + "epoch": 0.46, + "grad_norm": 1.3543258643514862, + "learning_rate": 1.1791240405084355e-05, + "loss": 0.1994, + "step": 9034 + }, + { + "epoch": 0.46, + "grad_norm": 0.8771805946884635, + "learning_rate": 1.1789620049422603e-05, + "loss": 0.1673, + "step": 9035 + }, + { + "epoch": 0.46, + "grad_norm": 1.028897040551911, + "learning_rate": 1.1787999645217309e-05, + "loss": 0.2148, + "step": 9036 + }, + { + "epoch": 0.46, + "grad_norm": 1.0308527874745395, + "learning_rate": 1.178637919251242e-05, + "loss": 0.2075, + "step": 9037 + }, + { + "epoch": 0.46, + "grad_norm": 1.2276060915073248, + "learning_rate": 1.1784758691351898e-05, + "loss": 0.1925, + "step": 9038 + }, + { + "epoch": 0.46, + "grad_norm": 1.2425534898000836, + "learning_rate": 1.1783138141779695e-05, + "loss": 0.1499, + "step": 9039 + }, + { + "epoch": 0.46, + "grad_norm": 0.9302029211692885, + "learning_rate": 1.178151754383977e-05, + "loss": 0.1992, + "step": 9040 + }, + { + "epoch": 0.46, + "grad_norm": 1.0435356139538756, + "learning_rate": 1.177989689757608e-05, + "loss": 0.1983, + "step": 9041 + }, + { + "epoch": 0.46, + "grad_norm": 0.9754457914366035, + "learning_rate": 1.1778276203032584e-05, + "loss": 0.1784, + "step": 9042 + }, + { + "epoch": 0.46, + "grad_norm": 2.0053469624790905, + "learning_rate": 1.1776655460253248e-05, + "loss": 0.1921, + "step": 9043 + }, + { + "epoch": 0.46, + "grad_norm": 0.8778961911792674, + "learning_rate": 1.1775034669282034e-05, + "loss": 0.1955, + "step": 9044 + }, + { + "epoch": 0.46, + "grad_norm": 1.1203395252317987, + "learning_rate": 1.1773413830162902e-05, + "loss": 0.1981, + "step": 9045 + }, + { + "epoch": 0.46, + "grad_norm": 1.0830248882097275, + "learning_rate": 1.177179294293982e-05, + "loss": 0.1805, + "step": 9046 + }, + { + "epoch": 0.46, + "grad_norm": 0.9531798772402397, + "learning_rate": 1.1770172007656756e-05, + "loss": 0.195, + "step": 9047 + }, + { + "epoch": 0.46, + "grad_norm": 1.3430591008501616, + "learning_rate": 1.1768551024357673e-05, + "loss": 0.2009, + "step": 9048 + }, + { + "epoch": 0.46, + "grad_norm": 0.9824402645449075, + "learning_rate": 1.1766929993086547e-05, + "loss": 0.2128, + "step": 9049 + }, + { + "epoch": 0.46, + "grad_norm": 1.6385073345023113, + "learning_rate": 1.1765308913887341e-05, + "loss": 0.1901, + "step": 9050 + }, + { + "epoch": 0.46, + "grad_norm": 0.964258379970069, + "learning_rate": 1.1763687786804034e-05, + "loss": 0.1797, + "step": 9051 + }, + { + "epoch": 0.46, + "grad_norm": 1.1455400657359058, + "learning_rate": 1.1762066611880596e-05, + "loss": 0.1977, + "step": 9052 + }, + { + "epoch": 0.46, + "grad_norm": 0.8002007770933686, + "learning_rate": 1.1760445389161002e-05, + "loss": 0.1776, + "step": 9053 + }, + { + "epoch": 0.46, + "grad_norm": 0.943694401307601, + "learning_rate": 1.1758824118689225e-05, + "loss": 0.2099, + "step": 9054 + }, + { + "epoch": 0.46, + "grad_norm": 1.0283652595881423, + "learning_rate": 1.1757202800509245e-05, + "loss": 0.1996, + "step": 9055 + }, + { + "epoch": 0.46, + "grad_norm": 0.9931005687344918, + "learning_rate": 1.1755581434665043e-05, + "loss": 0.18, + "step": 9056 + }, + { + "epoch": 0.46, + "grad_norm": 2.238537129714373, + "learning_rate": 1.1753960021200593e-05, + "loss": 0.1984, + "step": 9057 + }, + { + "epoch": 0.46, + "grad_norm": 0.7553415966956479, + "learning_rate": 1.1752338560159878e-05, + "loss": 0.1678, + "step": 9058 + }, + { + "epoch": 0.46, + "grad_norm": 1.0408684215597828, + "learning_rate": 1.1750717051586882e-05, + "loss": 0.1895, + "step": 9059 + }, + { + "epoch": 0.46, + "grad_norm": 1.0384347110753673, + "learning_rate": 1.1749095495525584e-05, + "loss": 0.1906, + "step": 9060 + }, + { + "epoch": 0.46, + "grad_norm": 1.044673644971104, + "learning_rate": 1.1747473892019977e-05, + "loss": 0.1983, + "step": 9061 + }, + { + "epoch": 0.46, + "grad_norm": 1.109794063699578, + "learning_rate": 1.1745852241114038e-05, + "loss": 0.2028, + "step": 9062 + }, + { + "epoch": 0.46, + "grad_norm": 1.084289158802793, + "learning_rate": 1.1744230542851758e-05, + "loss": 0.1999, + "step": 9063 + }, + { + "epoch": 0.46, + "grad_norm": 0.777145804319658, + "learning_rate": 1.174260879727713e-05, + "loss": 0.1711, + "step": 9064 + }, + { + "epoch": 0.46, + "grad_norm": 1.0703078027644066, + "learning_rate": 1.1740987004434137e-05, + "loss": 0.1918, + "step": 9065 + }, + { + "epoch": 0.46, + "grad_norm": 0.8781926448135073, + "learning_rate": 1.1739365164366775e-05, + "loss": 0.198, + "step": 9066 + }, + { + "epoch": 0.46, + "grad_norm": 1.3489089861431531, + "learning_rate": 1.1737743277119031e-05, + "loss": 0.1865, + "step": 9067 + }, + { + "epoch": 0.46, + "grad_norm": 1.9097810356618725, + "learning_rate": 1.1736121342734903e-05, + "loss": 0.1857, + "step": 9068 + }, + { + "epoch": 0.46, + "grad_norm": 1.0893069302099345, + "learning_rate": 1.173449936125839e-05, + "loss": 0.186, + "step": 9069 + }, + { + "epoch": 0.46, + "grad_norm": 0.8277620769173616, + "learning_rate": 1.1732877332733479e-05, + "loss": 0.1812, + "step": 9070 + }, + { + "epoch": 0.46, + "grad_norm": 1.3249669025073985, + "learning_rate": 1.1731255257204174e-05, + "loss": 0.1903, + "step": 9071 + }, + { + "epoch": 0.46, + "grad_norm": 1.1042809564014204, + "learning_rate": 1.1729633134714475e-05, + "loss": 0.2004, + "step": 9072 + }, + { + "epoch": 0.46, + "grad_norm": 0.9206027756962021, + "learning_rate": 1.1728010965308377e-05, + "loss": 0.1911, + "step": 9073 + }, + { + "epoch": 0.46, + "grad_norm": 1.0423116504989947, + "learning_rate": 1.1726388749029884e-05, + "loss": 0.188, + "step": 9074 + }, + { + "epoch": 0.46, + "grad_norm": 0.7968196186056415, + "learning_rate": 1.1724766485922998e-05, + "loss": 0.1874, + "step": 9075 + }, + { + "epoch": 0.46, + "grad_norm": 0.9243988557355096, + "learning_rate": 1.1723144176031727e-05, + "loss": 0.1881, + "step": 9076 + }, + { + "epoch": 0.46, + "grad_norm": 1.042151183072485, + "learning_rate": 1.1721521819400068e-05, + "loss": 0.1965, + "step": 9077 + }, + { + "epoch": 0.46, + "grad_norm": 0.9206453558272957, + "learning_rate": 1.1719899416072037e-05, + "loss": 0.1782, + "step": 9078 + }, + { + "epoch": 0.46, + "grad_norm": 0.9587520930099989, + "learning_rate": 1.1718276966091638e-05, + "loss": 0.1903, + "step": 9079 + }, + { + "epoch": 0.46, + "grad_norm": 1.2943426276561494, + "learning_rate": 1.1716654469502875e-05, + "loss": 0.218, + "step": 9080 + }, + { + "epoch": 0.46, + "grad_norm": 2.124829320688035, + "learning_rate": 1.1715031926349763e-05, + "loss": 0.1913, + "step": 9081 + }, + { + "epoch": 0.46, + "grad_norm": 0.8456629905306549, + "learning_rate": 1.1713409336676313e-05, + "loss": 0.1845, + "step": 9082 + }, + { + "epoch": 0.46, + "grad_norm": 0.8710014737852795, + "learning_rate": 1.1711786700526541e-05, + "loss": 0.1871, + "step": 9083 + }, + { + "epoch": 0.46, + "grad_norm": 1.2990723070408405, + "learning_rate": 1.1710164017944456e-05, + "loss": 0.1872, + "step": 9084 + }, + { + "epoch": 0.46, + "grad_norm": 1.236419696060247, + "learning_rate": 1.1708541288974074e-05, + "loss": 0.2054, + "step": 9085 + }, + { + "epoch": 0.46, + "grad_norm": 3.0682702113309683, + "learning_rate": 1.1706918513659416e-05, + "loss": 0.1932, + "step": 9086 + }, + { + "epoch": 0.46, + "grad_norm": 0.8068307461764347, + "learning_rate": 1.1705295692044496e-05, + "loss": 0.173, + "step": 9087 + }, + { + "epoch": 0.46, + "grad_norm": 1.3880576279247103, + "learning_rate": 1.1703672824173333e-05, + "loss": 0.1704, + "step": 9088 + }, + { + "epoch": 0.46, + "grad_norm": 1.270121404719117, + "learning_rate": 1.170204991008995e-05, + "loss": 0.1879, + "step": 9089 + }, + { + "epoch": 0.46, + "grad_norm": 1.0435643904778924, + "learning_rate": 1.1700426949838364e-05, + "loss": 0.1885, + "step": 9090 + }, + { + "epoch": 0.46, + "grad_norm": 0.8568140770975008, + "learning_rate": 1.1698803943462604e-05, + "loss": 0.1848, + "step": 9091 + }, + { + "epoch": 0.46, + "grad_norm": 1.0820704614596093, + "learning_rate": 1.169718089100669e-05, + "loss": 0.212, + "step": 9092 + }, + { + "epoch": 0.46, + "grad_norm": 1.0800354792918512, + "learning_rate": 1.1695557792514648e-05, + "loss": 0.1971, + "step": 9093 + }, + { + "epoch": 0.46, + "grad_norm": 1.1750232689570579, + "learning_rate": 1.1693934648030508e-05, + "loss": 0.1914, + "step": 9094 + }, + { + "epoch": 0.46, + "grad_norm": 1.3779916093593438, + "learning_rate": 1.1692311457598291e-05, + "loss": 0.184, + "step": 9095 + }, + { + "epoch": 0.46, + "grad_norm": 0.8684209284055365, + "learning_rate": 1.1690688221262035e-05, + "loss": 0.1927, + "step": 9096 + }, + { + "epoch": 0.46, + "grad_norm": 0.697502018051808, + "learning_rate": 1.1689064939065762e-05, + "loss": 0.1879, + "step": 9097 + }, + { + "epoch": 0.46, + "grad_norm": 1.0772614966661544, + "learning_rate": 1.1687441611053505e-05, + "loss": 0.1772, + "step": 9098 + }, + { + "epoch": 0.46, + "grad_norm": 1.1014332449661124, + "learning_rate": 1.1685818237269302e-05, + "loss": 0.1901, + "step": 9099 + }, + { + "epoch": 0.46, + "grad_norm": 2.013613860724744, + "learning_rate": 1.1684194817757184e-05, + "loss": 0.2189, + "step": 9100 + }, + { + "epoch": 0.46, + "grad_norm": 1.0590001336397885, + "learning_rate": 1.1682571352561187e-05, + "loss": 0.1967, + "step": 9101 + }, + { + "epoch": 0.46, + "grad_norm": 1.1143069368763483, + "learning_rate": 1.1680947841725348e-05, + "loss": 0.1986, + "step": 9102 + }, + { + "epoch": 0.46, + "grad_norm": 0.9463708285613962, + "learning_rate": 1.1679324285293698e-05, + "loss": 0.1896, + "step": 9103 + }, + { + "epoch": 0.46, + "grad_norm": 1.0438085528108987, + "learning_rate": 1.1677700683310286e-05, + "loss": 0.1854, + "step": 9104 + }, + { + "epoch": 0.46, + "grad_norm": 0.92252065973537, + "learning_rate": 1.1676077035819148e-05, + "loss": 0.2096, + "step": 9105 + }, + { + "epoch": 0.46, + "grad_norm": 1.1214717138794612, + "learning_rate": 1.1674453342864327e-05, + "loss": 0.1925, + "step": 9106 + }, + { + "epoch": 0.46, + "grad_norm": 1.0099356972626463, + "learning_rate": 1.1672829604489864e-05, + "loss": 0.1649, + "step": 9107 + }, + { + "epoch": 0.46, + "grad_norm": 1.1449027127665157, + "learning_rate": 1.16712058207398e-05, + "loss": 0.215, + "step": 9108 + }, + { + "epoch": 0.46, + "grad_norm": 0.911336039272148, + "learning_rate": 1.1669581991658187e-05, + "loss": 0.1711, + "step": 9109 + }, + { + "epoch": 0.46, + "grad_norm": 1.4580166182319019, + "learning_rate": 1.1667958117289068e-05, + "loss": 0.2201, + "step": 9110 + }, + { + "epoch": 0.46, + "grad_norm": 1.0857664654881272, + "learning_rate": 1.1666334197676492e-05, + "loss": 0.1829, + "step": 9111 + }, + { + "epoch": 0.46, + "grad_norm": 0.8859738895387417, + "learning_rate": 1.1664710232864505e-05, + "loss": 0.1673, + "step": 9112 + }, + { + "epoch": 0.46, + "grad_norm": 0.7798264118555199, + "learning_rate": 1.1663086222897157e-05, + "loss": 0.2089, + "step": 9113 + }, + { + "epoch": 0.46, + "grad_norm": 0.9975538121325291, + "learning_rate": 1.1661462167818507e-05, + "loss": 0.1873, + "step": 9114 + }, + { + "epoch": 0.46, + "grad_norm": 1.0709575867805985, + "learning_rate": 1.16598380676726e-05, + "loss": 0.1843, + "step": 9115 + }, + { + "epoch": 0.46, + "grad_norm": 0.9233783373327914, + "learning_rate": 1.1658213922503488e-05, + "loss": 0.1582, + "step": 9116 + }, + { + "epoch": 0.46, + "grad_norm": 1.0645080907936217, + "learning_rate": 1.1656589732355233e-05, + "loss": 0.1973, + "step": 9117 + }, + { + "epoch": 0.46, + "grad_norm": 0.9392700054586304, + "learning_rate": 1.1654965497271886e-05, + "loss": 0.1929, + "step": 9118 + }, + { + "epoch": 0.46, + "grad_norm": 1.114398197170294, + "learning_rate": 1.1653341217297507e-05, + "loss": 0.1915, + "step": 9119 + }, + { + "epoch": 0.46, + "grad_norm": 0.9674687619310163, + "learning_rate": 1.1651716892476154e-05, + "loss": 0.1944, + "step": 9120 + }, + { + "epoch": 0.46, + "grad_norm": 0.9367552026693869, + "learning_rate": 1.1650092522851885e-05, + "loss": 0.1898, + "step": 9121 + }, + { + "epoch": 0.46, + "grad_norm": 1.0708496935895828, + "learning_rate": 1.1648468108468767e-05, + "loss": 0.1894, + "step": 9122 + }, + { + "epoch": 0.46, + "grad_norm": 0.8815096620043407, + "learning_rate": 1.1646843649370858e-05, + "loss": 0.1887, + "step": 9123 + }, + { + "epoch": 0.46, + "grad_norm": 0.9908797866048731, + "learning_rate": 1.164521914560222e-05, + "loss": 0.1956, + "step": 9124 + }, + { + "epoch": 0.46, + "grad_norm": 1.0955654895177573, + "learning_rate": 1.164359459720692e-05, + "loss": 0.1917, + "step": 9125 + }, + { + "epoch": 0.46, + "grad_norm": 0.9897343143053348, + "learning_rate": 1.1641970004229025e-05, + "loss": 0.1836, + "step": 9126 + }, + { + "epoch": 0.46, + "grad_norm": 0.7886372445419565, + "learning_rate": 1.16403453667126e-05, + "loss": 0.2013, + "step": 9127 + }, + { + "epoch": 0.46, + "grad_norm": 1.3670924199438759, + "learning_rate": 1.1638720684701714e-05, + "loss": 0.1719, + "step": 9128 + }, + { + "epoch": 0.46, + "grad_norm": 1.6908065901193217, + "learning_rate": 1.1637095958240439e-05, + "loss": 0.2039, + "step": 9129 + }, + { + "epoch": 0.46, + "grad_norm": 1.8688739782273087, + "learning_rate": 1.163547118737284e-05, + "loss": 0.1984, + "step": 9130 + }, + { + "epoch": 0.46, + "grad_norm": 0.8693621221208244, + "learning_rate": 1.1633846372142997e-05, + "loss": 0.1758, + "step": 9131 + }, + { + "epoch": 0.46, + "grad_norm": 1.0033795035288822, + "learning_rate": 1.1632221512594977e-05, + "loss": 0.1829, + "step": 9132 + }, + { + "epoch": 0.46, + "grad_norm": 1.203854492354105, + "learning_rate": 1.1630596608772855e-05, + "loss": 0.1991, + "step": 9133 + }, + { + "epoch": 0.46, + "grad_norm": 0.7739788349490826, + "learning_rate": 1.1628971660720707e-05, + "loss": 0.2161, + "step": 9134 + }, + { + "epoch": 0.46, + "grad_norm": 1.3333093970645196, + "learning_rate": 1.1627346668482614e-05, + "loss": 0.2002, + "step": 9135 + }, + { + "epoch": 0.46, + "grad_norm": 1.3961606707042902, + "learning_rate": 1.1625721632102649e-05, + "loss": 0.181, + "step": 9136 + }, + { + "epoch": 0.46, + "grad_norm": 0.7972776439352925, + "learning_rate": 1.1624096551624893e-05, + "loss": 0.1848, + "step": 9137 + }, + { + "epoch": 0.46, + "grad_norm": 0.8296410166518111, + "learning_rate": 1.1622471427093424e-05, + "loss": 0.1726, + "step": 9138 + }, + { + "epoch": 0.46, + "grad_norm": 0.9506650235484677, + "learning_rate": 1.162084625855233e-05, + "loss": 0.1721, + "step": 9139 + }, + { + "epoch": 0.46, + "grad_norm": 2.7818432041827315, + "learning_rate": 1.1619221046045688e-05, + "loss": 0.1942, + "step": 9140 + }, + { + "epoch": 0.46, + "grad_norm": 1.1575333574213695, + "learning_rate": 1.1617595789617585e-05, + "loss": 0.2027, + "step": 9141 + }, + { + "epoch": 0.46, + "grad_norm": 1.434177466973586, + "learning_rate": 1.1615970489312102e-05, + "loss": 0.194, + "step": 9142 + }, + { + "epoch": 0.46, + "grad_norm": 1.245727117900203, + "learning_rate": 1.1614345145173329e-05, + "loss": 0.2023, + "step": 9143 + }, + { + "epoch": 0.46, + "grad_norm": 1.4528276599772527, + "learning_rate": 1.1612719757245353e-05, + "loss": 0.1841, + "step": 9144 + }, + { + "epoch": 0.47, + "grad_norm": 0.883066228974052, + "learning_rate": 1.1611094325572263e-05, + "loss": 0.1973, + "step": 9145 + }, + { + "epoch": 0.47, + "grad_norm": 1.153096925156812, + "learning_rate": 1.1609468850198149e-05, + "loss": 0.1883, + "step": 9146 + }, + { + "epoch": 0.47, + "grad_norm": 1.2894960675682319, + "learning_rate": 1.1607843331167099e-05, + "loss": 0.2053, + "step": 9147 + }, + { + "epoch": 0.47, + "grad_norm": 0.8747914564760497, + "learning_rate": 1.160621776852321e-05, + "loss": 0.1882, + "step": 9148 + }, + { + "epoch": 0.47, + "grad_norm": 1.1131978812303405, + "learning_rate": 1.1604592162310575e-05, + "loss": 0.2121, + "step": 9149 + }, + { + "epoch": 0.47, + "grad_norm": 1.5693726179321674, + "learning_rate": 1.1602966512573286e-05, + "loss": 0.1881, + "step": 9150 + }, + { + "epoch": 0.47, + "grad_norm": 1.0216723824159144, + "learning_rate": 1.1601340819355437e-05, + "loss": 0.1912, + "step": 9151 + }, + { + "epoch": 0.47, + "grad_norm": 0.7815028235272667, + "learning_rate": 1.159971508270113e-05, + "loss": 0.1954, + "step": 9152 + }, + { + "epoch": 0.47, + "grad_norm": 0.9754311439115084, + "learning_rate": 1.159808930265446e-05, + "loss": 0.1918, + "step": 9153 + }, + { + "epoch": 0.47, + "grad_norm": 1.0278402602039418, + "learning_rate": 1.159646347925953e-05, + "loss": 0.1974, + "step": 9154 + }, + { + "epoch": 0.47, + "grad_norm": 0.9032850279132243, + "learning_rate": 1.1594837612560437e-05, + "loss": 0.1778, + "step": 9155 + }, + { + "epoch": 0.47, + "grad_norm": 0.8774455184882909, + "learning_rate": 1.159321170260128e-05, + "loss": 0.2006, + "step": 9156 + }, + { + "epoch": 0.47, + "grad_norm": 0.9127838115789007, + "learning_rate": 1.159158574942617e-05, + "loss": 0.1733, + "step": 9157 + }, + { + "epoch": 0.47, + "grad_norm": 0.8448337659172754, + "learning_rate": 1.1589959753079203e-05, + "loss": 0.1759, + "step": 9158 + }, + { + "epoch": 0.47, + "grad_norm": 1.2241639207791157, + "learning_rate": 1.1588333713604491e-05, + "loss": 0.2184, + "step": 9159 + }, + { + "epoch": 0.47, + "grad_norm": 0.7723198053308619, + "learning_rate": 1.1586707631046135e-05, + "loss": 0.1973, + "step": 9160 + }, + { + "epoch": 0.47, + "grad_norm": 0.8615544161233657, + "learning_rate": 1.1585081505448246e-05, + "loss": 0.2044, + "step": 9161 + }, + { + "epoch": 0.47, + "grad_norm": 0.8611882561608455, + "learning_rate": 1.1583455336854932e-05, + "loss": 0.1845, + "step": 9162 + }, + { + "epoch": 0.47, + "grad_norm": 1.0851258693274464, + "learning_rate": 1.1581829125310302e-05, + "loss": 0.179, + "step": 9163 + }, + { + "epoch": 0.47, + "grad_norm": 0.7737070155432831, + "learning_rate": 1.1580202870858468e-05, + "loss": 0.1768, + "step": 9164 + }, + { + "epoch": 0.47, + "grad_norm": 0.9287374921183846, + "learning_rate": 1.1578576573543541e-05, + "loss": 0.1976, + "step": 9165 + }, + { + "epoch": 0.47, + "grad_norm": 0.8834705337603755, + "learning_rate": 1.1576950233409638e-05, + "loss": 0.1852, + "step": 9166 + }, + { + "epoch": 0.47, + "grad_norm": 0.8774831452300471, + "learning_rate": 1.1575323850500868e-05, + "loss": 0.1702, + "step": 9167 + }, + { + "epoch": 0.47, + "grad_norm": 2.4809191685656273, + "learning_rate": 1.1573697424861353e-05, + "loss": 0.1834, + "step": 9168 + }, + { + "epoch": 0.47, + "grad_norm": 0.9047569129335316, + "learning_rate": 1.1572070956535202e-05, + "loss": 0.1878, + "step": 9169 + }, + { + "epoch": 0.47, + "grad_norm": 1.0114729627753714, + "learning_rate": 1.1570444445566538e-05, + "loss": 0.1841, + "step": 9170 + }, + { + "epoch": 0.47, + "grad_norm": 1.0533951490813414, + "learning_rate": 1.1568817891999482e-05, + "loss": 0.2076, + "step": 9171 + }, + { + "epoch": 0.47, + "grad_norm": 1.0908645916592143, + "learning_rate": 1.1567191295878152e-05, + "loss": 0.1858, + "step": 9172 + }, + { + "epoch": 0.47, + "grad_norm": 0.9703974422296601, + "learning_rate": 1.1565564657246667e-05, + "loss": 0.2004, + "step": 9173 + }, + { + "epoch": 0.47, + "grad_norm": 1.0118705478228984, + "learning_rate": 1.1563937976149153e-05, + "loss": 0.1953, + "step": 9174 + }, + { + "epoch": 0.47, + "grad_norm": 0.8260904591199736, + "learning_rate": 1.1562311252629736e-05, + "loss": 0.2017, + "step": 9175 + }, + { + "epoch": 0.47, + "grad_norm": 0.7339715507228588, + "learning_rate": 1.1560684486732537e-05, + "loss": 0.1916, + "step": 9176 + }, + { + "epoch": 0.47, + "grad_norm": 0.8509635990034636, + "learning_rate": 1.1559057678501682e-05, + "loss": 0.1829, + "step": 9177 + }, + { + "epoch": 0.47, + "grad_norm": 1.9624278814958298, + "learning_rate": 1.1557430827981297e-05, + "loss": 0.2154, + "step": 9178 + }, + { + "epoch": 0.47, + "grad_norm": 1.479984283272646, + "learning_rate": 1.1555803935215516e-05, + "loss": 0.2187, + "step": 9179 + }, + { + "epoch": 0.47, + "grad_norm": 0.9547459427889696, + "learning_rate": 1.1554177000248466e-05, + "loss": 0.192, + "step": 9180 + }, + { + "epoch": 0.47, + "grad_norm": 1.0713234269482474, + "learning_rate": 1.1552550023124276e-05, + "loss": 0.1993, + "step": 9181 + }, + { + "epoch": 0.47, + "grad_norm": 1.084265326853148, + "learning_rate": 1.155092300388708e-05, + "loss": 0.1896, + "step": 9182 + }, + { + "epoch": 0.47, + "grad_norm": 0.823005443489001, + "learning_rate": 1.1549295942581007e-05, + "loss": 0.1774, + "step": 9183 + }, + { + "epoch": 0.47, + "grad_norm": 1.6002085217547777, + "learning_rate": 1.1547668839250199e-05, + "loss": 0.1919, + "step": 9184 + }, + { + "epoch": 0.47, + "grad_norm": 0.8626993942068936, + "learning_rate": 1.1546041693938784e-05, + "loss": 0.192, + "step": 9185 + }, + { + "epoch": 0.47, + "grad_norm": 1.0077475214317404, + "learning_rate": 1.1544414506690897e-05, + "loss": 0.1848, + "step": 9186 + }, + { + "epoch": 0.47, + "grad_norm": 1.0757353918928187, + "learning_rate": 1.1542787277550683e-05, + "loss": 0.2179, + "step": 9187 + }, + { + "epoch": 0.47, + "grad_norm": 1.08680507436135, + "learning_rate": 1.1541160006562275e-05, + "loss": 0.1883, + "step": 9188 + }, + { + "epoch": 0.47, + "grad_norm": 0.9751665996225382, + "learning_rate": 1.1539532693769818e-05, + "loss": 0.1974, + "step": 9189 + }, + { + "epoch": 0.47, + "grad_norm": 2.48160369171706, + "learning_rate": 1.1537905339217448e-05, + "loss": 0.1883, + "step": 9190 + }, + { + "epoch": 0.47, + "grad_norm": 0.975263185759104, + "learning_rate": 1.1536277942949305e-05, + "loss": 0.2017, + "step": 9191 + }, + { + "epoch": 0.47, + "grad_norm": 0.8240373691871361, + "learning_rate": 1.1534650505009542e-05, + "loss": 0.1775, + "step": 9192 + }, + { + "epoch": 0.47, + "grad_norm": 1.3049402644744645, + "learning_rate": 1.1533023025442294e-05, + "loss": 0.2072, + "step": 9193 + }, + { + "epoch": 0.47, + "grad_norm": 0.9742705420566906, + "learning_rate": 1.1531395504291711e-05, + "loss": 0.1864, + "step": 9194 + }, + { + "epoch": 0.47, + "grad_norm": 0.9472394210632011, + "learning_rate": 1.1529767941601937e-05, + "loss": 0.1836, + "step": 9195 + }, + { + "epoch": 0.47, + "grad_norm": 0.9238803406730269, + "learning_rate": 1.1528140337417121e-05, + "loss": 0.1921, + "step": 9196 + }, + { + "epoch": 0.47, + "grad_norm": 0.8351639923254849, + "learning_rate": 1.1526512691781415e-05, + "loss": 0.1927, + "step": 9197 + }, + { + "epoch": 0.47, + "grad_norm": 1.2967717494314384, + "learning_rate": 1.1524885004738966e-05, + "loss": 0.1871, + "step": 9198 + }, + { + "epoch": 0.47, + "grad_norm": 0.915047277005369, + "learning_rate": 1.1523257276333924e-05, + "loss": 0.2058, + "step": 9199 + }, + { + "epoch": 0.47, + "grad_norm": 0.8859798604811852, + "learning_rate": 1.1521629506610439e-05, + "loss": 0.2082, + "step": 9200 + }, + { + "epoch": 0.47, + "grad_norm": 0.9162438028535452, + "learning_rate": 1.1520001695612675e-05, + "loss": 0.2077, + "step": 9201 + }, + { + "epoch": 0.47, + "grad_norm": 0.8868233681525796, + "learning_rate": 1.1518373843384774e-05, + "loss": 0.1949, + "step": 9202 + }, + { + "epoch": 0.47, + "grad_norm": 0.893103606569373, + "learning_rate": 1.1516745949970897e-05, + "loss": 0.1734, + "step": 9203 + }, + { + "epoch": 0.47, + "grad_norm": 1.0350623362067328, + "learning_rate": 1.15151180154152e-05, + "loss": 0.1957, + "step": 9204 + }, + { + "epoch": 0.47, + "grad_norm": 0.7472797149020382, + "learning_rate": 1.1513490039761843e-05, + "loss": 0.1719, + "step": 9205 + }, + { + "epoch": 0.47, + "grad_norm": 0.8943678903413873, + "learning_rate": 1.1511862023054983e-05, + "loss": 0.1849, + "step": 9206 + }, + { + "epoch": 0.47, + "grad_norm": 2.0615004595864916, + "learning_rate": 1.151023396533878e-05, + "loss": 0.1849, + "step": 9207 + }, + { + "epoch": 0.47, + "grad_norm": 1.0065551689099037, + "learning_rate": 1.1508605866657392e-05, + "loss": 0.2165, + "step": 9208 + }, + { + "epoch": 0.47, + "grad_norm": 0.8278316868482662, + "learning_rate": 1.1506977727054988e-05, + "loss": 0.1768, + "step": 9209 + }, + { + "epoch": 0.47, + "grad_norm": 2.0617148335170974, + "learning_rate": 1.1505349546575728e-05, + "loss": 0.1931, + "step": 9210 + }, + { + "epoch": 0.47, + "grad_norm": 0.8807830721570364, + "learning_rate": 1.1503721325263778e-05, + "loss": 0.2041, + "step": 9211 + }, + { + "epoch": 0.47, + "grad_norm": 1.131425982837395, + "learning_rate": 1.15020930631633e-05, + "loss": 0.1908, + "step": 9212 + }, + { + "epoch": 0.47, + "grad_norm": 1.016980414378633, + "learning_rate": 1.1500464760318462e-05, + "loss": 0.1705, + "step": 9213 + }, + { + "epoch": 0.47, + "grad_norm": 1.3114196960927185, + "learning_rate": 1.1498836416773433e-05, + "loss": 0.2042, + "step": 9214 + }, + { + "epoch": 0.47, + "grad_norm": 0.9548647672267575, + "learning_rate": 1.1497208032572385e-05, + "loss": 0.1688, + "step": 9215 + }, + { + "epoch": 0.47, + "grad_norm": 1.4053452700177105, + "learning_rate": 1.149557960775948e-05, + "loss": 0.2194, + "step": 9216 + }, + { + "epoch": 0.47, + "grad_norm": 0.9455345695137635, + "learning_rate": 1.1493951142378896e-05, + "loss": 0.174, + "step": 9217 + }, + { + "epoch": 0.47, + "grad_norm": 2.914664529750119, + "learning_rate": 1.1492322636474802e-05, + "loss": 0.1867, + "step": 9218 + }, + { + "epoch": 0.47, + "grad_norm": 0.9728464313455532, + "learning_rate": 1.1490694090091375e-05, + "loss": 0.1737, + "step": 9219 + }, + { + "epoch": 0.47, + "grad_norm": 0.8613771294384492, + "learning_rate": 1.1489065503272785e-05, + "loss": 0.1906, + "step": 9220 + }, + { + "epoch": 0.47, + "grad_norm": 0.9827950275426468, + "learning_rate": 1.1487436876063205e-05, + "loss": 0.2, + "step": 9221 + }, + { + "epoch": 0.47, + "grad_norm": 1.253797589125273, + "learning_rate": 1.148580820850682e-05, + "loss": 0.1748, + "step": 9222 + }, + { + "epoch": 0.47, + "grad_norm": 0.92996072269765, + "learning_rate": 1.1484179500647802e-05, + "loss": 0.1813, + "step": 9223 + }, + { + "epoch": 0.47, + "grad_norm": 0.9294885667105391, + "learning_rate": 1.1482550752530332e-05, + "loss": 0.1911, + "step": 9224 + }, + { + "epoch": 0.47, + "grad_norm": 1.4144234683841561, + "learning_rate": 1.148092196419859e-05, + "loss": 0.2001, + "step": 9225 + }, + { + "epoch": 0.47, + "grad_norm": 0.8613645922670157, + "learning_rate": 1.1479293135696755e-05, + "loss": 0.2162, + "step": 9226 + }, + { + "epoch": 0.47, + "grad_norm": 1.0450389190608478, + "learning_rate": 1.1477664267069009e-05, + "loss": 0.1856, + "step": 9227 + }, + { + "epoch": 0.47, + "grad_norm": 0.8930640924927837, + "learning_rate": 1.1476035358359539e-05, + "loss": 0.2097, + "step": 9228 + }, + { + "epoch": 0.47, + "grad_norm": 1.592902215469274, + "learning_rate": 1.1474406409612524e-05, + "loss": 0.1921, + "step": 9229 + }, + { + "epoch": 0.47, + "grad_norm": 0.7904460656923809, + "learning_rate": 1.1472777420872154e-05, + "loss": 0.1891, + "step": 9230 + }, + { + "epoch": 0.47, + "grad_norm": 0.8015309344030134, + "learning_rate": 1.147114839218261e-05, + "loss": 0.1601, + "step": 9231 + }, + { + "epoch": 0.47, + "grad_norm": 0.9705118301556879, + "learning_rate": 1.1469519323588085e-05, + "loss": 0.1639, + "step": 9232 + }, + { + "epoch": 0.47, + "grad_norm": 1.5770038604029533, + "learning_rate": 1.1467890215132767e-05, + "loss": 0.1919, + "step": 9233 + }, + { + "epoch": 0.47, + "grad_norm": 1.1379908559648302, + "learning_rate": 1.146626106686084e-05, + "loss": 0.1739, + "step": 9234 + }, + { + "epoch": 0.47, + "grad_norm": 1.7509321612595246, + "learning_rate": 1.1464631878816502e-05, + "loss": 0.1993, + "step": 9235 + }, + { + "epoch": 0.47, + "grad_norm": 1.1126365561109344, + "learning_rate": 1.1463002651043942e-05, + "loss": 0.1867, + "step": 9236 + }, + { + "epoch": 0.47, + "grad_norm": 0.9436631209359791, + "learning_rate": 1.146137338358735e-05, + "loss": 0.1892, + "step": 9237 + }, + { + "epoch": 0.47, + "grad_norm": 1.293770946685034, + "learning_rate": 1.1459744076490924e-05, + "loss": 0.1957, + "step": 9238 + }, + { + "epoch": 0.47, + "grad_norm": 0.9276318520781688, + "learning_rate": 1.1458114729798855e-05, + "loss": 0.1872, + "step": 9239 + }, + { + "epoch": 0.47, + "grad_norm": 0.6795751167978585, + "learning_rate": 1.1456485343555344e-05, + "loss": 0.1803, + "step": 9240 + }, + { + "epoch": 0.47, + "grad_norm": 0.9264723014647175, + "learning_rate": 1.1454855917804586e-05, + "loss": 0.1718, + "step": 9241 + }, + { + "epoch": 0.47, + "grad_norm": 1.457193211728748, + "learning_rate": 1.145322645259078e-05, + "loss": 0.1898, + "step": 9242 + }, + { + "epoch": 0.47, + "grad_norm": 0.9019050296705318, + "learning_rate": 1.1451596947958122e-05, + "loss": 0.2239, + "step": 9243 + }, + { + "epoch": 0.47, + "grad_norm": 7.865206930755326, + "learning_rate": 1.1449967403950812e-05, + "loss": 0.2017, + "step": 9244 + }, + { + "epoch": 0.47, + "grad_norm": 1.132181096344825, + "learning_rate": 1.1448337820613061e-05, + "loss": 0.1885, + "step": 9245 + }, + { + "epoch": 0.47, + "grad_norm": 1.151784318954177, + "learning_rate": 1.1446708197989061e-05, + "loss": 0.2025, + "step": 9246 + }, + { + "epoch": 0.47, + "grad_norm": 1.0029738228138243, + "learning_rate": 1.144507853612302e-05, + "loss": 0.1855, + "step": 9247 + }, + { + "epoch": 0.47, + "grad_norm": 1.0962947125468563, + "learning_rate": 1.1443448835059141e-05, + "loss": 0.1944, + "step": 9248 + }, + { + "epoch": 0.47, + "grad_norm": 0.8477426463884647, + "learning_rate": 1.144181909484163e-05, + "loss": 0.1753, + "step": 9249 + }, + { + "epoch": 0.47, + "grad_norm": 0.8817457321386621, + "learning_rate": 1.1440189315514698e-05, + "loss": 0.1827, + "step": 9250 + }, + { + "epoch": 0.47, + "grad_norm": 0.9198362005845908, + "learning_rate": 1.143855949712255e-05, + "loss": 0.1758, + "step": 9251 + }, + { + "epoch": 0.47, + "grad_norm": 0.8501168011702994, + "learning_rate": 1.143692963970939e-05, + "loss": 0.1951, + "step": 9252 + }, + { + "epoch": 0.47, + "grad_norm": 1.1709863082016263, + "learning_rate": 1.1435299743319434e-05, + "loss": 0.1998, + "step": 9253 + }, + { + "epoch": 0.47, + "grad_norm": 0.9096201448211303, + "learning_rate": 1.1433669807996894e-05, + "loss": 0.17, + "step": 9254 + }, + { + "epoch": 0.47, + "grad_norm": 1.1348872412188835, + "learning_rate": 1.1432039833785979e-05, + "loss": 0.1884, + "step": 9255 + }, + { + "epoch": 0.47, + "grad_norm": 1.1259494677848945, + "learning_rate": 1.1430409820730902e-05, + "loss": 0.1881, + "step": 9256 + }, + { + "epoch": 0.47, + "grad_norm": 1.1273228094294094, + "learning_rate": 1.1428779768875874e-05, + "loss": 0.1934, + "step": 9257 + }, + { + "epoch": 0.47, + "grad_norm": 1.1033316887349531, + "learning_rate": 1.1427149678265119e-05, + "loss": 0.2156, + "step": 9258 + }, + { + "epoch": 0.47, + "grad_norm": 1.008896724459771, + "learning_rate": 1.1425519548942847e-05, + "loss": 0.2092, + "step": 9259 + }, + { + "epoch": 0.47, + "grad_norm": 1.263669512820033, + "learning_rate": 1.1423889380953277e-05, + "loss": 0.1778, + "step": 9260 + }, + { + "epoch": 0.47, + "grad_norm": 0.9381984779225716, + "learning_rate": 1.1422259174340624e-05, + "loss": 0.1963, + "step": 9261 + }, + { + "epoch": 0.47, + "grad_norm": 1.5088037717808391, + "learning_rate": 1.1420628929149114e-05, + "loss": 0.2038, + "step": 9262 + }, + { + "epoch": 0.47, + "grad_norm": 1.0717096467954101, + "learning_rate": 1.1418998645422963e-05, + "loss": 0.1927, + "step": 9263 + }, + { + "epoch": 0.47, + "grad_norm": 1.193684945593441, + "learning_rate": 1.1417368323206395e-05, + "loss": 0.2046, + "step": 9264 + }, + { + "epoch": 0.47, + "grad_norm": 1.0533682703561398, + "learning_rate": 1.141573796254363e-05, + "loss": 0.1697, + "step": 9265 + }, + { + "epoch": 0.47, + "grad_norm": 1.1840786054105483, + "learning_rate": 1.141410756347889e-05, + "loss": 0.1822, + "step": 9266 + }, + { + "epoch": 0.47, + "grad_norm": 1.749805516924357, + "learning_rate": 1.1412477126056405e-05, + "loss": 0.1979, + "step": 9267 + }, + { + "epoch": 0.47, + "grad_norm": 1.253852439701606, + "learning_rate": 1.14108466503204e-05, + "loss": 0.2176, + "step": 9268 + }, + { + "epoch": 0.47, + "grad_norm": 1.1173364128331582, + "learning_rate": 1.1409216136315097e-05, + "loss": 0.1778, + "step": 9269 + }, + { + "epoch": 0.47, + "grad_norm": 1.035032031791532, + "learning_rate": 1.1407585584084726e-05, + "loss": 0.2161, + "step": 9270 + }, + { + "epoch": 0.47, + "grad_norm": 1.2405865114634105, + "learning_rate": 1.1405954993673522e-05, + "loss": 0.1771, + "step": 9271 + }, + { + "epoch": 0.47, + "grad_norm": 0.9961618122285781, + "learning_rate": 1.1404324365125708e-05, + "loss": 0.193, + "step": 9272 + }, + { + "epoch": 0.47, + "grad_norm": 1.6814547632964016, + "learning_rate": 1.1402693698485512e-05, + "loss": 0.1832, + "step": 9273 + }, + { + "epoch": 0.47, + "grad_norm": 1.2294370918587325, + "learning_rate": 1.1401062993797171e-05, + "loss": 0.1903, + "step": 9274 + }, + { + "epoch": 0.47, + "grad_norm": 1.415889465472221, + "learning_rate": 1.1399432251104918e-05, + "loss": 0.2137, + "step": 9275 + }, + { + "epoch": 0.47, + "grad_norm": 1.0334241329352223, + "learning_rate": 1.1397801470452987e-05, + "loss": 0.196, + "step": 9276 + }, + { + "epoch": 0.47, + "grad_norm": 0.9642912610388481, + "learning_rate": 1.1396170651885613e-05, + "loss": 0.182, + "step": 9277 + }, + { + "epoch": 0.47, + "grad_norm": 1.2648969637697525, + "learning_rate": 1.139453979544703e-05, + "loss": 0.1908, + "step": 9278 + }, + { + "epoch": 0.47, + "grad_norm": 0.8346050383865007, + "learning_rate": 1.1392908901181474e-05, + "loss": 0.176, + "step": 9279 + }, + { + "epoch": 0.47, + "grad_norm": 1.3413644643229863, + "learning_rate": 1.139127796913319e-05, + "loss": 0.1911, + "step": 9280 + }, + { + "epoch": 0.47, + "grad_norm": 0.9490728998996395, + "learning_rate": 1.138964699934641e-05, + "loss": 0.1796, + "step": 9281 + }, + { + "epoch": 0.47, + "grad_norm": 1.0848424621668546, + "learning_rate": 1.1388015991865377e-05, + "loss": 0.1773, + "step": 9282 + }, + { + "epoch": 0.47, + "grad_norm": 1.8101725755414195, + "learning_rate": 1.138638494673433e-05, + "loss": 0.2012, + "step": 9283 + }, + { + "epoch": 0.47, + "grad_norm": 1.3028282362221764, + "learning_rate": 1.1384753863997516e-05, + "loss": 0.1876, + "step": 9284 + }, + { + "epoch": 0.47, + "grad_norm": 1.1140020330220612, + "learning_rate": 1.1383122743699173e-05, + "loss": 0.2109, + "step": 9285 + }, + { + "epoch": 0.47, + "grad_norm": 1.1084320608542635, + "learning_rate": 1.1381491585883548e-05, + "loss": 0.1971, + "step": 9286 + }, + { + "epoch": 0.47, + "grad_norm": 1.3256072713621745, + "learning_rate": 1.1379860390594888e-05, + "loss": 0.2103, + "step": 9287 + }, + { + "epoch": 0.47, + "grad_norm": 0.9168074800231482, + "learning_rate": 1.1378229157877432e-05, + "loss": 0.1789, + "step": 9288 + }, + { + "epoch": 0.47, + "grad_norm": 0.8194316208116729, + "learning_rate": 1.1376597887775438e-05, + "loss": 0.1948, + "step": 9289 + }, + { + "epoch": 0.47, + "grad_norm": 0.98526556813502, + "learning_rate": 1.1374966580333147e-05, + "loss": 0.1984, + "step": 9290 + }, + { + "epoch": 0.47, + "grad_norm": 2.061805734811115, + "learning_rate": 1.1373335235594809e-05, + "loss": 0.2147, + "step": 9291 + }, + { + "epoch": 0.47, + "grad_norm": 0.952970495944534, + "learning_rate": 1.1371703853604672e-05, + "loss": 0.1879, + "step": 9292 + }, + { + "epoch": 0.47, + "grad_norm": 1.0049105878186886, + "learning_rate": 1.1370072434406993e-05, + "loss": 0.2125, + "step": 9293 + }, + { + "epoch": 0.47, + "grad_norm": 1.180288824955952, + "learning_rate": 1.1368440978046022e-05, + "loss": 0.1977, + "step": 9294 + }, + { + "epoch": 0.47, + "grad_norm": 0.9273278457059216, + "learning_rate": 1.1366809484566015e-05, + "loss": 0.1786, + "step": 9295 + }, + { + "epoch": 0.47, + "grad_norm": 0.7298925974661218, + "learning_rate": 1.136517795401122e-05, + "loss": 0.18, + "step": 9296 + }, + { + "epoch": 0.47, + "grad_norm": 0.9384320818110587, + "learning_rate": 1.1363546386425895e-05, + "loss": 0.184, + "step": 9297 + }, + { + "epoch": 0.47, + "grad_norm": 0.7859695205526979, + "learning_rate": 1.13619147818543e-05, + "loss": 0.167, + "step": 9298 + }, + { + "epoch": 0.47, + "grad_norm": 0.9421177588324476, + "learning_rate": 1.1360283140340688e-05, + "loss": 0.1688, + "step": 9299 + }, + { + "epoch": 0.47, + "grad_norm": 1.2079617951803552, + "learning_rate": 1.1358651461929319e-05, + "loss": 0.1966, + "step": 9300 + }, + { + "epoch": 0.47, + "grad_norm": 0.817762563013867, + "learning_rate": 1.1357019746664453e-05, + "loss": 0.1757, + "step": 9301 + }, + { + "epoch": 0.47, + "grad_norm": 2.1866910568283346, + "learning_rate": 1.135538799459035e-05, + "loss": 0.199, + "step": 9302 + }, + { + "epoch": 0.47, + "grad_norm": 0.9677330185793297, + "learning_rate": 1.1353756205751272e-05, + "loss": 0.2217, + "step": 9303 + }, + { + "epoch": 0.47, + "grad_norm": 1.7153143992431847, + "learning_rate": 1.1352124380191479e-05, + "loss": 0.1836, + "step": 9304 + }, + { + "epoch": 0.47, + "grad_norm": 0.9525697878726538, + "learning_rate": 1.1350492517955234e-05, + "loss": 0.1999, + "step": 9305 + }, + { + "epoch": 0.47, + "grad_norm": 0.9008507657210286, + "learning_rate": 1.1348860619086808e-05, + "loss": 0.1982, + "step": 9306 + }, + { + "epoch": 0.47, + "grad_norm": 0.9362961925883473, + "learning_rate": 1.134722868363046e-05, + "loss": 0.1963, + "step": 9307 + }, + { + "epoch": 0.47, + "grad_norm": 0.8781050347037737, + "learning_rate": 1.1345596711630456e-05, + "loss": 0.1846, + "step": 9308 + }, + { + "epoch": 0.47, + "grad_norm": 0.9572232189569388, + "learning_rate": 1.1343964703131065e-05, + "loss": 0.1866, + "step": 9309 + }, + { + "epoch": 0.47, + "grad_norm": 1.3214437628723248, + "learning_rate": 1.1342332658176556e-05, + "loss": 0.1705, + "step": 9310 + }, + { + "epoch": 0.47, + "grad_norm": 1.7267957828656673, + "learning_rate": 1.1340700576811198e-05, + "loss": 0.1847, + "step": 9311 + }, + { + "epoch": 0.47, + "grad_norm": 1.6346922718412682, + "learning_rate": 1.1339068459079262e-05, + "loss": 0.1748, + "step": 9312 + }, + { + "epoch": 0.47, + "grad_norm": 0.7569900182507847, + "learning_rate": 1.1337436305025019e-05, + "loss": 0.177, + "step": 9313 + }, + { + "epoch": 0.47, + "grad_norm": 0.955594887499051, + "learning_rate": 1.1335804114692737e-05, + "loss": 0.2005, + "step": 9314 + }, + { + "epoch": 0.47, + "grad_norm": 2.9007643934387923, + "learning_rate": 1.1334171888126698e-05, + "loss": 0.192, + "step": 9315 + }, + { + "epoch": 0.47, + "grad_norm": 1.1458182580341056, + "learning_rate": 1.1332539625371166e-05, + "loss": 0.179, + "step": 9316 + }, + { + "epoch": 0.47, + "grad_norm": 1.346045821307898, + "learning_rate": 1.1330907326470426e-05, + "loss": 0.1999, + "step": 9317 + }, + { + "epoch": 0.47, + "grad_norm": 1.1606367702267062, + "learning_rate": 1.1329274991468747e-05, + "loss": 0.1847, + "step": 9318 + }, + { + "epoch": 0.47, + "grad_norm": 1.1289045966659277, + "learning_rate": 1.1327642620410408e-05, + "loss": 0.1875, + "step": 9319 + }, + { + "epoch": 0.47, + "grad_norm": 0.8345586647548614, + "learning_rate": 1.1326010213339688e-05, + "loss": 0.1749, + "step": 9320 + }, + { + "epoch": 0.47, + "grad_norm": 1.4714817836575558, + "learning_rate": 1.132437777030087e-05, + "loss": 0.2186, + "step": 9321 + }, + { + "epoch": 0.47, + "grad_norm": 0.7114676751705088, + "learning_rate": 1.1322745291338226e-05, + "loss": 0.1805, + "step": 9322 + }, + { + "epoch": 0.47, + "grad_norm": 1.1326550840352263, + "learning_rate": 1.1321112776496042e-05, + "loss": 0.1794, + "step": 9323 + }, + { + "epoch": 0.47, + "grad_norm": 0.8232979729290985, + "learning_rate": 1.1319480225818602e-05, + "loss": 0.1973, + "step": 9324 + }, + { + "epoch": 0.47, + "grad_norm": 0.9378338038834931, + "learning_rate": 1.1317847639350186e-05, + "loss": 0.1777, + "step": 9325 + }, + { + "epoch": 0.47, + "grad_norm": 0.927375096532382, + "learning_rate": 1.1316215017135076e-05, + "loss": 0.203, + "step": 9326 + }, + { + "epoch": 0.47, + "grad_norm": 1.0456569263599929, + "learning_rate": 1.1314582359217558e-05, + "loss": 0.2025, + "step": 9327 + }, + { + "epoch": 0.47, + "grad_norm": 1.2830682932161301, + "learning_rate": 1.1312949665641923e-05, + "loss": 0.1987, + "step": 9328 + }, + { + "epoch": 0.47, + "grad_norm": 0.80481172348749, + "learning_rate": 1.1311316936452452e-05, + "loss": 0.1925, + "step": 9329 + }, + { + "epoch": 0.47, + "grad_norm": 0.9271681235067764, + "learning_rate": 1.1309684171693435e-05, + "loss": 0.1907, + "step": 9330 + }, + { + "epoch": 0.47, + "grad_norm": 0.8884154679752989, + "learning_rate": 1.1308051371409162e-05, + "loss": 0.1738, + "step": 9331 + }, + { + "epoch": 0.47, + "grad_norm": 1.2122619537768777, + "learning_rate": 1.1306418535643922e-05, + "loss": 0.1828, + "step": 9332 + }, + { + "epoch": 0.47, + "grad_norm": 1.0197330364239467, + "learning_rate": 1.1304785664442003e-05, + "loss": 0.1931, + "step": 9333 + }, + { + "epoch": 0.47, + "grad_norm": 0.9782015436799009, + "learning_rate": 1.1303152757847702e-05, + "loss": 0.2035, + "step": 9334 + }, + { + "epoch": 0.47, + "grad_norm": 0.9159995398951478, + "learning_rate": 1.1301519815905309e-05, + "loss": 0.1963, + "step": 9335 + }, + { + "epoch": 0.47, + "grad_norm": 0.9347505438490339, + "learning_rate": 1.1299886838659114e-05, + "loss": 0.1951, + "step": 9336 + }, + { + "epoch": 0.47, + "grad_norm": 1.0526098163364204, + "learning_rate": 1.1298253826153415e-05, + "loss": 0.1929, + "step": 9337 + }, + { + "epoch": 0.47, + "grad_norm": 1.193015880699807, + "learning_rate": 1.1296620778432512e-05, + "loss": 0.1911, + "step": 9338 + }, + { + "epoch": 0.47, + "grad_norm": 1.4693323716969673, + "learning_rate": 1.1294987695540695e-05, + "loss": 0.1605, + "step": 9339 + }, + { + "epoch": 0.47, + "grad_norm": 1.2914714608353897, + "learning_rate": 1.1293354577522264e-05, + "loss": 0.1874, + "step": 9340 + }, + { + "epoch": 0.48, + "grad_norm": 0.8955534658899678, + "learning_rate": 1.1291721424421518e-05, + "loss": 0.1762, + "step": 9341 + }, + { + "epoch": 0.48, + "grad_norm": 0.9457021966906232, + "learning_rate": 1.1290088236282752e-05, + "loss": 0.1945, + "step": 9342 + }, + { + "epoch": 0.48, + "grad_norm": 0.9463505398747077, + "learning_rate": 1.1288455013150275e-05, + "loss": 0.1811, + "step": 9343 + }, + { + "epoch": 0.48, + "grad_norm": 0.8537062086454555, + "learning_rate": 1.1286821755068375e-05, + "loss": 0.1776, + "step": 9344 + }, + { + "epoch": 0.48, + "grad_norm": 1.1395762597930732, + "learning_rate": 1.128518846208137e-05, + "loss": 0.1853, + "step": 9345 + }, + { + "epoch": 0.48, + "grad_norm": 0.8332298531890091, + "learning_rate": 1.128355513423355e-05, + "loss": 0.1726, + "step": 9346 + }, + { + "epoch": 0.48, + "grad_norm": 1.075619778357324, + "learning_rate": 1.1281921771569229e-05, + "loss": 0.1671, + "step": 9347 + }, + { + "epoch": 0.48, + "grad_norm": 1.1719243199926213, + "learning_rate": 1.1280288374132704e-05, + "loss": 0.1979, + "step": 9348 + }, + { + "epoch": 0.48, + "grad_norm": 1.1759049357909859, + "learning_rate": 1.1278654941968285e-05, + "loss": 0.1915, + "step": 9349 + }, + { + "epoch": 0.48, + "grad_norm": 0.8400856096808293, + "learning_rate": 1.127702147512028e-05, + "loss": 0.1709, + "step": 9350 + }, + { + "epoch": 0.48, + "grad_norm": 1.1444023227495415, + "learning_rate": 1.1275387973632994e-05, + "loss": 0.2173, + "step": 9351 + }, + { + "epoch": 0.48, + "grad_norm": 1.0254301032804167, + "learning_rate": 1.1273754437550738e-05, + "loss": 0.1826, + "step": 9352 + }, + { + "epoch": 0.48, + "grad_norm": 1.30260397154843, + "learning_rate": 1.1272120866917821e-05, + "loss": 0.1936, + "step": 9353 + }, + { + "epoch": 0.48, + "grad_norm": 0.8536684491265273, + "learning_rate": 1.1270487261778554e-05, + "loss": 0.1925, + "step": 9354 + }, + { + "epoch": 0.48, + "grad_norm": 0.9385710112540513, + "learning_rate": 1.1268853622177248e-05, + "loss": 0.1765, + "step": 9355 + }, + { + "epoch": 0.48, + "grad_norm": 9.90856099719088, + "learning_rate": 1.1267219948158215e-05, + "loss": 0.201, + "step": 9356 + }, + { + "epoch": 0.48, + "grad_norm": 0.9892179146824301, + "learning_rate": 1.1265586239765772e-05, + "loss": 0.2016, + "step": 9357 + }, + { + "epoch": 0.48, + "grad_norm": 2.2092280855303295, + "learning_rate": 1.1263952497044225e-05, + "loss": 0.1823, + "step": 9358 + }, + { + "epoch": 0.48, + "grad_norm": 1.1156350393611687, + "learning_rate": 1.1262318720037902e-05, + "loss": 0.1772, + "step": 9359 + }, + { + "epoch": 0.48, + "grad_norm": 0.9048227767853564, + "learning_rate": 1.1260684908791109e-05, + "loss": 0.1869, + "step": 9360 + }, + { + "epoch": 0.48, + "grad_norm": 1.0731084504573483, + "learning_rate": 1.1259051063348167e-05, + "loss": 0.1601, + "step": 9361 + }, + { + "epoch": 0.48, + "grad_norm": 1.4848992993314354, + "learning_rate": 1.1257417183753391e-05, + "loss": 0.1813, + "step": 9362 + }, + { + "epoch": 0.48, + "grad_norm": 0.987609588215139, + "learning_rate": 1.1255783270051105e-05, + "loss": 0.1609, + "step": 9363 + }, + { + "epoch": 0.48, + "grad_norm": 1.1217420741641009, + "learning_rate": 1.125414932228563e-05, + "loss": 0.184, + "step": 9364 + }, + { + "epoch": 0.48, + "grad_norm": 1.0911673067257806, + "learning_rate": 1.1252515340501282e-05, + "loss": 0.1864, + "step": 9365 + }, + { + "epoch": 0.48, + "grad_norm": 1.2569941304967243, + "learning_rate": 1.1250881324742382e-05, + "loss": 0.2255, + "step": 9366 + }, + { + "epoch": 0.48, + "grad_norm": 0.9116639119858985, + "learning_rate": 1.1249247275053256e-05, + "loss": 0.1844, + "step": 9367 + }, + { + "epoch": 0.48, + "grad_norm": 1.083187498682193, + "learning_rate": 1.1247613191478231e-05, + "loss": 0.181, + "step": 9368 + }, + { + "epoch": 0.48, + "grad_norm": 1.2340382247496755, + "learning_rate": 1.1245979074061623e-05, + "loss": 0.1796, + "step": 9369 + }, + { + "epoch": 0.48, + "grad_norm": 1.2310194776736283, + "learning_rate": 1.1244344922847765e-05, + "loss": 0.2008, + "step": 9370 + }, + { + "epoch": 0.48, + "grad_norm": 1.0514258589881065, + "learning_rate": 1.1242710737880979e-05, + "loss": 0.2235, + "step": 9371 + }, + { + "epoch": 0.48, + "grad_norm": 1.0733065946367741, + "learning_rate": 1.1241076519205595e-05, + "loss": 0.1793, + "step": 9372 + }, + { + "epoch": 0.48, + "grad_norm": 1.6731214337021334, + "learning_rate": 1.123944226686594e-05, + "loss": 0.2054, + "step": 9373 + }, + { + "epoch": 0.48, + "grad_norm": 1.0373545401706548, + "learning_rate": 1.1237807980906346e-05, + "loss": 0.1899, + "step": 9374 + }, + { + "epoch": 0.48, + "grad_norm": 0.8567521447141507, + "learning_rate": 1.1236173661371139e-05, + "loss": 0.1926, + "step": 9375 + }, + { + "epoch": 0.48, + "grad_norm": 1.6535204425524637, + "learning_rate": 1.123453930830465e-05, + "loss": 0.1908, + "step": 9376 + }, + { + "epoch": 0.48, + "grad_norm": 1.6849632397852876, + "learning_rate": 1.1232904921751216e-05, + "loss": 0.2385, + "step": 9377 + }, + { + "epoch": 0.48, + "grad_norm": 0.9553081200938266, + "learning_rate": 1.1231270501755162e-05, + "loss": 0.1924, + "step": 9378 + }, + { + "epoch": 0.48, + "grad_norm": 0.9561839368777103, + "learning_rate": 1.1229636048360828e-05, + "loss": 0.2014, + "step": 9379 + }, + { + "epoch": 0.48, + "grad_norm": 0.9692146046944835, + "learning_rate": 1.1228001561612547e-05, + "loss": 0.1691, + "step": 9380 + }, + { + "epoch": 0.48, + "grad_norm": 1.0657126735953355, + "learning_rate": 1.1226367041554655e-05, + "loss": 0.1835, + "step": 9381 + }, + { + "epoch": 0.48, + "grad_norm": 0.9752036307321181, + "learning_rate": 1.1224732488231487e-05, + "loss": 0.1828, + "step": 9382 + }, + { + "epoch": 0.48, + "grad_norm": 0.9659786017577724, + "learning_rate": 1.1223097901687382e-05, + "loss": 0.1909, + "step": 9383 + }, + { + "epoch": 0.48, + "grad_norm": 0.8918016838091298, + "learning_rate": 1.1221463281966673e-05, + "loss": 0.1661, + "step": 9384 + }, + { + "epoch": 0.48, + "grad_norm": 0.8987191636079584, + "learning_rate": 1.1219828629113707e-05, + "loss": 0.1765, + "step": 9385 + }, + { + "epoch": 0.48, + "grad_norm": 1.45173704218604, + "learning_rate": 1.1218193943172821e-05, + "loss": 0.2047, + "step": 9386 + }, + { + "epoch": 0.48, + "grad_norm": 1.6950586457712218, + "learning_rate": 1.1216559224188355e-05, + "loss": 0.2014, + "step": 9387 + }, + { + "epoch": 0.48, + "grad_norm": 1.0394831725837366, + "learning_rate": 1.1214924472204651e-05, + "loss": 0.1983, + "step": 9388 + }, + { + "epoch": 0.48, + "grad_norm": 1.2175066037402202, + "learning_rate": 1.1213289687266052e-05, + "loss": 0.1872, + "step": 9389 + }, + { + "epoch": 0.48, + "grad_norm": 1.3022976481634068, + "learning_rate": 1.1211654869416901e-05, + "loss": 0.2173, + "step": 9390 + }, + { + "epoch": 0.48, + "grad_norm": 1.027564229058217, + "learning_rate": 1.1210020018701546e-05, + "loss": 0.1745, + "step": 9391 + }, + { + "epoch": 0.48, + "grad_norm": 1.1945177307533967, + "learning_rate": 1.1208385135164329e-05, + "loss": 0.2103, + "step": 9392 + }, + { + "epoch": 0.48, + "grad_norm": 0.8846131409121079, + "learning_rate": 1.120675021884959e-05, + "loss": 0.1483, + "step": 9393 + }, + { + "epoch": 0.48, + "grad_norm": 1.012946214427254, + "learning_rate": 1.1205115269801695e-05, + "loss": 0.1726, + "step": 9394 + }, + { + "epoch": 0.48, + "grad_norm": 1.4632348655698826, + "learning_rate": 1.1203480288064974e-05, + "loss": 0.1836, + "step": 9395 + }, + { + "epoch": 0.48, + "grad_norm": 1.2870539131782863, + "learning_rate": 1.1201845273683782e-05, + "loss": 0.1799, + "step": 9396 + }, + { + "epoch": 0.48, + "grad_norm": 0.8808388430584856, + "learning_rate": 1.1200210226702469e-05, + "loss": 0.2151, + "step": 9397 + }, + { + "epoch": 0.48, + "grad_norm": 0.9740500347235727, + "learning_rate": 1.1198575147165384e-05, + "loss": 0.177, + "step": 9398 + }, + { + "epoch": 0.48, + "grad_norm": 1.5208257341292488, + "learning_rate": 1.1196940035116884e-05, + "loss": 0.1716, + "step": 9399 + }, + { + "epoch": 0.48, + "grad_norm": 1.035575690228776, + "learning_rate": 1.1195304890601317e-05, + "loss": 0.2063, + "step": 9400 + }, + { + "epoch": 0.48, + "grad_norm": 1.012321287351004, + "learning_rate": 1.1193669713663039e-05, + "loss": 0.1733, + "step": 9401 + }, + { + "epoch": 0.48, + "grad_norm": 1.3756052531505762, + "learning_rate": 1.1192034504346397e-05, + "loss": 0.1951, + "step": 9402 + }, + { + "epoch": 0.48, + "grad_norm": 2.4966464979842127, + "learning_rate": 1.1190399262695757e-05, + "loss": 0.1959, + "step": 9403 + }, + { + "epoch": 0.48, + "grad_norm": 0.9375503280340816, + "learning_rate": 1.1188763988755467e-05, + "loss": 0.183, + "step": 9404 + }, + { + "epoch": 0.48, + "grad_norm": 1.1260862585486986, + "learning_rate": 1.1187128682569888e-05, + "loss": 0.1945, + "step": 9405 + }, + { + "epoch": 0.48, + "grad_norm": 0.9728000538819264, + "learning_rate": 1.1185493344183375e-05, + "loss": 0.1859, + "step": 9406 + }, + { + "epoch": 0.48, + "grad_norm": 1.0834736784120405, + "learning_rate": 1.1183857973640289e-05, + "loss": 0.1855, + "step": 9407 + }, + { + "epoch": 0.48, + "grad_norm": 0.9348768976215369, + "learning_rate": 1.118222257098499e-05, + "loss": 0.1856, + "step": 9408 + }, + { + "epoch": 0.48, + "grad_norm": 0.9706361044266855, + "learning_rate": 1.1180587136261835e-05, + "loss": 0.1889, + "step": 9409 + }, + { + "epoch": 0.48, + "grad_norm": 0.9726258225414485, + "learning_rate": 1.117895166951519e-05, + "loss": 0.1734, + "step": 9410 + }, + { + "epoch": 0.48, + "grad_norm": 1.0166944674563385, + "learning_rate": 1.1177316170789412e-05, + "loss": 0.1909, + "step": 9411 + }, + { + "epoch": 0.48, + "grad_norm": 0.8494335568980105, + "learning_rate": 1.1175680640128867e-05, + "loss": 0.2172, + "step": 9412 + }, + { + "epoch": 0.48, + "grad_norm": 0.9503780665887606, + "learning_rate": 1.117404507757792e-05, + "loss": 0.1835, + "step": 9413 + }, + { + "epoch": 0.48, + "grad_norm": 1.0304138091835113, + "learning_rate": 1.1172409483180929e-05, + "loss": 0.1669, + "step": 9414 + }, + { + "epoch": 0.48, + "grad_norm": 1.3386853123357887, + "learning_rate": 1.1170773856982268e-05, + "loss": 0.1824, + "step": 9415 + }, + { + "epoch": 0.48, + "grad_norm": 2.192617945116487, + "learning_rate": 1.11691381990263e-05, + "loss": 0.1875, + "step": 9416 + }, + { + "epoch": 0.48, + "grad_norm": 1.044363310431338, + "learning_rate": 1.1167502509357393e-05, + "loss": 0.1805, + "step": 9417 + }, + { + "epoch": 0.48, + "grad_norm": 0.8158167248839474, + "learning_rate": 1.1165866788019912e-05, + "loss": 0.1921, + "step": 9418 + }, + { + "epoch": 0.48, + "grad_norm": 1.4395607267781292, + "learning_rate": 1.1164231035058228e-05, + "loss": 0.2025, + "step": 9419 + }, + { + "epoch": 0.48, + "grad_norm": 1.015506286236725, + "learning_rate": 1.1162595250516715e-05, + "loss": 0.1666, + "step": 9420 + }, + { + "epoch": 0.48, + "grad_norm": 1.5064817417365441, + "learning_rate": 1.116095943443974e-05, + "loss": 0.1904, + "step": 9421 + }, + { + "epoch": 0.48, + "grad_norm": 1.7158616883607665, + "learning_rate": 1.1159323586871673e-05, + "loss": 0.1879, + "step": 9422 + }, + { + "epoch": 0.48, + "grad_norm": 0.8696293588736635, + "learning_rate": 1.1157687707856888e-05, + "loss": 0.1676, + "step": 9423 + }, + { + "epoch": 0.48, + "grad_norm": 1.2717209787363684, + "learning_rate": 1.1156051797439757e-05, + "loss": 0.177, + "step": 9424 + }, + { + "epoch": 0.48, + "grad_norm": 1.0687536577180041, + "learning_rate": 1.1154415855664657e-05, + "loss": 0.2175, + "step": 9425 + }, + { + "epoch": 0.48, + "grad_norm": 1.487045634947814, + "learning_rate": 1.1152779882575964e-05, + "loss": 0.1979, + "step": 9426 + }, + { + "epoch": 0.48, + "grad_norm": 1.247667420377959, + "learning_rate": 1.115114387821805e-05, + "loss": 0.2063, + "step": 9427 + }, + { + "epoch": 0.48, + "grad_norm": 0.9232023822380149, + "learning_rate": 1.1149507842635293e-05, + "loss": 0.193, + "step": 9428 + }, + { + "epoch": 0.48, + "grad_norm": 1.0482885930779677, + "learning_rate": 1.1147871775872072e-05, + "loss": 0.1661, + "step": 9429 + }, + { + "epoch": 0.48, + "grad_norm": 0.9521862144910216, + "learning_rate": 1.1146235677972765e-05, + "loss": 0.1883, + "step": 9430 + }, + { + "epoch": 0.48, + "grad_norm": 0.8533427250808602, + "learning_rate": 1.1144599548981749e-05, + "loss": 0.2021, + "step": 9431 + }, + { + "epoch": 0.48, + "grad_norm": 1.0419278456030308, + "learning_rate": 1.1142963388943405e-05, + "loss": 0.1666, + "step": 9432 + }, + { + "epoch": 0.48, + "grad_norm": 0.7751955248876855, + "learning_rate": 1.1141327197902114e-05, + "loss": 0.163, + "step": 9433 + }, + { + "epoch": 0.48, + "grad_norm": 0.8975275086310701, + "learning_rate": 1.113969097590226e-05, + "loss": 0.1881, + "step": 9434 + }, + { + "epoch": 0.48, + "grad_norm": 0.92585182171884, + "learning_rate": 1.1138054722988223e-05, + "loss": 0.2046, + "step": 9435 + }, + { + "epoch": 0.48, + "grad_norm": 0.7624586845530239, + "learning_rate": 1.1136418439204388e-05, + "loss": 0.1841, + "step": 9436 + }, + { + "epoch": 0.48, + "grad_norm": 6.557712694484254, + "learning_rate": 1.1134782124595136e-05, + "loss": 0.1889, + "step": 9437 + }, + { + "epoch": 0.48, + "grad_norm": 1.0712559355250177, + "learning_rate": 1.113314577920486e-05, + "loss": 0.204, + "step": 9438 + }, + { + "epoch": 0.48, + "grad_norm": 0.9869076526938929, + "learning_rate": 1.1131509403077936e-05, + "loss": 0.1832, + "step": 9439 + }, + { + "epoch": 0.48, + "grad_norm": 1.4004004264938446, + "learning_rate": 1.1129872996258757e-05, + "loss": 0.2028, + "step": 9440 + }, + { + "epoch": 0.48, + "grad_norm": 0.8442967787105335, + "learning_rate": 1.1128236558791708e-05, + "loss": 0.1941, + "step": 9441 + }, + { + "epoch": 0.48, + "grad_norm": 0.9274354478195815, + "learning_rate": 1.112660009072118e-05, + "loss": 0.1768, + "step": 9442 + }, + { + "epoch": 0.48, + "grad_norm": 1.130228477397798, + "learning_rate": 1.1124963592091563e-05, + "loss": 0.1773, + "step": 9443 + }, + { + "epoch": 0.48, + "grad_norm": 0.9728083311241017, + "learning_rate": 1.1123327062947243e-05, + "loss": 0.2137, + "step": 9444 + }, + { + "epoch": 0.48, + "grad_norm": 0.9970451783684262, + "learning_rate": 1.1121690503332613e-05, + "loss": 0.1762, + "step": 9445 + }, + { + "epoch": 0.48, + "grad_norm": 0.8959882886040591, + "learning_rate": 1.1120053913292066e-05, + "loss": 0.1735, + "step": 9446 + }, + { + "epoch": 0.48, + "grad_norm": 1.1404202161163575, + "learning_rate": 1.1118417292869992e-05, + "loss": 0.2025, + "step": 9447 + }, + { + "epoch": 0.48, + "grad_norm": 0.9352040327694172, + "learning_rate": 1.1116780642110785e-05, + "loss": 0.1955, + "step": 9448 + }, + { + "epoch": 0.48, + "grad_norm": 1.3641165860043831, + "learning_rate": 1.1115143961058843e-05, + "loss": 0.182, + "step": 9449 + }, + { + "epoch": 0.48, + "grad_norm": 1.0398020609796403, + "learning_rate": 1.1113507249758553e-05, + "loss": 0.1765, + "step": 9450 + }, + { + "epoch": 0.48, + "grad_norm": 1.6875637408914828, + "learning_rate": 1.111187050825432e-05, + "loss": 0.1832, + "step": 9451 + }, + { + "epoch": 0.48, + "grad_norm": 1.0012695602418653, + "learning_rate": 1.1110233736590535e-05, + "loss": 0.1798, + "step": 9452 + }, + { + "epoch": 0.48, + "grad_norm": 1.137091287242088, + "learning_rate": 1.1108596934811598e-05, + "loss": 0.2062, + "step": 9453 + }, + { + "epoch": 0.48, + "grad_norm": 1.073121214044263, + "learning_rate": 1.1106960102961906e-05, + "loss": 0.1909, + "step": 9454 + }, + { + "epoch": 0.48, + "grad_norm": 0.919479504533419, + "learning_rate": 1.1105323241085856e-05, + "loss": 0.1874, + "step": 9455 + }, + { + "epoch": 0.48, + "grad_norm": 1.0339051802221684, + "learning_rate": 1.1103686349227856e-05, + "loss": 0.1888, + "step": 9456 + }, + { + "epoch": 0.48, + "grad_norm": 0.9920965654107798, + "learning_rate": 1.11020494274323e-05, + "loss": 0.1766, + "step": 9457 + }, + { + "epoch": 0.48, + "grad_norm": 1.091572426986709, + "learning_rate": 1.1100412475743589e-05, + "loss": 0.198, + "step": 9458 + }, + { + "epoch": 0.48, + "grad_norm": 0.9002278553615146, + "learning_rate": 1.1098775494206126e-05, + "loss": 0.1809, + "step": 9459 + }, + { + "epoch": 0.48, + "grad_norm": 0.9662421903169365, + "learning_rate": 1.109713848286432e-05, + "loss": 0.1869, + "step": 9460 + }, + { + "epoch": 0.48, + "grad_norm": 1.0745650451858144, + "learning_rate": 1.1095501441762568e-05, + "loss": 0.1761, + "step": 9461 + }, + { + "epoch": 0.48, + "grad_norm": 0.6982312184914814, + "learning_rate": 1.109386437094528e-05, + "loss": 0.1811, + "step": 9462 + }, + { + "epoch": 0.48, + "grad_norm": 0.8952950825627658, + "learning_rate": 1.1092227270456857e-05, + "loss": 0.1971, + "step": 9463 + }, + { + "epoch": 0.48, + "grad_norm": 1.1015031040313652, + "learning_rate": 1.1090590140341709e-05, + "loss": 0.2004, + "step": 9464 + }, + { + "epoch": 0.48, + "grad_norm": 0.8780746732320548, + "learning_rate": 1.1088952980644242e-05, + "loss": 0.1639, + "step": 9465 + }, + { + "epoch": 0.48, + "grad_norm": 1.0166800530835955, + "learning_rate": 1.1087315791408864e-05, + "loss": 0.1938, + "step": 9466 + }, + { + "epoch": 0.48, + "grad_norm": 0.893028751235322, + "learning_rate": 1.1085678572679978e-05, + "loss": 0.1791, + "step": 9467 + }, + { + "epoch": 0.48, + "grad_norm": 2.1214063958853724, + "learning_rate": 1.1084041324502006e-05, + "loss": 0.1951, + "step": 9468 + }, + { + "epoch": 0.48, + "grad_norm": 0.9458754700189468, + "learning_rate": 1.108240404691935e-05, + "loss": 0.1844, + "step": 9469 + }, + { + "epoch": 0.48, + "grad_norm": 1.4094387971337061, + "learning_rate": 1.1080766739976424e-05, + "loss": 0.2075, + "step": 9470 + }, + { + "epoch": 0.48, + "grad_norm": 0.9267791702679756, + "learning_rate": 1.1079129403717639e-05, + "loss": 0.1891, + "step": 9471 + }, + { + "epoch": 0.48, + "grad_norm": 3.480391044904194, + "learning_rate": 1.1077492038187403e-05, + "loss": 0.2007, + "step": 9472 + }, + { + "epoch": 0.48, + "grad_norm": 1.2714801697325204, + "learning_rate": 1.107585464343014e-05, + "loss": 0.1809, + "step": 9473 + }, + { + "epoch": 0.48, + "grad_norm": 1.0151298523185912, + "learning_rate": 1.1074217219490258e-05, + "loss": 0.1926, + "step": 9474 + }, + { + "epoch": 0.48, + "grad_norm": 1.4766124566730703, + "learning_rate": 1.1072579766412172e-05, + "loss": 0.2004, + "step": 9475 + }, + { + "epoch": 0.48, + "grad_norm": 1.0682720487733304, + "learning_rate": 1.10709422842403e-05, + "loss": 0.1746, + "step": 9476 + }, + { + "epoch": 0.48, + "grad_norm": 1.0623812204356664, + "learning_rate": 1.1069304773019058e-05, + "loss": 0.2052, + "step": 9477 + }, + { + "epoch": 0.48, + "grad_norm": 0.98325888769799, + "learning_rate": 1.1067667232792864e-05, + "loss": 0.1969, + "step": 9478 + }, + { + "epoch": 0.48, + "grad_norm": 1.249394726179517, + "learning_rate": 1.1066029663606138e-05, + "loss": 0.1765, + "step": 9479 + }, + { + "epoch": 0.48, + "grad_norm": 1.1215393168986563, + "learning_rate": 1.1064392065503294e-05, + "loss": 0.2001, + "step": 9480 + }, + { + "epoch": 0.48, + "grad_norm": 0.9950671645201333, + "learning_rate": 1.1062754438528758e-05, + "loss": 0.1866, + "step": 9481 + }, + { + "epoch": 0.48, + "grad_norm": 1.4252768997024146, + "learning_rate": 1.1061116782726947e-05, + "loss": 0.2047, + "step": 9482 + }, + { + "epoch": 0.48, + "grad_norm": 1.6916831687929794, + "learning_rate": 1.1059479098142281e-05, + "loss": 0.1886, + "step": 9483 + }, + { + "epoch": 0.48, + "grad_norm": 1.298294711202152, + "learning_rate": 1.1057841384819185e-05, + "loss": 0.1986, + "step": 9484 + }, + { + "epoch": 0.48, + "grad_norm": 0.9317317237057114, + "learning_rate": 1.1056203642802081e-05, + "loss": 0.1926, + "step": 9485 + }, + { + "epoch": 0.48, + "grad_norm": 0.8458105659558617, + "learning_rate": 1.1054565872135397e-05, + "loss": 0.189, + "step": 9486 + }, + { + "epoch": 0.48, + "grad_norm": 1.2018879195347907, + "learning_rate": 1.1052928072863552e-05, + "loss": 0.1777, + "step": 9487 + }, + { + "epoch": 0.48, + "grad_norm": 0.894558397914112, + "learning_rate": 1.1051290245030975e-05, + "loss": 0.1987, + "step": 9488 + }, + { + "epoch": 0.48, + "grad_norm": 1.1740428506485563, + "learning_rate": 1.1049652388682088e-05, + "loss": 0.1954, + "step": 9489 + }, + { + "epoch": 0.48, + "grad_norm": 0.8405196629069694, + "learning_rate": 1.1048014503861321e-05, + "loss": 0.1678, + "step": 9490 + }, + { + "epoch": 0.48, + "grad_norm": 1.025401225748899, + "learning_rate": 1.1046376590613103e-05, + "loss": 0.1766, + "step": 9491 + }, + { + "epoch": 0.48, + "grad_norm": 1.2048673707118842, + "learning_rate": 1.104473864898186e-05, + "loss": 0.2014, + "step": 9492 + }, + { + "epoch": 0.48, + "grad_norm": 1.2694804452467194, + "learning_rate": 1.1043100679012025e-05, + "loss": 0.1935, + "step": 9493 + }, + { + "epoch": 0.48, + "grad_norm": 0.8965625555710108, + "learning_rate": 1.104146268074802e-05, + "loss": 0.1854, + "step": 9494 + }, + { + "epoch": 0.48, + "grad_norm": 0.9878495481703905, + "learning_rate": 1.1039824654234286e-05, + "loss": 0.1748, + "step": 9495 + }, + { + "epoch": 0.48, + "grad_norm": 1.3167225063683237, + "learning_rate": 1.1038186599515247e-05, + "loss": 0.2079, + "step": 9496 + }, + { + "epoch": 0.48, + "grad_norm": 1.148398782309424, + "learning_rate": 1.1036548516635339e-05, + "loss": 0.1933, + "step": 9497 + }, + { + "epoch": 0.48, + "grad_norm": 0.9509034949490187, + "learning_rate": 1.1034910405638992e-05, + "loss": 0.1888, + "step": 9498 + }, + { + "epoch": 0.48, + "grad_norm": 1.0496257154764759, + "learning_rate": 1.1033272266570645e-05, + "loss": 0.1887, + "step": 9499 + }, + { + "epoch": 0.48, + "grad_norm": 1.035097016626563, + "learning_rate": 1.1031634099474727e-05, + "loss": 0.1916, + "step": 9500 + }, + { + "epoch": 0.48, + "grad_norm": 1.1664136233350906, + "learning_rate": 1.1029995904395676e-05, + "loss": 0.2019, + "step": 9501 + }, + { + "epoch": 0.48, + "grad_norm": 1.132179506013568, + "learning_rate": 1.1028357681377928e-05, + "loss": 0.179, + "step": 9502 + }, + { + "epoch": 0.48, + "grad_norm": 0.9801111290950743, + "learning_rate": 1.1026719430465919e-05, + "loss": 0.2018, + "step": 9503 + }, + { + "epoch": 0.48, + "grad_norm": 1.116829993968936, + "learning_rate": 1.1025081151704089e-05, + "loss": 0.171, + "step": 9504 + }, + { + "epoch": 0.48, + "grad_norm": 0.9850572439318865, + "learning_rate": 1.1023442845136874e-05, + "loss": 0.2012, + "step": 9505 + }, + { + "epoch": 0.48, + "grad_norm": 1.1544031189586472, + "learning_rate": 1.1021804510808715e-05, + "loss": 0.1966, + "step": 9506 + }, + { + "epoch": 0.48, + "grad_norm": 1.1640617416122763, + "learning_rate": 1.102016614876405e-05, + "loss": 0.1639, + "step": 9507 + }, + { + "epoch": 0.48, + "grad_norm": 1.7324786741544342, + "learning_rate": 1.1018527759047319e-05, + "loss": 0.1758, + "step": 9508 + }, + { + "epoch": 0.48, + "grad_norm": 0.9896374147248223, + "learning_rate": 1.1016889341702968e-05, + "loss": 0.1674, + "step": 9509 + }, + { + "epoch": 0.48, + "grad_norm": 1.0471666833332736, + "learning_rate": 1.1015250896775436e-05, + "loss": 0.1844, + "step": 9510 + }, + { + "epoch": 0.48, + "grad_norm": 0.9148938137694808, + "learning_rate": 1.1013612424309163e-05, + "loss": 0.1826, + "step": 9511 + }, + { + "epoch": 0.48, + "grad_norm": 1.0928563897735315, + "learning_rate": 1.1011973924348599e-05, + "loss": 0.2285, + "step": 9512 + }, + { + "epoch": 0.48, + "grad_norm": 1.2076467126569248, + "learning_rate": 1.1010335396938183e-05, + "loss": 0.2117, + "step": 9513 + }, + { + "epoch": 0.48, + "grad_norm": 2.7580399000340794, + "learning_rate": 1.1008696842122364e-05, + "loss": 0.1968, + "step": 9514 + }, + { + "epoch": 0.48, + "grad_norm": 0.9902090228799503, + "learning_rate": 1.1007058259945584e-05, + "loss": 0.1701, + "step": 9515 + }, + { + "epoch": 0.48, + "grad_norm": 0.8934607040812377, + "learning_rate": 1.1005419650452294e-05, + "loss": 0.2102, + "step": 9516 + }, + { + "epoch": 0.48, + "grad_norm": 1.5274316465729678, + "learning_rate": 1.1003781013686939e-05, + "loss": 0.192, + "step": 9517 + }, + { + "epoch": 0.48, + "grad_norm": 1.1934406110291276, + "learning_rate": 1.1002142349693967e-05, + "loss": 0.1596, + "step": 9518 + }, + { + "epoch": 0.48, + "grad_norm": 1.8946649186923104, + "learning_rate": 1.1000503658517827e-05, + "loss": 0.1874, + "step": 9519 + }, + { + "epoch": 0.48, + "grad_norm": 2.2912572338837025, + "learning_rate": 1.0998864940202967e-05, + "loss": 0.2044, + "step": 9520 + }, + { + "epoch": 0.48, + "grad_norm": 1.1351873568499273, + "learning_rate": 1.0997226194793842e-05, + "loss": 0.1764, + "step": 9521 + }, + { + "epoch": 0.48, + "grad_norm": 1.1338616587599282, + "learning_rate": 1.09955874223349e-05, + "loss": 0.1884, + "step": 9522 + }, + { + "epoch": 0.48, + "grad_norm": 0.9169417601362231, + "learning_rate": 1.099394862287059e-05, + "loss": 0.1832, + "step": 9523 + }, + { + "epoch": 0.48, + "grad_norm": 0.8400343124098275, + "learning_rate": 1.099230979644537e-05, + "loss": 0.1825, + "step": 9524 + }, + { + "epoch": 0.48, + "grad_norm": 1.2284716183910207, + "learning_rate": 1.0990670943103688e-05, + "loss": 0.1828, + "step": 9525 + }, + { + "epoch": 0.48, + "grad_norm": 1.2249751854792137, + "learning_rate": 1.0989032062890004e-05, + "loss": 0.1851, + "step": 9526 + }, + { + "epoch": 0.48, + "grad_norm": 1.1007619155064332, + "learning_rate": 1.0987393155848767e-05, + "loss": 0.1899, + "step": 9527 + }, + { + "epoch": 0.48, + "grad_norm": 1.3302949936435415, + "learning_rate": 1.0985754222024437e-05, + "loss": 0.1778, + "step": 9528 + }, + { + "epoch": 0.48, + "grad_norm": 0.7935309274750862, + "learning_rate": 1.0984115261461466e-05, + "loss": 0.1679, + "step": 9529 + }, + { + "epoch": 0.48, + "grad_norm": 0.8740483493541972, + "learning_rate": 1.0982476274204314e-05, + "loss": 0.1912, + "step": 9530 + }, + { + "epoch": 0.48, + "grad_norm": 0.9544592534379195, + "learning_rate": 1.0980837260297437e-05, + "loss": 0.1756, + "step": 9531 + }, + { + "epoch": 0.48, + "grad_norm": 1.4647054361443923, + "learning_rate": 1.0979198219785296e-05, + "loss": 0.1668, + "step": 9532 + }, + { + "epoch": 0.48, + "grad_norm": 2.0763420008421867, + "learning_rate": 1.0977559152712347e-05, + "loss": 0.1942, + "step": 9533 + }, + { + "epoch": 0.48, + "grad_norm": 1.3867599307183982, + "learning_rate": 1.0975920059123051e-05, + "loss": 0.1943, + "step": 9534 + }, + { + "epoch": 0.48, + "grad_norm": 1.4807281378852024, + "learning_rate": 1.0974280939061867e-05, + "loss": 0.1769, + "step": 9535 + }, + { + "epoch": 0.48, + "grad_norm": 1.7477775660421317, + "learning_rate": 1.0972641792573258e-05, + "loss": 0.1705, + "step": 9536 + }, + { + "epoch": 0.48, + "grad_norm": 1.0163715440655499, + "learning_rate": 1.0971002619701682e-05, + "loss": 0.1845, + "step": 9537 + }, + { + "epoch": 0.49, + "grad_norm": 1.0746521485497653, + "learning_rate": 1.096936342049161e-05, + "loss": 0.2001, + "step": 9538 + }, + { + "epoch": 0.49, + "grad_norm": 0.8781367171305492, + "learning_rate": 1.0967724194987498e-05, + "loss": 0.1904, + "step": 9539 + }, + { + "epoch": 0.49, + "grad_norm": 0.765734891397578, + "learning_rate": 1.0966084943233818e-05, + "loss": 0.1884, + "step": 9540 + }, + { + "epoch": 0.49, + "grad_norm": 1.13823561612025, + "learning_rate": 1.0964445665275023e-05, + "loss": 0.1873, + "step": 9541 + }, + { + "epoch": 0.49, + "grad_norm": 0.9539136937544149, + "learning_rate": 1.0962806361155585e-05, + "loss": 0.1784, + "step": 9542 + }, + { + "epoch": 0.49, + "grad_norm": 1.0034189361586376, + "learning_rate": 1.0961167030919973e-05, + "loss": 0.1868, + "step": 9543 + }, + { + "epoch": 0.49, + "grad_norm": 1.2143927037480549, + "learning_rate": 1.095952767461265e-05, + "loss": 0.1821, + "step": 9544 + }, + { + "epoch": 0.49, + "grad_norm": 1.54044962525446, + "learning_rate": 1.0957888292278084e-05, + "loss": 0.1871, + "step": 9545 + }, + { + "epoch": 0.49, + "grad_norm": 0.9402634558591103, + "learning_rate": 1.0956248883960744e-05, + "loss": 0.1947, + "step": 9546 + }, + { + "epoch": 0.49, + "grad_norm": 0.9362298716995432, + "learning_rate": 1.0954609449705097e-05, + "loss": 0.1812, + "step": 9547 + }, + { + "epoch": 0.49, + "grad_norm": 1.1257224446570795, + "learning_rate": 1.095296998955562e-05, + "loss": 0.1759, + "step": 9548 + }, + { + "epoch": 0.49, + "grad_norm": 1.7555764622968162, + "learning_rate": 1.0951330503556776e-05, + "loss": 0.2133, + "step": 9549 + }, + { + "epoch": 0.49, + "grad_norm": 0.8595309424350507, + "learning_rate": 1.0949690991753036e-05, + "loss": 0.1849, + "step": 9550 + }, + { + "epoch": 0.49, + "grad_norm": 1.2556944727905175, + "learning_rate": 1.0948051454188877e-05, + "loss": 0.1932, + "step": 9551 + }, + { + "epoch": 0.49, + "grad_norm": 1.0799262374997667, + "learning_rate": 1.094641189090877e-05, + "loss": 0.1774, + "step": 9552 + }, + { + "epoch": 0.49, + "grad_norm": 1.2042809302790212, + "learning_rate": 1.0944772301957185e-05, + "loss": 0.1714, + "step": 9553 + }, + { + "epoch": 0.49, + "grad_norm": 0.8870239839138336, + "learning_rate": 1.0943132687378597e-05, + "loss": 0.2022, + "step": 9554 + }, + { + "epoch": 0.49, + "grad_norm": 0.7968798872828126, + "learning_rate": 1.0941493047217482e-05, + "loss": 0.1526, + "step": 9555 + }, + { + "epoch": 0.49, + "grad_norm": 0.9582730119266584, + "learning_rate": 1.0939853381518315e-05, + "loss": 0.1779, + "step": 9556 + }, + { + "epoch": 0.49, + "grad_norm": 0.8933870974205196, + "learning_rate": 1.0938213690325572e-05, + "loss": 0.1911, + "step": 9557 + }, + { + "epoch": 0.49, + "grad_norm": 0.8760223525297648, + "learning_rate": 1.093657397368373e-05, + "loss": 0.1833, + "step": 9558 + }, + { + "epoch": 0.49, + "grad_norm": 0.8660757127061413, + "learning_rate": 1.0934934231637267e-05, + "loss": 0.2105, + "step": 9559 + }, + { + "epoch": 0.49, + "grad_norm": 1.0198198870011186, + "learning_rate": 1.0933294464230657e-05, + "loss": 0.1763, + "step": 9560 + }, + { + "epoch": 0.49, + "grad_norm": 0.8730812713789345, + "learning_rate": 1.0931654671508384e-05, + "loss": 0.203, + "step": 9561 + }, + { + "epoch": 0.49, + "grad_norm": 0.8240497451748958, + "learning_rate": 1.0930014853514925e-05, + "loss": 0.1809, + "step": 9562 + }, + { + "epoch": 0.49, + "grad_norm": 1.3428577217343693, + "learning_rate": 1.0928375010294762e-05, + "loss": 0.189, + "step": 9563 + }, + { + "epoch": 0.49, + "grad_norm": 0.9102713113626955, + "learning_rate": 1.092673514189237e-05, + "loss": 0.194, + "step": 9564 + }, + { + "epoch": 0.49, + "grad_norm": 0.8800419886753066, + "learning_rate": 1.092509524835224e-05, + "loss": 0.1863, + "step": 9565 + }, + { + "epoch": 0.49, + "grad_norm": 0.8915238012081395, + "learning_rate": 1.0923455329718849e-05, + "loss": 0.1711, + "step": 9566 + }, + { + "epoch": 0.49, + "grad_norm": 0.8247002058985641, + "learning_rate": 1.0921815386036679e-05, + "loss": 0.1808, + "step": 9567 + }, + { + "epoch": 0.49, + "grad_norm": 0.9421256090529048, + "learning_rate": 1.0920175417350214e-05, + "loss": 0.1706, + "step": 9568 + }, + { + "epoch": 0.49, + "grad_norm": 1.5488011925565883, + "learning_rate": 1.091853542370394e-05, + "loss": 0.1864, + "step": 9569 + }, + { + "epoch": 0.49, + "grad_norm": 1.0941063553037418, + "learning_rate": 1.0916895405142339e-05, + "loss": 0.1973, + "step": 9570 + }, + { + "epoch": 0.49, + "grad_norm": 0.933533004750533, + "learning_rate": 1.09152553617099e-05, + "loss": 0.1759, + "step": 9571 + }, + { + "epoch": 0.49, + "grad_norm": 1.116679573487381, + "learning_rate": 1.0913615293451105e-05, + "loss": 0.1764, + "step": 9572 + }, + { + "epoch": 0.49, + "grad_norm": 1.3262860299047816, + "learning_rate": 1.0911975200410445e-05, + "loss": 0.2024, + "step": 9573 + }, + { + "epoch": 0.49, + "grad_norm": 1.0543549657524613, + "learning_rate": 1.0910335082632406e-05, + "loss": 0.2182, + "step": 9574 + }, + { + "epoch": 0.49, + "grad_norm": 1.1044120478533699, + "learning_rate": 1.0908694940161477e-05, + "loss": 0.1796, + "step": 9575 + }, + { + "epoch": 0.49, + "grad_norm": 0.8634574691411527, + "learning_rate": 1.0907054773042148e-05, + "loss": 0.1866, + "step": 9576 + }, + { + "epoch": 0.49, + "grad_norm": 0.8953783284154978, + "learning_rate": 1.0905414581318902e-05, + "loss": 0.1815, + "step": 9577 + }, + { + "epoch": 0.49, + "grad_norm": 1.0168979420911626, + "learning_rate": 1.090377436503624e-05, + "loss": 0.1585, + "step": 9578 + }, + { + "epoch": 0.49, + "grad_norm": 2.3193192319875138, + "learning_rate": 1.0902134124238644e-05, + "loss": 0.1978, + "step": 9579 + }, + { + "epoch": 0.49, + "grad_norm": 0.9542646773015464, + "learning_rate": 1.090049385897061e-05, + "loss": 0.2019, + "step": 9580 + }, + { + "epoch": 0.49, + "grad_norm": 1.2211461636864696, + "learning_rate": 1.089885356927663e-05, + "loss": 0.1991, + "step": 9581 + }, + { + "epoch": 0.49, + "grad_norm": 1.4789441101008454, + "learning_rate": 1.0897213255201193e-05, + "loss": 0.1985, + "step": 9582 + }, + { + "epoch": 0.49, + "grad_norm": 1.2913650894369069, + "learning_rate": 1.0895572916788799e-05, + "loss": 0.1891, + "step": 9583 + }, + { + "epoch": 0.49, + "grad_norm": 0.8862058384341132, + "learning_rate": 1.089393255408394e-05, + "loss": 0.1904, + "step": 9584 + }, + { + "epoch": 0.49, + "grad_norm": 0.9609850270437007, + "learning_rate": 1.0892292167131107e-05, + "loss": 0.2003, + "step": 9585 + }, + { + "epoch": 0.49, + "grad_norm": 1.068301573582356, + "learning_rate": 1.08906517559748e-05, + "loss": 0.1921, + "step": 9586 + }, + { + "epoch": 0.49, + "grad_norm": 0.9769987088158, + "learning_rate": 1.0889011320659513e-05, + "loss": 0.1679, + "step": 9587 + }, + { + "epoch": 0.49, + "grad_norm": 1.3124652154756535, + "learning_rate": 1.0887370861229744e-05, + "loss": 0.1971, + "step": 9588 + }, + { + "epoch": 0.49, + "grad_norm": 1.2458187192822827, + "learning_rate": 1.0885730377729993e-05, + "loss": 0.1789, + "step": 9589 + }, + { + "epoch": 0.49, + "grad_norm": 0.9722915506603313, + "learning_rate": 1.0884089870204751e-05, + "loss": 0.1861, + "step": 9590 + }, + { + "epoch": 0.49, + "grad_norm": 1.0798640423811054, + "learning_rate": 1.0882449338698521e-05, + "loss": 0.1671, + "step": 9591 + }, + { + "epoch": 0.49, + "grad_norm": 0.9778314506386806, + "learning_rate": 1.0880808783255808e-05, + "loss": 0.185, + "step": 9592 + }, + { + "epoch": 0.49, + "grad_norm": 1.1909601832238668, + "learning_rate": 1.0879168203921105e-05, + "loss": 0.1663, + "step": 9593 + }, + { + "epoch": 0.49, + "grad_norm": 1.354804048495785, + "learning_rate": 1.0877527600738913e-05, + "loss": 0.2059, + "step": 9594 + }, + { + "epoch": 0.49, + "grad_norm": 2.2460384170334757, + "learning_rate": 1.0875886973753735e-05, + "loss": 0.1811, + "step": 9595 + }, + { + "epoch": 0.49, + "grad_norm": 1.4932185049543425, + "learning_rate": 1.0874246323010074e-05, + "loss": 0.2045, + "step": 9596 + }, + { + "epoch": 0.49, + "grad_norm": 1.298711194215064, + "learning_rate": 1.0872605648552435e-05, + "loss": 0.1746, + "step": 9597 + }, + { + "epoch": 0.49, + "grad_norm": 1.322696201426117, + "learning_rate": 1.0870964950425315e-05, + "loss": 0.1721, + "step": 9598 + }, + { + "epoch": 0.49, + "grad_norm": 1.0958001072962307, + "learning_rate": 1.0869324228673222e-05, + "loss": 0.1864, + "step": 9599 + }, + { + "epoch": 0.49, + "grad_norm": 2.75974717231408, + "learning_rate": 1.086768348334066e-05, + "loss": 0.204, + "step": 9600 + }, + { + "epoch": 0.49, + "grad_norm": 0.966607763752332, + "learning_rate": 1.0866042714472136e-05, + "loss": 0.2106, + "step": 9601 + }, + { + "epoch": 0.49, + "grad_norm": 1.0667990702956014, + "learning_rate": 1.0864401922112155e-05, + "loss": 0.1905, + "step": 9602 + }, + { + "epoch": 0.49, + "grad_norm": 0.9604517721025619, + "learning_rate": 1.0862761106305222e-05, + "loss": 0.1987, + "step": 9603 + }, + { + "epoch": 0.49, + "grad_norm": 1.0185280849062157, + "learning_rate": 1.0861120267095846e-05, + "loss": 0.2039, + "step": 9604 + }, + { + "epoch": 0.49, + "grad_norm": 1.1082943502475084, + "learning_rate": 1.0859479404528532e-05, + "loss": 0.1873, + "step": 9605 + }, + { + "epoch": 0.49, + "grad_norm": 1.1452453729289234, + "learning_rate": 1.0857838518647794e-05, + "loss": 0.2176, + "step": 9606 + }, + { + "epoch": 0.49, + "grad_norm": 3.148710275207547, + "learning_rate": 1.0856197609498135e-05, + "loss": 0.1744, + "step": 9607 + }, + { + "epoch": 0.49, + "grad_norm": 1.209646023184379, + "learning_rate": 1.0854556677124066e-05, + "loss": 0.1627, + "step": 9608 + }, + { + "epoch": 0.49, + "grad_norm": 1.1650388011788848, + "learning_rate": 1.08529157215701e-05, + "loss": 0.1956, + "step": 9609 + }, + { + "epoch": 0.49, + "grad_norm": 1.157072205528815, + "learning_rate": 1.085127474288075e-05, + "loss": 0.1939, + "step": 9610 + }, + { + "epoch": 0.49, + "grad_norm": 2.648764821811985, + "learning_rate": 1.0849633741100522e-05, + "loss": 0.1991, + "step": 9611 + }, + { + "epoch": 0.49, + "grad_norm": 1.0288598778364175, + "learning_rate": 1.084799271627393e-05, + "loss": 0.2053, + "step": 9612 + }, + { + "epoch": 0.49, + "grad_norm": 1.1278542729437095, + "learning_rate": 1.0846351668445489e-05, + "loss": 0.1999, + "step": 9613 + }, + { + "epoch": 0.49, + "grad_norm": 1.457359559733764, + "learning_rate": 1.084471059765971e-05, + "loss": 0.2182, + "step": 9614 + }, + { + "epoch": 0.49, + "grad_norm": 1.185454539256383, + "learning_rate": 1.0843069503961112e-05, + "loss": 0.1964, + "step": 9615 + }, + { + "epoch": 0.49, + "grad_norm": 1.1001727164465225, + "learning_rate": 1.0841428387394204e-05, + "loss": 0.1753, + "step": 9616 + }, + { + "epoch": 0.49, + "grad_norm": 1.468981106103501, + "learning_rate": 1.0839787248003499e-05, + "loss": 0.2171, + "step": 9617 + }, + { + "epoch": 0.49, + "grad_norm": 1.2066098512363195, + "learning_rate": 1.0838146085833523e-05, + "loss": 0.1775, + "step": 9618 + }, + { + "epoch": 0.49, + "grad_norm": 1.3883135625315148, + "learning_rate": 1.0836504900928786e-05, + "loss": 0.1777, + "step": 9619 + }, + { + "epoch": 0.49, + "grad_norm": 1.1582429998119048, + "learning_rate": 1.0834863693333805e-05, + "loss": 0.1803, + "step": 9620 + }, + { + "epoch": 0.49, + "grad_norm": 1.076354165273277, + "learning_rate": 1.08332224630931e-05, + "loss": 0.1923, + "step": 9621 + }, + { + "epoch": 0.49, + "grad_norm": 0.9869213870487541, + "learning_rate": 1.083158121025119e-05, + "loss": 0.1957, + "step": 9622 + }, + { + "epoch": 0.49, + "grad_norm": 1.4017559780579356, + "learning_rate": 1.082993993485259e-05, + "loss": 0.2127, + "step": 9623 + }, + { + "epoch": 0.49, + "grad_norm": 0.9471222448724727, + "learning_rate": 1.0828298636941826e-05, + "loss": 0.1712, + "step": 9624 + }, + { + "epoch": 0.49, + "grad_norm": 1.3935679546422635, + "learning_rate": 1.0826657316563412e-05, + "loss": 0.1822, + "step": 9625 + }, + { + "epoch": 0.49, + "grad_norm": 1.1490294573199775, + "learning_rate": 1.082501597376187e-05, + "loss": 0.19, + "step": 9626 + }, + { + "epoch": 0.49, + "grad_norm": 1.1764376635965907, + "learning_rate": 1.0823374608581727e-05, + "loss": 0.1984, + "step": 9627 + }, + { + "epoch": 0.49, + "grad_norm": 1.6176490098619052, + "learning_rate": 1.0821733221067499e-05, + "loss": 0.1924, + "step": 9628 + }, + { + "epoch": 0.49, + "grad_norm": 1.1701847629385655, + "learning_rate": 1.082009181126371e-05, + "loss": 0.1885, + "step": 9629 + }, + { + "epoch": 0.49, + "grad_norm": 1.4873077019656005, + "learning_rate": 1.0818450379214887e-05, + "loss": 0.1864, + "step": 9630 + }, + { + "epoch": 0.49, + "grad_norm": 1.0821228362386008, + "learning_rate": 1.081680892496555e-05, + "loss": 0.1884, + "step": 9631 + }, + { + "epoch": 0.49, + "grad_norm": 1.0979545228674528, + "learning_rate": 1.0815167448560225e-05, + "loss": 0.1843, + "step": 9632 + }, + { + "epoch": 0.49, + "grad_norm": 1.2455347475804985, + "learning_rate": 1.0813525950043435e-05, + "loss": 0.1813, + "step": 9633 + }, + { + "epoch": 0.49, + "grad_norm": 3.978559329533385, + "learning_rate": 1.0811884429459708e-05, + "loss": 0.1883, + "step": 9634 + }, + { + "epoch": 0.49, + "grad_norm": 0.9707061312525472, + "learning_rate": 1.0810242886853572e-05, + "loss": 0.1923, + "step": 9635 + }, + { + "epoch": 0.49, + "grad_norm": 1.4835241358193412, + "learning_rate": 1.0808601322269553e-05, + "loss": 0.1855, + "step": 9636 + }, + { + "epoch": 0.49, + "grad_norm": 1.0247533098985855, + "learning_rate": 1.0806959735752174e-05, + "loss": 0.1938, + "step": 9637 + }, + { + "epoch": 0.49, + "grad_norm": 0.9173282273112893, + "learning_rate": 1.0805318127345968e-05, + "loss": 0.1907, + "step": 9638 + }, + { + "epoch": 0.49, + "grad_norm": 1.2942874387981254, + "learning_rate": 1.0803676497095463e-05, + "loss": 0.1993, + "step": 9639 + }, + { + "epoch": 0.49, + "grad_norm": 1.103891088845872, + "learning_rate": 1.0802034845045189e-05, + "loss": 0.1806, + "step": 9640 + }, + { + "epoch": 0.49, + "grad_norm": 0.8294224003058488, + "learning_rate": 1.0800393171239672e-05, + "loss": 0.1798, + "step": 9641 + }, + { + "epoch": 0.49, + "grad_norm": 1.6864174989057856, + "learning_rate": 1.0798751475723446e-05, + "loss": 0.1904, + "step": 9642 + }, + { + "epoch": 0.49, + "grad_norm": 1.3092171275756168, + "learning_rate": 1.0797109758541038e-05, + "loss": 0.201, + "step": 9643 + }, + { + "epoch": 0.49, + "grad_norm": 0.9974374522277218, + "learning_rate": 1.0795468019736988e-05, + "loss": 0.2134, + "step": 9644 + }, + { + "epoch": 0.49, + "grad_norm": 1.2165807216297406, + "learning_rate": 1.079382625935582e-05, + "loss": 0.175, + "step": 9645 + }, + { + "epoch": 0.49, + "grad_norm": 0.9718849350489902, + "learning_rate": 1.0792184477442072e-05, + "loss": 0.1899, + "step": 9646 + }, + { + "epoch": 0.49, + "grad_norm": 1.4698470200530558, + "learning_rate": 1.079054267404027e-05, + "loss": 0.1963, + "step": 9647 + }, + { + "epoch": 0.49, + "grad_norm": 1.1466382693672115, + "learning_rate": 1.078890084919496e-05, + "loss": 0.181, + "step": 9648 + }, + { + "epoch": 0.49, + "grad_norm": 1.25333634486384, + "learning_rate": 1.0787259002950665e-05, + "loss": 0.2133, + "step": 9649 + }, + { + "epoch": 0.49, + "grad_norm": 1.012534747933004, + "learning_rate": 1.0785617135351927e-05, + "loss": 0.1924, + "step": 9650 + }, + { + "epoch": 0.49, + "grad_norm": 1.1474468277559202, + "learning_rate": 1.0783975246443281e-05, + "loss": 0.209, + "step": 9651 + }, + { + "epoch": 0.49, + "grad_norm": 0.9345414739937334, + "learning_rate": 1.078233333626926e-05, + "loss": 0.1717, + "step": 9652 + }, + { + "epoch": 0.49, + "grad_norm": 1.5527860447119741, + "learning_rate": 1.0780691404874404e-05, + "loss": 0.1913, + "step": 9653 + }, + { + "epoch": 0.49, + "grad_norm": 0.9515322340897505, + "learning_rate": 1.077904945230325e-05, + "loss": 0.1982, + "step": 9654 + }, + { + "epoch": 0.49, + "grad_norm": 0.9235823339102696, + "learning_rate": 1.0777407478600334e-05, + "loss": 0.2162, + "step": 9655 + }, + { + "epoch": 0.49, + "grad_norm": 0.7539908144802698, + "learning_rate": 1.0775765483810199e-05, + "loss": 0.1876, + "step": 9656 + }, + { + "epoch": 0.49, + "grad_norm": 0.9304968353210401, + "learning_rate": 1.0774123467977379e-05, + "loss": 0.1866, + "step": 9657 + }, + { + "epoch": 0.49, + "grad_norm": 1.2604049295529116, + "learning_rate": 1.077248143114642e-05, + "loss": 0.1925, + "step": 9658 + }, + { + "epoch": 0.49, + "grad_norm": 0.9924552531913771, + "learning_rate": 1.0770839373361854e-05, + "loss": 0.1633, + "step": 9659 + }, + { + "epoch": 0.49, + "grad_norm": 4.364045262513066, + "learning_rate": 1.0769197294668228e-05, + "loss": 0.1889, + "step": 9660 + }, + { + "epoch": 0.49, + "grad_norm": 0.9236530495839659, + "learning_rate": 1.0767555195110082e-05, + "loss": 0.204, + "step": 9661 + }, + { + "epoch": 0.49, + "grad_norm": 1.0476753106491055, + "learning_rate": 1.0765913074731957e-05, + "loss": 0.175, + "step": 9662 + }, + { + "epoch": 0.49, + "grad_norm": 1.0026805236194896, + "learning_rate": 1.07642709335784e-05, + "loss": 0.1936, + "step": 9663 + }, + { + "epoch": 0.49, + "grad_norm": 0.9422222095805597, + "learning_rate": 1.0762628771693948e-05, + "loss": 0.2034, + "step": 9664 + }, + { + "epoch": 0.49, + "grad_norm": 0.9433156465199231, + "learning_rate": 1.0760986589123145e-05, + "loss": 0.2009, + "step": 9665 + }, + { + "epoch": 0.49, + "grad_norm": 1.2291693414472664, + "learning_rate": 1.0759344385910541e-05, + "loss": 0.1703, + "step": 9666 + }, + { + "epoch": 0.49, + "grad_norm": 1.0785063042674297, + "learning_rate": 1.0757702162100679e-05, + "loss": 0.1848, + "step": 9667 + }, + { + "epoch": 0.49, + "grad_norm": 0.949233765859445, + "learning_rate": 1.0756059917738102e-05, + "loss": 0.1871, + "step": 9668 + }, + { + "epoch": 0.49, + "grad_norm": 1.0232043255187737, + "learning_rate": 1.0754417652867357e-05, + "loss": 0.1785, + "step": 9669 + }, + { + "epoch": 0.49, + "grad_norm": 0.9992934331116861, + "learning_rate": 1.0752775367532988e-05, + "loss": 0.2028, + "step": 9670 + }, + { + "epoch": 0.49, + "grad_norm": 1.4295122213944318, + "learning_rate": 1.0751133061779545e-05, + "loss": 0.1792, + "step": 9671 + }, + { + "epoch": 0.49, + "grad_norm": 1.18761065388684, + "learning_rate": 1.074949073565158e-05, + "loss": 0.1577, + "step": 9672 + }, + { + "epoch": 0.49, + "grad_norm": 1.7470259053038004, + "learning_rate": 1.0747848389193633e-05, + "loss": 0.1993, + "step": 9673 + }, + { + "epoch": 0.49, + "grad_norm": 0.9904275822503751, + "learning_rate": 1.0746206022450256e-05, + "loss": 0.1931, + "step": 9674 + }, + { + "epoch": 0.49, + "grad_norm": 0.7834337706150454, + "learning_rate": 1.0744563635466e-05, + "loss": 0.1788, + "step": 9675 + }, + { + "epoch": 0.49, + "grad_norm": 0.7895005401380205, + "learning_rate": 1.0742921228285412e-05, + "loss": 0.1897, + "step": 9676 + }, + { + "epoch": 0.49, + "grad_norm": 1.283230030469254, + "learning_rate": 1.0741278800953045e-05, + "loss": 0.2225, + "step": 9677 + }, + { + "epoch": 0.49, + "grad_norm": 1.12162867780618, + "learning_rate": 1.0739636353513446e-05, + "loss": 0.1907, + "step": 9678 + }, + { + "epoch": 0.49, + "grad_norm": 1.0680028023000427, + "learning_rate": 1.0737993886011171e-05, + "loss": 0.2474, + "step": 9679 + }, + { + "epoch": 0.49, + "grad_norm": 0.8863078026342777, + "learning_rate": 1.0736351398490772e-05, + "loss": 0.1842, + "step": 9680 + }, + { + "epoch": 0.49, + "grad_norm": 1.1023494855573912, + "learning_rate": 1.0734708890996797e-05, + "loss": 0.1818, + "step": 9681 + }, + { + "epoch": 0.49, + "grad_norm": 0.9156047317575732, + "learning_rate": 1.0733066363573803e-05, + "loss": 0.1983, + "step": 9682 + }, + { + "epoch": 0.49, + "grad_norm": 1.105766073750055, + "learning_rate": 1.073142381626634e-05, + "loss": 0.1703, + "step": 9683 + }, + { + "epoch": 0.49, + "grad_norm": 1.2012940202707465, + "learning_rate": 1.0729781249118966e-05, + "loss": 0.2002, + "step": 9684 + }, + { + "epoch": 0.49, + "grad_norm": 1.162556841011124, + "learning_rate": 1.0728138662176237e-05, + "loss": 0.2084, + "step": 9685 + }, + { + "epoch": 0.49, + "grad_norm": 0.9256523144338665, + "learning_rate": 1.0726496055482705e-05, + "loss": 0.2, + "step": 9686 + }, + { + "epoch": 0.49, + "grad_norm": 0.9945108882251645, + "learning_rate": 1.0724853429082923e-05, + "loss": 0.187, + "step": 9687 + }, + { + "epoch": 0.49, + "grad_norm": 0.8823018088973252, + "learning_rate": 1.0723210783021454e-05, + "loss": 0.209, + "step": 9688 + }, + { + "epoch": 0.49, + "grad_norm": 1.0012462162192681, + "learning_rate": 1.072156811734285e-05, + "loss": 0.1664, + "step": 9689 + }, + { + "epoch": 0.49, + "grad_norm": 0.9969096264391915, + "learning_rate": 1.0719925432091671e-05, + "loss": 0.207, + "step": 9690 + }, + { + "epoch": 0.49, + "grad_norm": 1.8109754142384766, + "learning_rate": 1.0718282727312475e-05, + "loss": 0.1923, + "step": 9691 + }, + { + "epoch": 0.49, + "grad_norm": 1.2853994775292084, + "learning_rate": 1.0716640003049818e-05, + "loss": 0.1762, + "step": 9692 + }, + { + "epoch": 0.49, + "grad_norm": 1.1245990847029432, + "learning_rate": 1.0714997259348261e-05, + "loss": 0.1988, + "step": 9693 + }, + { + "epoch": 0.49, + "grad_norm": 0.8821098044855635, + "learning_rate": 1.0713354496252364e-05, + "loss": 0.1747, + "step": 9694 + }, + { + "epoch": 0.49, + "grad_norm": 1.3418193495044213, + "learning_rate": 1.0711711713806684e-05, + "loss": 0.2024, + "step": 9695 + }, + { + "epoch": 0.49, + "grad_norm": 1.121844576126626, + "learning_rate": 1.0710068912055784e-05, + "loss": 0.1763, + "step": 9696 + }, + { + "epoch": 0.49, + "grad_norm": 1.469846337002135, + "learning_rate": 1.0708426091044224e-05, + "loss": 0.1865, + "step": 9697 + }, + { + "epoch": 0.49, + "grad_norm": 1.127361301705819, + "learning_rate": 1.0706783250816568e-05, + "loss": 0.1752, + "step": 9698 + }, + { + "epoch": 0.49, + "grad_norm": 1.371137664811588, + "learning_rate": 1.0705140391417377e-05, + "loss": 0.1794, + "step": 9699 + }, + { + "epoch": 0.49, + "grad_norm": 0.9201500615344242, + "learning_rate": 1.070349751289121e-05, + "loss": 0.1792, + "step": 9700 + }, + { + "epoch": 0.49, + "grad_norm": 0.9205893271689509, + "learning_rate": 1.0701854615282635e-05, + "loss": 0.1859, + "step": 9701 + }, + { + "epoch": 0.49, + "grad_norm": 2.3951584715649386, + "learning_rate": 1.0700211698636214e-05, + "loss": 0.1859, + "step": 9702 + }, + { + "epoch": 0.49, + "grad_norm": 1.2693974342116394, + "learning_rate": 1.069856876299651e-05, + "loss": 0.1972, + "step": 9703 + }, + { + "epoch": 0.49, + "grad_norm": 0.9274877290317544, + "learning_rate": 1.0696925808408092e-05, + "loss": 0.1689, + "step": 9704 + }, + { + "epoch": 0.49, + "grad_norm": 1.2619010784277056, + "learning_rate": 1.0695282834915517e-05, + "loss": 0.1971, + "step": 9705 + }, + { + "epoch": 0.49, + "grad_norm": 1.1154770717828937, + "learning_rate": 1.069363984256336e-05, + "loss": 0.195, + "step": 9706 + }, + { + "epoch": 0.49, + "grad_norm": 0.8760721779934151, + "learning_rate": 1.0691996831396181e-05, + "loss": 0.1914, + "step": 9707 + }, + { + "epoch": 0.49, + "grad_norm": 1.1540281001625392, + "learning_rate": 1.0690353801458551e-05, + "loss": 0.2161, + "step": 9708 + }, + { + "epoch": 0.49, + "grad_norm": 1.0251076150045788, + "learning_rate": 1.0688710752795033e-05, + "loss": 0.1751, + "step": 9709 + }, + { + "epoch": 0.49, + "grad_norm": 0.8244986961316821, + "learning_rate": 1.0687067685450199e-05, + "loss": 0.1642, + "step": 9710 + }, + { + "epoch": 0.49, + "grad_norm": 0.814014152897162, + "learning_rate": 1.0685424599468615e-05, + "loss": 0.181, + "step": 9711 + }, + { + "epoch": 0.49, + "grad_norm": 1.2640957549475418, + "learning_rate": 1.068378149489485e-05, + "loss": 0.1815, + "step": 9712 + }, + { + "epoch": 0.49, + "grad_norm": 0.8867530504394332, + "learning_rate": 1.068213837177347e-05, + "loss": 0.1818, + "step": 9713 + }, + { + "epoch": 0.49, + "grad_norm": 0.8642166809082094, + "learning_rate": 1.068049523014905e-05, + "loss": 0.1707, + "step": 9714 + }, + { + "epoch": 0.49, + "grad_norm": 1.013203360223355, + "learning_rate": 1.067885207006616e-05, + "loss": 0.1836, + "step": 9715 + }, + { + "epoch": 0.49, + "grad_norm": 1.2534834176344625, + "learning_rate": 1.0677208891569366e-05, + "loss": 0.1988, + "step": 9716 + }, + { + "epoch": 0.49, + "grad_norm": 1.3915590820409383, + "learning_rate": 1.0675565694703248e-05, + "loss": 0.1985, + "step": 9717 + }, + { + "epoch": 0.49, + "grad_norm": 0.9889276453661713, + "learning_rate": 1.0673922479512366e-05, + "loss": 0.1814, + "step": 9718 + }, + { + "epoch": 0.49, + "grad_norm": 1.1748920823909086, + "learning_rate": 1.0672279246041301e-05, + "loss": 0.1878, + "step": 9719 + }, + { + "epoch": 0.49, + "grad_norm": 1.4392217921353778, + "learning_rate": 1.0670635994334626e-05, + "loss": 0.1757, + "step": 9720 + }, + { + "epoch": 0.49, + "grad_norm": 0.7471315556478759, + "learning_rate": 1.066899272443691e-05, + "loss": 0.1693, + "step": 9721 + }, + { + "epoch": 0.49, + "grad_norm": 1.1259472114454028, + "learning_rate": 1.0667349436392727e-05, + "loss": 0.1794, + "step": 9722 + }, + { + "epoch": 0.49, + "grad_norm": 1.1501304500378, + "learning_rate": 1.0665706130246654e-05, + "loss": 0.2131, + "step": 9723 + }, + { + "epoch": 0.49, + "grad_norm": 1.0239004877819975, + "learning_rate": 1.0664062806043266e-05, + "loss": 0.1724, + "step": 9724 + }, + { + "epoch": 0.49, + "grad_norm": 1.0414264832020326, + "learning_rate": 1.0662419463827136e-05, + "loss": 0.201, + "step": 9725 + }, + { + "epoch": 0.49, + "grad_norm": 1.3522003124483102, + "learning_rate": 1.066077610364284e-05, + "loss": 0.1844, + "step": 9726 + }, + { + "epoch": 0.49, + "grad_norm": 1.1639536779282773, + "learning_rate": 1.0659132725534958e-05, + "loss": 0.1931, + "step": 9727 + }, + { + "epoch": 0.49, + "grad_norm": 0.9436860780724251, + "learning_rate": 1.065748932954806e-05, + "loss": 0.1929, + "step": 9728 + }, + { + "epoch": 0.49, + "grad_norm": 1.273126733844216, + "learning_rate": 1.0655845915726728e-05, + "loss": 0.1848, + "step": 9729 + }, + { + "epoch": 0.49, + "grad_norm": 1.2985323865422416, + "learning_rate": 1.065420248411554e-05, + "loss": 0.1935, + "step": 9730 + }, + { + "epoch": 0.49, + "grad_norm": 1.039552057235806, + "learning_rate": 1.0652559034759069e-05, + "loss": 0.195, + "step": 9731 + }, + { + "epoch": 0.49, + "grad_norm": 0.9724179390813914, + "learning_rate": 1.0650915567701897e-05, + "loss": 0.1773, + "step": 9732 + }, + { + "epoch": 0.49, + "grad_norm": 2.31261878897922, + "learning_rate": 1.0649272082988609e-05, + "loss": 0.2152, + "step": 9733 + }, + { + "epoch": 0.49, + "grad_norm": 1.0956625730563676, + "learning_rate": 1.0647628580663775e-05, + "loss": 0.1902, + "step": 9734 + }, + { + "epoch": 0.5, + "grad_norm": 0.9542691101221298, + "learning_rate": 1.0645985060771978e-05, + "loss": 0.1743, + "step": 9735 + }, + { + "epoch": 0.5, + "grad_norm": 1.1992225194268242, + "learning_rate": 1.0644341523357802e-05, + "loss": 0.1901, + "step": 9736 + }, + { + "epoch": 0.5, + "grad_norm": 0.8999673120258931, + "learning_rate": 1.0642697968465827e-05, + "loss": 0.1845, + "step": 9737 + }, + { + "epoch": 0.5, + "grad_norm": 1.0087462698811394, + "learning_rate": 1.0641054396140631e-05, + "loss": 0.1856, + "step": 9738 + }, + { + "epoch": 0.5, + "grad_norm": 0.8581094685422574, + "learning_rate": 1.06394108064268e-05, + "loss": 0.2202, + "step": 9739 + }, + { + "epoch": 0.5, + "grad_norm": 1.1336111707300922, + "learning_rate": 1.0637767199368911e-05, + "loss": 0.1673, + "step": 9740 + }, + { + "epoch": 0.5, + "grad_norm": 0.814053132649836, + "learning_rate": 1.0636123575011555e-05, + "loss": 0.1984, + "step": 9741 + }, + { + "epoch": 0.5, + "grad_norm": 1.2560613817757404, + "learning_rate": 1.063447993339931e-05, + "loss": 0.1903, + "step": 9742 + }, + { + "epoch": 0.5, + "grad_norm": 1.1206558307710335, + "learning_rate": 1.0632836274576761e-05, + "loss": 0.1904, + "step": 9743 + }, + { + "epoch": 0.5, + "grad_norm": 0.7950750967837613, + "learning_rate": 1.0631192598588493e-05, + "loss": 0.1781, + "step": 9744 + }, + { + "epoch": 0.5, + "grad_norm": 0.7829207386631539, + "learning_rate": 1.062954890547909e-05, + "loss": 0.1681, + "step": 9745 + }, + { + "epoch": 0.5, + "grad_norm": 0.9252872638635755, + "learning_rate": 1.0627905195293135e-05, + "loss": 0.1822, + "step": 9746 + }, + { + "epoch": 0.5, + "grad_norm": 1.1785591768775445, + "learning_rate": 1.0626261468075218e-05, + "loss": 0.1863, + "step": 9747 + }, + { + "epoch": 0.5, + "grad_norm": 0.6476250713917159, + "learning_rate": 1.0624617723869921e-05, + "loss": 0.191, + "step": 9748 + }, + { + "epoch": 0.5, + "grad_norm": 1.0225638341945646, + "learning_rate": 1.0622973962721836e-05, + "loss": 0.1719, + "step": 9749 + }, + { + "epoch": 0.5, + "grad_norm": 1.2016612896236973, + "learning_rate": 1.0621330184675547e-05, + "loss": 0.202, + "step": 9750 + }, + { + "epoch": 0.5, + "grad_norm": 0.9244176686589038, + "learning_rate": 1.061968638977564e-05, + "loss": 0.192, + "step": 9751 + }, + { + "epoch": 0.5, + "grad_norm": 2.185460765223053, + "learning_rate": 1.0618042578066707e-05, + "loss": 0.1725, + "step": 9752 + }, + { + "epoch": 0.5, + "grad_norm": 1.0514921292392578, + "learning_rate": 1.0616398749593331e-05, + "loss": 0.2072, + "step": 9753 + }, + { + "epoch": 0.5, + "grad_norm": 1.1355402752532482, + "learning_rate": 1.0614754904400105e-05, + "loss": 0.1859, + "step": 9754 + }, + { + "epoch": 0.5, + "grad_norm": 0.8676669752768543, + "learning_rate": 1.0613111042531618e-05, + "loss": 0.1521, + "step": 9755 + }, + { + "epoch": 0.5, + "grad_norm": 1.9336257746652679, + "learning_rate": 1.061146716403246e-05, + "loss": 0.1879, + "step": 9756 + }, + { + "epoch": 0.5, + "grad_norm": 1.2460212844758167, + "learning_rate": 1.0609823268947219e-05, + "loss": 0.2039, + "step": 9757 + }, + { + "epoch": 0.5, + "grad_norm": 3.30362751558046, + "learning_rate": 1.0608179357320487e-05, + "loss": 0.2029, + "step": 9758 + }, + { + "epoch": 0.5, + "grad_norm": 1.1747846489666585, + "learning_rate": 1.0606535429196858e-05, + "loss": 0.2088, + "step": 9759 + }, + { + "epoch": 0.5, + "grad_norm": 0.9912826968645417, + "learning_rate": 1.060489148462092e-05, + "loss": 0.203, + "step": 9760 + }, + { + "epoch": 0.5, + "grad_norm": 2.554812461103568, + "learning_rate": 1.0603247523637268e-05, + "loss": 0.1879, + "step": 9761 + }, + { + "epoch": 0.5, + "grad_norm": 1.1857395296609805, + "learning_rate": 1.0601603546290491e-05, + "loss": 0.1882, + "step": 9762 + }, + { + "epoch": 0.5, + "grad_norm": 1.0603146410098505, + "learning_rate": 1.0599959552625186e-05, + "loss": 0.1918, + "step": 9763 + }, + { + "epoch": 0.5, + "grad_norm": 0.896561227828809, + "learning_rate": 1.0598315542685941e-05, + "loss": 0.1835, + "step": 9764 + }, + { + "epoch": 0.5, + "grad_norm": 0.9037701781341697, + "learning_rate": 1.0596671516517356e-05, + "loss": 0.1953, + "step": 9765 + }, + { + "epoch": 0.5, + "grad_norm": 1.0873238212270744, + "learning_rate": 1.059502747416402e-05, + "loss": 0.1801, + "step": 9766 + }, + { + "epoch": 0.5, + "grad_norm": 0.7932512597856124, + "learning_rate": 1.059338341567053e-05, + "loss": 0.1721, + "step": 9767 + }, + { + "epoch": 0.5, + "grad_norm": 0.9210138857139931, + "learning_rate": 1.0591739341081485e-05, + "loss": 0.1906, + "step": 9768 + }, + { + "epoch": 0.5, + "grad_norm": 1.224460682832701, + "learning_rate": 1.0590095250441473e-05, + "loss": 0.181, + "step": 9769 + }, + { + "epoch": 0.5, + "grad_norm": 0.9226760646689801, + "learning_rate": 1.0588451143795093e-05, + "loss": 0.181, + "step": 9770 + }, + { + "epoch": 0.5, + "grad_norm": 0.8404512315313271, + "learning_rate": 1.0586807021186946e-05, + "loss": 0.1965, + "step": 9771 + }, + { + "epoch": 0.5, + "grad_norm": 0.7582090020659907, + "learning_rate": 1.0585162882661624e-05, + "loss": 0.1869, + "step": 9772 + }, + { + "epoch": 0.5, + "grad_norm": 0.8639559963409217, + "learning_rate": 1.0583518728263726e-05, + "loss": 0.1993, + "step": 9773 + }, + { + "epoch": 0.5, + "grad_norm": 0.8684827736925508, + "learning_rate": 1.058187455803785e-05, + "loss": 0.1904, + "step": 9774 + }, + { + "epoch": 0.5, + "grad_norm": 6.543355549440279, + "learning_rate": 1.0580230372028593e-05, + "loss": 0.1883, + "step": 9775 + }, + { + "epoch": 0.5, + "grad_norm": 1.1117833572882903, + "learning_rate": 1.0578586170280554e-05, + "loss": 0.1982, + "step": 9776 + }, + { + "epoch": 0.5, + "grad_norm": 0.9471809713270811, + "learning_rate": 1.0576941952838334e-05, + "loss": 0.2013, + "step": 9777 + }, + { + "epoch": 0.5, + "grad_norm": 1.2033368512094136, + "learning_rate": 1.0575297719746533e-05, + "loss": 0.1684, + "step": 9778 + }, + { + "epoch": 0.5, + "grad_norm": 1.0747091930537156, + "learning_rate": 1.0573653471049745e-05, + "loss": 0.1869, + "step": 9779 + }, + { + "epoch": 0.5, + "grad_norm": 0.9710104972552251, + "learning_rate": 1.0572009206792575e-05, + "loss": 0.1758, + "step": 9780 + }, + { + "epoch": 0.5, + "grad_norm": 1.2635121948298544, + "learning_rate": 1.0570364927019623e-05, + "loss": 0.1872, + "step": 9781 + }, + { + "epoch": 0.5, + "grad_norm": 0.7568544286952503, + "learning_rate": 1.0568720631775491e-05, + "loss": 0.1767, + "step": 9782 + }, + { + "epoch": 0.5, + "grad_norm": 0.8659534324188721, + "learning_rate": 1.0567076321104776e-05, + "loss": 0.1758, + "step": 9783 + }, + { + "epoch": 0.5, + "grad_norm": 1.0559891149766256, + "learning_rate": 1.0565431995052089e-05, + "loss": 0.1908, + "step": 9784 + }, + { + "epoch": 0.5, + "grad_norm": 1.057283250817933, + "learning_rate": 1.0563787653662025e-05, + "loss": 0.2087, + "step": 9785 + }, + { + "epoch": 0.5, + "grad_norm": 0.8860916224318578, + "learning_rate": 1.0562143296979188e-05, + "loss": 0.1736, + "step": 9786 + }, + { + "epoch": 0.5, + "grad_norm": 0.859785432898717, + "learning_rate": 1.0560498925048186e-05, + "loss": 0.1838, + "step": 9787 + }, + { + "epoch": 0.5, + "grad_norm": 1.385780047711686, + "learning_rate": 1.0558854537913614e-05, + "loss": 0.1905, + "step": 9788 + }, + { + "epoch": 0.5, + "grad_norm": 0.9099122713281608, + "learning_rate": 1.0557210135620084e-05, + "loss": 0.1958, + "step": 9789 + }, + { + "epoch": 0.5, + "grad_norm": 1.0201497510398987, + "learning_rate": 1.0555565718212198e-05, + "loss": 0.1751, + "step": 9790 + }, + { + "epoch": 0.5, + "grad_norm": 1.0445946424152273, + "learning_rate": 1.0553921285734559e-05, + "loss": 0.1961, + "step": 9791 + }, + { + "epoch": 0.5, + "grad_norm": 0.7918993501345337, + "learning_rate": 1.0552276838231773e-05, + "loss": 0.166, + "step": 9792 + }, + { + "epoch": 0.5, + "grad_norm": 0.9163235676527428, + "learning_rate": 1.0550632375748448e-05, + "loss": 0.1755, + "step": 9793 + }, + { + "epoch": 0.5, + "grad_norm": 1.0610958812797693, + "learning_rate": 1.0548987898329188e-05, + "loss": 0.2183, + "step": 9794 + }, + { + "epoch": 0.5, + "grad_norm": 1.1659132117607722, + "learning_rate": 1.0547343406018602e-05, + "loss": 0.2157, + "step": 9795 + }, + { + "epoch": 0.5, + "grad_norm": 0.828123363788774, + "learning_rate": 1.0545698898861293e-05, + "loss": 0.1848, + "step": 9796 + }, + { + "epoch": 0.5, + "grad_norm": 0.8330494134593698, + "learning_rate": 1.0544054376901872e-05, + "loss": 0.1919, + "step": 9797 + }, + { + "epoch": 0.5, + "grad_norm": 0.9986865906064578, + "learning_rate": 1.0542409840184946e-05, + "loss": 0.1824, + "step": 9798 + }, + { + "epoch": 0.5, + "grad_norm": 2.431778286746509, + "learning_rate": 1.0540765288755124e-05, + "loss": 0.171, + "step": 9799 + }, + { + "epoch": 0.5, + "grad_norm": 1.3598909405812594, + "learning_rate": 1.053912072265701e-05, + "loss": 0.174, + "step": 9800 + }, + { + "epoch": 0.5, + "grad_norm": 0.8439246989389632, + "learning_rate": 1.0537476141935215e-05, + "loss": 0.1882, + "step": 9801 + }, + { + "epoch": 0.5, + "grad_norm": 1.1056060930525182, + "learning_rate": 1.053583154663435e-05, + "loss": 0.1848, + "step": 9802 + }, + { + "epoch": 0.5, + "grad_norm": 0.8568831388430186, + "learning_rate": 1.0534186936799024e-05, + "loss": 0.19, + "step": 9803 + }, + { + "epoch": 0.5, + "grad_norm": 0.881329670015039, + "learning_rate": 1.053254231247385e-05, + "loss": 0.1883, + "step": 9804 + }, + { + "epoch": 0.5, + "grad_norm": 0.8986303767656837, + "learning_rate": 1.0530897673703431e-05, + "loss": 0.1952, + "step": 9805 + }, + { + "epoch": 0.5, + "grad_norm": 1.053070481720899, + "learning_rate": 1.0529253020532386e-05, + "loss": 0.2114, + "step": 9806 + }, + { + "epoch": 0.5, + "grad_norm": 0.8058192743448597, + "learning_rate": 1.0527608353005324e-05, + "loss": 0.1872, + "step": 9807 + }, + { + "epoch": 0.5, + "grad_norm": 1.0561154218791682, + "learning_rate": 1.0525963671166852e-05, + "loss": 0.1821, + "step": 9808 + }, + { + "epoch": 0.5, + "grad_norm": 0.950268019639858, + "learning_rate": 1.0524318975061589e-05, + "loss": 0.1885, + "step": 9809 + }, + { + "epoch": 0.5, + "grad_norm": 1.0711119481698064, + "learning_rate": 1.0522674264734141e-05, + "loss": 0.1874, + "step": 9810 + }, + { + "epoch": 0.5, + "grad_norm": 1.4547965340786408, + "learning_rate": 1.0521029540229126e-05, + "loss": 0.1782, + "step": 9811 + }, + { + "epoch": 0.5, + "grad_norm": 0.8882553205810025, + "learning_rate": 1.0519384801591155e-05, + "loss": 0.1625, + "step": 9812 + }, + { + "epoch": 0.5, + "grad_norm": 0.9386707906152143, + "learning_rate": 1.0517740048864843e-05, + "loss": 0.2197, + "step": 9813 + }, + { + "epoch": 0.5, + "grad_norm": 1.481738485759149, + "learning_rate": 1.05160952820948e-05, + "loss": 0.1855, + "step": 9814 + }, + { + "epoch": 0.5, + "grad_norm": 1.3158177722380133, + "learning_rate": 1.0514450501325646e-05, + "loss": 0.1794, + "step": 9815 + }, + { + "epoch": 0.5, + "grad_norm": 0.8984572182708918, + "learning_rate": 1.0512805706601994e-05, + "loss": 0.173, + "step": 9816 + }, + { + "epoch": 0.5, + "grad_norm": 1.2354073521381275, + "learning_rate": 1.0511160897968456e-05, + "loss": 0.1829, + "step": 9817 + }, + { + "epoch": 0.5, + "grad_norm": 1.2088642384857178, + "learning_rate": 1.0509516075469648e-05, + "loss": 0.1863, + "step": 9818 + }, + { + "epoch": 0.5, + "grad_norm": 0.9717684108168861, + "learning_rate": 1.0507871239150192e-05, + "loss": 0.1976, + "step": 9819 + }, + { + "epoch": 0.5, + "grad_norm": 1.1378371016917381, + "learning_rate": 1.0506226389054697e-05, + "loss": 0.1918, + "step": 9820 + }, + { + "epoch": 0.5, + "grad_norm": 1.1337727830864175, + "learning_rate": 1.0504581525227784e-05, + "loss": 0.1701, + "step": 9821 + }, + { + "epoch": 0.5, + "grad_norm": 0.9734523957238299, + "learning_rate": 1.0502936647714068e-05, + "loss": 0.1675, + "step": 9822 + }, + { + "epoch": 0.5, + "grad_norm": 0.8085856071065838, + "learning_rate": 1.0501291756558166e-05, + "loss": 0.2105, + "step": 9823 + }, + { + "epoch": 0.5, + "grad_norm": 1.3614299113261774, + "learning_rate": 1.0499646851804698e-05, + "loss": 0.1949, + "step": 9824 + }, + { + "epoch": 0.5, + "grad_norm": 1.0225480331788612, + "learning_rate": 1.049800193349828e-05, + "loss": 0.1803, + "step": 9825 + }, + { + "epoch": 0.5, + "grad_norm": 1.1063594496079652, + "learning_rate": 1.0496357001683535e-05, + "loss": 0.2012, + "step": 9826 + }, + { + "epoch": 0.5, + "grad_norm": 0.8650630444598459, + "learning_rate": 1.0494712056405077e-05, + "loss": 0.1975, + "step": 9827 + }, + { + "epoch": 0.5, + "grad_norm": 1.2071697382974003, + "learning_rate": 1.0493067097707521e-05, + "loss": 0.173, + "step": 9828 + }, + { + "epoch": 0.5, + "grad_norm": 0.8907759542377793, + "learning_rate": 1.0491422125635497e-05, + "loss": 0.1606, + "step": 9829 + }, + { + "epoch": 0.5, + "grad_norm": 1.1407485010753793, + "learning_rate": 1.0489777140233619e-05, + "loss": 0.2085, + "step": 9830 + }, + { + "epoch": 0.5, + "grad_norm": 1.2018164434360048, + "learning_rate": 1.048813214154651e-05, + "loss": 0.1777, + "step": 9831 + }, + { + "epoch": 0.5, + "grad_norm": 0.8739314910355729, + "learning_rate": 1.0486487129618787e-05, + "loss": 0.1964, + "step": 9832 + }, + { + "epoch": 0.5, + "grad_norm": 1.2347078940200111, + "learning_rate": 1.0484842104495077e-05, + "loss": 0.1826, + "step": 9833 + }, + { + "epoch": 0.5, + "grad_norm": 0.9880699813876799, + "learning_rate": 1.0483197066219994e-05, + "loss": 0.1739, + "step": 9834 + }, + { + "epoch": 0.5, + "grad_norm": 1.5484746841825274, + "learning_rate": 1.0481552014838164e-05, + "loss": 0.2093, + "step": 9835 + }, + { + "epoch": 0.5, + "grad_norm": 0.8828003341975913, + "learning_rate": 1.0479906950394205e-05, + "loss": 0.174, + "step": 9836 + }, + { + "epoch": 0.5, + "grad_norm": 0.8714267237009181, + "learning_rate": 1.0478261872932747e-05, + "loss": 0.1865, + "step": 9837 + }, + { + "epoch": 0.5, + "grad_norm": 0.9416663230472567, + "learning_rate": 1.0476616782498408e-05, + "loss": 0.1954, + "step": 9838 + }, + { + "epoch": 0.5, + "grad_norm": 0.8907966044512139, + "learning_rate": 1.0474971679135812e-05, + "loss": 0.18, + "step": 9839 + }, + { + "epoch": 0.5, + "grad_norm": 1.0428955435248508, + "learning_rate": 1.0473326562889583e-05, + "loss": 0.2001, + "step": 9840 + }, + { + "epoch": 0.5, + "grad_norm": 0.8931635815421982, + "learning_rate": 1.047168143380434e-05, + "loss": 0.2256, + "step": 9841 + }, + { + "epoch": 0.5, + "grad_norm": 1.230855516835648, + "learning_rate": 1.0470036291924716e-05, + "loss": 0.1949, + "step": 9842 + }, + { + "epoch": 0.5, + "grad_norm": 0.9159697293692711, + "learning_rate": 1.046839113729533e-05, + "loss": 0.1648, + "step": 9843 + }, + { + "epoch": 0.5, + "grad_norm": 1.0587328554788622, + "learning_rate": 1.0466745969960808e-05, + "loss": 0.1693, + "step": 9844 + }, + { + "epoch": 0.5, + "grad_norm": 1.0236897380683918, + "learning_rate": 1.0465100789965774e-05, + "loss": 0.1741, + "step": 9845 + }, + { + "epoch": 0.5, + "grad_norm": 0.8364371860688766, + "learning_rate": 1.0463455597354857e-05, + "loss": 0.1745, + "step": 9846 + }, + { + "epoch": 0.5, + "grad_norm": 1.2590298353872946, + "learning_rate": 1.0461810392172678e-05, + "loss": 0.1973, + "step": 9847 + }, + { + "epoch": 0.5, + "grad_norm": 1.55764474316235, + "learning_rate": 1.046016517446387e-05, + "loss": 0.2304, + "step": 9848 + }, + { + "epoch": 0.5, + "grad_norm": 1.3950511436078774, + "learning_rate": 1.0458519944273051e-05, + "loss": 0.2017, + "step": 9849 + }, + { + "epoch": 0.5, + "grad_norm": 2.075205875836515, + "learning_rate": 1.0456874701644857e-05, + "loss": 0.1854, + "step": 9850 + }, + { + "epoch": 0.5, + "grad_norm": 1.1534536502690333, + "learning_rate": 1.0455229446623909e-05, + "loss": 0.188, + "step": 9851 + }, + { + "epoch": 0.5, + "grad_norm": 1.105654838126189, + "learning_rate": 1.0453584179254837e-05, + "loss": 0.2011, + "step": 9852 + }, + { + "epoch": 0.5, + "grad_norm": 1.2759741028043035, + "learning_rate": 1.0451938899582263e-05, + "loss": 0.2026, + "step": 9853 + }, + { + "epoch": 0.5, + "grad_norm": 1.3135892552693105, + "learning_rate": 1.0450293607650828e-05, + "loss": 0.1803, + "step": 9854 + }, + { + "epoch": 0.5, + "grad_norm": 1.4361511876030737, + "learning_rate": 1.044864830350515e-05, + "loss": 0.1854, + "step": 9855 + }, + { + "epoch": 0.5, + "grad_norm": 0.7257367038897319, + "learning_rate": 1.0447002987189863e-05, + "loss": 0.1817, + "step": 9856 + }, + { + "epoch": 0.5, + "grad_norm": 1.0461617761421993, + "learning_rate": 1.0445357658749596e-05, + "loss": 0.1761, + "step": 9857 + }, + { + "epoch": 0.5, + "grad_norm": 0.9720243915830117, + "learning_rate": 1.0443712318228973e-05, + "loss": 0.1935, + "step": 9858 + }, + { + "epoch": 0.5, + "grad_norm": 1.109979195338577, + "learning_rate": 1.044206696567263e-05, + "loss": 0.184, + "step": 9859 + }, + { + "epoch": 0.5, + "grad_norm": 1.952114468244844, + "learning_rate": 1.0440421601125196e-05, + "loss": 0.1996, + "step": 9860 + }, + { + "epoch": 0.5, + "grad_norm": 0.832335857365281, + "learning_rate": 1.0438776224631305e-05, + "loss": 0.1811, + "step": 9861 + }, + { + "epoch": 0.5, + "grad_norm": 1.04682268647991, + "learning_rate": 1.043713083623558e-05, + "loss": 0.1811, + "step": 9862 + }, + { + "epoch": 0.5, + "grad_norm": 2.700625011153412, + "learning_rate": 1.0435485435982655e-05, + "loss": 0.1839, + "step": 9863 + }, + { + "epoch": 0.5, + "grad_norm": 0.8833745933497529, + "learning_rate": 1.0433840023917166e-05, + "loss": 0.2041, + "step": 9864 + }, + { + "epoch": 0.5, + "grad_norm": 1.0260558036137535, + "learning_rate": 1.043219460008374e-05, + "loss": 0.1931, + "step": 9865 + }, + { + "epoch": 0.5, + "grad_norm": 1.158223230321419, + "learning_rate": 1.0430549164527012e-05, + "loss": 0.1954, + "step": 9866 + }, + { + "epoch": 0.5, + "grad_norm": 4.840651535607204, + "learning_rate": 1.0428903717291614e-05, + "loss": 0.1628, + "step": 9867 + }, + { + "epoch": 0.5, + "grad_norm": 0.8215057895386293, + "learning_rate": 1.0427258258422179e-05, + "loss": 0.1621, + "step": 9868 + }, + { + "epoch": 0.5, + "grad_norm": 0.9297959300807954, + "learning_rate": 1.042561278796334e-05, + "loss": 0.2035, + "step": 9869 + }, + { + "epoch": 0.5, + "grad_norm": 1.0375582587651102, + "learning_rate": 1.0423967305959727e-05, + "loss": 0.2245, + "step": 9870 + }, + { + "epoch": 0.5, + "grad_norm": 0.8712326630395105, + "learning_rate": 1.0422321812455977e-05, + "loss": 0.1941, + "step": 9871 + }, + { + "epoch": 0.5, + "grad_norm": 1.2359447371119525, + "learning_rate": 1.0420676307496727e-05, + "loss": 0.1969, + "step": 9872 + }, + { + "epoch": 0.5, + "grad_norm": 0.9577458563174123, + "learning_rate": 1.0419030791126608e-05, + "loss": 0.2073, + "step": 9873 + }, + { + "epoch": 0.5, + "grad_norm": 0.9015402021284342, + "learning_rate": 1.0417385263390253e-05, + "loss": 0.1743, + "step": 9874 + }, + { + "epoch": 0.5, + "grad_norm": 0.8146898383568317, + "learning_rate": 1.0415739724332301e-05, + "loss": 0.1706, + "step": 9875 + }, + { + "epoch": 0.5, + "grad_norm": 0.7583665868297698, + "learning_rate": 1.0414094173997382e-05, + "loss": 0.1644, + "step": 9876 + }, + { + "epoch": 0.5, + "grad_norm": 0.7536084868690823, + "learning_rate": 1.0412448612430139e-05, + "loss": 0.1932, + "step": 9877 + }, + { + "epoch": 0.5, + "grad_norm": 0.8179173485214218, + "learning_rate": 1.0410803039675203e-05, + "loss": 0.1711, + "step": 9878 + }, + { + "epoch": 0.5, + "grad_norm": 0.7474447497140159, + "learning_rate": 1.0409157455777212e-05, + "loss": 0.1742, + "step": 9879 + }, + { + "epoch": 0.5, + "grad_norm": 1.3573782407500192, + "learning_rate": 1.0407511860780798e-05, + "loss": 0.2061, + "step": 9880 + }, + { + "epoch": 0.5, + "grad_norm": 1.1693626671955326, + "learning_rate": 1.0405866254730607e-05, + "loss": 0.1809, + "step": 9881 + }, + { + "epoch": 0.5, + "grad_norm": 1.121973631958896, + "learning_rate": 1.0404220637671269e-05, + "loss": 0.2006, + "step": 9882 + }, + { + "epoch": 0.5, + "grad_norm": 0.9838023394438137, + "learning_rate": 1.0402575009647427e-05, + "loss": 0.184, + "step": 9883 + }, + { + "epoch": 0.5, + "grad_norm": 0.8474963997760555, + "learning_rate": 1.040092937070371e-05, + "loss": 0.1793, + "step": 9884 + }, + { + "epoch": 0.5, + "grad_norm": 1.436673796146176, + "learning_rate": 1.0399283720884761e-05, + "loss": 0.1649, + "step": 9885 + }, + { + "epoch": 0.5, + "grad_norm": 0.9966040144415655, + "learning_rate": 1.039763806023522e-05, + "loss": 0.1769, + "step": 9886 + }, + { + "epoch": 0.5, + "grad_norm": 1.6373730020474027, + "learning_rate": 1.0395992388799725e-05, + "loss": 0.1877, + "step": 9887 + }, + { + "epoch": 0.5, + "grad_norm": 2.883731825593106, + "learning_rate": 1.0394346706622915e-05, + "loss": 0.1512, + "step": 9888 + }, + { + "epoch": 0.5, + "grad_norm": 1.316877146392383, + "learning_rate": 1.0392701013749424e-05, + "loss": 0.211, + "step": 9889 + }, + { + "epoch": 0.5, + "grad_norm": 0.9359088500850488, + "learning_rate": 1.0391055310223899e-05, + "loss": 0.1746, + "step": 9890 + }, + { + "epoch": 0.5, + "grad_norm": 1.0825036335565394, + "learning_rate": 1.0389409596090975e-05, + "loss": 0.1844, + "step": 9891 + }, + { + "epoch": 0.5, + "grad_norm": 0.8675212267076646, + "learning_rate": 1.0387763871395298e-05, + "loss": 0.1681, + "step": 9892 + }, + { + "epoch": 0.5, + "grad_norm": 1.252490369140081, + "learning_rate": 1.0386118136181498e-05, + "loss": 0.1764, + "step": 9893 + }, + { + "epoch": 0.5, + "grad_norm": 0.8267953666195103, + "learning_rate": 1.0384472390494225e-05, + "loss": 0.1704, + "step": 9894 + }, + { + "epoch": 0.5, + "grad_norm": 1.1896296090305618, + "learning_rate": 1.0382826634378115e-05, + "loss": 0.1895, + "step": 9895 + }, + { + "epoch": 0.5, + "grad_norm": 0.8499202270123422, + "learning_rate": 1.0381180867877813e-05, + "loss": 0.1709, + "step": 9896 + }, + { + "epoch": 0.5, + "grad_norm": 1.6666646990052674, + "learning_rate": 1.037953509103796e-05, + "loss": 0.1928, + "step": 9897 + }, + { + "epoch": 0.5, + "grad_norm": 1.2907050050197117, + "learning_rate": 1.037788930390319e-05, + "loss": 0.1838, + "step": 9898 + }, + { + "epoch": 0.5, + "grad_norm": 0.9548489444956723, + "learning_rate": 1.0376243506518157e-05, + "loss": 0.1835, + "step": 9899 + }, + { + "epoch": 0.5, + "grad_norm": 1.2427355712972166, + "learning_rate": 1.0374597698927496e-05, + "loss": 0.1887, + "step": 9900 + }, + { + "epoch": 0.5, + "grad_norm": 1.0923649933816408, + "learning_rate": 1.0372951881175854e-05, + "loss": 0.2256, + "step": 9901 + }, + { + "epoch": 0.5, + "grad_norm": 1.4007054267078818, + "learning_rate": 1.0371306053307866e-05, + "loss": 0.1744, + "step": 9902 + }, + { + "epoch": 0.5, + "grad_norm": 0.8349455748931837, + "learning_rate": 1.0369660215368182e-05, + "loss": 0.1902, + "step": 9903 + }, + { + "epoch": 0.5, + "grad_norm": 1.160864993415897, + "learning_rate": 1.0368014367401447e-05, + "loss": 0.1748, + "step": 9904 + }, + { + "epoch": 0.5, + "grad_norm": 1.100514323505627, + "learning_rate": 1.0366368509452298e-05, + "loss": 0.1795, + "step": 9905 + }, + { + "epoch": 0.5, + "grad_norm": 1.3266089140201034, + "learning_rate": 1.0364722641565381e-05, + "loss": 0.1917, + "step": 9906 + }, + { + "epoch": 0.5, + "grad_norm": 1.6110809904091798, + "learning_rate": 1.0363076763785345e-05, + "loss": 0.1889, + "step": 9907 + }, + { + "epoch": 0.5, + "grad_norm": 0.9414698726238333, + "learning_rate": 1.0361430876156831e-05, + "loss": 0.1756, + "step": 9908 + }, + { + "epoch": 0.5, + "grad_norm": 1.2947285498046557, + "learning_rate": 1.0359784978724483e-05, + "loss": 0.2214, + "step": 9909 + }, + { + "epoch": 0.5, + "grad_norm": 1.014018441865324, + "learning_rate": 1.0358139071532949e-05, + "loss": 0.1785, + "step": 9910 + }, + { + "epoch": 0.5, + "grad_norm": 1.1017582642136696, + "learning_rate": 1.0356493154626868e-05, + "loss": 0.1778, + "step": 9911 + }, + { + "epoch": 0.5, + "grad_norm": 1.5116573330006802, + "learning_rate": 1.0354847228050895e-05, + "loss": 0.1851, + "step": 9912 + }, + { + "epoch": 0.5, + "grad_norm": 1.1288381460495436, + "learning_rate": 1.0353201291849668e-05, + "loss": 0.2002, + "step": 9913 + }, + { + "epoch": 0.5, + "grad_norm": 0.8981263323620646, + "learning_rate": 1.0351555346067836e-05, + "loss": 0.1919, + "step": 9914 + }, + { + "epoch": 0.5, + "grad_norm": 0.9193426516734726, + "learning_rate": 1.0349909390750046e-05, + "loss": 0.1826, + "step": 9915 + }, + { + "epoch": 0.5, + "grad_norm": 0.9227102536487921, + "learning_rate": 1.0348263425940945e-05, + "loss": 0.1691, + "step": 9916 + }, + { + "epoch": 0.5, + "grad_norm": 1.1041781716681, + "learning_rate": 1.034661745168518e-05, + "loss": 0.1606, + "step": 9917 + }, + { + "epoch": 0.5, + "grad_norm": 1.2902489109492576, + "learning_rate": 1.0344971468027397e-05, + "loss": 0.1863, + "step": 9918 + }, + { + "epoch": 0.5, + "grad_norm": 0.9399133950621475, + "learning_rate": 1.034332547501224e-05, + "loss": 0.1891, + "step": 9919 + }, + { + "epoch": 0.5, + "grad_norm": 1.2562837223830832, + "learning_rate": 1.034167947268436e-05, + "loss": 0.1842, + "step": 9920 + }, + { + "epoch": 0.5, + "grad_norm": 0.8729064282047535, + "learning_rate": 1.0340033461088408e-05, + "loss": 0.1671, + "step": 9921 + }, + { + "epoch": 0.5, + "grad_norm": 0.8439636188899999, + "learning_rate": 1.0338387440269029e-05, + "loss": 0.1943, + "step": 9922 + }, + { + "epoch": 0.5, + "grad_norm": 1.3023244038905142, + "learning_rate": 1.0336741410270872e-05, + "loss": 0.1724, + "step": 9923 + }, + { + "epoch": 0.5, + "grad_norm": 0.8552753076216781, + "learning_rate": 1.0335095371138582e-05, + "loss": 0.1997, + "step": 9924 + }, + { + "epoch": 0.5, + "grad_norm": 0.9254039051965883, + "learning_rate": 1.0333449322916812e-05, + "loss": 0.2074, + "step": 9925 + }, + { + "epoch": 0.5, + "grad_norm": 0.9900810976488515, + "learning_rate": 1.0331803265650212e-05, + "loss": 0.1736, + "step": 9926 + }, + { + "epoch": 0.5, + "grad_norm": 1.2135277999895162, + "learning_rate": 1.0330157199383428e-05, + "loss": 0.1773, + "step": 9927 + }, + { + "epoch": 0.5, + "grad_norm": 1.1341813241407668, + "learning_rate": 1.0328511124161111e-05, + "loss": 0.1741, + "step": 9928 + }, + { + "epoch": 0.5, + "grad_norm": 1.0080976235152903, + "learning_rate": 1.0326865040027914e-05, + "loss": 0.1836, + "step": 9929 + }, + { + "epoch": 0.5, + "grad_norm": 0.9764000986623692, + "learning_rate": 1.0325218947028483e-05, + "loss": 0.2, + "step": 9930 + }, + { + "epoch": 0.51, + "grad_norm": 1.0610741734776916, + "learning_rate": 1.032357284520747e-05, + "loss": 0.2032, + "step": 9931 + }, + { + "epoch": 0.51, + "grad_norm": 1.168931745220606, + "learning_rate": 1.0321926734609525e-05, + "loss": 0.1873, + "step": 9932 + }, + { + "epoch": 0.51, + "grad_norm": 1.2369314036165708, + "learning_rate": 1.0320280615279297e-05, + "loss": 0.1899, + "step": 9933 + }, + { + "epoch": 0.51, + "grad_norm": 1.7069155450925078, + "learning_rate": 1.031863448726144e-05, + "loss": 0.1916, + "step": 9934 + }, + { + "epoch": 0.51, + "grad_norm": 1.1104411774982685, + "learning_rate": 1.0316988350600608e-05, + "loss": 0.1847, + "step": 9935 + }, + { + "epoch": 0.51, + "grad_norm": 1.295191707646736, + "learning_rate": 1.0315342205341448e-05, + "loss": 0.2246, + "step": 9936 + }, + { + "epoch": 0.51, + "grad_norm": 0.9595382596028376, + "learning_rate": 1.031369605152861e-05, + "loss": 0.1805, + "step": 9937 + }, + { + "epoch": 0.51, + "grad_norm": 1.3007792169988164, + "learning_rate": 1.031204988920675e-05, + "loss": 0.1858, + "step": 9938 + }, + { + "epoch": 0.51, + "grad_norm": 1.3047610591721563, + "learning_rate": 1.031040371842052e-05, + "loss": 0.1952, + "step": 9939 + }, + { + "epoch": 0.51, + "grad_norm": 2.094211112565609, + "learning_rate": 1.0308757539214573e-05, + "loss": 0.1874, + "step": 9940 + }, + { + "epoch": 0.51, + "grad_norm": 2.831448138141734, + "learning_rate": 1.0307111351633556e-05, + "loss": 0.1948, + "step": 9941 + }, + { + "epoch": 0.51, + "grad_norm": 1.1926966402724417, + "learning_rate": 1.030546515572213e-05, + "loss": 0.1982, + "step": 9942 + }, + { + "epoch": 0.51, + "grad_norm": 1.4148961719797137, + "learning_rate": 1.0303818951524941e-05, + "loss": 0.1655, + "step": 9943 + }, + { + "epoch": 0.51, + "grad_norm": 2.1868873442922836, + "learning_rate": 1.0302172739086651e-05, + "loss": 0.193, + "step": 9944 + }, + { + "epoch": 0.51, + "grad_norm": 1.0994668253712192, + "learning_rate": 1.0300526518451906e-05, + "loss": 0.2119, + "step": 9945 + }, + { + "epoch": 0.51, + "grad_norm": 0.7749159270480126, + "learning_rate": 1.0298880289665359e-05, + "loss": 0.1655, + "step": 9946 + }, + { + "epoch": 0.51, + "grad_norm": 1.2181962641100348, + "learning_rate": 1.029723405277167e-05, + "loss": 0.1956, + "step": 9947 + }, + { + "epoch": 0.51, + "grad_norm": 1.3402104542718751, + "learning_rate": 1.029558780781549e-05, + "loss": 0.1954, + "step": 9948 + }, + { + "epoch": 0.51, + "grad_norm": 1.0498104781581756, + "learning_rate": 1.0293941554841475e-05, + "loss": 0.1621, + "step": 9949 + }, + { + "epoch": 0.51, + "grad_norm": 0.8955502875619714, + "learning_rate": 1.0292295293894279e-05, + "loss": 0.1815, + "step": 9950 + }, + { + "epoch": 0.51, + "grad_norm": 1.4261451905653821, + "learning_rate": 1.0290649025018553e-05, + "loss": 0.1885, + "step": 9951 + }, + { + "epoch": 0.51, + "grad_norm": 1.1050297705764032, + "learning_rate": 1.0289002748258961e-05, + "loss": 0.1985, + "step": 9952 + }, + { + "epoch": 0.51, + "grad_norm": 0.9725277146089406, + "learning_rate": 1.0287356463660152e-05, + "loss": 0.1828, + "step": 9953 + }, + { + "epoch": 0.51, + "grad_norm": 1.2985263149088095, + "learning_rate": 1.0285710171266778e-05, + "loss": 0.1933, + "step": 9954 + }, + { + "epoch": 0.51, + "grad_norm": 0.8226489022264653, + "learning_rate": 1.0284063871123504e-05, + "loss": 0.1784, + "step": 9955 + }, + { + "epoch": 0.51, + "grad_norm": 2.5280430717726388, + "learning_rate": 1.0282417563274982e-05, + "loss": 0.1858, + "step": 9956 + }, + { + "epoch": 0.51, + "grad_norm": 1.4197035876160895, + "learning_rate": 1.0280771247765865e-05, + "loss": 0.1906, + "step": 9957 + }, + { + "epoch": 0.51, + "grad_norm": 1.1898992638116423, + "learning_rate": 1.0279124924640813e-05, + "loss": 0.1845, + "step": 9958 + }, + { + "epoch": 0.51, + "grad_norm": 1.301609927287, + "learning_rate": 1.027747859394448e-05, + "loss": 0.1846, + "step": 9959 + }, + { + "epoch": 0.51, + "grad_norm": 1.03226421368799, + "learning_rate": 1.0275832255721527e-05, + "loss": 0.1687, + "step": 9960 + }, + { + "epoch": 0.51, + "grad_norm": 1.1475592463345812, + "learning_rate": 1.0274185910016608e-05, + "loss": 0.2076, + "step": 9961 + }, + { + "epoch": 0.51, + "grad_norm": 0.857957466398941, + "learning_rate": 1.0272539556874381e-05, + "loss": 0.1826, + "step": 9962 + }, + { + "epoch": 0.51, + "grad_norm": 0.8451749170398627, + "learning_rate": 1.0270893196339499e-05, + "loss": 0.1783, + "step": 9963 + }, + { + "epoch": 0.51, + "grad_norm": 1.0211113370636866, + "learning_rate": 1.026924682845663e-05, + "loss": 0.1852, + "step": 9964 + }, + { + "epoch": 0.51, + "grad_norm": 1.3551642322630637, + "learning_rate": 1.0267600453270422e-05, + "loss": 0.1822, + "step": 9965 + }, + { + "epoch": 0.51, + "grad_norm": 0.9282330658954546, + "learning_rate": 1.0265954070825536e-05, + "loss": 0.1837, + "step": 9966 + }, + { + "epoch": 0.51, + "grad_norm": 1.555358636297375, + "learning_rate": 1.0264307681166634e-05, + "loss": 0.1748, + "step": 9967 + }, + { + "epoch": 0.51, + "grad_norm": 1.5972130118339416, + "learning_rate": 1.0262661284338367e-05, + "loss": 0.1762, + "step": 9968 + }, + { + "epoch": 0.51, + "grad_norm": 1.001103276739541, + "learning_rate": 1.02610148803854e-05, + "loss": 0.1952, + "step": 9969 + }, + { + "epoch": 0.51, + "grad_norm": 1.1031313216687382, + "learning_rate": 1.025936846935239e-05, + "loss": 0.2099, + "step": 9970 + }, + { + "epoch": 0.51, + "grad_norm": 1.2314792299673958, + "learning_rate": 1.0257722051283998e-05, + "loss": 0.1816, + "step": 9971 + }, + { + "epoch": 0.51, + "grad_norm": 1.061943164826423, + "learning_rate": 1.0256075626224876e-05, + "loss": 0.1715, + "step": 9972 + }, + { + "epoch": 0.51, + "grad_norm": 0.9307765106478149, + "learning_rate": 1.0254429194219694e-05, + "loss": 0.1957, + "step": 9973 + }, + { + "epoch": 0.51, + "grad_norm": 0.9256565304503027, + "learning_rate": 1.02527827553131e-05, + "loss": 0.2083, + "step": 9974 + }, + { + "epoch": 0.51, + "grad_norm": 1.4219555269572766, + "learning_rate": 1.0251136309549764e-05, + "loss": 0.1665, + "step": 9975 + }, + { + "epoch": 0.51, + "grad_norm": 1.1394062648630405, + "learning_rate": 1.0249489856974335e-05, + "loss": 0.1743, + "step": 9976 + }, + { + "epoch": 0.51, + "grad_norm": 0.8884799606782805, + "learning_rate": 1.0247843397631485e-05, + "loss": 0.2081, + "step": 9977 + }, + { + "epoch": 0.51, + "grad_norm": 1.0280662741511235, + "learning_rate": 1.0246196931565869e-05, + "loss": 0.1822, + "step": 9978 + }, + { + "epoch": 0.51, + "grad_norm": 2.1389930656547747, + "learning_rate": 1.0244550458822145e-05, + "loss": 0.1817, + "step": 9979 + }, + { + "epoch": 0.51, + "grad_norm": 0.8778846451344755, + "learning_rate": 1.0242903979444976e-05, + "loss": 0.1585, + "step": 9980 + }, + { + "epoch": 0.51, + "grad_norm": 0.9142638715956067, + "learning_rate": 1.0241257493479022e-05, + "loss": 0.2058, + "step": 9981 + }, + { + "epoch": 0.51, + "grad_norm": 1.18773644917128, + "learning_rate": 1.0239611000968948e-05, + "loss": 0.2521, + "step": 9982 + }, + { + "epoch": 0.51, + "grad_norm": 1.0356077541986568, + "learning_rate": 1.023796450195941e-05, + "loss": 0.1737, + "step": 9983 + }, + { + "epoch": 0.51, + "grad_norm": 1.1727486639573697, + "learning_rate": 1.0236317996495074e-05, + "loss": 0.1765, + "step": 9984 + }, + { + "epoch": 0.51, + "grad_norm": 0.9419796126948489, + "learning_rate": 1.0234671484620595e-05, + "loss": 0.1902, + "step": 9985 + }, + { + "epoch": 0.51, + "grad_norm": 0.7801012010400616, + "learning_rate": 1.023302496638064e-05, + "loss": 0.1929, + "step": 9986 + }, + { + "epoch": 0.51, + "grad_norm": 1.013663718762519, + "learning_rate": 1.023137844181987e-05, + "loss": 0.1936, + "step": 9987 + }, + { + "epoch": 0.51, + "grad_norm": 1.386473113559732, + "learning_rate": 1.022973191098295e-05, + "loss": 0.1757, + "step": 9988 + }, + { + "epoch": 0.51, + "grad_norm": 0.9898436861099714, + "learning_rate": 1.0228085373914534e-05, + "loss": 0.1833, + "step": 9989 + }, + { + "epoch": 0.51, + "grad_norm": 0.8509169739451241, + "learning_rate": 1.022643883065929e-05, + "loss": 0.1759, + "step": 9990 + }, + { + "epoch": 0.51, + "grad_norm": 0.8572529282667672, + "learning_rate": 1.0224792281261883e-05, + "loss": 0.1923, + "step": 9991 + }, + { + "epoch": 0.51, + "grad_norm": 2.52936893410723, + "learning_rate": 1.0223145725766972e-05, + "loss": 0.215, + "step": 9992 + }, + { + "epoch": 0.51, + "grad_norm": 1.7402502741709267, + "learning_rate": 1.022149916421922e-05, + "loss": 0.1867, + "step": 9993 + }, + { + "epoch": 0.51, + "grad_norm": 1.4300609361053354, + "learning_rate": 1.0219852596663287e-05, + "loss": 0.1933, + "step": 9994 + }, + { + "epoch": 0.51, + "grad_norm": 1.0778884603823187, + "learning_rate": 1.0218206023143843e-05, + "loss": 0.1901, + "step": 9995 + }, + { + "epoch": 0.51, + "grad_norm": 1.0155397155807473, + "learning_rate": 1.0216559443705549e-05, + "loss": 0.1654, + "step": 9996 + }, + { + "epoch": 0.51, + "grad_norm": 1.2284066582722302, + "learning_rate": 1.0214912858393069e-05, + "loss": 0.182, + "step": 9997 + }, + { + "epoch": 0.51, + "grad_norm": 1.0145416710485446, + "learning_rate": 1.0213266267251063e-05, + "loss": 0.1879, + "step": 9998 + }, + { + "epoch": 0.51, + "grad_norm": 1.19001894105872, + "learning_rate": 1.0211619670324196e-05, + "loss": 0.1792, + "step": 9999 + }, + { + "epoch": 0.51, + "grad_norm": 0.970080459098825, + "learning_rate": 1.0209973067657138e-05, + "loss": 0.1814, + "step": 10000 + }, + { + "epoch": 0.51, + "grad_norm": 1.0951474234469627, + "learning_rate": 1.0208326459294544e-05, + "loss": 0.189, + "step": 10001 + }, + { + "epoch": 0.51, + "grad_norm": 0.8388440522398718, + "learning_rate": 1.0206679845281086e-05, + "loss": 0.2007, + "step": 10002 + }, + { + "epoch": 0.51, + "grad_norm": 1.0514825151087455, + "learning_rate": 1.0205033225661425e-05, + "loss": 0.1738, + "step": 10003 + }, + { + "epoch": 0.51, + "grad_norm": 0.9795950508541672, + "learning_rate": 1.0203386600480225e-05, + "loss": 0.1931, + "step": 10004 + }, + { + "epoch": 0.51, + "grad_norm": 3.335975532830077, + "learning_rate": 1.0201739969782154e-05, + "loss": 0.1905, + "step": 10005 + }, + { + "epoch": 0.51, + "grad_norm": 0.9444491493237841, + "learning_rate": 1.0200093333611877e-05, + "loss": 0.202, + "step": 10006 + }, + { + "epoch": 0.51, + "grad_norm": 1.8810868878895413, + "learning_rate": 1.0198446692014052e-05, + "loss": 0.1995, + "step": 10007 + }, + { + "epoch": 0.51, + "grad_norm": 0.9449853646973742, + "learning_rate": 1.019680004503335e-05, + "loss": 0.1927, + "step": 10008 + }, + { + "epoch": 0.51, + "grad_norm": 3.285770560807395, + "learning_rate": 1.0195153392714439e-05, + "loss": 0.2011, + "step": 10009 + }, + { + "epoch": 0.51, + "grad_norm": 1.4409911905410049, + "learning_rate": 1.019350673510198e-05, + "loss": 0.1609, + "step": 10010 + }, + { + "epoch": 0.51, + "grad_norm": 1.1450798008075653, + "learning_rate": 1.0191860072240638e-05, + "loss": 0.1909, + "step": 10011 + }, + { + "epoch": 0.51, + "grad_norm": 1.2757295834893168, + "learning_rate": 1.019021340417508e-05, + "loss": 0.1918, + "step": 10012 + }, + { + "epoch": 0.51, + "grad_norm": 0.9584243701572146, + "learning_rate": 1.0188566730949977e-05, + "loss": 0.1855, + "step": 10013 + }, + { + "epoch": 0.51, + "grad_norm": 1.1818591305464108, + "learning_rate": 1.0186920052609988e-05, + "loss": 0.1811, + "step": 10014 + }, + { + "epoch": 0.51, + "grad_norm": 1.3379051075950539, + "learning_rate": 1.0185273369199781e-05, + "loss": 0.2011, + "step": 10015 + }, + { + "epoch": 0.51, + "grad_norm": 0.7753559912686392, + "learning_rate": 1.0183626680764023e-05, + "loss": 0.1962, + "step": 10016 + }, + { + "epoch": 0.51, + "grad_norm": 0.9930898004506734, + "learning_rate": 1.0181979987347383e-05, + "loss": 0.178, + "step": 10017 + }, + { + "epoch": 0.51, + "grad_norm": 0.9906395873333926, + "learning_rate": 1.0180333288994526e-05, + "loss": 0.1679, + "step": 10018 + }, + { + "epoch": 0.51, + "grad_norm": 1.1841086133641656, + "learning_rate": 1.0178686585750117e-05, + "loss": 0.2037, + "step": 10019 + }, + { + "epoch": 0.51, + "grad_norm": 0.9662665276637551, + "learning_rate": 1.0177039877658825e-05, + "loss": 0.1944, + "step": 10020 + }, + { + "epoch": 0.51, + "grad_norm": 1.092277086848292, + "learning_rate": 1.0175393164765315e-05, + "loss": 0.1799, + "step": 10021 + }, + { + "epoch": 0.51, + "grad_norm": 0.9675325735824981, + "learning_rate": 1.0173746447114257e-05, + "loss": 0.1645, + "step": 10022 + }, + { + "epoch": 0.51, + "grad_norm": 1.1363807412961018, + "learning_rate": 1.017209972475032e-05, + "loss": 0.1859, + "step": 10023 + }, + { + "epoch": 0.51, + "grad_norm": 1.1286346321271528, + "learning_rate": 1.0170452997718161e-05, + "loss": 0.1827, + "step": 10024 + }, + { + "epoch": 0.51, + "grad_norm": 1.0493532996425716, + "learning_rate": 1.0168806266062459e-05, + "loss": 0.1824, + "step": 10025 + }, + { + "epoch": 0.51, + "grad_norm": 2.5678128545444037, + "learning_rate": 1.0167159529827876e-05, + "loss": 0.1938, + "step": 10026 + }, + { + "epoch": 0.51, + "grad_norm": 0.9157850060664193, + "learning_rate": 1.0165512789059084e-05, + "loss": 0.2014, + "step": 10027 + }, + { + "epoch": 0.51, + "grad_norm": 1.1309156120661967, + "learning_rate": 1.0163866043800748e-05, + "loss": 0.2063, + "step": 10028 + }, + { + "epoch": 0.51, + "grad_norm": 1.25804053030035, + "learning_rate": 1.0162219294097531e-05, + "loss": 0.1726, + "step": 10029 + }, + { + "epoch": 0.51, + "grad_norm": 1.3146522587286733, + "learning_rate": 1.0160572539994111e-05, + "loss": 0.1736, + "step": 10030 + }, + { + "epoch": 0.51, + "grad_norm": 1.140177136353688, + "learning_rate": 1.015892578153515e-05, + "loss": 0.1942, + "step": 10031 + }, + { + "epoch": 0.51, + "grad_norm": 0.9510605440053564, + "learning_rate": 1.015727901876532e-05, + "loss": 0.1731, + "step": 10032 + }, + { + "epoch": 0.51, + "grad_norm": 0.9945450978518271, + "learning_rate": 1.0155632251729289e-05, + "loss": 0.2, + "step": 10033 + }, + { + "epoch": 0.51, + "grad_norm": 0.894224695698637, + "learning_rate": 1.015398548047172e-05, + "loss": 0.1977, + "step": 10034 + }, + { + "epoch": 0.51, + "grad_norm": 1.142454133947765, + "learning_rate": 1.0152338705037288e-05, + "loss": 0.1869, + "step": 10035 + }, + { + "epoch": 0.51, + "grad_norm": 0.976074582797507, + "learning_rate": 1.0150691925470661e-05, + "loss": 0.1678, + "step": 10036 + }, + { + "epoch": 0.51, + "grad_norm": 1.110742325414474, + "learning_rate": 1.0149045141816507e-05, + "loss": 0.1991, + "step": 10037 + }, + { + "epoch": 0.51, + "grad_norm": 2.5073131663612216, + "learning_rate": 1.0147398354119493e-05, + "loss": 0.2022, + "step": 10038 + }, + { + "epoch": 0.51, + "grad_norm": 1.1136933742481399, + "learning_rate": 1.0145751562424293e-05, + "loss": 0.1838, + "step": 10039 + }, + { + "epoch": 0.51, + "grad_norm": 0.8396311826471067, + "learning_rate": 1.0144104766775574e-05, + "loss": 0.1648, + "step": 10040 + }, + { + "epoch": 0.51, + "grad_norm": 1.5889142219711996, + "learning_rate": 1.0142457967218004e-05, + "loss": 0.1527, + "step": 10041 + }, + { + "epoch": 0.51, + "grad_norm": 1.5526245240475558, + "learning_rate": 1.0140811163796251e-05, + "loss": 0.1834, + "step": 10042 + }, + { + "epoch": 0.51, + "grad_norm": 1.4435183341553837, + "learning_rate": 1.0139164356554991e-05, + "loss": 0.18, + "step": 10043 + }, + { + "epoch": 0.51, + "grad_norm": 0.8552306532750134, + "learning_rate": 1.0137517545538889e-05, + "loss": 0.2122, + "step": 10044 + }, + { + "epoch": 0.51, + "grad_norm": 1.0819198266646604, + "learning_rate": 1.0135870730792614e-05, + "loss": 0.1571, + "step": 10045 + }, + { + "epoch": 0.51, + "grad_norm": 1.2645867726469782, + "learning_rate": 1.0134223912360841e-05, + "loss": 0.1843, + "step": 10046 + }, + { + "epoch": 0.51, + "grad_norm": 0.9493612997745103, + "learning_rate": 1.013257709028823e-05, + "loss": 0.1805, + "step": 10047 + }, + { + "epoch": 0.51, + "grad_norm": 0.9482335058184064, + "learning_rate": 1.0130930264619464e-05, + "loss": 0.1744, + "step": 10048 + }, + { + "epoch": 0.51, + "grad_norm": 1.0100789781159738, + "learning_rate": 1.0129283435399209e-05, + "loss": 0.2131, + "step": 10049 + }, + { + "epoch": 0.51, + "grad_norm": 1.210160492224197, + "learning_rate": 1.0127636602672129e-05, + "loss": 0.1801, + "step": 10050 + }, + { + "epoch": 0.51, + "grad_norm": 1.0121260518070379, + "learning_rate": 1.01259897664829e-05, + "loss": 0.1851, + "step": 10051 + }, + { + "epoch": 0.51, + "grad_norm": 1.0048530916729317, + "learning_rate": 1.0124342926876191e-05, + "loss": 0.193, + "step": 10052 + }, + { + "epoch": 0.51, + "grad_norm": 1.5781470869665752, + "learning_rate": 1.0122696083896675e-05, + "loss": 0.1884, + "step": 10053 + }, + { + "epoch": 0.51, + "grad_norm": 1.3276050210602262, + "learning_rate": 1.012104923758902e-05, + "loss": 0.178, + "step": 10054 + }, + { + "epoch": 0.51, + "grad_norm": 1.7500154198232776, + "learning_rate": 1.0119402387997896e-05, + "loss": 0.1751, + "step": 10055 + }, + { + "epoch": 0.51, + "grad_norm": 1.1186441647606769, + "learning_rate": 1.0117755535167976e-05, + "loss": 0.1866, + "step": 10056 + }, + { + "epoch": 0.51, + "grad_norm": 1.0804238144692095, + "learning_rate": 1.0116108679143932e-05, + "loss": 0.2126, + "step": 10057 + }, + { + "epoch": 0.51, + "grad_norm": 0.8679185570128787, + "learning_rate": 1.0114461819970435e-05, + "loss": 0.1635, + "step": 10058 + }, + { + "epoch": 0.51, + "grad_norm": 2.012499264568714, + "learning_rate": 1.0112814957692151e-05, + "loss": 0.1922, + "step": 10059 + }, + { + "epoch": 0.51, + "grad_norm": 1.2950140590211783, + "learning_rate": 1.0111168092353755e-05, + "loss": 0.1999, + "step": 10060 + }, + { + "epoch": 0.51, + "grad_norm": 1.4073424409836586, + "learning_rate": 1.010952122399992e-05, + "loss": 0.1851, + "step": 10061 + }, + { + "epoch": 0.51, + "grad_norm": 0.9568381123647998, + "learning_rate": 1.0107874352675318e-05, + "loss": 0.1821, + "step": 10062 + }, + { + "epoch": 0.51, + "grad_norm": 1.262395675575379, + "learning_rate": 1.0106227478424616e-05, + "loss": 0.2034, + "step": 10063 + }, + { + "epoch": 0.51, + "grad_norm": 0.9072944589352504, + "learning_rate": 1.0104580601292484e-05, + "loss": 0.1892, + "step": 10064 + }, + { + "epoch": 0.51, + "grad_norm": 0.9383135405528775, + "learning_rate": 1.01029337213236e-05, + "loss": 0.1723, + "step": 10065 + }, + { + "epoch": 0.51, + "grad_norm": 1.0190912617277366, + "learning_rate": 1.0101286838562634e-05, + "loss": 0.1561, + "step": 10066 + }, + { + "epoch": 0.51, + "grad_norm": 1.0876522637918589, + "learning_rate": 1.0099639953054256e-05, + "loss": 0.2164, + "step": 10067 + }, + { + "epoch": 0.51, + "grad_norm": 1.1657087864947426, + "learning_rate": 1.0097993064843138e-05, + "loss": 0.1862, + "step": 10068 + }, + { + "epoch": 0.51, + "grad_norm": 1.273406700391757, + "learning_rate": 1.0096346173973951e-05, + "loss": 0.2154, + "step": 10069 + }, + { + "epoch": 0.51, + "grad_norm": 1.1349999528339405, + "learning_rate": 1.0094699280491371e-05, + "loss": 0.1691, + "step": 10070 + }, + { + "epoch": 0.51, + "grad_norm": 1.8161335476433862, + "learning_rate": 1.009305238444007e-05, + "loss": 0.1859, + "step": 10071 + }, + { + "epoch": 0.51, + "grad_norm": 1.1468136125885018, + "learning_rate": 1.0091405485864714e-05, + "loss": 0.2021, + "step": 10072 + }, + { + "epoch": 0.51, + "grad_norm": 2.5859126246115993, + "learning_rate": 1.008975858480998e-05, + "loss": 0.1969, + "step": 10073 + }, + { + "epoch": 0.51, + "grad_norm": 1.07772020244796, + "learning_rate": 1.0088111681320539e-05, + "loss": 0.17, + "step": 10074 + }, + { + "epoch": 0.51, + "grad_norm": 1.367254327526161, + "learning_rate": 1.0086464775441064e-05, + "loss": 0.1844, + "step": 10075 + }, + { + "epoch": 0.51, + "grad_norm": 1.5431365797665315, + "learning_rate": 1.008481786721623e-05, + "loss": 0.2131, + "step": 10076 + }, + { + "epoch": 0.51, + "grad_norm": 1.2134258991202571, + "learning_rate": 1.0083170956690702e-05, + "loss": 0.1739, + "step": 10077 + }, + { + "epoch": 0.51, + "grad_norm": 1.1734516351523943, + "learning_rate": 1.008152404390916e-05, + "loss": 0.2067, + "step": 10078 + }, + { + "epoch": 0.51, + "grad_norm": 1.3435819692760993, + "learning_rate": 1.0079877128916274e-05, + "loss": 0.1844, + "step": 10079 + }, + { + "epoch": 0.51, + "grad_norm": 1.1947122193617454, + "learning_rate": 1.0078230211756714e-05, + "loss": 0.2014, + "step": 10080 + }, + { + "epoch": 0.51, + "grad_norm": 1.1594082609931562, + "learning_rate": 1.0076583292475157e-05, + "loss": 0.1865, + "step": 10081 + }, + { + "epoch": 0.51, + "grad_norm": 1.0904335945044985, + "learning_rate": 1.007493637111627e-05, + "loss": 0.1975, + "step": 10082 + }, + { + "epoch": 0.51, + "grad_norm": 0.9743470552569026, + "learning_rate": 1.0073289447724735e-05, + "loss": 0.202, + "step": 10083 + }, + { + "epoch": 0.51, + "grad_norm": 0.9636925739935388, + "learning_rate": 1.0071642522345217e-05, + "loss": 0.1914, + "step": 10084 + }, + { + "epoch": 0.51, + "grad_norm": 1.1185462840749154, + "learning_rate": 1.0069995595022393e-05, + "loss": 0.1728, + "step": 10085 + }, + { + "epoch": 0.51, + "grad_norm": 1.2161215893688821, + "learning_rate": 1.006834866580093e-05, + "loss": 0.1995, + "step": 10086 + }, + { + "epoch": 0.51, + "grad_norm": 1.5347187922548848, + "learning_rate": 1.006670173472551e-05, + "loss": 0.195, + "step": 10087 + }, + { + "epoch": 0.51, + "grad_norm": 0.8850629354087978, + "learning_rate": 1.00650548018408e-05, + "loss": 0.1853, + "step": 10088 + }, + { + "epoch": 0.51, + "grad_norm": 1.2386088666481518, + "learning_rate": 1.0063407867191478e-05, + "loss": 0.2176, + "step": 10089 + }, + { + "epoch": 0.51, + "grad_norm": 1.797387745492623, + "learning_rate": 1.006176093082221e-05, + "loss": 0.1852, + "step": 10090 + }, + { + "epoch": 0.51, + "grad_norm": 1.150179340161504, + "learning_rate": 1.0060113992777674e-05, + "loss": 0.1699, + "step": 10091 + }, + { + "epoch": 0.51, + "grad_norm": 1.0130362875047425, + "learning_rate": 1.0058467053102544e-05, + "loss": 0.2271, + "step": 10092 + }, + { + "epoch": 0.51, + "grad_norm": 1.491247780734237, + "learning_rate": 1.0056820111841495e-05, + "loss": 0.1868, + "step": 10093 + }, + { + "epoch": 0.51, + "grad_norm": 0.9510571415665885, + "learning_rate": 1.0055173169039192e-05, + "loss": 0.2148, + "step": 10094 + }, + { + "epoch": 0.51, + "grad_norm": 1.4031686577019125, + "learning_rate": 1.0053526224740313e-05, + "loss": 0.1746, + "step": 10095 + }, + { + "epoch": 0.51, + "grad_norm": 0.783505650323615, + "learning_rate": 1.0051879278989536e-05, + "loss": 0.1739, + "step": 10096 + }, + { + "epoch": 0.51, + "grad_norm": 1.7477764660744681, + "learning_rate": 1.0050232331831528e-05, + "loss": 0.1789, + "step": 10097 + }, + { + "epoch": 0.51, + "grad_norm": 1.8995038115576395, + "learning_rate": 1.0048585383310967e-05, + "loss": 0.2128, + "step": 10098 + }, + { + "epoch": 0.51, + "grad_norm": 1.8440069820748402, + "learning_rate": 1.0046938433472522e-05, + "loss": 0.1614, + "step": 10099 + }, + { + "epoch": 0.51, + "grad_norm": 1.0316875891218211, + "learning_rate": 1.0045291482360871e-05, + "loss": 0.1923, + "step": 10100 + }, + { + "epoch": 0.51, + "grad_norm": 1.3853684601406886, + "learning_rate": 1.0043644530020686e-05, + "loss": 0.1897, + "step": 10101 + }, + { + "epoch": 0.51, + "grad_norm": 1.0621006982820664, + "learning_rate": 1.0041997576496643e-05, + "loss": 0.1831, + "step": 10102 + }, + { + "epoch": 0.51, + "grad_norm": 1.0929803339188837, + "learning_rate": 1.004035062183341e-05, + "loss": 0.1871, + "step": 10103 + }, + { + "epoch": 0.51, + "grad_norm": 1.5969371607412692, + "learning_rate": 1.0038703666075665e-05, + "loss": 0.1802, + "step": 10104 + }, + { + "epoch": 0.51, + "grad_norm": 1.2339497000753874, + "learning_rate": 1.003705670926808e-05, + "loss": 0.1894, + "step": 10105 + }, + { + "epoch": 0.51, + "grad_norm": 1.408984839071695, + "learning_rate": 1.0035409751455332e-05, + "loss": 0.1927, + "step": 10106 + }, + { + "epoch": 0.51, + "grad_norm": 0.9577016281366301, + "learning_rate": 1.0033762792682092e-05, + "loss": 0.1802, + "step": 10107 + }, + { + "epoch": 0.51, + "grad_norm": 1.0593468957036303, + "learning_rate": 1.0032115832993032e-05, + "loss": 0.1736, + "step": 10108 + }, + { + "epoch": 0.51, + "grad_norm": 1.1893297903859361, + "learning_rate": 1.003046887243283e-05, + "loss": 0.1927, + "step": 10109 + }, + { + "epoch": 0.51, + "grad_norm": 0.7841198800976856, + "learning_rate": 1.0028821911046158e-05, + "loss": 0.1733, + "step": 10110 + }, + { + "epoch": 0.51, + "grad_norm": 0.9635994815974888, + "learning_rate": 1.0027174948877692e-05, + "loss": 0.1769, + "step": 10111 + }, + { + "epoch": 0.51, + "grad_norm": 1.4976506517758972, + "learning_rate": 1.0025527985972102e-05, + "loss": 0.1903, + "step": 10112 + }, + { + "epoch": 0.51, + "grad_norm": 0.9258277904885904, + "learning_rate": 1.0023881022374062e-05, + "loss": 0.181, + "step": 10113 + }, + { + "epoch": 0.51, + "grad_norm": 1.1322482761758057, + "learning_rate": 1.0022234058128251e-05, + "loss": 0.185, + "step": 10114 + }, + { + "epoch": 0.51, + "grad_norm": 1.697952043975562, + "learning_rate": 1.0020587093279339e-05, + "loss": 0.1945, + "step": 10115 + }, + { + "epoch": 0.51, + "grad_norm": 1.0221196595404018, + "learning_rate": 1.0018940127872001e-05, + "loss": 0.173, + "step": 10116 + }, + { + "epoch": 0.51, + "grad_norm": 2.525650165385635, + "learning_rate": 1.001729316195091e-05, + "loss": 0.196, + "step": 10117 + }, + { + "epoch": 0.51, + "grad_norm": 0.9281390262973999, + "learning_rate": 1.001564619556074e-05, + "loss": 0.1758, + "step": 10118 + }, + { + "epoch": 0.51, + "grad_norm": 1.5021408002983778, + "learning_rate": 1.001399922874617e-05, + "loss": 0.2149, + "step": 10119 + }, + { + "epoch": 0.51, + "grad_norm": 1.338027793563955, + "learning_rate": 1.0012352261551868e-05, + "loss": 0.1987, + "step": 10120 + }, + { + "epoch": 0.51, + "grad_norm": 1.034500217978134, + "learning_rate": 1.001070529402251e-05, + "loss": 0.1678, + "step": 10121 + }, + { + "epoch": 0.51, + "grad_norm": 0.9757847773691674, + "learning_rate": 1.0009058326202768e-05, + "loss": 0.1821, + "step": 10122 + }, + { + "epoch": 0.51, + "grad_norm": 0.9283672375689633, + "learning_rate": 1.000741135813732e-05, + "loss": 0.2065, + "step": 10123 + }, + { + "epoch": 0.51, + "grad_norm": 1.0360793799716828, + "learning_rate": 1.000576438987084e-05, + "loss": 0.188, + "step": 10124 + }, + { + "epoch": 0.51, + "grad_norm": 0.9095239168298347, + "learning_rate": 1.0004117421448e-05, + "loss": 0.1953, + "step": 10125 + }, + { + "epoch": 0.51, + "grad_norm": 1.0856311188231922, + "learning_rate": 1.0002470452913473e-05, + "loss": 0.184, + "step": 10126 + }, + { + "epoch": 0.51, + "grad_norm": 1.0522283892013424, + "learning_rate": 1.0000823484311937e-05, + "loss": 0.1797, + "step": 10127 + }, + { + "epoch": 0.52, + "grad_norm": 0.8729040385760526, + "learning_rate": 9.999176515688066e-06, + "loss": 0.1783, + "step": 10128 + }, + { + "epoch": 0.52, + "grad_norm": 1.5443301439658077, + "learning_rate": 9.997529547086527e-06, + "loss": 0.1928, + "step": 10129 + }, + { + "epoch": 0.52, + "grad_norm": 1.0450069976328333, + "learning_rate": 9.995882578552002e-06, + "loss": 0.1769, + "step": 10130 + }, + { + "epoch": 0.52, + "grad_norm": 2.594677740582822, + "learning_rate": 9.99423561012916e-06, + "loss": 0.1859, + "step": 10131 + }, + { + "epoch": 0.52, + "grad_norm": 0.9298369881994603, + "learning_rate": 9.992588641862682e-06, + "loss": 0.183, + "step": 10132 + }, + { + "epoch": 0.52, + "grad_norm": 1.1359824341091553, + "learning_rate": 9.990941673797234e-06, + "loss": 0.2131, + "step": 10133 + }, + { + "epoch": 0.52, + "grad_norm": 1.1726229751635113, + "learning_rate": 9.989294705977494e-06, + "loss": 0.2187, + "step": 10134 + }, + { + "epoch": 0.52, + "grad_norm": 1.0697105168059207, + "learning_rate": 9.987647738448134e-06, + "loss": 0.1792, + "step": 10135 + }, + { + "epoch": 0.52, + "grad_norm": 1.0446155555654533, + "learning_rate": 9.986000771253835e-06, + "loss": 0.1842, + "step": 10136 + }, + { + "epoch": 0.52, + "grad_norm": 1.5252893644381011, + "learning_rate": 9.984353804439264e-06, + "loss": 0.2099, + "step": 10137 + }, + { + "epoch": 0.52, + "grad_norm": 0.991401095277735, + "learning_rate": 9.982706838049094e-06, + "loss": 0.1812, + "step": 10138 + }, + { + "epoch": 0.52, + "grad_norm": 0.9456998564950463, + "learning_rate": 9.981059872128004e-06, + "loss": 0.1749, + "step": 10139 + }, + { + "epoch": 0.52, + "grad_norm": 1.141972747906058, + "learning_rate": 9.979412906720663e-06, + "loss": 0.1747, + "step": 10140 + }, + { + "epoch": 0.52, + "grad_norm": 0.9011930047903997, + "learning_rate": 9.977765941871754e-06, + "loss": 0.1748, + "step": 10141 + }, + { + "epoch": 0.52, + "grad_norm": 1.135559019401279, + "learning_rate": 9.976118977625941e-06, + "loss": 0.1878, + "step": 10142 + }, + { + "epoch": 0.52, + "grad_norm": 1.4727396649384408, + "learning_rate": 9.974472014027903e-06, + "loss": 0.1737, + "step": 10143 + }, + { + "epoch": 0.52, + "grad_norm": 0.8580717686639748, + "learning_rate": 9.97282505112231e-06, + "loss": 0.21, + "step": 10144 + }, + { + "epoch": 0.52, + "grad_norm": 1.09660598116125, + "learning_rate": 9.971178088953845e-06, + "loss": 0.1681, + "step": 10145 + }, + { + "epoch": 0.52, + "grad_norm": 0.8839707672338685, + "learning_rate": 9.969531127567172e-06, + "loss": 0.1792, + "step": 10146 + }, + { + "epoch": 0.52, + "grad_norm": 0.970215974749392, + "learning_rate": 9.96788416700697e-06, + "loss": 0.177, + "step": 10147 + }, + { + "epoch": 0.52, + "grad_norm": 1.26868888668654, + "learning_rate": 9.96623720731791e-06, + "loss": 0.1784, + "step": 10148 + }, + { + "epoch": 0.52, + "grad_norm": 2.656729429434053, + "learning_rate": 9.964590248544671e-06, + "loss": 0.2014, + "step": 10149 + }, + { + "epoch": 0.52, + "grad_norm": 0.93392320181895, + "learning_rate": 9.96294329073192e-06, + "loss": 0.1887, + "step": 10150 + }, + { + "epoch": 0.52, + "grad_norm": 0.8589463763497676, + "learning_rate": 9.961296333924338e-06, + "loss": 0.1678, + "step": 10151 + }, + { + "epoch": 0.52, + "grad_norm": 1.0833683363223623, + "learning_rate": 9.959649378166593e-06, + "loss": 0.1936, + "step": 10152 + }, + { + "epoch": 0.52, + "grad_norm": 0.9254068495035073, + "learning_rate": 9.95800242350336e-06, + "loss": 0.1791, + "step": 10153 + }, + { + "epoch": 0.52, + "grad_norm": 1.2301348039871198, + "learning_rate": 9.95635546997932e-06, + "loss": 0.1916, + "step": 10154 + }, + { + "epoch": 0.52, + "grad_norm": 0.8856938637562976, + "learning_rate": 9.95470851763913e-06, + "loss": 0.1649, + "step": 10155 + }, + { + "epoch": 0.52, + "grad_norm": 2.540285019478901, + "learning_rate": 9.953061566527481e-06, + "loss": 0.1785, + "step": 10156 + }, + { + "epoch": 0.52, + "grad_norm": 0.9488229772162609, + "learning_rate": 9.951414616689037e-06, + "loss": 0.1971, + "step": 10157 + }, + { + "epoch": 0.52, + "grad_norm": 1.4195672508913053, + "learning_rate": 9.949767668168477e-06, + "loss": 0.1716, + "step": 10158 + }, + { + "epoch": 0.52, + "grad_norm": 0.9390328203362874, + "learning_rate": 9.948120721010467e-06, + "loss": 0.1847, + "step": 10159 + }, + { + "epoch": 0.52, + "grad_norm": 0.9321828642929739, + "learning_rate": 9.94647377525969e-06, + "loss": 0.1757, + "step": 10160 + }, + { + "epoch": 0.52, + "grad_norm": 1.4279965604940652, + "learning_rate": 9.94482683096081e-06, + "loss": 0.201, + "step": 10161 + }, + { + "epoch": 0.52, + "grad_norm": 1.0650861327001324, + "learning_rate": 9.943179888158512e-06, + "loss": 0.1886, + "step": 10162 + }, + { + "epoch": 0.52, + "grad_norm": 2.259419552091082, + "learning_rate": 9.941532946897456e-06, + "loss": 0.1867, + "step": 10163 + }, + { + "epoch": 0.52, + "grad_norm": 1.0238322413879728, + "learning_rate": 9.93988600722233e-06, + "loss": 0.1822, + "step": 10164 + }, + { + "epoch": 0.52, + "grad_norm": 2.9864417236171614, + "learning_rate": 9.938239069177792e-06, + "loss": 0.1855, + "step": 10165 + }, + { + "epoch": 0.52, + "grad_norm": 0.8899000559027274, + "learning_rate": 9.936592132808526e-06, + "loss": 0.1858, + "step": 10166 + }, + { + "epoch": 0.52, + "grad_norm": 1.0665160725544522, + "learning_rate": 9.9349451981592e-06, + "loss": 0.1728, + "step": 10167 + }, + { + "epoch": 0.52, + "grad_norm": 1.0003805091844538, + "learning_rate": 9.933298265274493e-06, + "loss": 0.1756, + "step": 10168 + }, + { + "epoch": 0.52, + "grad_norm": 0.7563077112028692, + "learning_rate": 9.93165133419907e-06, + "loss": 0.1651, + "step": 10169 + }, + { + "epoch": 0.52, + "grad_norm": 0.9976909236680422, + "learning_rate": 9.93000440497761e-06, + "loss": 0.2197, + "step": 10170 + }, + { + "epoch": 0.52, + "grad_norm": 1.0635056769523115, + "learning_rate": 9.928357477654783e-06, + "loss": 0.1755, + "step": 10171 + }, + { + "epoch": 0.52, + "grad_norm": 0.9026074106361105, + "learning_rate": 9.926710552275268e-06, + "loss": 0.1995, + "step": 10172 + }, + { + "epoch": 0.52, + "grad_norm": 0.8673749012582641, + "learning_rate": 9.925063628883731e-06, + "loss": 0.1817, + "step": 10173 + }, + { + "epoch": 0.52, + "grad_norm": 0.9998411376931166, + "learning_rate": 9.923416707524845e-06, + "loss": 0.205, + "step": 10174 + }, + { + "epoch": 0.52, + "grad_norm": 1.334736412034857, + "learning_rate": 9.921769788243291e-06, + "loss": 0.1721, + "step": 10175 + }, + { + "epoch": 0.52, + "grad_norm": 0.853685111169905, + "learning_rate": 9.92012287108373e-06, + "loss": 0.1825, + "step": 10176 + }, + { + "epoch": 0.52, + "grad_norm": 1.5061052824171897, + "learning_rate": 9.918475956090845e-06, + "loss": 0.1907, + "step": 10177 + }, + { + "epoch": 0.52, + "grad_norm": 1.649723100823968, + "learning_rate": 9.9168290433093e-06, + "loss": 0.1581, + "step": 10178 + }, + { + "epoch": 0.52, + "grad_norm": 0.9177755922432216, + "learning_rate": 9.915182132783773e-06, + "loss": 0.1698, + "step": 10179 + }, + { + "epoch": 0.52, + "grad_norm": 1.171004729759711, + "learning_rate": 9.913535224558936e-06, + "loss": 0.1862, + "step": 10180 + }, + { + "epoch": 0.52, + "grad_norm": 1.1815972737473774, + "learning_rate": 9.911888318679463e-06, + "loss": 0.2017, + "step": 10181 + }, + { + "epoch": 0.52, + "grad_norm": 0.9401379104944637, + "learning_rate": 9.910241415190022e-06, + "loss": 0.1664, + "step": 10182 + }, + { + "epoch": 0.52, + "grad_norm": 1.086417801036447, + "learning_rate": 9.908594514135288e-06, + "loss": 0.2047, + "step": 10183 + }, + { + "epoch": 0.52, + "grad_norm": 1.1246348855935053, + "learning_rate": 9.906947615559932e-06, + "loss": 0.182, + "step": 10184 + }, + { + "epoch": 0.52, + "grad_norm": 1.5114210520282993, + "learning_rate": 9.90530071950863e-06, + "loss": 0.2235, + "step": 10185 + }, + { + "epoch": 0.52, + "grad_norm": 1.7888776269891093, + "learning_rate": 9.903653826026049e-06, + "loss": 0.1705, + "step": 10186 + }, + { + "epoch": 0.52, + "grad_norm": 1.3258985075992884, + "learning_rate": 9.902006935156863e-06, + "loss": 0.1849, + "step": 10187 + }, + { + "epoch": 0.52, + "grad_norm": 1.954590283479778, + "learning_rate": 9.900360046945746e-06, + "loss": 0.184, + "step": 10188 + }, + { + "epoch": 0.52, + "grad_norm": 0.9657271524800654, + "learning_rate": 9.89871316143737e-06, + "loss": 0.1725, + "step": 10189 + }, + { + "epoch": 0.52, + "grad_norm": 0.910098204363335, + "learning_rate": 9.897066278676405e-06, + "loss": 0.2054, + "step": 10190 + }, + { + "epoch": 0.52, + "grad_norm": 0.980882661006259, + "learning_rate": 9.89541939870752e-06, + "loss": 0.169, + "step": 10191 + }, + { + "epoch": 0.52, + "grad_norm": 1.3256139950041805, + "learning_rate": 9.893772521575391e-06, + "loss": 0.1984, + "step": 10192 + }, + { + "epoch": 0.52, + "grad_norm": 1.2102877874991573, + "learning_rate": 9.892125647324686e-06, + "loss": 0.1972, + "step": 10193 + }, + { + "epoch": 0.52, + "grad_norm": 1.2624640998421532, + "learning_rate": 9.890478776000084e-06, + "loss": 0.1975, + "step": 10194 + }, + { + "epoch": 0.52, + "grad_norm": 1.6185245911520465, + "learning_rate": 9.888831907646246e-06, + "loss": 0.1702, + "step": 10195 + }, + { + "epoch": 0.52, + "grad_norm": 1.3108016567585763, + "learning_rate": 9.887185042307852e-06, + "loss": 0.1929, + "step": 10196 + }, + { + "epoch": 0.52, + "grad_norm": 1.1164020898665021, + "learning_rate": 9.885538180029568e-06, + "loss": 0.1819, + "step": 10197 + }, + { + "epoch": 0.52, + "grad_norm": 1.3538769640540862, + "learning_rate": 9.883891320856071e-06, + "loss": 0.2121, + "step": 10198 + }, + { + "epoch": 0.52, + "grad_norm": 2.724159505917361, + "learning_rate": 9.882244464832026e-06, + "loss": 0.1805, + "step": 10199 + }, + { + "epoch": 0.52, + "grad_norm": 1.2116009207840774, + "learning_rate": 9.880597612002106e-06, + "loss": 0.1754, + "step": 10200 + }, + { + "epoch": 0.52, + "grad_norm": 1.0692218955637767, + "learning_rate": 9.878950762410981e-06, + "loss": 0.1719, + "step": 10201 + }, + { + "epoch": 0.52, + "grad_norm": 1.1384005316524284, + "learning_rate": 9.877303916103328e-06, + "loss": 0.2003, + "step": 10202 + }, + { + "epoch": 0.52, + "grad_norm": 1.7804404480331262, + "learning_rate": 9.87565707312381e-06, + "loss": 0.1939, + "step": 10203 + }, + { + "epoch": 0.52, + "grad_norm": 1.1253730710614271, + "learning_rate": 9.874010233517103e-06, + "loss": 0.1829, + "step": 10204 + }, + { + "epoch": 0.52, + "grad_norm": 1.0966039460231016, + "learning_rate": 9.872363397327873e-06, + "loss": 0.1832, + "step": 10205 + }, + { + "epoch": 0.52, + "grad_norm": 0.9516794217543744, + "learning_rate": 9.870716564600796e-06, + "loss": 0.1649, + "step": 10206 + }, + { + "epoch": 0.52, + "grad_norm": 1.4001621379783895, + "learning_rate": 9.869069735380539e-06, + "loss": 0.2184, + "step": 10207 + }, + { + "epoch": 0.52, + "grad_norm": 0.9562249033797356, + "learning_rate": 9.86742290971177e-06, + "loss": 0.1564, + "step": 10208 + }, + { + "epoch": 0.52, + "grad_norm": 1.1783580116509145, + "learning_rate": 9.865776087639166e-06, + "loss": 0.1931, + "step": 10209 + }, + { + "epoch": 0.52, + "grad_norm": 0.9452027253631481, + "learning_rate": 9.864129269207388e-06, + "loss": 0.1644, + "step": 10210 + }, + { + "epoch": 0.52, + "grad_norm": 0.7783206095949607, + "learning_rate": 9.862482454461116e-06, + "loss": 0.1727, + "step": 10211 + }, + { + "epoch": 0.52, + "grad_norm": 1.2040093366723683, + "learning_rate": 9.860835643445012e-06, + "loss": 0.1946, + "step": 10212 + }, + { + "epoch": 0.52, + "grad_norm": 1.0919123989202537, + "learning_rate": 9.85918883620375e-06, + "loss": 0.1927, + "step": 10213 + }, + { + "epoch": 0.52, + "grad_norm": 1.2210304943276447, + "learning_rate": 9.857542032781998e-06, + "loss": 0.1658, + "step": 10214 + }, + { + "epoch": 0.52, + "grad_norm": 1.403600247600213, + "learning_rate": 9.855895233224431e-06, + "loss": 0.1764, + "step": 10215 + }, + { + "epoch": 0.52, + "grad_norm": 2.6703789557349276, + "learning_rate": 9.854248437575709e-06, + "loss": 0.1786, + "step": 10216 + }, + { + "epoch": 0.52, + "grad_norm": 1.2725040333955278, + "learning_rate": 9.852601645880509e-06, + "loss": 0.1944, + "step": 10217 + }, + { + "epoch": 0.52, + "grad_norm": 1.2734027828974808, + "learning_rate": 9.850954858183496e-06, + "loss": 0.1755, + "step": 10218 + }, + { + "epoch": 0.52, + "grad_norm": 1.5556163072912323, + "learning_rate": 9.84930807452934e-06, + "loss": 0.2188, + "step": 10219 + }, + { + "epoch": 0.52, + "grad_norm": 0.8223797292168257, + "learning_rate": 9.847661294962712e-06, + "loss": 0.1837, + "step": 10220 + }, + { + "epoch": 0.52, + "grad_norm": 0.9725737606009888, + "learning_rate": 9.846014519528284e-06, + "loss": 0.1765, + "step": 10221 + }, + { + "epoch": 0.52, + "grad_norm": 0.9305960044566103, + "learning_rate": 9.844367748270715e-06, + "loss": 0.1787, + "step": 10222 + }, + { + "epoch": 0.52, + "grad_norm": 0.9975408276525014, + "learning_rate": 9.842720981234682e-06, + "loss": 0.1891, + "step": 10223 + }, + { + "epoch": 0.52, + "grad_norm": 1.8328704239311866, + "learning_rate": 9.841074218464852e-06, + "loss": 0.2045, + "step": 10224 + }, + { + "epoch": 0.52, + "grad_norm": 0.8011822959442132, + "learning_rate": 9.839427460005892e-06, + "loss": 0.1606, + "step": 10225 + }, + { + "epoch": 0.52, + "grad_norm": 1.1040336320983504, + "learning_rate": 9.83778070590247e-06, + "loss": 0.1841, + "step": 10226 + }, + { + "epoch": 0.52, + "grad_norm": 1.6536014897279836, + "learning_rate": 9.836133956199256e-06, + "loss": 0.2025, + "step": 10227 + }, + { + "epoch": 0.52, + "grad_norm": 1.1487464384278228, + "learning_rate": 9.834487210940921e-06, + "loss": 0.1803, + "step": 10228 + }, + { + "epoch": 0.52, + "grad_norm": 1.167115620085271, + "learning_rate": 9.832840470172125e-06, + "loss": 0.169, + "step": 10229 + }, + { + "epoch": 0.52, + "grad_norm": 1.1368296793900705, + "learning_rate": 9.831193733937546e-06, + "loss": 0.1966, + "step": 10230 + }, + { + "epoch": 0.52, + "grad_norm": 2.248366853893879, + "learning_rate": 9.829547002281842e-06, + "loss": 0.1962, + "step": 10231 + }, + { + "epoch": 0.52, + "grad_norm": 1.2296604069185608, + "learning_rate": 9.827900275249686e-06, + "loss": 0.1698, + "step": 10232 + }, + { + "epoch": 0.52, + "grad_norm": 1.0772480915238765, + "learning_rate": 9.826253552885744e-06, + "loss": 0.1702, + "step": 10233 + }, + { + "epoch": 0.52, + "grad_norm": 1.1237664542437968, + "learning_rate": 9.824606835234689e-06, + "loss": 0.1807, + "step": 10234 + }, + { + "epoch": 0.52, + "grad_norm": 1.1427706105579902, + "learning_rate": 9.822960122341178e-06, + "loss": 0.162, + "step": 10235 + }, + { + "epoch": 0.52, + "grad_norm": 1.1936009566155101, + "learning_rate": 9.821313414249885e-06, + "loss": 0.1701, + "step": 10236 + }, + { + "epoch": 0.52, + "grad_norm": 0.858311831063239, + "learning_rate": 9.819666711005475e-06, + "loss": 0.1671, + "step": 10237 + }, + { + "epoch": 0.52, + "grad_norm": 0.9974203917111358, + "learning_rate": 9.818020012652619e-06, + "loss": 0.1792, + "step": 10238 + }, + { + "epoch": 0.52, + "grad_norm": 1.048164162370106, + "learning_rate": 9.816373319235978e-06, + "loss": 0.1773, + "step": 10239 + }, + { + "epoch": 0.52, + "grad_norm": 1.1626993038494857, + "learning_rate": 9.81472663080022e-06, + "loss": 0.2068, + "step": 10240 + }, + { + "epoch": 0.52, + "grad_norm": 1.1444195275217797, + "learning_rate": 9.813079947390014e-06, + "loss": 0.176, + "step": 10241 + }, + { + "epoch": 0.52, + "grad_norm": 2.142794849853422, + "learning_rate": 9.811433269050028e-06, + "loss": 0.1936, + "step": 10242 + }, + { + "epoch": 0.52, + "grad_norm": 1.73344991210697, + "learning_rate": 9.809786595824922e-06, + "loss": 0.1756, + "step": 10243 + }, + { + "epoch": 0.52, + "grad_norm": 0.9409173951911256, + "learning_rate": 9.808139927759363e-06, + "loss": 0.1816, + "step": 10244 + }, + { + "epoch": 0.52, + "grad_norm": 1.0067367796209783, + "learning_rate": 9.806493264898025e-06, + "loss": 0.1731, + "step": 10245 + }, + { + "epoch": 0.52, + "grad_norm": 1.0266266779009294, + "learning_rate": 9.804846607285564e-06, + "loss": 0.1842, + "step": 10246 + }, + { + "epoch": 0.52, + "grad_norm": 1.1987923485278358, + "learning_rate": 9.803199954966653e-06, + "loss": 0.1718, + "step": 10247 + }, + { + "epoch": 0.52, + "grad_norm": 1.6993453303103312, + "learning_rate": 9.801553307985951e-06, + "loss": 0.1612, + "step": 10248 + }, + { + "epoch": 0.52, + "grad_norm": 1.1169837375895257, + "learning_rate": 9.799906666388129e-06, + "loss": 0.2179, + "step": 10249 + }, + { + "epoch": 0.52, + "grad_norm": 1.2233213875080469, + "learning_rate": 9.798260030217846e-06, + "loss": 0.1806, + "step": 10250 + }, + { + "epoch": 0.52, + "grad_norm": 0.8535516549948462, + "learning_rate": 9.796613399519777e-06, + "loss": 0.1985, + "step": 10251 + }, + { + "epoch": 0.52, + "grad_norm": 1.042384382904542, + "learning_rate": 9.794966774338576e-06, + "loss": 0.1733, + "step": 10252 + }, + { + "epoch": 0.52, + "grad_norm": 1.2915414484488394, + "learning_rate": 9.793320154718916e-06, + "loss": 0.2085, + "step": 10253 + }, + { + "epoch": 0.52, + "grad_norm": 1.0400857471966334, + "learning_rate": 9.791673540705455e-06, + "loss": 0.187, + "step": 10254 + }, + { + "epoch": 0.52, + "grad_norm": 1.0354907674864533, + "learning_rate": 9.790026932342867e-06, + "loss": 0.1905, + "step": 10255 + }, + { + "epoch": 0.52, + "grad_norm": 1.156361371396123, + "learning_rate": 9.788380329675804e-06, + "loss": 0.1737, + "step": 10256 + }, + { + "epoch": 0.52, + "grad_norm": 1.130435978970632, + "learning_rate": 9.78673373274894e-06, + "loss": 0.1822, + "step": 10257 + }, + { + "epoch": 0.52, + "grad_norm": 1.005638120384302, + "learning_rate": 9.785087141606933e-06, + "loss": 0.1739, + "step": 10258 + }, + { + "epoch": 0.52, + "grad_norm": 1.1380722013661921, + "learning_rate": 9.783440556294453e-06, + "loss": 0.1632, + "step": 10259 + }, + { + "epoch": 0.52, + "grad_norm": 0.9394729324116381, + "learning_rate": 9.781793976856162e-06, + "loss": 0.1935, + "step": 10260 + }, + { + "epoch": 0.52, + "grad_norm": 1.0403813908247483, + "learning_rate": 9.780147403336715e-06, + "loss": 0.1996, + "step": 10261 + }, + { + "epoch": 0.52, + "grad_norm": 1.0395031212258359, + "learning_rate": 9.778500835780787e-06, + "loss": 0.1701, + "step": 10262 + }, + { + "epoch": 0.52, + "grad_norm": 1.4688923849207338, + "learning_rate": 9.776854274233033e-06, + "loss": 0.216, + "step": 10263 + }, + { + "epoch": 0.52, + "grad_norm": 0.9303142358262178, + "learning_rate": 9.775207718738122e-06, + "loss": 0.1751, + "step": 10264 + }, + { + "epoch": 0.52, + "grad_norm": 0.9180433229028818, + "learning_rate": 9.773561169340711e-06, + "loss": 0.1701, + "step": 10265 + }, + { + "epoch": 0.52, + "grad_norm": 4.3860890328689495, + "learning_rate": 9.771914626085469e-06, + "loss": 0.182, + "step": 10266 + }, + { + "epoch": 0.52, + "grad_norm": 0.9648158711671928, + "learning_rate": 9.770268089017053e-06, + "loss": 0.1796, + "step": 10267 + }, + { + "epoch": 0.52, + "grad_norm": 1.2075022464405123, + "learning_rate": 9.768621558180132e-06, + "loss": 0.1843, + "step": 10268 + }, + { + "epoch": 0.52, + "grad_norm": 1.0368552296049038, + "learning_rate": 9.766975033619361e-06, + "loss": 0.2178, + "step": 10269 + }, + { + "epoch": 0.52, + "grad_norm": 0.9632831147646977, + "learning_rate": 9.765328515379407e-06, + "loss": 0.192, + "step": 10270 + }, + { + "epoch": 0.52, + "grad_norm": 1.0815652516872132, + "learning_rate": 9.763682003504928e-06, + "loss": 0.2028, + "step": 10271 + }, + { + "epoch": 0.52, + "grad_norm": 1.0663126245248298, + "learning_rate": 9.762035498040594e-06, + "loss": 0.1806, + "step": 10272 + }, + { + "epoch": 0.52, + "grad_norm": 1.0079893330839749, + "learning_rate": 9.760388999031052e-06, + "loss": 0.1645, + "step": 10273 + }, + { + "epoch": 0.52, + "grad_norm": 0.8802844478953326, + "learning_rate": 9.758742506520981e-06, + "loss": 0.2002, + "step": 10274 + }, + { + "epoch": 0.52, + "grad_norm": 1.0872710320559473, + "learning_rate": 9.757096020555026e-06, + "loss": 0.1872, + "step": 10275 + }, + { + "epoch": 0.52, + "grad_norm": 1.3758322155268397, + "learning_rate": 9.755449541177858e-06, + "loss": 0.1975, + "step": 10276 + }, + { + "epoch": 0.52, + "grad_norm": 0.8157296114345972, + "learning_rate": 9.753803068434138e-06, + "loss": 0.1757, + "step": 10277 + }, + { + "epoch": 0.52, + "grad_norm": 0.8569053998828822, + "learning_rate": 9.752156602368518e-06, + "loss": 0.1921, + "step": 10278 + }, + { + "epoch": 0.52, + "grad_norm": 1.086718255160991, + "learning_rate": 9.750510143025667e-06, + "loss": 0.183, + "step": 10279 + }, + { + "epoch": 0.52, + "grad_norm": 0.8547794002110998, + "learning_rate": 9.74886369045024e-06, + "loss": 0.1925, + "step": 10280 + }, + { + "epoch": 0.52, + "grad_norm": 0.9958419750913367, + "learning_rate": 9.747217244686904e-06, + "loss": 0.1836, + "step": 10281 + }, + { + "epoch": 0.52, + "grad_norm": 0.9492194733753996, + "learning_rate": 9.745570805780312e-06, + "loss": 0.1913, + "step": 10282 + }, + { + "epoch": 0.52, + "grad_norm": 1.0321429448228026, + "learning_rate": 9.743924373775125e-06, + "loss": 0.1981, + "step": 10283 + }, + { + "epoch": 0.52, + "grad_norm": 1.2903368325471654, + "learning_rate": 9.742277948716004e-06, + "loss": 0.1851, + "step": 10284 + }, + { + "epoch": 0.52, + "grad_norm": 0.9680758698483248, + "learning_rate": 9.740631530647611e-06, + "loss": 0.1756, + "step": 10285 + }, + { + "epoch": 0.52, + "grad_norm": 1.1418268403416956, + "learning_rate": 9.7389851196146e-06, + "loss": 0.1651, + "step": 10286 + }, + { + "epoch": 0.52, + "grad_norm": 1.0744824585428467, + "learning_rate": 9.737338715661635e-06, + "loss": 0.1815, + "step": 10287 + }, + { + "epoch": 0.52, + "grad_norm": 0.9499836138535693, + "learning_rate": 9.735692318833368e-06, + "loss": 0.1643, + "step": 10288 + }, + { + "epoch": 0.52, + "grad_norm": 1.033330852504397, + "learning_rate": 9.734045929174465e-06, + "loss": 0.1745, + "step": 10289 + }, + { + "epoch": 0.52, + "grad_norm": 0.9317581030504224, + "learning_rate": 9.732399546729578e-06, + "loss": 0.181, + "step": 10290 + }, + { + "epoch": 0.52, + "grad_norm": 0.936147738733036, + "learning_rate": 9.730753171543374e-06, + "loss": 0.1947, + "step": 10291 + }, + { + "epoch": 0.52, + "grad_norm": 1.4341538894034087, + "learning_rate": 9.729106803660501e-06, + "loss": 0.1622, + "step": 10292 + }, + { + "epoch": 0.52, + "grad_norm": 1.0503544146668324, + "learning_rate": 9.727460443125622e-06, + "loss": 0.1868, + "step": 10293 + }, + { + "epoch": 0.52, + "grad_norm": 0.9132717870250818, + "learning_rate": 9.725814089983398e-06, + "loss": 0.1837, + "step": 10294 + }, + { + "epoch": 0.52, + "grad_norm": 1.2074972029352926, + "learning_rate": 9.724167744278475e-06, + "loss": 0.1876, + "step": 10295 + }, + { + "epoch": 0.52, + "grad_norm": 1.166491148740018, + "learning_rate": 9.722521406055521e-06, + "loss": 0.2105, + "step": 10296 + }, + { + "epoch": 0.52, + "grad_norm": 1.1476688881483905, + "learning_rate": 9.720875075359188e-06, + "loss": 0.201, + "step": 10297 + }, + { + "epoch": 0.52, + "grad_norm": 1.0005395496337184, + "learning_rate": 9.71922875223414e-06, + "loss": 0.1653, + "step": 10298 + }, + { + "epoch": 0.52, + "grad_norm": 1.045700555276377, + "learning_rate": 9.717582436725021e-06, + "loss": 0.1933, + "step": 10299 + }, + { + "epoch": 0.52, + "grad_norm": 1.1122857347006354, + "learning_rate": 9.715936128876501e-06, + "loss": 0.2016, + "step": 10300 + }, + { + "epoch": 0.52, + "grad_norm": 0.8552952996571777, + "learning_rate": 9.714289828733223e-06, + "loss": 0.1983, + "step": 10301 + }, + { + "epoch": 0.52, + "grad_norm": 0.9366334544382264, + "learning_rate": 9.712643536339853e-06, + "loss": 0.1673, + "step": 10302 + }, + { + "epoch": 0.52, + "grad_norm": 0.8599452455983897, + "learning_rate": 9.71099725174104e-06, + "loss": 0.177, + "step": 10303 + }, + { + "epoch": 0.52, + "grad_norm": 1.4254728191793864, + "learning_rate": 9.709350974981449e-06, + "loss": 0.1746, + "step": 10304 + }, + { + "epoch": 0.52, + "grad_norm": 1.2572506240427659, + "learning_rate": 9.707704706105724e-06, + "loss": 0.2239, + "step": 10305 + }, + { + "epoch": 0.52, + "grad_norm": 0.8240594846353297, + "learning_rate": 9.706058445158527e-06, + "loss": 0.162, + "step": 10306 + }, + { + "epoch": 0.52, + "grad_norm": 1.3421053745298535, + "learning_rate": 9.704412192184511e-06, + "loss": 0.1903, + "step": 10307 + }, + { + "epoch": 0.52, + "grad_norm": 0.707243750628533, + "learning_rate": 9.702765947228333e-06, + "loss": 0.1605, + "step": 10308 + }, + { + "epoch": 0.52, + "grad_norm": 1.4382999861183035, + "learning_rate": 9.701119710334641e-06, + "loss": 0.2078, + "step": 10309 + }, + { + "epoch": 0.52, + "grad_norm": 0.9283704283606212, + "learning_rate": 9.699473481548097e-06, + "loss": 0.2208, + "step": 10310 + }, + { + "epoch": 0.52, + "grad_norm": 0.9548298747791736, + "learning_rate": 9.69782726091335e-06, + "loss": 0.185, + "step": 10311 + }, + { + "epoch": 0.52, + "grad_norm": 1.2026902730032376, + "learning_rate": 9.69618104847506e-06, + "loss": 0.2108, + "step": 10312 + }, + { + "epoch": 0.52, + "grad_norm": 1.2489949232747564, + "learning_rate": 9.694534844277876e-06, + "loss": 0.1893, + "step": 10313 + }, + { + "epoch": 0.52, + "grad_norm": 0.9972214462379292, + "learning_rate": 9.692888648366447e-06, + "loss": 0.21, + "step": 10314 + }, + { + "epoch": 0.52, + "grad_norm": 1.1213588878823344, + "learning_rate": 9.691242460785433e-06, + "loss": 0.1822, + "step": 10315 + }, + { + "epoch": 0.52, + "grad_norm": 0.9237740150032636, + "learning_rate": 9.689596281579481e-06, + "loss": 0.1859, + "step": 10316 + }, + { + "epoch": 0.52, + "grad_norm": 1.0459244960203853, + "learning_rate": 9.687950110793254e-06, + "loss": 0.1757, + "step": 10317 + }, + { + "epoch": 0.52, + "grad_norm": 0.9527148976547666, + "learning_rate": 9.686303948471393e-06, + "loss": 0.1902, + "step": 10318 + }, + { + "epoch": 0.52, + "grad_norm": 1.399354081487811, + "learning_rate": 9.684657794658557e-06, + "loss": 0.1742, + "step": 10319 + }, + { + "epoch": 0.52, + "grad_norm": 1.2006200307552037, + "learning_rate": 9.683011649399393e-06, + "loss": 0.1801, + "step": 10320 + }, + { + "epoch": 0.52, + "grad_norm": 0.8273043295743692, + "learning_rate": 9.681365512738561e-06, + "loss": 0.2077, + "step": 10321 + }, + { + "epoch": 0.52, + "grad_norm": 0.9695511459683863, + "learning_rate": 9.679719384720705e-06, + "loss": 0.1791, + "step": 10322 + }, + { + "epoch": 0.52, + "grad_norm": 1.4961861053556897, + "learning_rate": 9.678073265390478e-06, + "loss": 0.1728, + "step": 10323 + }, + { + "epoch": 0.52, + "grad_norm": 1.1475693874767454, + "learning_rate": 9.676427154792532e-06, + "loss": 0.1927, + "step": 10324 + }, + { + "epoch": 0.53, + "grad_norm": 1.1376021528687463, + "learning_rate": 9.67478105297152e-06, + "loss": 0.1805, + "step": 10325 + }, + { + "epoch": 0.53, + "grad_norm": 1.4852948187095072, + "learning_rate": 9.673134959972087e-06, + "loss": 0.1786, + "step": 10326 + }, + { + "epoch": 0.53, + "grad_norm": 1.3038353556527462, + "learning_rate": 9.671488875838892e-06, + "loss": 0.1658, + "step": 10327 + }, + { + "epoch": 0.53, + "grad_norm": 1.2454552539516948, + "learning_rate": 9.669842800616573e-06, + "loss": 0.1997, + "step": 10328 + }, + { + "epoch": 0.53, + "grad_norm": 1.1513530264433791, + "learning_rate": 9.66819673434979e-06, + "loss": 0.176, + "step": 10329 + }, + { + "epoch": 0.53, + "grad_norm": 1.0618655964419974, + "learning_rate": 9.666550677083193e-06, + "loss": 0.1856, + "step": 10330 + }, + { + "epoch": 0.53, + "grad_norm": 1.8207853176957545, + "learning_rate": 9.664904628861423e-06, + "loss": 0.165, + "step": 10331 + }, + { + "epoch": 0.53, + "grad_norm": 1.4835936664284424, + "learning_rate": 9.663258589729133e-06, + "loss": 0.1818, + "step": 10332 + }, + { + "epoch": 0.53, + "grad_norm": 1.148356495795242, + "learning_rate": 9.661612559730974e-06, + "loss": 0.1886, + "step": 10333 + }, + { + "epoch": 0.53, + "grad_norm": 0.977431153226433, + "learning_rate": 9.659966538911597e-06, + "loss": 0.1764, + "step": 10334 + }, + { + "epoch": 0.53, + "grad_norm": 1.3184864457409595, + "learning_rate": 9.658320527315642e-06, + "loss": 0.1875, + "step": 10335 + }, + { + "epoch": 0.53, + "grad_norm": 1.0881433063606767, + "learning_rate": 9.656674524987764e-06, + "loss": 0.1858, + "step": 10336 + }, + { + "epoch": 0.53, + "grad_norm": 0.9988224231568393, + "learning_rate": 9.655028531972607e-06, + "loss": 0.2006, + "step": 10337 + }, + { + "epoch": 0.53, + "grad_norm": 1.0873306012414692, + "learning_rate": 9.653382548314824e-06, + "loss": 0.1902, + "step": 10338 + }, + { + "epoch": 0.53, + "grad_norm": 1.6293296419133951, + "learning_rate": 9.651736574059056e-06, + "loss": 0.1736, + "step": 10339 + }, + { + "epoch": 0.53, + "grad_norm": 0.9422063308430143, + "learning_rate": 9.650090609249957e-06, + "loss": 0.1897, + "step": 10340 + }, + { + "epoch": 0.53, + "grad_norm": 1.098181699439507, + "learning_rate": 9.648444653932166e-06, + "loss": 0.2025, + "step": 10341 + }, + { + "epoch": 0.53, + "grad_norm": 1.043494769008611, + "learning_rate": 9.646798708150335e-06, + "loss": 0.1811, + "step": 10342 + }, + { + "epoch": 0.53, + "grad_norm": 1.4141343490160387, + "learning_rate": 9.645152771949107e-06, + "loss": 0.1732, + "step": 10343 + }, + { + "epoch": 0.53, + "grad_norm": 0.9833433273445842, + "learning_rate": 9.643506845373134e-06, + "loss": 0.1736, + "step": 10344 + }, + { + "epoch": 0.53, + "grad_norm": 1.4078488952573975, + "learning_rate": 9.641860928467054e-06, + "loss": 0.1871, + "step": 10345 + }, + { + "epoch": 0.53, + "grad_norm": 1.254584100159465, + "learning_rate": 9.64021502127552e-06, + "loss": 0.2344, + "step": 10346 + }, + { + "epoch": 0.53, + "grad_norm": 0.9609613365024053, + "learning_rate": 9.638569123843174e-06, + "loss": 0.1756, + "step": 10347 + }, + { + "epoch": 0.53, + "grad_norm": 0.863047354823533, + "learning_rate": 9.636923236214658e-06, + "loss": 0.1634, + "step": 10348 + }, + { + "epoch": 0.53, + "grad_norm": 0.9746403408105337, + "learning_rate": 9.635277358434622e-06, + "loss": 0.191, + "step": 10349 + }, + { + "epoch": 0.53, + "grad_norm": 0.9928457396463447, + "learning_rate": 9.633631490547705e-06, + "loss": 0.1771, + "step": 10350 + }, + { + "epoch": 0.53, + "grad_norm": 1.1389667439345696, + "learning_rate": 9.63198563259856e-06, + "loss": 0.1779, + "step": 10351 + }, + { + "epoch": 0.53, + "grad_norm": 0.8675214941337062, + "learning_rate": 9.63033978463182e-06, + "loss": 0.1908, + "step": 10352 + }, + { + "epoch": 0.53, + "grad_norm": 1.8474049487338804, + "learning_rate": 9.628693946692137e-06, + "loss": 0.1869, + "step": 10353 + }, + { + "epoch": 0.53, + "grad_norm": 0.9039438034001152, + "learning_rate": 9.62704811882415e-06, + "loss": 0.198, + "step": 10354 + }, + { + "epoch": 0.53, + "grad_norm": 1.2251513400924823, + "learning_rate": 9.625402301072508e-06, + "loss": 0.1837, + "step": 10355 + }, + { + "epoch": 0.53, + "grad_norm": 0.7946568640432952, + "learning_rate": 9.623756493481845e-06, + "loss": 0.169, + "step": 10356 + }, + { + "epoch": 0.53, + "grad_norm": 1.0808439840321342, + "learning_rate": 9.622110696096812e-06, + "loss": 0.2229, + "step": 10357 + }, + { + "epoch": 0.53, + "grad_norm": 1.1578573480507568, + "learning_rate": 9.620464908962044e-06, + "loss": 0.2092, + "step": 10358 + }, + { + "epoch": 0.53, + "grad_norm": 1.2304337056025627, + "learning_rate": 9.618819132122188e-06, + "loss": 0.1985, + "step": 10359 + }, + { + "epoch": 0.53, + "grad_norm": 0.9198798137925053, + "learning_rate": 9.617173365621885e-06, + "loss": 0.2165, + "step": 10360 + }, + { + "epoch": 0.53, + "grad_norm": 1.3088003472110072, + "learning_rate": 9.615527609505778e-06, + "loss": 0.2178, + "step": 10361 + }, + { + "epoch": 0.53, + "grad_norm": 1.1875781433128594, + "learning_rate": 9.613881863818504e-06, + "loss": 0.2038, + "step": 10362 + }, + { + "epoch": 0.53, + "grad_norm": 0.7732911230002019, + "learning_rate": 9.612236128604707e-06, + "loss": 0.1597, + "step": 10363 + }, + { + "epoch": 0.53, + "grad_norm": 0.8891492803152726, + "learning_rate": 9.610590403909028e-06, + "loss": 0.1673, + "step": 10364 + }, + { + "epoch": 0.53, + "grad_norm": 1.4303661276989428, + "learning_rate": 9.608944689776104e-06, + "loss": 0.2045, + "step": 10365 + }, + { + "epoch": 0.53, + "grad_norm": 1.1698005064544776, + "learning_rate": 9.607298986250578e-06, + "loss": 0.1826, + "step": 10366 + }, + { + "epoch": 0.53, + "grad_norm": 0.9063023615216049, + "learning_rate": 9.605653293377088e-06, + "loss": 0.1959, + "step": 10367 + }, + { + "epoch": 0.53, + "grad_norm": 0.9339563685741792, + "learning_rate": 9.60400761120028e-06, + "loss": 0.1899, + "step": 10368 + }, + { + "epoch": 0.53, + "grad_norm": 0.9848466830667983, + "learning_rate": 9.60236193976478e-06, + "loss": 0.1742, + "step": 10369 + }, + { + "epoch": 0.53, + "grad_norm": 1.9967791172095308, + "learning_rate": 9.600716279115244e-06, + "loss": 0.182, + "step": 10370 + }, + { + "epoch": 0.53, + "grad_norm": 1.1842568174382346, + "learning_rate": 9.599070629296295e-06, + "loss": 0.1643, + "step": 10371 + }, + { + "epoch": 0.53, + "grad_norm": 1.6531054022617186, + "learning_rate": 9.597424990352578e-06, + "loss": 0.2011, + "step": 10372 + }, + { + "epoch": 0.53, + "grad_norm": 0.7400670830777677, + "learning_rate": 9.595779362328731e-06, + "loss": 0.156, + "step": 10373 + }, + { + "epoch": 0.53, + "grad_norm": 0.9571451657640488, + "learning_rate": 9.594133745269396e-06, + "loss": 0.1706, + "step": 10374 + }, + { + "epoch": 0.53, + "grad_norm": 1.1804292443357665, + "learning_rate": 9.5924881392192e-06, + "loss": 0.1502, + "step": 10375 + }, + { + "epoch": 0.53, + "grad_norm": 2.3734033967550605, + "learning_rate": 9.59084254422279e-06, + "loss": 0.1662, + "step": 10376 + }, + { + "epoch": 0.53, + "grad_norm": 1.1294070061228543, + "learning_rate": 9.589196960324797e-06, + "loss": 0.2031, + "step": 10377 + }, + { + "epoch": 0.53, + "grad_norm": 1.1415644059540622, + "learning_rate": 9.587551387569863e-06, + "loss": 0.1832, + "step": 10378 + }, + { + "epoch": 0.53, + "grad_norm": 0.9893916259689024, + "learning_rate": 9.585905826002618e-06, + "loss": 0.1943, + "step": 10379 + }, + { + "epoch": 0.53, + "grad_norm": 0.9395348895068868, + "learning_rate": 9.584260275667702e-06, + "loss": 0.1786, + "step": 10380 + }, + { + "epoch": 0.53, + "grad_norm": 0.9656296619115283, + "learning_rate": 9.582614736609746e-06, + "loss": 0.1878, + "step": 10381 + }, + { + "epoch": 0.53, + "grad_norm": 0.9396833635719322, + "learning_rate": 9.580969208873396e-06, + "loss": 0.1971, + "step": 10382 + }, + { + "epoch": 0.53, + "grad_norm": 0.850830522329186, + "learning_rate": 9.579323692503278e-06, + "loss": 0.1804, + "step": 10383 + }, + { + "epoch": 0.53, + "grad_norm": 0.9138329218506234, + "learning_rate": 9.577678187544024e-06, + "loss": 0.1811, + "step": 10384 + }, + { + "epoch": 0.53, + "grad_norm": 0.895827065989639, + "learning_rate": 9.576032694040278e-06, + "loss": 0.19, + "step": 10385 + }, + { + "epoch": 0.53, + "grad_norm": 0.9258072443873068, + "learning_rate": 9.574387212036664e-06, + "loss": 0.1925, + "step": 10386 + }, + { + "epoch": 0.53, + "grad_norm": 0.8878547805058776, + "learning_rate": 9.572741741577826e-06, + "loss": 0.2078, + "step": 10387 + }, + { + "epoch": 0.53, + "grad_norm": 1.1455362422134288, + "learning_rate": 9.571096282708388e-06, + "loss": 0.1865, + "step": 10388 + }, + { + "epoch": 0.53, + "grad_norm": 1.6733140804840556, + "learning_rate": 9.56945083547299e-06, + "loss": 0.1918, + "step": 10389 + }, + { + "epoch": 0.53, + "grad_norm": 1.1029868972832708, + "learning_rate": 9.56780539991626e-06, + "loss": 0.184, + "step": 10390 + }, + { + "epoch": 0.53, + "grad_norm": 1.2399755222533273, + "learning_rate": 9.566159976082838e-06, + "loss": 0.1819, + "step": 10391 + }, + { + "epoch": 0.53, + "grad_norm": 0.95713725716782, + "learning_rate": 9.564514564017345e-06, + "loss": 0.1873, + "step": 10392 + }, + { + "epoch": 0.53, + "grad_norm": 1.241490567498554, + "learning_rate": 9.562869163764423e-06, + "loss": 0.2063, + "step": 10393 + }, + { + "epoch": 0.53, + "grad_norm": 1.162189024921771, + "learning_rate": 9.561223775368697e-06, + "loss": 0.1756, + "step": 10394 + }, + { + "epoch": 0.53, + "grad_norm": 0.9746192496771294, + "learning_rate": 9.559578398874805e-06, + "loss": 0.189, + "step": 10395 + }, + { + "epoch": 0.53, + "grad_norm": 0.997149034861936, + "learning_rate": 9.55793303432737e-06, + "loss": 0.1811, + "step": 10396 + }, + { + "epoch": 0.53, + "grad_norm": 0.8424462531241881, + "learning_rate": 9.55628768177103e-06, + "loss": 0.1777, + "step": 10397 + }, + { + "epoch": 0.53, + "grad_norm": 0.8116625462094226, + "learning_rate": 9.554642341250408e-06, + "loss": 0.1795, + "step": 10398 + }, + { + "epoch": 0.53, + "grad_norm": 0.9758897036252532, + "learning_rate": 9.552997012810138e-06, + "loss": 0.1524, + "step": 10399 + }, + { + "epoch": 0.53, + "grad_norm": 1.7726637556307456, + "learning_rate": 9.551351696494854e-06, + "loss": 0.1866, + "step": 10400 + }, + { + "epoch": 0.53, + "grad_norm": 1.4435038850679889, + "learning_rate": 9.549706392349175e-06, + "loss": 0.1796, + "step": 10401 + }, + { + "epoch": 0.53, + "grad_norm": 1.0780415380104225, + "learning_rate": 9.548061100417739e-06, + "loss": 0.1979, + "step": 10402 + }, + { + "epoch": 0.53, + "grad_norm": 1.0693027700309574, + "learning_rate": 9.546415820745168e-06, + "loss": 0.1951, + "step": 10403 + }, + { + "epoch": 0.53, + "grad_norm": 0.9908854052260064, + "learning_rate": 9.544770553376098e-06, + "loss": 0.1904, + "step": 10404 + }, + { + "epoch": 0.53, + "grad_norm": 1.6473184583869245, + "learning_rate": 9.543125298355147e-06, + "loss": 0.1923, + "step": 10405 + }, + { + "epoch": 0.53, + "grad_norm": 1.3924433832198697, + "learning_rate": 9.54148005572695e-06, + "loss": 0.1897, + "step": 10406 + }, + { + "epoch": 0.53, + "grad_norm": 1.111984621097495, + "learning_rate": 9.539834825536131e-06, + "loss": 0.1845, + "step": 10407 + }, + { + "epoch": 0.53, + "grad_norm": 1.1519537417134282, + "learning_rate": 9.538189607827324e-06, + "loss": 0.1888, + "step": 10408 + }, + { + "epoch": 0.53, + "grad_norm": 1.07458036048325, + "learning_rate": 9.536544402645144e-06, + "loss": 0.1847, + "step": 10409 + }, + { + "epoch": 0.53, + "grad_norm": 1.5093378512311653, + "learning_rate": 9.53489921003423e-06, + "loss": 0.1835, + "step": 10410 + }, + { + "epoch": 0.53, + "grad_norm": 1.0330251895942397, + "learning_rate": 9.533254030039193e-06, + "loss": 0.1596, + "step": 10411 + }, + { + "epoch": 0.53, + "grad_norm": 1.6626992170533568, + "learning_rate": 9.531608862704672e-06, + "loss": 0.2001, + "step": 10412 + }, + { + "epoch": 0.53, + "grad_norm": 1.496480785595012, + "learning_rate": 9.529963708075284e-06, + "loss": 0.1882, + "step": 10413 + }, + { + "epoch": 0.53, + "grad_norm": 0.9989953762051174, + "learning_rate": 9.528318566195661e-06, + "loss": 0.1707, + "step": 10414 + }, + { + "epoch": 0.53, + "grad_norm": 1.35912861959528, + "learning_rate": 9.52667343711042e-06, + "loss": 0.2268, + "step": 10415 + }, + { + "epoch": 0.53, + "grad_norm": 1.3312356143026387, + "learning_rate": 9.525028320864191e-06, + "loss": 0.1828, + "step": 10416 + }, + { + "epoch": 0.53, + "grad_norm": 0.9451334639642954, + "learning_rate": 9.523383217501596e-06, + "loss": 0.1897, + "step": 10417 + }, + { + "epoch": 0.53, + "grad_norm": 1.2835826239197037, + "learning_rate": 9.521738127067254e-06, + "loss": 0.1893, + "step": 10418 + }, + { + "epoch": 0.53, + "grad_norm": 1.9477317307320914, + "learning_rate": 9.520093049605796e-06, + "loss": 0.1738, + "step": 10419 + }, + { + "epoch": 0.53, + "grad_norm": 0.9035321903439242, + "learning_rate": 9.51844798516184e-06, + "loss": 0.1919, + "step": 10420 + }, + { + "epoch": 0.53, + "grad_norm": 0.7218146957856297, + "learning_rate": 9.516802933780011e-06, + "loss": 0.1565, + "step": 10421 + }, + { + "epoch": 0.53, + "grad_norm": 1.0801580770780828, + "learning_rate": 9.515157895504927e-06, + "loss": 0.192, + "step": 10422 + }, + { + "epoch": 0.53, + "grad_norm": 0.9582296167426243, + "learning_rate": 9.513512870381216e-06, + "loss": 0.1733, + "step": 10423 + }, + { + "epoch": 0.53, + "grad_norm": 1.0867462991915926, + "learning_rate": 9.511867858453493e-06, + "loss": 0.2014, + "step": 10424 + }, + { + "epoch": 0.53, + "grad_norm": 0.7490081315040854, + "learning_rate": 9.510222859766383e-06, + "loss": 0.1687, + "step": 10425 + }, + { + "epoch": 0.53, + "grad_norm": 0.9441063005416657, + "learning_rate": 9.508577874364503e-06, + "loss": 0.1992, + "step": 10426 + }, + { + "epoch": 0.53, + "grad_norm": 2.211252729767083, + "learning_rate": 9.506932902292482e-06, + "loss": 0.182, + "step": 10427 + }, + { + "epoch": 0.53, + "grad_norm": 1.384841114501458, + "learning_rate": 9.505287943594928e-06, + "loss": 0.175, + "step": 10428 + }, + { + "epoch": 0.53, + "grad_norm": 1.2205290375607247, + "learning_rate": 9.503642998316469e-06, + "loss": 0.1751, + "step": 10429 + }, + { + "epoch": 0.53, + "grad_norm": 0.8274716728200241, + "learning_rate": 9.501998066501718e-06, + "loss": 0.1955, + "step": 10430 + }, + { + "epoch": 0.53, + "grad_norm": 1.0254602672883348, + "learning_rate": 9.500353148195305e-06, + "loss": 0.2029, + "step": 10431 + }, + { + "epoch": 0.53, + "grad_norm": 0.7968841718287084, + "learning_rate": 9.498708243441834e-06, + "loss": 0.1899, + "step": 10432 + }, + { + "epoch": 0.53, + "grad_norm": 1.0826837399121725, + "learning_rate": 9.497063352285934e-06, + "loss": 0.1851, + "step": 10433 + }, + { + "epoch": 0.53, + "grad_norm": 0.9069755430051439, + "learning_rate": 9.495418474772221e-06, + "loss": 0.2015, + "step": 10434 + }, + { + "epoch": 0.53, + "grad_norm": 1.271043067333602, + "learning_rate": 9.493773610945305e-06, + "loss": 0.194, + "step": 10435 + }, + { + "epoch": 0.53, + "grad_norm": 1.1972613171697342, + "learning_rate": 9.492128760849813e-06, + "loss": 0.2168, + "step": 10436 + }, + { + "epoch": 0.53, + "grad_norm": 1.0838621602061793, + "learning_rate": 9.490483924530353e-06, + "loss": 0.1731, + "step": 10437 + }, + { + "epoch": 0.53, + "grad_norm": 0.7529980716882423, + "learning_rate": 9.488839102031549e-06, + "loss": 0.1725, + "step": 10438 + }, + { + "epoch": 0.53, + "grad_norm": 0.8989904055035346, + "learning_rate": 9.48719429339801e-06, + "loss": 0.1859, + "step": 10439 + }, + { + "epoch": 0.53, + "grad_norm": 0.9278387486832459, + "learning_rate": 9.485549498674357e-06, + "loss": 0.191, + "step": 10440 + }, + { + "epoch": 0.53, + "grad_norm": 1.4556072385973486, + "learning_rate": 9.483904717905202e-06, + "loss": 0.1897, + "step": 10441 + }, + { + "epoch": 0.53, + "grad_norm": 0.9878938604682453, + "learning_rate": 9.48225995113516e-06, + "loss": 0.1697, + "step": 10442 + }, + { + "epoch": 0.53, + "grad_norm": 1.2287388076396861, + "learning_rate": 9.480615198408846e-06, + "loss": 0.1862, + "step": 10443 + }, + { + "epoch": 0.53, + "grad_norm": 0.8831594096054374, + "learning_rate": 9.478970459770878e-06, + "loss": 0.201, + "step": 10444 + }, + { + "epoch": 0.53, + "grad_norm": 0.9367663375726483, + "learning_rate": 9.47732573526586e-06, + "loss": 0.1834, + "step": 10445 + }, + { + "epoch": 0.53, + "grad_norm": 0.7777173866626321, + "learning_rate": 9.475681024938415e-06, + "loss": 0.2014, + "step": 10446 + }, + { + "epoch": 0.53, + "grad_norm": 1.17639472112245, + "learning_rate": 9.474036328833148e-06, + "loss": 0.1915, + "step": 10447 + }, + { + "epoch": 0.53, + "grad_norm": 0.910047139114023, + "learning_rate": 9.472391646994681e-06, + "loss": 0.1648, + "step": 10448 + }, + { + "epoch": 0.53, + "grad_norm": 1.044503529177657, + "learning_rate": 9.470746979467614e-06, + "loss": 0.1794, + "step": 10449 + }, + { + "epoch": 0.53, + "grad_norm": 0.9275408983661686, + "learning_rate": 9.46910232629657e-06, + "loss": 0.1785, + "step": 10450 + }, + { + "epoch": 0.53, + "grad_norm": 1.0062888448818028, + "learning_rate": 9.467457687526156e-06, + "loss": 0.213, + "step": 10451 + }, + { + "epoch": 0.53, + "grad_norm": 1.0758056837608503, + "learning_rate": 9.465813063200978e-06, + "loss": 0.1883, + "step": 10452 + }, + { + "epoch": 0.53, + "grad_norm": 1.200194759153906, + "learning_rate": 9.464168453365655e-06, + "loss": 0.1687, + "step": 10453 + }, + { + "epoch": 0.53, + "grad_norm": 0.8561814641014569, + "learning_rate": 9.462523858064788e-06, + "loss": 0.1801, + "step": 10454 + }, + { + "epoch": 0.53, + "grad_norm": 2.4085649439814016, + "learning_rate": 9.460879277342995e-06, + "loss": 0.1891, + "step": 10455 + }, + { + "epoch": 0.53, + "grad_norm": 1.072865594506947, + "learning_rate": 9.459234711244881e-06, + "loss": 0.1901, + "step": 10456 + }, + { + "epoch": 0.53, + "grad_norm": 1.4323014820101265, + "learning_rate": 9.457590159815058e-06, + "loss": 0.1858, + "step": 10457 + }, + { + "epoch": 0.53, + "grad_norm": 1.4154831275527877, + "learning_rate": 9.45594562309813e-06, + "loss": 0.2139, + "step": 10458 + }, + { + "epoch": 0.53, + "grad_norm": 1.0001665481648387, + "learning_rate": 9.454301101138708e-06, + "loss": 0.1876, + "step": 10459 + }, + { + "epoch": 0.53, + "grad_norm": 1.9027308619514212, + "learning_rate": 9.452656593981398e-06, + "loss": 0.1831, + "step": 10460 + }, + { + "epoch": 0.53, + "grad_norm": 0.9316900980594415, + "learning_rate": 9.451012101670814e-06, + "loss": 0.1858, + "step": 10461 + }, + { + "epoch": 0.53, + "grad_norm": 0.7836745774854093, + "learning_rate": 9.449367624251554e-06, + "loss": 0.1624, + "step": 10462 + }, + { + "epoch": 0.53, + "grad_norm": 1.230737222455105, + "learning_rate": 9.447723161768228e-06, + "loss": 0.1664, + "step": 10463 + }, + { + "epoch": 0.53, + "grad_norm": 0.8627584376712686, + "learning_rate": 9.446078714265441e-06, + "loss": 0.1978, + "step": 10464 + }, + { + "epoch": 0.53, + "grad_norm": 1.0735143941794252, + "learning_rate": 9.444434281787806e-06, + "loss": 0.1821, + "step": 10465 + }, + { + "epoch": 0.53, + "grad_norm": 1.113359953970941, + "learning_rate": 9.442789864379918e-06, + "loss": 0.1847, + "step": 10466 + }, + { + "epoch": 0.53, + "grad_norm": 1.2387280858722214, + "learning_rate": 9.44114546208639e-06, + "loss": 0.1765, + "step": 10467 + }, + { + "epoch": 0.53, + "grad_norm": 0.8166548705579885, + "learning_rate": 9.439501074951817e-06, + "loss": 0.1841, + "step": 10468 + }, + { + "epoch": 0.53, + "grad_norm": 0.9275349732677751, + "learning_rate": 9.437856703020813e-06, + "loss": 0.1949, + "step": 10469 + }, + { + "epoch": 0.53, + "grad_norm": 0.9684713113188496, + "learning_rate": 9.436212346337981e-06, + "loss": 0.2019, + "step": 10470 + }, + { + "epoch": 0.53, + "grad_norm": 2.752915393134308, + "learning_rate": 9.434568004947914e-06, + "loss": 0.1811, + "step": 10471 + }, + { + "epoch": 0.53, + "grad_norm": 1.0686608805981916, + "learning_rate": 9.432923678895225e-06, + "loss": 0.1788, + "step": 10472 + }, + { + "epoch": 0.53, + "grad_norm": 1.3131991830901886, + "learning_rate": 9.431279368224512e-06, + "loss": 0.1957, + "step": 10473 + }, + { + "epoch": 0.53, + "grad_norm": 0.8220619801982747, + "learning_rate": 9.429635072980382e-06, + "loss": 0.1781, + "step": 10474 + }, + { + "epoch": 0.53, + "grad_norm": 0.7944177343908115, + "learning_rate": 9.427990793207428e-06, + "loss": 0.1783, + "step": 10475 + }, + { + "epoch": 0.53, + "grad_norm": 0.8865612068771639, + "learning_rate": 9.426346528950258e-06, + "loss": 0.1681, + "step": 10476 + }, + { + "epoch": 0.53, + "grad_norm": 1.0013291908495037, + "learning_rate": 9.42470228025347e-06, + "loss": 0.1789, + "step": 10477 + }, + { + "epoch": 0.53, + "grad_norm": 1.1030495223038712, + "learning_rate": 9.423058047161668e-06, + "loss": 0.1759, + "step": 10478 + }, + { + "epoch": 0.53, + "grad_norm": 1.2086008136565232, + "learning_rate": 9.421413829719446e-06, + "loss": 0.191, + "step": 10479 + }, + { + "epoch": 0.53, + "grad_norm": 1.0555368232810498, + "learning_rate": 9.41976962797141e-06, + "loss": 0.2003, + "step": 10480 + }, + { + "epoch": 0.53, + "grad_norm": 0.8284304837222289, + "learning_rate": 9.418125441962151e-06, + "loss": 0.1916, + "step": 10481 + }, + { + "epoch": 0.53, + "grad_norm": 1.1149723921087935, + "learning_rate": 9.416481271736276e-06, + "loss": 0.178, + "step": 10482 + }, + { + "epoch": 0.53, + "grad_norm": 1.2590214185530384, + "learning_rate": 9.414837117338376e-06, + "loss": 0.2095, + "step": 10483 + }, + { + "epoch": 0.53, + "grad_norm": 0.9191664131662469, + "learning_rate": 9.413192978813057e-06, + "loss": 0.183, + "step": 10484 + }, + { + "epoch": 0.53, + "grad_norm": 1.0947807191380714, + "learning_rate": 9.411548856204907e-06, + "loss": 0.1784, + "step": 10485 + }, + { + "epoch": 0.53, + "grad_norm": 0.8485057552267062, + "learning_rate": 9.409904749558529e-06, + "loss": 0.1741, + "step": 10486 + }, + { + "epoch": 0.53, + "grad_norm": 1.2353579300556639, + "learning_rate": 9.408260658918522e-06, + "loss": 0.1711, + "step": 10487 + }, + { + "epoch": 0.53, + "grad_norm": 0.9047841484421405, + "learning_rate": 9.406616584329473e-06, + "loss": 0.1886, + "step": 10488 + }, + { + "epoch": 0.53, + "grad_norm": 1.0537218642384858, + "learning_rate": 9.404972525835984e-06, + "loss": 0.17, + "step": 10489 + }, + { + "epoch": 0.53, + "grad_norm": 2.490393840000147, + "learning_rate": 9.403328483482647e-06, + "loss": 0.1835, + "step": 10490 + }, + { + "epoch": 0.53, + "grad_norm": 0.9113168576797979, + "learning_rate": 9.401684457314064e-06, + "loss": 0.1985, + "step": 10491 + }, + { + "epoch": 0.53, + "grad_norm": 0.8407452044189376, + "learning_rate": 9.400040447374818e-06, + "loss": 0.1784, + "step": 10492 + }, + { + "epoch": 0.53, + "grad_norm": 1.0300144394706507, + "learning_rate": 9.398396453709514e-06, + "loss": 0.1661, + "step": 10493 + }, + { + "epoch": 0.53, + "grad_norm": 1.0476008663127399, + "learning_rate": 9.396752476362735e-06, + "loss": 0.1865, + "step": 10494 + }, + { + "epoch": 0.53, + "grad_norm": 1.6707348934781523, + "learning_rate": 9.395108515379082e-06, + "loss": 0.2134, + "step": 10495 + }, + { + "epoch": 0.53, + "grad_norm": 2.4262919869218256, + "learning_rate": 9.393464570803142e-06, + "loss": 0.2053, + "step": 10496 + }, + { + "epoch": 0.53, + "grad_norm": 1.1980144717024208, + "learning_rate": 9.391820642679515e-06, + "loss": 0.1852, + "step": 10497 + }, + { + "epoch": 0.53, + "grad_norm": 1.0619071454147688, + "learning_rate": 9.390176731052783e-06, + "loss": 0.1835, + "step": 10498 + }, + { + "epoch": 0.53, + "grad_norm": 0.9156702670758646, + "learning_rate": 9.388532835967543e-06, + "loss": 0.1979, + "step": 10499 + }, + { + "epoch": 0.53, + "grad_norm": 1.1991320317620187, + "learning_rate": 9.386888957468383e-06, + "loss": 0.1743, + "step": 10500 + }, + { + "epoch": 0.53, + "grad_norm": 1.0832346660178478, + "learning_rate": 9.385245095599897e-06, + "loss": 0.1608, + "step": 10501 + }, + { + "epoch": 0.53, + "grad_norm": 1.1136661258607057, + "learning_rate": 9.38360125040667e-06, + "loss": 0.1903, + "step": 10502 + }, + { + "epoch": 0.53, + "grad_norm": 1.9346059659450856, + "learning_rate": 9.381957421933296e-06, + "loss": 0.1833, + "step": 10503 + }, + { + "epoch": 0.53, + "grad_norm": 1.0498713030587017, + "learning_rate": 9.380313610224364e-06, + "loss": 0.1831, + "step": 10504 + }, + { + "epoch": 0.53, + "grad_norm": 1.1507233448862686, + "learning_rate": 9.378669815324456e-06, + "loss": 0.1679, + "step": 10505 + }, + { + "epoch": 0.53, + "grad_norm": 0.7585534369008222, + "learning_rate": 9.377026037278169e-06, + "loss": 0.1746, + "step": 10506 + }, + { + "epoch": 0.53, + "grad_norm": 1.1627401646562339, + "learning_rate": 9.37538227613008e-06, + "loss": 0.1868, + "step": 10507 + }, + { + "epoch": 0.53, + "grad_norm": 0.9497167071824306, + "learning_rate": 9.373738531924787e-06, + "loss": 0.1733, + "step": 10508 + }, + { + "epoch": 0.53, + "grad_norm": 0.9222271876528074, + "learning_rate": 9.372094804706867e-06, + "loss": 0.1955, + "step": 10509 + }, + { + "epoch": 0.53, + "grad_norm": 0.982110484488253, + "learning_rate": 9.370451094520915e-06, + "loss": 0.1801, + "step": 10510 + }, + { + "epoch": 0.53, + "grad_norm": 1.0602814609731621, + "learning_rate": 9.36880740141151e-06, + "loss": 0.1606, + "step": 10511 + }, + { + "epoch": 0.53, + "grad_norm": 1.2839696823036988, + "learning_rate": 9.367163725423242e-06, + "loss": 0.177, + "step": 10512 + }, + { + "epoch": 0.53, + "grad_norm": 1.2200605917065472, + "learning_rate": 9.365520066600691e-06, + "loss": 0.1706, + "step": 10513 + }, + { + "epoch": 0.53, + "grad_norm": 0.8684543483198988, + "learning_rate": 9.363876424988447e-06, + "loss": 0.19, + "step": 10514 + }, + { + "epoch": 0.53, + "grad_norm": 1.4733424165761082, + "learning_rate": 9.362232800631087e-06, + "loss": 0.1808, + "step": 10515 + }, + { + "epoch": 0.53, + "grad_norm": 1.332430221388429, + "learning_rate": 9.360589193573203e-06, + "loss": 0.1703, + "step": 10516 + }, + { + "epoch": 0.53, + "grad_norm": 1.0983837973789459, + "learning_rate": 9.358945603859369e-06, + "loss": 0.1682, + "step": 10517 + }, + { + "epoch": 0.53, + "grad_norm": 0.9061642429714712, + "learning_rate": 9.357302031534178e-06, + "loss": 0.1752, + "step": 10518 + }, + { + "epoch": 0.53, + "grad_norm": 1.0453790873171436, + "learning_rate": 9.355658476642198e-06, + "loss": 0.191, + "step": 10519 + }, + { + "epoch": 0.53, + "grad_norm": 0.8765708013616752, + "learning_rate": 9.354014939228024e-06, + "loss": 0.1748, + "step": 10520 + }, + { + "epoch": 0.54, + "grad_norm": 0.890012437695401, + "learning_rate": 9.35237141933623e-06, + "loss": 0.2308, + "step": 10521 + }, + { + "epoch": 0.54, + "grad_norm": 0.9096191796737242, + "learning_rate": 9.350727917011395e-06, + "loss": 0.1793, + "step": 10522 + }, + { + "epoch": 0.54, + "grad_norm": 1.5613106494417954, + "learning_rate": 9.349084432298106e-06, + "loss": 0.1811, + "step": 10523 + }, + { + "epoch": 0.54, + "grad_norm": 1.0860020915804065, + "learning_rate": 9.347440965240934e-06, + "loss": 0.2029, + "step": 10524 + }, + { + "epoch": 0.54, + "grad_norm": 0.8887090919405828, + "learning_rate": 9.345797515884466e-06, + "loss": 0.174, + "step": 10525 + }, + { + "epoch": 0.54, + "grad_norm": 0.8486337984116933, + "learning_rate": 9.344154084273275e-06, + "loss": 0.1791, + "step": 10526 + }, + { + "epoch": 0.54, + "grad_norm": 1.006827054252546, + "learning_rate": 9.342510670451944e-06, + "loss": 0.1654, + "step": 10527 + }, + { + "epoch": 0.54, + "grad_norm": 0.9545599629514354, + "learning_rate": 9.340867274465047e-06, + "loss": 0.1948, + "step": 10528 + }, + { + "epoch": 0.54, + "grad_norm": 1.047854515192047, + "learning_rate": 9.339223896357163e-06, + "loss": 0.1808, + "step": 10529 + }, + { + "epoch": 0.54, + "grad_norm": 1.1177400791380077, + "learning_rate": 9.337580536172864e-06, + "loss": 0.1798, + "step": 10530 + }, + { + "epoch": 0.54, + "grad_norm": 0.8060106213647832, + "learning_rate": 9.335937193956736e-06, + "loss": 0.1807, + "step": 10531 + }, + { + "epoch": 0.54, + "grad_norm": 0.884147369338201, + "learning_rate": 9.334293869753346e-06, + "loss": 0.1824, + "step": 10532 + }, + { + "epoch": 0.54, + "grad_norm": 1.082017160418502, + "learning_rate": 9.332650563607276e-06, + "loss": 0.205, + "step": 10533 + }, + { + "epoch": 0.54, + "grad_norm": 0.8124867757454877, + "learning_rate": 9.331007275563093e-06, + "loss": 0.1854, + "step": 10534 + }, + { + "epoch": 0.54, + "grad_norm": 0.8725530897469803, + "learning_rate": 9.329364005665377e-06, + "loss": 0.1858, + "step": 10535 + }, + { + "epoch": 0.54, + "grad_norm": 0.8761616764100726, + "learning_rate": 9.327720753958699e-06, + "loss": 0.2076, + "step": 10536 + }, + { + "epoch": 0.54, + "grad_norm": 1.0565332273224135, + "learning_rate": 9.326077520487637e-06, + "loss": 0.2011, + "step": 10537 + }, + { + "epoch": 0.54, + "grad_norm": 1.5342916385910432, + "learning_rate": 9.324434305296757e-06, + "loss": 0.1785, + "step": 10538 + }, + { + "epoch": 0.54, + "grad_norm": 0.9888427341066531, + "learning_rate": 9.322791108430636e-06, + "loss": 0.2056, + "step": 10539 + }, + { + "epoch": 0.54, + "grad_norm": 0.8702642528267842, + "learning_rate": 9.321147929933847e-06, + "loss": 0.1675, + "step": 10540 + }, + { + "epoch": 0.54, + "grad_norm": 1.2638876117441158, + "learning_rate": 9.319504769850953e-06, + "loss": 0.1782, + "step": 10541 + }, + { + "epoch": 0.54, + "grad_norm": 0.9326731432920613, + "learning_rate": 9.317861628226535e-06, + "loss": 0.1902, + "step": 10542 + }, + { + "epoch": 0.54, + "grad_norm": 1.0811358970036962, + "learning_rate": 9.316218505105155e-06, + "loss": 0.1943, + "step": 10543 + }, + { + "epoch": 0.54, + "grad_norm": 0.9911777388698443, + "learning_rate": 9.314575400531391e-06, + "loss": 0.2022, + "step": 10544 + }, + { + "epoch": 0.54, + "grad_norm": 1.1818692996375626, + "learning_rate": 9.312932314549804e-06, + "loss": 0.1933, + "step": 10545 + }, + { + "epoch": 0.54, + "grad_norm": 0.9087683719340895, + "learning_rate": 9.311289247204972e-06, + "loss": 0.187, + "step": 10546 + }, + { + "epoch": 0.54, + "grad_norm": 1.2647180730801626, + "learning_rate": 9.309646198541454e-06, + "loss": 0.1762, + "step": 10547 + }, + { + "epoch": 0.54, + "grad_norm": 1.0285760495995913, + "learning_rate": 9.308003168603822e-06, + "loss": 0.1709, + "step": 10548 + }, + { + "epoch": 0.54, + "grad_norm": 1.4024991467330072, + "learning_rate": 9.306360157436642e-06, + "loss": 0.1795, + "step": 10549 + }, + { + "epoch": 0.54, + "grad_norm": 0.9153367260426736, + "learning_rate": 9.304717165084486e-06, + "loss": 0.1668, + "step": 10550 + }, + { + "epoch": 0.54, + "grad_norm": 0.9666848999268505, + "learning_rate": 9.303074191591912e-06, + "loss": 0.2107, + "step": 10551 + }, + { + "epoch": 0.54, + "grad_norm": 1.443604240907049, + "learning_rate": 9.301431237003492e-06, + "loss": 0.1799, + "step": 10552 + }, + { + "epoch": 0.54, + "grad_norm": 1.1412227625228761, + "learning_rate": 9.299788301363786e-06, + "loss": 0.1912, + "step": 10553 + }, + { + "epoch": 0.54, + "grad_norm": 1.074255400598678, + "learning_rate": 9.298145384717369e-06, + "loss": 0.1648, + "step": 10554 + }, + { + "epoch": 0.54, + "grad_norm": 1.278035111588236, + "learning_rate": 9.296502487108792e-06, + "loss": 0.2043, + "step": 10555 + }, + { + "epoch": 0.54, + "grad_norm": 0.9894725590022767, + "learning_rate": 9.294859608582626e-06, + "loss": 0.1744, + "step": 10556 + }, + { + "epoch": 0.54, + "grad_norm": 0.9126589381137777, + "learning_rate": 9.293216749183437e-06, + "loss": 0.1606, + "step": 10557 + }, + { + "epoch": 0.54, + "grad_norm": 0.9971284910584742, + "learning_rate": 9.291573908955777e-06, + "loss": 0.1872, + "step": 10558 + }, + { + "epoch": 0.54, + "grad_norm": 0.9356317290539621, + "learning_rate": 9.289931087944221e-06, + "loss": 0.1674, + "step": 10559 + }, + { + "epoch": 0.54, + "grad_norm": 0.9180737699979528, + "learning_rate": 9.28828828619332e-06, + "loss": 0.1911, + "step": 10560 + }, + { + "epoch": 0.54, + "grad_norm": 1.3102951476006044, + "learning_rate": 9.286645503747641e-06, + "loss": 0.1943, + "step": 10561 + }, + { + "epoch": 0.54, + "grad_norm": 1.0486085362353716, + "learning_rate": 9.28500274065174e-06, + "loss": 0.172, + "step": 10562 + }, + { + "epoch": 0.54, + "grad_norm": 1.0143367843255702, + "learning_rate": 9.283359996950187e-06, + "loss": 0.1923, + "step": 10563 + }, + { + "epoch": 0.54, + "grad_norm": 1.0226800184238658, + "learning_rate": 9.281717272687527e-06, + "loss": 0.1905, + "step": 10564 + }, + { + "epoch": 0.54, + "grad_norm": 1.0633593771594727, + "learning_rate": 9.28007456790833e-06, + "loss": 0.1949, + "step": 10565 + }, + { + "epoch": 0.54, + "grad_norm": 0.9299341580114174, + "learning_rate": 9.27843188265715e-06, + "loss": 0.1804, + "step": 10566 + }, + { + "epoch": 0.54, + "grad_norm": 1.169669224468857, + "learning_rate": 9.276789216978549e-06, + "loss": 0.2227, + "step": 10567 + }, + { + "epoch": 0.54, + "grad_norm": 0.9097009835886758, + "learning_rate": 9.275146570917077e-06, + "loss": 0.1884, + "step": 10568 + }, + { + "epoch": 0.54, + "grad_norm": 0.9145517605087363, + "learning_rate": 9.273503944517298e-06, + "loss": 0.1675, + "step": 10569 + }, + { + "epoch": 0.54, + "grad_norm": 0.8695056976150293, + "learning_rate": 9.271861337823763e-06, + "loss": 0.1663, + "step": 10570 + }, + { + "epoch": 0.54, + "grad_norm": 0.912319748951526, + "learning_rate": 9.270218750881036e-06, + "loss": 0.1891, + "step": 10571 + }, + { + "epoch": 0.54, + "grad_norm": 1.1769578406569654, + "learning_rate": 9.26857618373366e-06, + "loss": 0.2083, + "step": 10572 + }, + { + "epoch": 0.54, + "grad_norm": 1.1584981841543633, + "learning_rate": 9.2669336364262e-06, + "loss": 0.1728, + "step": 10573 + }, + { + "epoch": 0.54, + "grad_norm": 0.7834513467328376, + "learning_rate": 9.265291109003208e-06, + "loss": 0.1634, + "step": 10574 + }, + { + "epoch": 0.54, + "grad_norm": 1.5024126986383335, + "learning_rate": 9.263648601509231e-06, + "loss": 0.1773, + "step": 10575 + }, + { + "epoch": 0.54, + "grad_norm": 0.9605455781620283, + "learning_rate": 9.262006113988832e-06, + "loss": 0.1835, + "step": 10576 + }, + { + "epoch": 0.54, + "grad_norm": 0.7979609875885121, + "learning_rate": 9.260363646486556e-06, + "loss": 0.185, + "step": 10577 + }, + { + "epoch": 0.54, + "grad_norm": 1.0930640857959548, + "learning_rate": 9.25872119904696e-06, + "loss": 0.1911, + "step": 10578 + }, + { + "epoch": 0.54, + "grad_norm": 1.5730311189725499, + "learning_rate": 9.257078771714591e-06, + "loss": 0.1897, + "step": 10579 + }, + { + "epoch": 0.54, + "grad_norm": 1.00160813983151, + "learning_rate": 9.255436364534005e-06, + "loss": 0.2044, + "step": 10580 + }, + { + "epoch": 0.54, + "grad_norm": 0.907234305672958, + "learning_rate": 9.253793977549747e-06, + "loss": 0.1529, + "step": 10581 + }, + { + "epoch": 0.54, + "grad_norm": 1.7767435462070758, + "learning_rate": 9.25215161080637e-06, + "loss": 0.1958, + "step": 10582 + }, + { + "epoch": 0.54, + "grad_norm": 0.9855417730617082, + "learning_rate": 9.250509264348422e-06, + "loss": 0.2006, + "step": 10583 + }, + { + "epoch": 0.54, + "grad_norm": 0.8226800135543453, + "learning_rate": 9.248866938220456e-06, + "loss": 0.1796, + "step": 10584 + }, + { + "epoch": 0.54, + "grad_norm": 0.8673771835954791, + "learning_rate": 9.247224632467014e-06, + "loss": 0.1813, + "step": 10585 + }, + { + "epoch": 0.54, + "grad_norm": 1.6398020717643, + "learning_rate": 9.245582347132646e-06, + "loss": 0.1856, + "step": 10586 + }, + { + "epoch": 0.54, + "grad_norm": 0.9801202936924129, + "learning_rate": 9.2439400822619e-06, + "loss": 0.1629, + "step": 10587 + }, + { + "epoch": 0.54, + "grad_norm": 1.09510375553041, + "learning_rate": 9.242297837899325e-06, + "loss": 0.1768, + "step": 10588 + }, + { + "epoch": 0.54, + "grad_norm": 1.005381026821629, + "learning_rate": 9.240655614089459e-06, + "loss": 0.2058, + "step": 10589 + }, + { + "epoch": 0.54, + "grad_norm": 1.3320793772483235, + "learning_rate": 9.239013410876856e-06, + "loss": 0.1891, + "step": 10590 + }, + { + "epoch": 0.54, + "grad_norm": 1.1738236438179857, + "learning_rate": 9.237371228306057e-06, + "loss": 0.202, + "step": 10591 + }, + { + "epoch": 0.54, + "grad_norm": 1.1494644621728747, + "learning_rate": 9.235729066421604e-06, + "loss": 0.1738, + "step": 10592 + }, + { + "epoch": 0.54, + "grad_norm": 0.9827935431430217, + "learning_rate": 9.234086925268046e-06, + "loss": 0.1813, + "step": 10593 + }, + { + "epoch": 0.54, + "grad_norm": 1.1216711618319328, + "learning_rate": 9.232444804889921e-06, + "loss": 0.1901, + "step": 10594 + }, + { + "epoch": 0.54, + "grad_norm": 1.3244857919711657, + "learning_rate": 9.230802705331776e-06, + "loss": 0.184, + "step": 10595 + }, + { + "epoch": 0.54, + "grad_norm": 0.9137910573447837, + "learning_rate": 9.229160626638148e-06, + "loss": 0.1817, + "step": 10596 + }, + { + "epoch": 0.54, + "grad_norm": 0.9236684179786859, + "learning_rate": 9.227518568853587e-06, + "loss": 0.1825, + "step": 10597 + }, + { + "epoch": 0.54, + "grad_norm": 1.135558348639019, + "learning_rate": 9.225876532022623e-06, + "loss": 0.1815, + "step": 10598 + }, + { + "epoch": 0.54, + "grad_norm": 0.9099245816546617, + "learning_rate": 9.224234516189803e-06, + "loss": 0.1802, + "step": 10599 + }, + { + "epoch": 0.54, + "grad_norm": 0.936960906132101, + "learning_rate": 9.222592521399666e-06, + "loss": 0.1844, + "step": 10600 + }, + { + "epoch": 0.54, + "grad_norm": 1.196191487570183, + "learning_rate": 9.220950547696754e-06, + "loss": 0.1801, + "step": 10601 + }, + { + "epoch": 0.54, + "grad_norm": 1.1492432241366455, + "learning_rate": 9.219308595125598e-06, + "loss": 0.1677, + "step": 10602 + }, + { + "epoch": 0.54, + "grad_norm": 1.2051869978190075, + "learning_rate": 9.217666663730744e-06, + "loss": 0.1814, + "step": 10603 + }, + { + "epoch": 0.54, + "grad_norm": 1.0243526918755632, + "learning_rate": 9.216024753556722e-06, + "loss": 0.1958, + "step": 10604 + }, + { + "epoch": 0.54, + "grad_norm": 1.084683646176466, + "learning_rate": 9.214382864648075e-06, + "loss": 0.1562, + "step": 10605 + }, + { + "epoch": 0.54, + "grad_norm": 1.080449899563939, + "learning_rate": 9.212740997049335e-06, + "loss": 0.2075, + "step": 10606 + }, + { + "epoch": 0.54, + "grad_norm": 1.2832964653669452, + "learning_rate": 9.211099150805046e-06, + "loss": 0.1761, + "step": 10607 + }, + { + "epoch": 0.54, + "grad_norm": 1.7528709193932426, + "learning_rate": 9.209457325959731e-06, + "loss": 0.1813, + "step": 10608 + }, + { + "epoch": 0.54, + "grad_norm": 0.8444403300450413, + "learning_rate": 9.207815522557932e-06, + "loss": 0.1839, + "step": 10609 + }, + { + "epoch": 0.54, + "grad_norm": 0.9972227451408476, + "learning_rate": 9.206173740644186e-06, + "loss": 0.1934, + "step": 10610 + }, + { + "epoch": 0.54, + "grad_norm": 3.8715618480349923, + "learning_rate": 9.204531980263017e-06, + "loss": 0.191, + "step": 10611 + }, + { + "epoch": 0.54, + "grad_norm": 1.2879592708261964, + "learning_rate": 9.202890241458963e-06, + "loss": 0.1668, + "step": 10612 + }, + { + "epoch": 0.54, + "grad_norm": 0.815992453516756, + "learning_rate": 9.201248524276557e-06, + "loss": 0.1838, + "step": 10613 + }, + { + "epoch": 0.54, + "grad_norm": 1.3697672435436143, + "learning_rate": 9.199606828760333e-06, + "loss": 0.2018, + "step": 10614 + }, + { + "epoch": 0.54, + "grad_norm": 1.0653803940723892, + "learning_rate": 9.197965154954815e-06, + "loss": 0.1773, + "step": 10615 + }, + { + "epoch": 0.54, + "grad_norm": 0.7669012746424213, + "learning_rate": 9.196323502904542e-06, + "loss": 0.1711, + "step": 10616 + }, + { + "epoch": 0.54, + "grad_norm": 1.2937877619242906, + "learning_rate": 9.194681872654034e-06, + "loss": 0.1799, + "step": 10617 + }, + { + "epoch": 0.54, + "grad_norm": 1.490122996501818, + "learning_rate": 9.19304026424783e-06, + "loss": 0.1531, + "step": 10618 + }, + { + "epoch": 0.54, + "grad_norm": 1.3258244978870801, + "learning_rate": 9.191398677730449e-06, + "loss": 0.1871, + "step": 10619 + }, + { + "epoch": 0.54, + "grad_norm": 0.9765260196003005, + "learning_rate": 9.189757113146431e-06, + "loss": 0.1832, + "step": 10620 + }, + { + "epoch": 0.54, + "grad_norm": 1.1123303040511696, + "learning_rate": 9.188115570540292e-06, + "loss": 0.1659, + "step": 10621 + }, + { + "epoch": 0.54, + "grad_norm": 1.132217710576086, + "learning_rate": 9.186474049956568e-06, + "loss": 0.1827, + "step": 10622 + }, + { + "epoch": 0.54, + "grad_norm": 1.0677326613521958, + "learning_rate": 9.184832551439777e-06, + "loss": 0.1999, + "step": 10623 + }, + { + "epoch": 0.54, + "grad_norm": 0.8733102154009217, + "learning_rate": 9.183191075034455e-06, + "loss": 0.1573, + "step": 10624 + }, + { + "epoch": 0.54, + "grad_norm": 1.4879165809895192, + "learning_rate": 9.181549620785115e-06, + "loss": 0.1958, + "step": 10625 + }, + { + "epoch": 0.54, + "grad_norm": 0.989982301785905, + "learning_rate": 9.179908188736291e-06, + "loss": 0.1876, + "step": 10626 + }, + { + "epoch": 0.54, + "grad_norm": 0.9870998274556386, + "learning_rate": 9.178266778932506e-06, + "loss": 0.1604, + "step": 10627 + }, + { + "epoch": 0.54, + "grad_norm": 1.0754699021323235, + "learning_rate": 9.176625391418277e-06, + "loss": 0.1792, + "step": 10628 + }, + { + "epoch": 0.54, + "grad_norm": 0.8355366004814154, + "learning_rate": 9.174984026238134e-06, + "loss": 0.1895, + "step": 10629 + }, + { + "epoch": 0.54, + "grad_norm": 1.4989380803523193, + "learning_rate": 9.173342683436593e-06, + "loss": 0.2049, + "step": 10630 + }, + { + "epoch": 0.54, + "grad_norm": 0.7914223255584208, + "learning_rate": 9.171701363058179e-06, + "loss": 0.1691, + "step": 10631 + }, + { + "epoch": 0.54, + "grad_norm": 0.9393068625759057, + "learning_rate": 9.17006006514741e-06, + "loss": 0.1749, + "step": 10632 + }, + { + "epoch": 0.54, + "grad_norm": 1.1462009659205226, + "learning_rate": 9.168418789748815e-06, + "loss": 0.1781, + "step": 10633 + }, + { + "epoch": 0.54, + "grad_norm": 1.3736892729869854, + "learning_rate": 9.166777536906901e-06, + "loss": 0.1726, + "step": 10634 + }, + { + "epoch": 0.54, + "grad_norm": 1.1734054679774246, + "learning_rate": 9.165136306666196e-06, + "loss": 0.1598, + "step": 10635 + }, + { + "epoch": 0.54, + "grad_norm": 0.8581227962415139, + "learning_rate": 9.163495099071214e-06, + "loss": 0.1763, + "step": 10636 + }, + { + "epoch": 0.54, + "grad_norm": 0.9346363358837523, + "learning_rate": 9.161853914166479e-06, + "loss": 0.2053, + "step": 10637 + }, + { + "epoch": 0.54, + "grad_norm": 1.0907314457043735, + "learning_rate": 9.1602127519965e-06, + "loss": 0.1876, + "step": 10638 + }, + { + "epoch": 0.54, + "grad_norm": 0.9318701442009211, + "learning_rate": 9.1585716126058e-06, + "loss": 0.1759, + "step": 10639 + }, + { + "epoch": 0.54, + "grad_norm": 0.9744826376394506, + "learning_rate": 9.15693049603889e-06, + "loss": 0.2064, + "step": 10640 + }, + { + "epoch": 0.54, + "grad_norm": 1.0207950872373368, + "learning_rate": 9.155289402340291e-06, + "loss": 0.1799, + "step": 10641 + }, + { + "epoch": 0.54, + "grad_norm": 0.9647089007127575, + "learning_rate": 9.153648331554511e-06, + "loss": 0.1964, + "step": 10642 + }, + { + "epoch": 0.54, + "grad_norm": 1.1810311399011428, + "learning_rate": 9.152007283726073e-06, + "loss": 0.1655, + "step": 10643 + }, + { + "epoch": 0.54, + "grad_norm": 1.67131220083229, + "learning_rate": 9.150366258899483e-06, + "loss": 0.2026, + "step": 10644 + }, + { + "epoch": 0.54, + "grad_norm": 1.0390141748363309, + "learning_rate": 9.148725257119253e-06, + "loss": 0.1978, + "step": 10645 + }, + { + "epoch": 0.54, + "grad_norm": 2.4274642279887897, + "learning_rate": 9.147084278429903e-06, + "loss": 0.1734, + "step": 10646 + }, + { + "epoch": 0.54, + "grad_norm": 1.1536905388237146, + "learning_rate": 9.145443322875937e-06, + "loss": 0.1664, + "step": 10647 + }, + { + "epoch": 0.54, + "grad_norm": 1.0672186751727026, + "learning_rate": 9.14380239050187e-06, + "loss": 0.1857, + "step": 10648 + }, + { + "epoch": 0.54, + "grad_norm": 1.1406716880450503, + "learning_rate": 9.14216148135221e-06, + "loss": 0.1912, + "step": 10649 + }, + { + "epoch": 0.54, + "grad_norm": 1.6124381866654078, + "learning_rate": 9.140520595471473e-06, + "loss": 0.1919, + "step": 10650 + }, + { + "epoch": 0.54, + "grad_norm": 0.9507612008810503, + "learning_rate": 9.138879732904157e-06, + "loss": 0.2101, + "step": 10651 + }, + { + "epoch": 0.54, + "grad_norm": 0.8238385961207768, + "learning_rate": 9.137238893694781e-06, + "loss": 0.2056, + "step": 10652 + }, + { + "epoch": 0.54, + "grad_norm": 0.9904799908904333, + "learning_rate": 9.135598077887846e-06, + "loss": 0.192, + "step": 10653 + }, + { + "epoch": 0.54, + "grad_norm": 1.0593204896708142, + "learning_rate": 9.133957285527868e-06, + "loss": 0.1807, + "step": 10654 + }, + { + "epoch": 0.54, + "grad_norm": 0.8334129482294383, + "learning_rate": 9.132316516659341e-06, + "loss": 0.1912, + "step": 10655 + }, + { + "epoch": 0.54, + "grad_norm": 2.878014703805649, + "learning_rate": 9.130675771326783e-06, + "loss": 0.1927, + "step": 10656 + }, + { + "epoch": 0.54, + "grad_norm": 1.0126816393076499, + "learning_rate": 9.129035049574688e-06, + "loss": 0.2007, + "step": 10657 + }, + { + "epoch": 0.54, + "grad_norm": 1.280017082713366, + "learning_rate": 9.12739435144757e-06, + "loss": 0.1823, + "step": 10658 + }, + { + "epoch": 0.54, + "grad_norm": 1.2509973026115213, + "learning_rate": 9.125753676989926e-06, + "loss": 0.1979, + "step": 10659 + }, + { + "epoch": 0.54, + "grad_norm": 1.090298495463318, + "learning_rate": 9.124113026246268e-06, + "loss": 0.1854, + "step": 10660 + }, + { + "epoch": 0.54, + "grad_norm": 1.5921016493384281, + "learning_rate": 9.122472399261092e-06, + "loss": 0.1974, + "step": 10661 + }, + { + "epoch": 0.54, + "grad_norm": 0.9171169378239742, + "learning_rate": 9.1208317960789e-06, + "loss": 0.197, + "step": 10662 + }, + { + "epoch": 0.54, + "grad_norm": 0.8770555727852963, + "learning_rate": 9.119191216744197e-06, + "loss": 0.2032, + "step": 10663 + }, + { + "epoch": 0.54, + "grad_norm": 1.3314627171471414, + "learning_rate": 9.11755066130148e-06, + "loss": 0.1925, + "step": 10664 + }, + { + "epoch": 0.54, + "grad_norm": 0.8738649746145589, + "learning_rate": 9.115910129795252e-06, + "loss": 0.184, + "step": 10665 + }, + { + "epoch": 0.54, + "grad_norm": 0.9780393428276124, + "learning_rate": 9.11426962227001e-06, + "loss": 0.1919, + "step": 10666 + }, + { + "epoch": 0.54, + "grad_norm": 0.9004068467150715, + "learning_rate": 9.112629138770259e-06, + "loss": 0.1904, + "step": 10667 + }, + { + "epoch": 0.54, + "grad_norm": 1.0289144423469088, + "learning_rate": 9.110988679340488e-06, + "loss": 0.1721, + "step": 10668 + }, + { + "epoch": 0.54, + "grad_norm": 1.1838230125708527, + "learning_rate": 9.109348244025204e-06, + "loss": 0.1929, + "step": 10669 + }, + { + "epoch": 0.54, + "grad_norm": 0.9743941893483662, + "learning_rate": 9.107707832868896e-06, + "loss": 0.1949, + "step": 10670 + }, + { + "epoch": 0.54, + "grad_norm": 0.8473092924176169, + "learning_rate": 9.106067445916064e-06, + "loss": 0.1724, + "step": 10671 + }, + { + "epoch": 0.54, + "grad_norm": 0.957361674100075, + "learning_rate": 9.104427083211201e-06, + "loss": 0.1669, + "step": 10672 + }, + { + "epoch": 0.54, + "grad_norm": 0.7641459772145157, + "learning_rate": 9.10278674479881e-06, + "loss": 0.166, + "step": 10673 + }, + { + "epoch": 0.54, + "grad_norm": 0.9649873289264107, + "learning_rate": 9.101146430723373e-06, + "loss": 0.157, + "step": 10674 + }, + { + "epoch": 0.54, + "grad_norm": 1.695599698503498, + "learning_rate": 9.099506141029393e-06, + "loss": 0.1863, + "step": 10675 + }, + { + "epoch": 0.54, + "grad_norm": 1.041754790008622, + "learning_rate": 9.097865875761356e-06, + "loss": 0.1821, + "step": 10676 + }, + { + "epoch": 0.54, + "grad_norm": 1.4290400240424552, + "learning_rate": 9.096225634963764e-06, + "loss": 0.1835, + "step": 10677 + }, + { + "epoch": 0.54, + "grad_norm": 1.2286969932742677, + "learning_rate": 9.094585418681098e-06, + "loss": 0.1774, + "step": 10678 + }, + { + "epoch": 0.54, + "grad_norm": 0.8841463674321048, + "learning_rate": 9.092945226957856e-06, + "loss": 0.1698, + "step": 10679 + }, + { + "epoch": 0.54, + "grad_norm": 1.219517297991463, + "learning_rate": 9.091305059838528e-06, + "loss": 0.1917, + "step": 10680 + }, + { + "epoch": 0.54, + "grad_norm": 1.2140141811496525, + "learning_rate": 9.089664917367597e-06, + "loss": 0.1842, + "step": 10681 + }, + { + "epoch": 0.54, + "grad_norm": 1.2792334097251197, + "learning_rate": 9.088024799589557e-06, + "loss": 0.2023, + "step": 10682 + }, + { + "epoch": 0.54, + "grad_norm": 1.0618023545637925, + "learning_rate": 9.086384706548897e-06, + "loss": 0.1664, + "step": 10683 + }, + { + "epoch": 0.54, + "grad_norm": 4.191493755751196, + "learning_rate": 9.084744638290105e-06, + "loss": 0.1846, + "step": 10684 + }, + { + "epoch": 0.54, + "grad_norm": 0.9933607568905328, + "learning_rate": 9.083104594857663e-06, + "loss": 0.1936, + "step": 10685 + }, + { + "epoch": 0.54, + "grad_norm": 2.8200920801777767, + "learning_rate": 9.081464576296066e-06, + "loss": 0.1865, + "step": 10686 + }, + { + "epoch": 0.54, + "grad_norm": 0.971900461440888, + "learning_rate": 9.079824582649788e-06, + "loss": 0.1748, + "step": 10687 + }, + { + "epoch": 0.54, + "grad_norm": 0.8686920317236623, + "learning_rate": 9.078184613963324e-06, + "loss": 0.1749, + "step": 10688 + }, + { + "epoch": 0.54, + "grad_norm": 1.0556078077071391, + "learning_rate": 9.076544670281153e-06, + "loss": 0.1644, + "step": 10689 + }, + { + "epoch": 0.54, + "grad_norm": 0.8479512987307262, + "learning_rate": 9.074904751647762e-06, + "loss": 0.1655, + "step": 10690 + }, + { + "epoch": 0.54, + "grad_norm": 2.9580535758655278, + "learning_rate": 9.073264858107628e-06, + "loss": 0.2055, + "step": 10691 + }, + { + "epoch": 0.54, + "grad_norm": 1.0938096694712256, + "learning_rate": 9.071624989705241e-06, + "loss": 0.1681, + "step": 10692 + }, + { + "epoch": 0.54, + "grad_norm": 1.4343843624625687, + "learning_rate": 9.069985146485075e-06, + "loss": 0.1951, + "step": 10693 + }, + { + "epoch": 0.54, + "grad_norm": 0.9902205037582185, + "learning_rate": 9.068345328491619e-06, + "loss": 0.1865, + "step": 10694 + }, + { + "epoch": 0.54, + "grad_norm": 0.9372712378232515, + "learning_rate": 9.066705535769345e-06, + "loss": 0.1802, + "step": 10695 + }, + { + "epoch": 0.54, + "grad_norm": 1.4848712846231706, + "learning_rate": 9.065065768362736e-06, + "loss": 0.1754, + "step": 10696 + }, + { + "epoch": 0.54, + "grad_norm": 1.5807359672516952, + "learning_rate": 9.063426026316275e-06, + "loss": 0.1838, + "step": 10697 + }, + { + "epoch": 0.54, + "grad_norm": 1.059155620869402, + "learning_rate": 9.061786309674431e-06, + "loss": 0.1754, + "step": 10698 + }, + { + "epoch": 0.54, + "grad_norm": 0.9541798586769945, + "learning_rate": 9.060146618481692e-06, + "loss": 0.1656, + "step": 10699 + }, + { + "epoch": 0.54, + "grad_norm": 1.81118910909562, + "learning_rate": 9.058506952782522e-06, + "loss": 0.2161, + "step": 10700 + }, + { + "epoch": 0.54, + "grad_norm": 1.0132204443059734, + "learning_rate": 9.056867312621408e-06, + "loss": 0.1715, + "step": 10701 + }, + { + "epoch": 0.54, + "grad_norm": 1.0257858478307955, + "learning_rate": 9.05522769804282e-06, + "loss": 0.193, + "step": 10702 + }, + { + "epoch": 0.54, + "grad_norm": 0.8758821720127123, + "learning_rate": 9.053588109091236e-06, + "loss": 0.1795, + "step": 10703 + }, + { + "epoch": 0.54, + "grad_norm": 1.459217006890036, + "learning_rate": 9.051948545811125e-06, + "loss": 0.1789, + "step": 10704 + }, + { + "epoch": 0.54, + "grad_norm": 1.0812146827278828, + "learning_rate": 9.050309008246965e-06, + "loss": 0.2045, + "step": 10705 + }, + { + "epoch": 0.54, + "grad_norm": 2.7067803049987202, + "learning_rate": 9.048669496443226e-06, + "loss": 0.197, + "step": 10706 + }, + { + "epoch": 0.54, + "grad_norm": 1.0480730399539124, + "learning_rate": 9.047030010444382e-06, + "loss": 0.1772, + "step": 10707 + }, + { + "epoch": 0.54, + "grad_norm": 0.9990838162286354, + "learning_rate": 9.045390550294901e-06, + "loss": 0.1904, + "step": 10708 + }, + { + "epoch": 0.54, + "grad_norm": 0.8913607947233958, + "learning_rate": 9.043751116039257e-06, + "loss": 0.182, + "step": 10709 + }, + { + "epoch": 0.54, + "grad_norm": 1.0511457664452766, + "learning_rate": 9.042111707721916e-06, + "loss": 0.1594, + "step": 10710 + }, + { + "epoch": 0.54, + "grad_norm": 1.0732583932499873, + "learning_rate": 9.040472325387352e-06, + "loss": 0.2012, + "step": 10711 + }, + { + "epoch": 0.54, + "grad_norm": 1.903939596274699, + "learning_rate": 9.038832969080029e-06, + "loss": 0.1879, + "step": 10712 + }, + { + "epoch": 0.54, + "grad_norm": 2.200570819981873, + "learning_rate": 9.037193638844417e-06, + "loss": 0.1818, + "step": 10713 + }, + { + "epoch": 0.54, + "grad_norm": 0.7451402376048245, + "learning_rate": 9.035554334724982e-06, + "loss": 0.1894, + "step": 10714 + }, + { + "epoch": 0.54, + "grad_norm": 1.0144393292155092, + "learning_rate": 9.033915056766187e-06, + "loss": 0.1624, + "step": 10715 + }, + { + "epoch": 0.54, + "grad_norm": 1.1598482461764674, + "learning_rate": 9.032275805012505e-06, + "loss": 0.187, + "step": 10716 + }, + { + "epoch": 0.54, + "grad_norm": 1.4088383790140577, + "learning_rate": 9.030636579508392e-06, + "loss": 0.1993, + "step": 10717 + }, + { + "epoch": 0.55, + "grad_norm": 0.9323936061752953, + "learning_rate": 9.02899738029832e-06, + "loss": 0.1877, + "step": 10718 + }, + { + "epoch": 0.55, + "grad_norm": 1.6577111888773488, + "learning_rate": 9.027358207426745e-06, + "loss": 0.2049, + "step": 10719 + }, + { + "epoch": 0.55, + "grad_norm": 0.9143010624235737, + "learning_rate": 9.025719060938138e-06, + "loss": 0.1708, + "step": 10720 + }, + { + "epoch": 0.55, + "grad_norm": 0.8855990663781108, + "learning_rate": 9.024079940876954e-06, + "loss": 0.199, + "step": 10721 + }, + { + "epoch": 0.55, + "grad_norm": 0.9750804804995725, + "learning_rate": 9.022440847287656e-06, + "loss": 0.2279, + "step": 10722 + }, + { + "epoch": 0.55, + "grad_norm": 0.8803501296190209, + "learning_rate": 9.020801780214705e-06, + "loss": 0.1813, + "step": 10723 + }, + { + "epoch": 0.55, + "grad_norm": 0.9742924036790803, + "learning_rate": 9.019162739702564e-06, + "loss": 0.1951, + "step": 10724 + }, + { + "epoch": 0.55, + "grad_norm": 1.0582094217135745, + "learning_rate": 9.017523725795688e-06, + "loss": 0.1769, + "step": 10725 + }, + { + "epoch": 0.55, + "grad_norm": 0.9670077934281341, + "learning_rate": 9.015884738538537e-06, + "loss": 0.1735, + "step": 10726 + }, + { + "epoch": 0.55, + "grad_norm": 0.9456609790651568, + "learning_rate": 9.014245777975565e-06, + "loss": 0.1664, + "step": 10727 + }, + { + "epoch": 0.55, + "grad_norm": 1.5450763527086255, + "learning_rate": 9.012606844151235e-06, + "loss": 0.2163, + "step": 10728 + }, + { + "epoch": 0.55, + "grad_norm": 0.8390513214365491, + "learning_rate": 9.010967937109997e-06, + "loss": 0.1823, + "step": 10729 + }, + { + "epoch": 0.55, + "grad_norm": 1.0167610566569403, + "learning_rate": 9.009329056896314e-06, + "loss": 0.2059, + "step": 10730 + }, + { + "epoch": 0.55, + "grad_norm": 0.8110446302914665, + "learning_rate": 9.007690203554636e-06, + "loss": 0.1828, + "step": 10731 + }, + { + "epoch": 0.55, + "grad_norm": 0.9537201974786012, + "learning_rate": 9.006051377129412e-06, + "loss": 0.1662, + "step": 10732 + }, + { + "epoch": 0.55, + "grad_norm": 0.8364630993085369, + "learning_rate": 9.004412577665107e-06, + "loss": 0.166, + "step": 10733 + }, + { + "epoch": 0.55, + "grad_norm": 0.9991791474471617, + "learning_rate": 9.002773805206161e-06, + "loss": 0.198, + "step": 10734 + }, + { + "epoch": 0.55, + "grad_norm": 1.328538844649469, + "learning_rate": 9.001135059797036e-06, + "loss": 0.1678, + "step": 10735 + }, + { + "epoch": 0.55, + "grad_norm": 1.333354113863571, + "learning_rate": 8.999496341482176e-06, + "loss": 0.1917, + "step": 10736 + }, + { + "epoch": 0.55, + "grad_norm": 1.1206005428077606, + "learning_rate": 8.997857650306038e-06, + "loss": 0.209, + "step": 10737 + }, + { + "epoch": 0.55, + "grad_norm": 1.1991656192292084, + "learning_rate": 8.996218986313063e-06, + "loss": 0.2033, + "step": 10738 + }, + { + "epoch": 0.55, + "grad_norm": 11.08974914382019, + "learning_rate": 8.994580349547711e-06, + "loss": 0.1793, + "step": 10739 + }, + { + "epoch": 0.55, + "grad_norm": 0.8497449718022965, + "learning_rate": 8.992941740054418e-06, + "loss": 0.1951, + "step": 10740 + }, + { + "epoch": 0.55, + "grad_norm": 1.2854953153089266, + "learning_rate": 8.99130315787764e-06, + "loss": 0.1944, + "step": 10741 + }, + { + "epoch": 0.55, + "grad_norm": 1.316795993709575, + "learning_rate": 8.989664603061818e-06, + "loss": 0.1785, + "step": 10742 + }, + { + "epoch": 0.55, + "grad_norm": 0.858883046960564, + "learning_rate": 8.988026075651406e-06, + "loss": 0.1789, + "step": 10743 + }, + { + "epoch": 0.55, + "grad_norm": 0.7604592648249685, + "learning_rate": 8.986387575690839e-06, + "loss": 0.1655, + "step": 10744 + }, + { + "epoch": 0.55, + "grad_norm": 0.9078061178716357, + "learning_rate": 8.984749103224568e-06, + "loss": 0.2136, + "step": 10745 + }, + { + "epoch": 0.55, + "grad_norm": 0.7417197418190933, + "learning_rate": 8.983110658297033e-06, + "loss": 0.1641, + "step": 10746 + }, + { + "epoch": 0.55, + "grad_norm": 1.0208350376886899, + "learning_rate": 8.981472240952683e-06, + "loss": 0.1941, + "step": 10747 + }, + { + "epoch": 0.55, + "grad_norm": 1.0770493927619436, + "learning_rate": 8.979833851235952e-06, + "loss": 0.181, + "step": 10748 + }, + { + "epoch": 0.55, + "grad_norm": 1.2738105750757516, + "learning_rate": 8.978195489191288e-06, + "loss": 0.1891, + "step": 10749 + }, + { + "epoch": 0.55, + "grad_norm": 1.3508481124158724, + "learning_rate": 8.97655715486313e-06, + "loss": 0.1934, + "step": 10750 + }, + { + "epoch": 0.55, + "grad_norm": 0.9695221805853889, + "learning_rate": 8.974918848295914e-06, + "loss": 0.1796, + "step": 10751 + }, + { + "epoch": 0.55, + "grad_norm": 1.2229861774074742, + "learning_rate": 8.973280569534086e-06, + "loss": 0.1776, + "step": 10752 + }, + { + "epoch": 0.55, + "grad_norm": 0.9182531427775426, + "learning_rate": 8.971642318622076e-06, + "loss": 0.1921, + "step": 10753 + }, + { + "epoch": 0.55, + "grad_norm": 0.8634715877601445, + "learning_rate": 8.970004095604329e-06, + "loss": 0.177, + "step": 10754 + }, + { + "epoch": 0.55, + "grad_norm": 0.8793938254743472, + "learning_rate": 8.968365900525275e-06, + "loss": 0.1857, + "step": 10755 + }, + { + "epoch": 0.55, + "grad_norm": 0.9715431874467615, + "learning_rate": 8.966727733429362e-06, + "loss": 0.1889, + "step": 10756 + }, + { + "epoch": 0.55, + "grad_norm": 0.9027070073199415, + "learning_rate": 8.96508959436101e-06, + "loss": 0.1761, + "step": 10757 + }, + { + "epoch": 0.55, + "grad_norm": 1.1654803110367036, + "learning_rate": 8.963451483364665e-06, + "loss": 0.1877, + "step": 10758 + }, + { + "epoch": 0.55, + "grad_norm": 1.002974639763267, + "learning_rate": 8.961813400484753e-06, + "loss": 0.186, + "step": 10759 + }, + { + "epoch": 0.55, + "grad_norm": 0.9182051316480477, + "learning_rate": 8.960175345765718e-06, + "loss": 0.1783, + "step": 10760 + }, + { + "epoch": 0.55, + "grad_norm": 1.1405988075873255, + "learning_rate": 8.95853731925198e-06, + "loss": 0.1648, + "step": 10761 + }, + { + "epoch": 0.55, + "grad_norm": 0.7103698934238166, + "learning_rate": 8.956899320987977e-06, + "loss": 0.1845, + "step": 10762 + }, + { + "epoch": 0.55, + "grad_norm": 0.9802152870441576, + "learning_rate": 8.955261351018138e-06, + "loss": 0.1906, + "step": 10763 + }, + { + "epoch": 0.55, + "grad_norm": 1.1017980194807244, + "learning_rate": 8.953623409386898e-06, + "loss": 0.1808, + "step": 10764 + }, + { + "epoch": 0.55, + "grad_norm": 0.8795915647215737, + "learning_rate": 8.951985496138679e-06, + "loss": 0.1646, + "step": 10765 + }, + { + "epoch": 0.55, + "grad_norm": 0.8785209703760559, + "learning_rate": 8.950347611317915e-06, + "loss": 0.2085, + "step": 10766 + }, + { + "epoch": 0.55, + "grad_norm": 0.8129306072891198, + "learning_rate": 8.94870975496903e-06, + "loss": 0.1803, + "step": 10767 + }, + { + "epoch": 0.55, + "grad_norm": 3.5239210463731796, + "learning_rate": 8.94707192713645e-06, + "loss": 0.1723, + "step": 10768 + }, + { + "epoch": 0.55, + "grad_norm": 0.9476611032628168, + "learning_rate": 8.945434127864608e-06, + "loss": 0.1719, + "step": 10769 + }, + { + "epoch": 0.55, + "grad_norm": 0.7272317358149221, + "learning_rate": 8.94379635719792e-06, + "loss": 0.1572, + "step": 10770 + }, + { + "epoch": 0.55, + "grad_norm": 4.484549886429762, + "learning_rate": 8.942158615180818e-06, + "loss": 0.1828, + "step": 10771 + }, + { + "epoch": 0.55, + "grad_norm": 1.1519687675896289, + "learning_rate": 8.940520901857722e-06, + "loss": 0.1835, + "step": 10772 + }, + { + "epoch": 0.55, + "grad_norm": 0.9810758635443829, + "learning_rate": 8.93888321727306e-06, + "loss": 0.1785, + "step": 10773 + }, + { + "epoch": 0.55, + "grad_norm": 0.7676471358143412, + "learning_rate": 8.937245561471247e-06, + "loss": 0.1815, + "step": 10774 + }, + { + "epoch": 0.55, + "grad_norm": 1.3673114707940612, + "learning_rate": 8.935607934496708e-06, + "loss": 0.1937, + "step": 10775 + }, + { + "epoch": 0.55, + "grad_norm": 1.047631526141729, + "learning_rate": 8.933970336393864e-06, + "loss": 0.1663, + "step": 10776 + }, + { + "epoch": 0.55, + "grad_norm": 1.0940264402701274, + "learning_rate": 8.932332767207138e-06, + "loss": 0.1758, + "step": 10777 + }, + { + "epoch": 0.55, + "grad_norm": 0.7164913330577038, + "learning_rate": 8.930695226980942e-06, + "loss": 0.1674, + "step": 10778 + }, + { + "epoch": 0.55, + "grad_norm": 1.2698096854172805, + "learning_rate": 8.929057715759703e-06, + "loss": 0.1714, + "step": 10779 + }, + { + "epoch": 0.55, + "grad_norm": 0.939943834510151, + "learning_rate": 8.92742023358783e-06, + "loss": 0.1808, + "step": 10780 + }, + { + "epoch": 0.55, + "grad_norm": 5.2860861831418395, + "learning_rate": 8.925782780509745e-06, + "loss": 0.1863, + "step": 10781 + }, + { + "epoch": 0.55, + "grad_norm": 3.6427009088574174, + "learning_rate": 8.92414535656986e-06, + "loss": 0.1753, + "step": 10782 + }, + { + "epoch": 0.55, + "grad_norm": 1.307681135247615, + "learning_rate": 8.922507961812599e-06, + "loss": 0.1795, + "step": 10783 + }, + { + "epoch": 0.55, + "grad_norm": 1.0184505253136333, + "learning_rate": 8.920870596282368e-06, + "loss": 0.1885, + "step": 10784 + }, + { + "epoch": 0.55, + "grad_norm": 1.040277807833205, + "learning_rate": 8.91923326002358e-06, + "loss": 0.1727, + "step": 10785 + }, + { + "epoch": 0.55, + "grad_norm": 1.0954583122249455, + "learning_rate": 8.917595953080656e-06, + "loss": 0.1815, + "step": 10786 + }, + { + "epoch": 0.55, + "grad_norm": 0.9721497354157779, + "learning_rate": 8.915958675497997e-06, + "loss": 0.1791, + "step": 10787 + }, + { + "epoch": 0.55, + "grad_norm": 0.9360953372361804, + "learning_rate": 8.914321427320024e-06, + "loss": 0.1773, + "step": 10788 + }, + { + "epoch": 0.55, + "grad_norm": 1.4024322278896943, + "learning_rate": 8.91268420859114e-06, + "loss": 0.1794, + "step": 10789 + }, + { + "epoch": 0.55, + "grad_norm": 0.9425720861101381, + "learning_rate": 8.911047019355763e-06, + "loss": 0.1712, + "step": 10790 + }, + { + "epoch": 0.55, + "grad_norm": 1.639149419515541, + "learning_rate": 8.909409859658293e-06, + "loss": 0.1579, + "step": 10791 + }, + { + "epoch": 0.55, + "grad_norm": 1.0818029150587538, + "learning_rate": 8.907772729543145e-06, + "loss": 0.2021, + "step": 10792 + }, + { + "epoch": 0.55, + "grad_norm": 1.409368473961371, + "learning_rate": 8.90613562905472e-06, + "loss": 0.17, + "step": 10793 + }, + { + "epoch": 0.55, + "grad_norm": 0.9131587173741051, + "learning_rate": 8.904498558237434e-06, + "loss": 0.1664, + "step": 10794 + }, + { + "epoch": 0.55, + "grad_norm": 0.9603316547360553, + "learning_rate": 8.902861517135682e-06, + "loss": 0.1652, + "step": 10795 + }, + { + "epoch": 0.55, + "grad_norm": 1.1837257062991657, + "learning_rate": 8.901224505793876e-06, + "loss": 0.1775, + "step": 10796 + }, + { + "epoch": 0.55, + "grad_norm": 0.8485619861393137, + "learning_rate": 8.899587524256414e-06, + "loss": 0.1721, + "step": 10797 + }, + { + "epoch": 0.55, + "grad_norm": 1.0226003384133822, + "learning_rate": 8.897950572567704e-06, + "loss": 0.1849, + "step": 10798 + }, + { + "epoch": 0.55, + "grad_norm": 0.9032907278532869, + "learning_rate": 8.896313650772144e-06, + "loss": 0.1971, + "step": 10799 + }, + { + "epoch": 0.55, + "grad_norm": 1.1700811482162825, + "learning_rate": 8.894676758914145e-06, + "loss": 0.1821, + "step": 10800 + }, + { + "epoch": 0.55, + "grad_norm": 1.3275915038076012, + "learning_rate": 8.893039897038097e-06, + "loss": 0.2052, + "step": 10801 + }, + { + "epoch": 0.55, + "grad_norm": 1.1586253791165513, + "learning_rate": 8.891403065188404e-06, + "loss": 0.1714, + "step": 10802 + }, + { + "epoch": 0.55, + "grad_norm": 1.0172789986237956, + "learning_rate": 8.88976626340947e-06, + "loss": 0.1748, + "step": 10803 + }, + { + "epoch": 0.55, + "grad_norm": 0.9277282769430485, + "learning_rate": 8.888129491745684e-06, + "loss": 0.1624, + "step": 10804 + }, + { + "epoch": 0.55, + "grad_norm": 1.7708823030251715, + "learning_rate": 8.88649275024145e-06, + "loss": 0.2086, + "step": 10805 + }, + { + "epoch": 0.55, + "grad_norm": 1.458422843658293, + "learning_rate": 8.88485603894116e-06, + "loss": 0.2087, + "step": 10806 + }, + { + "epoch": 0.55, + "grad_norm": 3.906221351091835, + "learning_rate": 8.883219357889218e-06, + "loss": 0.1811, + "step": 10807 + }, + { + "epoch": 0.55, + "grad_norm": 0.9681472452182767, + "learning_rate": 8.88158270713001e-06, + "loss": 0.1948, + "step": 10808 + }, + { + "epoch": 0.55, + "grad_norm": 1.0084461542273477, + "learning_rate": 8.87994608670794e-06, + "loss": 0.1857, + "step": 10809 + }, + { + "epoch": 0.55, + "grad_norm": 1.0272427474370724, + "learning_rate": 8.87830949666739e-06, + "loss": 0.1802, + "step": 10810 + }, + { + "epoch": 0.55, + "grad_norm": 1.0135686742121741, + "learning_rate": 8.87667293705276e-06, + "loss": 0.2147, + "step": 10811 + }, + { + "epoch": 0.55, + "grad_norm": 0.9696081518257766, + "learning_rate": 8.875036407908439e-06, + "loss": 0.1948, + "step": 10812 + }, + { + "epoch": 0.55, + "grad_norm": 1.0369553923642154, + "learning_rate": 8.873399909278821e-06, + "loss": 0.2007, + "step": 10813 + }, + { + "epoch": 0.55, + "grad_norm": 1.083568254291667, + "learning_rate": 8.871763441208292e-06, + "loss": 0.1672, + "step": 10814 + }, + { + "epoch": 0.55, + "grad_norm": 0.9319992047044777, + "learning_rate": 8.870127003741245e-06, + "loss": 0.2072, + "step": 10815 + }, + { + "epoch": 0.55, + "grad_norm": 1.0894873647954664, + "learning_rate": 8.868490596922064e-06, + "loss": 0.1849, + "step": 10816 + }, + { + "epoch": 0.55, + "grad_norm": 0.8786214420077622, + "learning_rate": 8.866854220795144e-06, + "loss": 0.1663, + "step": 10817 + }, + { + "epoch": 0.55, + "grad_norm": 0.7965908011134126, + "learning_rate": 8.865217875404864e-06, + "loss": 0.1566, + "step": 10818 + }, + { + "epoch": 0.55, + "grad_norm": 1.122771993981457, + "learning_rate": 8.863581560795614e-06, + "loss": 0.1663, + "step": 10819 + }, + { + "epoch": 0.55, + "grad_norm": 1.5355491727641055, + "learning_rate": 8.861945277011782e-06, + "loss": 0.1811, + "step": 10820 + }, + { + "epoch": 0.55, + "grad_norm": 1.0149352880199478, + "learning_rate": 8.860309024097744e-06, + "loss": 0.1896, + "step": 10821 + }, + { + "epoch": 0.55, + "grad_norm": 1.15441128359665, + "learning_rate": 8.85867280209789e-06, + "loss": 0.19, + "step": 10822 + }, + { + "epoch": 0.55, + "grad_norm": 1.0242201683658139, + "learning_rate": 8.857036611056599e-06, + "loss": 0.1854, + "step": 10823 + }, + { + "epoch": 0.55, + "grad_norm": 1.0740247347675482, + "learning_rate": 8.855400451018256e-06, + "loss": 0.2088, + "step": 10824 + }, + { + "epoch": 0.55, + "grad_norm": 1.7181041405240665, + "learning_rate": 8.853764322027239e-06, + "loss": 0.1843, + "step": 10825 + }, + { + "epoch": 0.55, + "grad_norm": 1.096691445469932, + "learning_rate": 8.852128224127931e-06, + "loss": 0.1879, + "step": 10826 + }, + { + "epoch": 0.55, + "grad_norm": 1.0328893655865132, + "learning_rate": 8.850492157364709e-06, + "loss": 0.1928, + "step": 10827 + }, + { + "epoch": 0.55, + "grad_norm": 1.1989050714425344, + "learning_rate": 8.848856121781953e-06, + "loss": 0.1722, + "step": 10828 + }, + { + "epoch": 0.55, + "grad_norm": 0.9972744783098172, + "learning_rate": 8.847220117424035e-06, + "loss": 0.1822, + "step": 10829 + }, + { + "epoch": 0.55, + "grad_norm": 1.022753486245912, + "learning_rate": 8.845584144335344e-06, + "loss": 0.1798, + "step": 10830 + }, + { + "epoch": 0.55, + "grad_norm": 1.1575173533564185, + "learning_rate": 8.843948202560243e-06, + "loss": 0.1672, + "step": 10831 + }, + { + "epoch": 0.55, + "grad_norm": 1.1022591065507295, + "learning_rate": 8.842312292143115e-06, + "loss": 0.1953, + "step": 10832 + }, + { + "epoch": 0.55, + "grad_norm": 1.074404331661858, + "learning_rate": 8.840676413128329e-06, + "loss": 0.2035, + "step": 10833 + }, + { + "epoch": 0.55, + "grad_norm": 0.9233624295011634, + "learning_rate": 8.839040565560264e-06, + "loss": 0.178, + "step": 10834 + }, + { + "epoch": 0.55, + "grad_norm": 1.1496136891813986, + "learning_rate": 8.837404749483285e-06, + "loss": 0.1718, + "step": 10835 + }, + { + "epoch": 0.55, + "grad_norm": 0.9570721283259703, + "learning_rate": 8.835768964941773e-06, + "loss": 0.194, + "step": 10836 + }, + { + "epoch": 0.55, + "grad_norm": 1.000049498068047, + "learning_rate": 8.834133211980091e-06, + "loss": 0.1732, + "step": 10837 + }, + { + "epoch": 0.55, + "grad_norm": 0.9731055950131926, + "learning_rate": 8.83249749064261e-06, + "loss": 0.1644, + "step": 10838 + }, + { + "epoch": 0.55, + "grad_norm": 0.8763466169368689, + "learning_rate": 8.830861800973705e-06, + "loss": 0.1772, + "step": 10839 + }, + { + "epoch": 0.55, + "grad_norm": 0.9514424877296412, + "learning_rate": 8.829226143017735e-06, + "loss": 0.1578, + "step": 10840 + }, + { + "epoch": 0.55, + "grad_norm": 1.072765787949337, + "learning_rate": 8.827590516819073e-06, + "loss": 0.1659, + "step": 10841 + }, + { + "epoch": 0.55, + "grad_norm": 1.076068814527915, + "learning_rate": 8.825954922422084e-06, + "loss": 0.1831, + "step": 10842 + }, + { + "epoch": 0.55, + "grad_norm": 1.0256703193394343, + "learning_rate": 8.824319359871138e-06, + "loss": 0.1759, + "step": 10843 + }, + { + "epoch": 0.55, + "grad_norm": 0.9485220998311064, + "learning_rate": 8.822683829210591e-06, + "loss": 0.1959, + "step": 10844 + }, + { + "epoch": 0.55, + "grad_norm": 1.8247569049585812, + "learning_rate": 8.821048330484814e-06, + "loss": 0.1611, + "step": 10845 + }, + { + "epoch": 0.55, + "grad_norm": 0.9920482972212759, + "learning_rate": 8.819412863738165e-06, + "loss": 0.1807, + "step": 10846 + }, + { + "epoch": 0.55, + "grad_norm": 1.1803199349485702, + "learning_rate": 8.817777429015013e-06, + "loss": 0.1949, + "step": 10847 + }, + { + "epoch": 0.55, + "grad_norm": 1.8361111010536968, + "learning_rate": 8.816142026359711e-06, + "loss": 0.1904, + "step": 10848 + }, + { + "epoch": 0.55, + "grad_norm": 1.0204752253006832, + "learning_rate": 8.814506655816628e-06, + "loss": 0.2107, + "step": 10849 + }, + { + "epoch": 0.55, + "grad_norm": 0.8683096109613782, + "learning_rate": 8.812871317430114e-06, + "loss": 0.1856, + "step": 10850 + }, + { + "epoch": 0.55, + "grad_norm": 1.1192019964063356, + "learning_rate": 8.811236011244535e-06, + "loss": 0.1803, + "step": 10851 + }, + { + "epoch": 0.55, + "grad_norm": 0.8915818411611265, + "learning_rate": 8.809600737304245e-06, + "loss": 0.1713, + "step": 10852 + }, + { + "epoch": 0.55, + "grad_norm": 1.2922478281929404, + "learning_rate": 8.807965495653605e-06, + "loss": 0.1756, + "step": 10853 + }, + { + "epoch": 0.55, + "grad_norm": 1.1438370119106243, + "learning_rate": 8.806330286336966e-06, + "loss": 0.1737, + "step": 10854 + }, + { + "epoch": 0.55, + "grad_norm": 0.9674863155197395, + "learning_rate": 8.804695109398686e-06, + "loss": 0.1739, + "step": 10855 + }, + { + "epoch": 0.55, + "grad_norm": 0.8716670976435485, + "learning_rate": 8.803059964883121e-06, + "loss": 0.185, + "step": 10856 + }, + { + "epoch": 0.55, + "grad_norm": 0.8751877818085911, + "learning_rate": 8.801424852834617e-06, + "loss": 0.1864, + "step": 10857 + }, + { + "epoch": 0.55, + "grad_norm": 1.2810384916662814, + "learning_rate": 8.799789773297536e-06, + "loss": 0.2092, + "step": 10858 + }, + { + "epoch": 0.55, + "grad_norm": 1.715194154870013, + "learning_rate": 8.79815472631622e-06, + "loss": 0.1817, + "step": 10859 + }, + { + "epoch": 0.55, + "grad_norm": 0.957211967304257, + "learning_rate": 8.796519711935032e-06, + "loss": 0.1881, + "step": 10860 + }, + { + "epoch": 0.55, + "grad_norm": 1.204919477838909, + "learning_rate": 8.794884730198309e-06, + "loss": 0.1823, + "step": 10861 + }, + { + "epoch": 0.55, + "grad_norm": 1.1796530871354738, + "learning_rate": 8.793249781150411e-06, + "loss": 0.216, + "step": 10862 + }, + { + "epoch": 0.55, + "grad_norm": 1.4713416941020465, + "learning_rate": 8.791614864835676e-06, + "loss": 0.1788, + "step": 10863 + }, + { + "epoch": 0.55, + "grad_norm": 0.9662566873754121, + "learning_rate": 8.789979981298457e-06, + "loss": 0.1884, + "step": 10864 + }, + { + "epoch": 0.55, + "grad_norm": 1.356337540589256, + "learning_rate": 8.788345130583099e-06, + "loss": 0.1663, + "step": 10865 + }, + { + "epoch": 0.55, + "grad_norm": 1.7815317896632867, + "learning_rate": 8.786710312733952e-06, + "loss": 0.1656, + "step": 10866 + }, + { + "epoch": 0.55, + "grad_norm": 1.3871753218471037, + "learning_rate": 8.78507552779535e-06, + "loss": 0.1613, + "step": 10867 + }, + { + "epoch": 0.55, + "grad_norm": 1.5491548784204685, + "learning_rate": 8.783440775811646e-06, + "loss": 0.1772, + "step": 10868 + }, + { + "epoch": 0.55, + "grad_norm": 1.3689465680987378, + "learning_rate": 8.781806056827179e-06, + "loss": 0.1688, + "step": 10869 + }, + { + "epoch": 0.55, + "grad_norm": 1.2208364624605414, + "learning_rate": 8.780171370886295e-06, + "loss": 0.1632, + "step": 10870 + }, + { + "epoch": 0.55, + "grad_norm": 1.1374577018670873, + "learning_rate": 8.778536718033329e-06, + "loss": 0.2031, + "step": 10871 + }, + { + "epoch": 0.55, + "grad_norm": 1.064865021837396, + "learning_rate": 8.776902098312622e-06, + "loss": 0.1848, + "step": 10872 + }, + { + "epoch": 0.55, + "grad_norm": 1.1104820656639132, + "learning_rate": 8.775267511768518e-06, + "loss": 0.1866, + "step": 10873 + }, + { + "epoch": 0.55, + "grad_norm": 0.9836540178033396, + "learning_rate": 8.773632958445348e-06, + "loss": 0.182, + "step": 10874 + }, + { + "epoch": 0.55, + "grad_norm": 0.7715029190728462, + "learning_rate": 8.771998438387458e-06, + "loss": 0.1629, + "step": 10875 + }, + { + "epoch": 0.55, + "grad_norm": 0.9027018301738213, + "learning_rate": 8.770363951639175e-06, + "loss": 0.1903, + "step": 10876 + }, + { + "epoch": 0.55, + "grad_norm": 1.0277633203112935, + "learning_rate": 8.768729498244841e-06, + "loss": 0.1688, + "step": 10877 + }, + { + "epoch": 0.55, + "grad_norm": 1.2082709822081161, + "learning_rate": 8.767095078248788e-06, + "loss": 0.1884, + "step": 10878 + }, + { + "epoch": 0.55, + "grad_norm": 1.4968536355503077, + "learning_rate": 8.765460691695353e-06, + "loss": 0.1846, + "step": 10879 + }, + { + "epoch": 0.55, + "grad_norm": 0.8723340832563421, + "learning_rate": 8.763826338628865e-06, + "loss": 0.1659, + "step": 10880 + }, + { + "epoch": 0.55, + "grad_norm": 1.0983506797438705, + "learning_rate": 8.762192019093658e-06, + "loss": 0.1557, + "step": 10881 + }, + { + "epoch": 0.55, + "grad_norm": 2.665668563682617, + "learning_rate": 8.76055773313406e-06, + "loss": 0.1948, + "step": 10882 + }, + { + "epoch": 0.55, + "grad_norm": 1.1360990318938495, + "learning_rate": 8.758923480794407e-06, + "loss": 0.1858, + "step": 10883 + }, + { + "epoch": 0.55, + "grad_norm": 1.0668625114928212, + "learning_rate": 8.757289262119021e-06, + "loss": 0.1843, + "step": 10884 + }, + { + "epoch": 0.55, + "grad_norm": 1.5359141009129476, + "learning_rate": 8.755655077152237e-06, + "loss": 0.1967, + "step": 10885 + }, + { + "epoch": 0.55, + "grad_norm": 1.0503386454467547, + "learning_rate": 8.754020925938375e-06, + "loss": 0.1732, + "step": 10886 + }, + { + "epoch": 0.55, + "grad_norm": 1.0101469154598328, + "learning_rate": 8.752386808521774e-06, + "loss": 0.1775, + "step": 10887 + }, + { + "epoch": 0.55, + "grad_norm": 1.1936552339879511, + "learning_rate": 8.750752724946744e-06, + "loss": 0.1859, + "step": 10888 + }, + { + "epoch": 0.55, + "grad_norm": 1.083771152185385, + "learning_rate": 8.749118675257622e-06, + "loss": 0.1906, + "step": 10889 + }, + { + "epoch": 0.55, + "grad_norm": 1.1874420465891897, + "learning_rate": 8.747484659498725e-06, + "loss": 0.1761, + "step": 10890 + }, + { + "epoch": 0.55, + "grad_norm": 0.9178432426616299, + "learning_rate": 8.745850677714373e-06, + "loss": 0.1913, + "step": 10891 + }, + { + "epoch": 0.55, + "grad_norm": 0.9437458995593613, + "learning_rate": 8.744216729948898e-06, + "loss": 0.1849, + "step": 10892 + }, + { + "epoch": 0.55, + "grad_norm": 1.0195680018807618, + "learning_rate": 8.74258281624661e-06, + "loss": 0.1661, + "step": 10893 + }, + { + "epoch": 0.55, + "grad_norm": 0.9040885233318499, + "learning_rate": 8.740948936651838e-06, + "loss": 0.1879, + "step": 10894 + }, + { + "epoch": 0.55, + "grad_norm": 1.2327915094808635, + "learning_rate": 8.739315091208893e-06, + "loss": 0.1977, + "step": 10895 + }, + { + "epoch": 0.55, + "grad_norm": 1.2229661061013206, + "learning_rate": 8.737681279962103e-06, + "loss": 0.1607, + "step": 10896 + }, + { + "epoch": 0.55, + "grad_norm": 1.8743363590350692, + "learning_rate": 8.736047502955776e-06, + "loss": 0.1775, + "step": 10897 + }, + { + "epoch": 0.55, + "grad_norm": 1.1094227460806871, + "learning_rate": 8.734413760234233e-06, + "loss": 0.1764, + "step": 10898 + }, + { + "epoch": 0.55, + "grad_norm": 1.3084970668744595, + "learning_rate": 8.732780051841785e-06, + "loss": 0.1815, + "step": 10899 + }, + { + "epoch": 0.55, + "grad_norm": 1.6092401166985617, + "learning_rate": 8.731146377822755e-06, + "loss": 0.1936, + "step": 10900 + }, + { + "epoch": 0.55, + "grad_norm": 1.0475620352949029, + "learning_rate": 8.729512738221448e-06, + "loss": 0.2011, + "step": 10901 + }, + { + "epoch": 0.55, + "grad_norm": 0.9645061349451889, + "learning_rate": 8.72787913308218e-06, + "loss": 0.1767, + "step": 10902 + }, + { + "epoch": 0.55, + "grad_norm": 1.0792427949313164, + "learning_rate": 8.726245562449261e-06, + "loss": 0.1862, + "step": 10903 + }, + { + "epoch": 0.55, + "grad_norm": 1.1064164071055667, + "learning_rate": 8.724612026367008e-06, + "loss": 0.188, + "step": 10904 + }, + { + "epoch": 0.55, + "grad_norm": 2.2077951842578907, + "learning_rate": 8.72297852487972e-06, + "loss": 0.2033, + "step": 10905 + }, + { + "epoch": 0.55, + "grad_norm": 0.9631462825847468, + "learning_rate": 8.721345058031718e-06, + "loss": 0.1773, + "step": 10906 + }, + { + "epoch": 0.55, + "grad_norm": 1.0126229462231389, + "learning_rate": 8.7197116258673e-06, + "loss": 0.1764, + "step": 10907 + }, + { + "epoch": 0.55, + "grad_norm": 1.111625593594014, + "learning_rate": 8.718078228430775e-06, + "loss": 0.1739, + "step": 10908 + }, + { + "epoch": 0.55, + "grad_norm": 1.136754878674941, + "learning_rate": 8.716444865766454e-06, + "loss": 0.1699, + "step": 10909 + }, + { + "epoch": 0.55, + "grad_norm": 3.0117881332904743, + "learning_rate": 8.714811537918634e-06, + "loss": 0.2098, + "step": 10910 + }, + { + "epoch": 0.55, + "grad_norm": 0.9200787097045166, + "learning_rate": 8.713178244931626e-06, + "loss": 0.1763, + "step": 10911 + }, + { + "epoch": 0.55, + "grad_norm": 1.1680809124305545, + "learning_rate": 8.71154498684973e-06, + "loss": 0.1767, + "step": 10912 + }, + { + "epoch": 0.55, + "grad_norm": 0.994348721844077, + "learning_rate": 8.709911763717251e-06, + "loss": 0.1888, + "step": 10913 + }, + { + "epoch": 0.55, + "grad_norm": 0.848085721687801, + "learning_rate": 8.708278575578485e-06, + "loss": 0.158, + "step": 10914 + }, + { + "epoch": 0.56, + "grad_norm": 1.2588476989472734, + "learning_rate": 8.706645422477739e-06, + "loss": 0.1978, + "step": 10915 + }, + { + "epoch": 0.56, + "grad_norm": 1.8154000665521712, + "learning_rate": 8.705012304459306e-06, + "loss": 0.1759, + "step": 10916 + }, + { + "epoch": 0.56, + "grad_norm": 1.0771918217834016, + "learning_rate": 8.703379221567491e-06, + "loss": 0.1771, + "step": 10917 + }, + { + "epoch": 0.56, + "grad_norm": 1.1485676136008025, + "learning_rate": 8.701746173846583e-06, + "loss": 0.1665, + "step": 10918 + }, + { + "epoch": 0.56, + "grad_norm": 1.4594060433552434, + "learning_rate": 8.70011316134089e-06, + "loss": 0.2013, + "step": 10919 + }, + { + "epoch": 0.56, + "grad_norm": 1.1035150344192732, + "learning_rate": 8.698480184094695e-06, + "loss": 0.1896, + "step": 10920 + }, + { + "epoch": 0.56, + "grad_norm": 0.79687252374784, + "learning_rate": 8.696847242152301e-06, + "loss": 0.1588, + "step": 10921 + }, + { + "epoch": 0.56, + "grad_norm": 1.8751652874434868, + "learning_rate": 8.695214335557997e-06, + "loss": 0.1703, + "step": 10922 + }, + { + "epoch": 0.56, + "grad_norm": 2.283108327990949, + "learning_rate": 8.693581464356083e-06, + "loss": 0.1891, + "step": 10923 + }, + { + "epoch": 0.56, + "grad_norm": 1.30940533827719, + "learning_rate": 8.691948628590841e-06, + "loss": 0.1774, + "step": 10924 + }, + { + "epoch": 0.56, + "grad_norm": 1.2612497366281696, + "learning_rate": 8.690315828306566e-06, + "loss": 0.1899, + "step": 10925 + }, + { + "epoch": 0.56, + "grad_norm": 1.1632980454401722, + "learning_rate": 8.688683063547551e-06, + "loss": 0.1785, + "step": 10926 + }, + { + "epoch": 0.56, + "grad_norm": 1.2054669236060747, + "learning_rate": 8.68705033435808e-06, + "loss": 0.18, + "step": 10927 + }, + { + "epoch": 0.56, + "grad_norm": 0.8880601277908344, + "learning_rate": 8.685417640782444e-06, + "loss": 0.1753, + "step": 10928 + }, + { + "epoch": 0.56, + "grad_norm": 1.2596056413723435, + "learning_rate": 8.683784982864925e-06, + "loss": 0.1873, + "step": 10929 + }, + { + "epoch": 0.56, + "grad_norm": 1.0722207155737384, + "learning_rate": 8.682152360649819e-06, + "loss": 0.174, + "step": 10930 + }, + { + "epoch": 0.56, + "grad_norm": 1.0243942383285416, + "learning_rate": 8.6805197741814e-06, + "loss": 0.1703, + "step": 10931 + }, + { + "epoch": 0.56, + "grad_norm": 1.04686861248303, + "learning_rate": 8.678887223503962e-06, + "loss": 0.1897, + "step": 10932 + }, + { + "epoch": 0.56, + "grad_norm": 0.99105527675413, + "learning_rate": 8.677254708661775e-06, + "loss": 0.1561, + "step": 10933 + }, + { + "epoch": 0.56, + "grad_norm": 1.2436201373640272, + "learning_rate": 8.675622229699134e-06, + "loss": 0.1735, + "step": 10934 + }, + { + "epoch": 0.56, + "grad_norm": 1.0608032324133017, + "learning_rate": 8.67398978666031e-06, + "loss": 0.1764, + "step": 10935 + }, + { + "epoch": 0.56, + "grad_norm": 1.0845256537351942, + "learning_rate": 8.672357379589595e-06, + "loss": 0.1746, + "step": 10936 + }, + { + "epoch": 0.56, + "grad_norm": 1.1726740536790237, + "learning_rate": 8.670725008531255e-06, + "loss": 0.203, + "step": 10937 + }, + { + "epoch": 0.56, + "grad_norm": 0.8982383138239131, + "learning_rate": 8.669092673529577e-06, + "loss": 0.1936, + "step": 10938 + }, + { + "epoch": 0.56, + "grad_norm": 1.1467761252995048, + "learning_rate": 8.667460374628834e-06, + "loss": 0.2081, + "step": 10939 + }, + { + "epoch": 0.56, + "grad_norm": 1.279402231030666, + "learning_rate": 8.665828111873307e-06, + "loss": 0.1743, + "step": 10940 + }, + { + "epoch": 0.56, + "grad_norm": 1.3727098270038447, + "learning_rate": 8.664195885307265e-06, + "loss": 0.1823, + "step": 10941 + }, + { + "epoch": 0.56, + "grad_norm": 1.6401924818960485, + "learning_rate": 8.662563694974983e-06, + "loss": 0.1907, + "step": 10942 + }, + { + "epoch": 0.56, + "grad_norm": 1.123455051565522, + "learning_rate": 8.660931540920743e-06, + "loss": 0.1908, + "step": 10943 + }, + { + "epoch": 0.56, + "grad_norm": 0.838274020948395, + "learning_rate": 8.659299423188803e-06, + "loss": 0.1799, + "step": 10944 + }, + { + "epoch": 0.56, + "grad_norm": 1.0452364543364976, + "learning_rate": 8.657667341823449e-06, + "loss": 0.1792, + "step": 10945 + }, + { + "epoch": 0.56, + "grad_norm": 0.8190999827722713, + "learning_rate": 8.656035296868938e-06, + "loss": 0.1693, + "step": 10946 + }, + { + "epoch": 0.56, + "grad_norm": 1.4522244007392222, + "learning_rate": 8.654403288369548e-06, + "loss": 0.166, + "step": 10947 + }, + { + "epoch": 0.56, + "grad_norm": 1.3287989010272494, + "learning_rate": 8.652771316369544e-06, + "loss": 0.1847, + "step": 10948 + }, + { + "epoch": 0.56, + "grad_norm": 1.0890163794301415, + "learning_rate": 8.651139380913197e-06, + "loss": 0.1977, + "step": 10949 + }, + { + "epoch": 0.56, + "grad_norm": 1.0666571494962853, + "learning_rate": 8.649507482044766e-06, + "loss": 0.1751, + "step": 10950 + }, + { + "epoch": 0.56, + "grad_norm": 0.8103740471153004, + "learning_rate": 8.647875619808523e-06, + "loss": 0.1765, + "step": 10951 + }, + { + "epoch": 0.56, + "grad_norm": 1.0313607099358908, + "learning_rate": 8.64624379424873e-06, + "loss": 0.1665, + "step": 10952 + }, + { + "epoch": 0.56, + "grad_norm": 1.0468097800952842, + "learning_rate": 8.644612005409654e-06, + "loss": 0.1795, + "step": 10953 + }, + { + "epoch": 0.56, + "grad_norm": 1.1057397636613135, + "learning_rate": 8.642980253335547e-06, + "loss": 0.2016, + "step": 10954 + }, + { + "epoch": 0.56, + "grad_norm": 0.8451430566992679, + "learning_rate": 8.641348538070683e-06, + "loss": 0.175, + "step": 10955 + }, + { + "epoch": 0.56, + "grad_norm": 0.7806195927151729, + "learning_rate": 8.639716859659312e-06, + "loss": 0.1901, + "step": 10956 + }, + { + "epoch": 0.56, + "grad_norm": 0.8798299824358545, + "learning_rate": 8.638085218145704e-06, + "loss": 0.1769, + "step": 10957 + }, + { + "epoch": 0.56, + "grad_norm": 1.5650750606191544, + "learning_rate": 8.63645361357411e-06, + "loss": 0.1877, + "step": 10958 + }, + { + "epoch": 0.56, + "grad_norm": 1.1279481372195475, + "learning_rate": 8.634822045988784e-06, + "loss": 0.1831, + "step": 10959 + }, + { + "epoch": 0.56, + "grad_norm": 1.2830148866132574, + "learning_rate": 8.633190515433992e-06, + "loss": 0.1901, + "step": 10960 + }, + { + "epoch": 0.56, + "grad_norm": 1.2855953567844727, + "learning_rate": 8.63155902195398e-06, + "loss": 0.1865, + "step": 10961 + }, + { + "epoch": 0.56, + "grad_norm": 0.8777387129501347, + "learning_rate": 8.62992756559301e-06, + "loss": 0.1688, + "step": 10962 + }, + { + "epoch": 0.56, + "grad_norm": 0.8700630299582908, + "learning_rate": 8.628296146395331e-06, + "loss": 0.1704, + "step": 10963 + }, + { + "epoch": 0.56, + "grad_norm": 1.103315427349816, + "learning_rate": 8.626664764405196e-06, + "loss": 0.1724, + "step": 10964 + }, + { + "epoch": 0.56, + "grad_norm": 1.0764734465086485, + "learning_rate": 8.625033419666856e-06, + "loss": 0.1882, + "step": 10965 + }, + { + "epoch": 0.56, + "grad_norm": 1.0634970577767235, + "learning_rate": 8.623402112224567e-06, + "loss": 0.1972, + "step": 10966 + }, + { + "epoch": 0.56, + "grad_norm": 1.0115594755172888, + "learning_rate": 8.621770842122569e-06, + "loss": 0.2047, + "step": 10967 + }, + { + "epoch": 0.56, + "grad_norm": 1.0603757146365727, + "learning_rate": 8.620139609405115e-06, + "loss": 0.1647, + "step": 10968 + }, + { + "epoch": 0.56, + "grad_norm": 1.0003635283977106, + "learning_rate": 8.618508414116452e-06, + "loss": 0.1721, + "step": 10969 + }, + { + "epoch": 0.56, + "grad_norm": 1.1566118633153082, + "learning_rate": 8.616877256300829e-06, + "loss": 0.1855, + "step": 10970 + }, + { + "epoch": 0.56, + "grad_norm": 0.8767666422857754, + "learning_rate": 8.615246136002486e-06, + "loss": 0.1422, + "step": 10971 + }, + { + "epoch": 0.56, + "grad_norm": 0.9756034260032039, + "learning_rate": 8.613615053265673e-06, + "loss": 0.1666, + "step": 10972 + }, + { + "epoch": 0.56, + "grad_norm": 1.054608091695435, + "learning_rate": 8.611984008134626e-06, + "loss": 0.2045, + "step": 10973 + }, + { + "epoch": 0.56, + "grad_norm": 1.0026363374092324, + "learning_rate": 8.610353000653592e-06, + "loss": 0.1857, + "step": 10974 + }, + { + "epoch": 0.56, + "grad_norm": 1.0371375238714557, + "learning_rate": 8.608722030866812e-06, + "loss": 0.1849, + "step": 10975 + }, + { + "epoch": 0.56, + "grad_norm": 0.9701005160752207, + "learning_rate": 8.607091098818528e-06, + "loss": 0.1735, + "step": 10976 + }, + { + "epoch": 0.56, + "grad_norm": 1.0497538225267664, + "learning_rate": 8.605460204552975e-06, + "loss": 0.1926, + "step": 10977 + }, + { + "epoch": 0.56, + "grad_norm": 0.9234896721451861, + "learning_rate": 8.60382934811439e-06, + "loss": 0.1928, + "step": 10978 + }, + { + "epoch": 0.56, + "grad_norm": 0.9119449965995741, + "learning_rate": 8.602198529547016e-06, + "loss": 0.1795, + "step": 10979 + }, + { + "epoch": 0.56, + "grad_norm": 0.98527998422671, + "learning_rate": 8.600567748895083e-06, + "loss": 0.1869, + "step": 10980 + }, + { + "epoch": 0.56, + "grad_norm": 1.1053717143578772, + "learning_rate": 8.598937006202832e-06, + "loss": 0.1964, + "step": 10981 + }, + { + "epoch": 0.56, + "grad_norm": 1.468621052173339, + "learning_rate": 8.59730630151449e-06, + "loss": 0.1734, + "step": 10982 + }, + { + "epoch": 0.56, + "grad_norm": 0.952452351528307, + "learning_rate": 8.595675634874299e-06, + "loss": 0.1841, + "step": 10983 + }, + { + "epoch": 0.56, + "grad_norm": 1.1959355986594147, + "learning_rate": 8.594045006326481e-06, + "loss": 0.194, + "step": 10984 + }, + { + "epoch": 0.56, + "grad_norm": 0.9707998389931917, + "learning_rate": 8.592414415915275e-06, + "loss": 0.1864, + "step": 10985 + }, + { + "epoch": 0.56, + "grad_norm": 1.382033789392686, + "learning_rate": 8.590783863684904e-06, + "loss": 0.2151, + "step": 10986 + }, + { + "epoch": 0.56, + "grad_norm": 1.5081807414030657, + "learning_rate": 8.589153349679602e-06, + "loss": 0.1716, + "step": 10987 + }, + { + "epoch": 0.56, + "grad_norm": 1.261638494448365, + "learning_rate": 8.587522873943595e-06, + "loss": 0.1997, + "step": 10988 + }, + { + "epoch": 0.56, + "grad_norm": 1.462716210009533, + "learning_rate": 8.585892436521113e-06, + "loss": 0.188, + "step": 10989 + }, + { + "epoch": 0.56, + "grad_norm": 1.4971491727761224, + "learning_rate": 8.584262037456374e-06, + "loss": 0.1831, + "step": 10990 + }, + { + "epoch": 0.56, + "grad_norm": 0.9349659575444129, + "learning_rate": 8.582631676793609e-06, + "loss": 0.1834, + "step": 10991 + }, + { + "epoch": 0.56, + "grad_norm": 0.9905856958238987, + "learning_rate": 8.581001354577037e-06, + "loss": 0.1975, + "step": 10992 + }, + { + "epoch": 0.56, + "grad_norm": 0.9028068531467937, + "learning_rate": 8.57937107085089e-06, + "loss": 0.1784, + "step": 10993 + }, + { + "epoch": 0.56, + "grad_norm": 0.9150736772460405, + "learning_rate": 8.577740825659379e-06, + "loss": 0.1729, + "step": 10994 + }, + { + "epoch": 0.56, + "grad_norm": 0.9936047580820794, + "learning_rate": 8.576110619046726e-06, + "loss": 0.2087, + "step": 10995 + }, + { + "epoch": 0.56, + "grad_norm": 1.2040788857862377, + "learning_rate": 8.574480451057158e-06, + "loss": 0.1729, + "step": 10996 + }, + { + "epoch": 0.56, + "grad_norm": 0.8647923487542004, + "learning_rate": 8.572850321734884e-06, + "loss": 0.1611, + "step": 10997 + }, + { + "epoch": 0.56, + "grad_norm": 1.0258661585027546, + "learning_rate": 8.571220231124129e-06, + "loss": 0.1889, + "step": 10998 + }, + { + "epoch": 0.56, + "grad_norm": 0.9404572152675427, + "learning_rate": 8.569590179269103e-06, + "loss": 0.1859, + "step": 10999 + }, + { + "epoch": 0.56, + "grad_norm": 1.3241112379396596, + "learning_rate": 8.567960166214026e-06, + "loss": 0.1805, + "step": 11000 + }, + { + "epoch": 0.56, + "grad_norm": 1.2502395125362873, + "learning_rate": 8.566330192003108e-06, + "loss": 0.1694, + "step": 11001 + }, + { + "epoch": 0.56, + "grad_norm": 1.3002683332018317, + "learning_rate": 8.564700256680568e-06, + "loss": 0.168, + "step": 11002 + }, + { + "epoch": 0.56, + "grad_norm": 1.159540772474298, + "learning_rate": 8.563070360290611e-06, + "loss": 0.1874, + "step": 11003 + }, + { + "epoch": 0.56, + "grad_norm": 0.9030555139116054, + "learning_rate": 8.561440502877454e-06, + "loss": 0.1761, + "step": 11004 + }, + { + "epoch": 0.56, + "grad_norm": 0.89022901095595, + "learning_rate": 8.559810684485302e-06, + "loss": 0.1863, + "step": 11005 + }, + { + "epoch": 0.56, + "grad_norm": 1.2859747477095844, + "learning_rate": 8.558180905158372e-06, + "loss": 0.1866, + "step": 11006 + }, + { + "epoch": 0.56, + "grad_norm": 2.1070866647981865, + "learning_rate": 8.55655116494086e-06, + "loss": 0.2022, + "step": 11007 + }, + { + "epoch": 0.56, + "grad_norm": 1.0914719993337787, + "learning_rate": 8.554921463876983e-06, + "loss": 0.1655, + "step": 11008 + }, + { + "epoch": 0.56, + "grad_norm": 1.2357096687659934, + "learning_rate": 8.55329180201094e-06, + "loss": 0.203, + "step": 11009 + }, + { + "epoch": 0.56, + "grad_norm": 1.1989549010584921, + "learning_rate": 8.551662179386944e-06, + "loss": 0.1722, + "step": 11010 + }, + { + "epoch": 0.56, + "grad_norm": 2.088420664118368, + "learning_rate": 8.55003259604919e-06, + "loss": 0.1696, + "step": 11011 + }, + { + "epoch": 0.56, + "grad_norm": 1.0460955577335986, + "learning_rate": 8.54840305204188e-06, + "loss": 0.1639, + "step": 11012 + }, + { + "epoch": 0.56, + "grad_norm": 0.8399521974271892, + "learning_rate": 8.546773547409227e-06, + "loss": 0.168, + "step": 11013 + }, + { + "epoch": 0.56, + "grad_norm": 1.1979374294097083, + "learning_rate": 8.545144082195417e-06, + "loss": 0.1817, + "step": 11014 + }, + { + "epoch": 0.56, + "grad_norm": 1.3236855265368384, + "learning_rate": 8.54351465644466e-06, + "loss": 0.1872, + "step": 11015 + }, + { + "epoch": 0.56, + "grad_norm": 2.563503817505762, + "learning_rate": 8.541885270201146e-06, + "loss": 0.1858, + "step": 11016 + }, + { + "epoch": 0.56, + "grad_norm": 0.9908741381061342, + "learning_rate": 8.540255923509081e-06, + "loss": 0.2116, + "step": 11017 + }, + { + "epoch": 0.56, + "grad_norm": 1.184467292305344, + "learning_rate": 8.538626616412651e-06, + "loss": 0.1809, + "step": 11018 + }, + { + "epoch": 0.56, + "grad_norm": 2.2502518651558443, + "learning_rate": 8.536997348956065e-06, + "loss": 0.1994, + "step": 11019 + }, + { + "epoch": 0.56, + "grad_norm": 1.1074915593033396, + "learning_rate": 8.5353681211835e-06, + "loss": 0.1774, + "step": 11020 + }, + { + "epoch": 0.56, + "grad_norm": 1.3377825391510998, + "learning_rate": 8.533738933139162e-06, + "loss": 0.2249, + "step": 11021 + }, + { + "epoch": 0.56, + "grad_norm": 0.8265730358711466, + "learning_rate": 8.532109784867235e-06, + "loss": 0.1889, + "step": 11022 + }, + { + "epoch": 0.56, + "grad_norm": 1.147024717062173, + "learning_rate": 8.530480676411919e-06, + "loss": 0.1922, + "step": 11023 + }, + { + "epoch": 0.56, + "grad_norm": 0.9714762838124623, + "learning_rate": 8.52885160781739e-06, + "loss": 0.1623, + "step": 11024 + }, + { + "epoch": 0.56, + "grad_norm": 1.4244966383242854, + "learning_rate": 8.52722257912785e-06, + "loss": 0.1986, + "step": 11025 + }, + { + "epoch": 0.56, + "grad_norm": 1.406073801878377, + "learning_rate": 8.525593590387476e-06, + "loss": 0.1853, + "step": 11026 + }, + { + "epoch": 0.56, + "grad_norm": 1.9625221752761264, + "learning_rate": 8.523964641640466e-06, + "loss": 0.1741, + "step": 11027 + }, + { + "epoch": 0.56, + "grad_norm": 1.6320938086216492, + "learning_rate": 8.522335732930996e-06, + "loss": 0.1879, + "step": 11028 + }, + { + "epoch": 0.56, + "grad_norm": 0.8104481171462272, + "learning_rate": 8.52070686430325e-06, + "loss": 0.167, + "step": 11029 + }, + { + "epoch": 0.56, + "grad_norm": 2.0081096113409695, + "learning_rate": 8.519078035801415e-06, + "loss": 0.1695, + "step": 11030 + }, + { + "epoch": 0.56, + "grad_norm": 0.8971930904927773, + "learning_rate": 8.51744924746967e-06, + "loss": 0.1832, + "step": 11031 + }, + { + "epoch": 0.56, + "grad_norm": 0.9465995537342011, + "learning_rate": 8.515820499352203e-06, + "loss": 0.1698, + "step": 11032 + }, + { + "epoch": 0.56, + "grad_norm": 1.2279426320906355, + "learning_rate": 8.514191791493183e-06, + "loss": 0.1648, + "step": 11033 + }, + { + "epoch": 0.56, + "grad_norm": 1.096622128478326, + "learning_rate": 8.512563123936796e-06, + "loss": 0.1885, + "step": 11034 + }, + { + "epoch": 0.56, + "grad_norm": 0.9072936887908979, + "learning_rate": 8.510934496727218e-06, + "loss": 0.186, + "step": 11035 + }, + { + "epoch": 0.56, + "grad_norm": 1.2281823546434798, + "learning_rate": 8.50930590990863e-06, + "loss": 0.1806, + "step": 11036 + }, + { + "epoch": 0.56, + "grad_norm": 0.7826850654708606, + "learning_rate": 8.5076773635252e-06, + "loss": 0.1985, + "step": 11037 + }, + { + "epoch": 0.56, + "grad_norm": 0.7724179175255349, + "learning_rate": 8.506048857621106e-06, + "loss": 0.1779, + "step": 11038 + }, + { + "epoch": 0.56, + "grad_norm": 1.1463623013180657, + "learning_rate": 8.50442039224052e-06, + "loss": 0.1776, + "step": 11039 + }, + { + "epoch": 0.56, + "grad_norm": 0.8442033414257292, + "learning_rate": 8.50279196742762e-06, + "loss": 0.1574, + "step": 11040 + }, + { + "epoch": 0.56, + "grad_norm": 1.7464494523210246, + "learning_rate": 8.501163583226567e-06, + "loss": 0.1837, + "step": 11041 + }, + { + "epoch": 0.56, + "grad_norm": 2.03783109296676, + "learning_rate": 8.499535239681541e-06, + "loss": 0.1846, + "step": 11042 + }, + { + "epoch": 0.56, + "grad_norm": 1.186059856182854, + "learning_rate": 8.497906936836704e-06, + "loss": 0.1718, + "step": 11043 + }, + { + "epoch": 0.56, + "grad_norm": 1.6469341544249463, + "learning_rate": 8.496278674736226e-06, + "loss": 0.1484, + "step": 11044 + }, + { + "epoch": 0.56, + "grad_norm": 1.039933904642872, + "learning_rate": 8.494650453424272e-06, + "loss": 0.2039, + "step": 11045 + }, + { + "epoch": 0.56, + "grad_norm": 1.0120314170011517, + "learning_rate": 8.493022272945014e-06, + "loss": 0.1794, + "step": 11046 + }, + { + "epoch": 0.56, + "grad_norm": 1.0318104615833152, + "learning_rate": 8.49139413334261e-06, + "loss": 0.1917, + "step": 11047 + }, + { + "epoch": 0.56, + "grad_norm": 1.042770661802242, + "learning_rate": 8.489766034661225e-06, + "loss": 0.2006, + "step": 11048 + }, + { + "epoch": 0.56, + "grad_norm": 1.0643753590812406, + "learning_rate": 8.488137976945023e-06, + "loss": 0.2061, + "step": 11049 + }, + { + "epoch": 0.56, + "grad_norm": 0.9171635729586125, + "learning_rate": 8.48650996023816e-06, + "loss": 0.1804, + "step": 11050 + }, + { + "epoch": 0.56, + "grad_norm": 0.8872809410093716, + "learning_rate": 8.484881984584803e-06, + "loss": 0.1678, + "step": 11051 + }, + { + "epoch": 0.56, + "grad_norm": 0.8891633300766486, + "learning_rate": 8.483254050029105e-06, + "loss": 0.1966, + "step": 11052 + }, + { + "epoch": 0.56, + "grad_norm": 1.0598293338182732, + "learning_rate": 8.481626156615231e-06, + "loss": 0.1981, + "step": 11053 + }, + { + "epoch": 0.56, + "grad_norm": 1.0567734225104917, + "learning_rate": 8.479998304387329e-06, + "loss": 0.1826, + "step": 11054 + }, + { + "epoch": 0.56, + "grad_norm": 0.7260869610454411, + "learning_rate": 8.478370493389563e-06, + "loss": 0.184, + "step": 11055 + }, + { + "epoch": 0.56, + "grad_norm": 1.1388976808393028, + "learning_rate": 8.47674272366608e-06, + "loss": 0.1711, + "step": 11056 + }, + { + "epoch": 0.56, + "grad_norm": 0.9298321738607229, + "learning_rate": 8.475114995261038e-06, + "loss": 0.1799, + "step": 11057 + }, + { + "epoch": 0.56, + "grad_norm": 1.6325973917833867, + "learning_rate": 8.473487308218585e-06, + "loss": 0.2003, + "step": 11058 + }, + { + "epoch": 0.56, + "grad_norm": 0.8499000723973356, + "learning_rate": 8.47185966258288e-06, + "loss": 0.1757, + "step": 11059 + }, + { + "epoch": 0.56, + "grad_norm": 0.8913225991349657, + "learning_rate": 8.470232058398063e-06, + "loss": 0.1657, + "step": 11060 + }, + { + "epoch": 0.56, + "grad_norm": 1.1076538002812075, + "learning_rate": 8.468604495708292e-06, + "loss": 0.1799, + "step": 11061 + }, + { + "epoch": 0.56, + "grad_norm": 0.9903752364490356, + "learning_rate": 8.466976974557706e-06, + "loss": 0.1719, + "step": 11062 + }, + { + "epoch": 0.56, + "grad_norm": 0.8559784269414673, + "learning_rate": 8.465349494990461e-06, + "loss": 0.1761, + "step": 11063 + }, + { + "epoch": 0.56, + "grad_norm": 1.3156245006771254, + "learning_rate": 8.463722057050696e-06, + "loss": 0.1677, + "step": 11064 + }, + { + "epoch": 0.56, + "grad_norm": 0.943210871044513, + "learning_rate": 8.462094660782555e-06, + "loss": 0.1644, + "step": 11065 + }, + { + "epoch": 0.56, + "grad_norm": 0.9269504655641544, + "learning_rate": 8.460467306230187e-06, + "loss": 0.2061, + "step": 11066 + }, + { + "epoch": 0.56, + "grad_norm": 0.8738119366074828, + "learning_rate": 8.458839993437726e-06, + "loss": 0.2071, + "step": 11067 + }, + { + "epoch": 0.56, + "grad_norm": 0.8343929887803908, + "learning_rate": 8.457212722449322e-06, + "loss": 0.1685, + "step": 11068 + }, + { + "epoch": 0.56, + "grad_norm": 0.7622864170582623, + "learning_rate": 8.455585493309107e-06, + "loss": 0.1819, + "step": 11069 + }, + { + "epoch": 0.56, + "grad_norm": 1.8684169737198935, + "learning_rate": 8.453958306061223e-06, + "loss": 0.1858, + "step": 11070 + }, + { + "epoch": 0.56, + "grad_norm": 1.294786211809637, + "learning_rate": 8.452331160749804e-06, + "loss": 0.1853, + "step": 11071 + }, + { + "epoch": 0.56, + "grad_norm": 0.8926249284110674, + "learning_rate": 8.450704057418996e-06, + "loss": 0.206, + "step": 11072 + }, + { + "epoch": 0.56, + "grad_norm": 1.3117790446383297, + "learning_rate": 8.449076996112924e-06, + "loss": 0.1691, + "step": 11073 + }, + { + "epoch": 0.56, + "grad_norm": 0.869685388091908, + "learning_rate": 8.447449976875726e-06, + "loss": 0.1864, + "step": 11074 + }, + { + "epoch": 0.56, + "grad_norm": 0.7568117156894011, + "learning_rate": 8.445822999751536e-06, + "loss": 0.1803, + "step": 11075 + }, + { + "epoch": 0.56, + "grad_norm": 0.7174381127014311, + "learning_rate": 8.444196064784487e-06, + "loss": 0.1794, + "step": 11076 + }, + { + "epoch": 0.56, + "grad_norm": 0.9146953380622976, + "learning_rate": 8.442569172018703e-06, + "loss": 0.2146, + "step": 11077 + }, + { + "epoch": 0.56, + "grad_norm": 1.3439061263436667, + "learning_rate": 8.440942321498322e-06, + "loss": 0.1911, + "step": 11078 + }, + { + "epoch": 0.56, + "grad_norm": 0.794186945195168, + "learning_rate": 8.439315513267465e-06, + "loss": 0.1677, + "step": 11079 + }, + { + "epoch": 0.56, + "grad_norm": 1.1085470147484742, + "learning_rate": 8.437688747370267e-06, + "loss": 0.1719, + "step": 11080 + }, + { + "epoch": 0.56, + "grad_norm": 0.7518099221003346, + "learning_rate": 8.43606202385085e-06, + "loss": 0.1733, + "step": 11081 + }, + { + "epoch": 0.56, + "grad_norm": 0.9856665916024756, + "learning_rate": 8.434435342753335e-06, + "loss": 0.1834, + "step": 11082 + }, + { + "epoch": 0.56, + "grad_norm": 4.82839232903437, + "learning_rate": 8.432808704121854e-06, + "loss": 0.207, + "step": 11083 + }, + { + "epoch": 0.56, + "grad_norm": 1.3434290434642995, + "learning_rate": 8.431182108000522e-06, + "loss": 0.1664, + "step": 11084 + }, + { + "epoch": 0.56, + "grad_norm": 1.0161057228711095, + "learning_rate": 8.429555554433466e-06, + "loss": 0.1824, + "step": 11085 + }, + { + "epoch": 0.56, + "grad_norm": 0.8666489308505075, + "learning_rate": 8.427929043464802e-06, + "loss": 0.1797, + "step": 11086 + }, + { + "epoch": 0.56, + "grad_norm": 1.0791313722788873, + "learning_rate": 8.426302575138652e-06, + "loss": 0.1925, + "step": 11087 + }, + { + "epoch": 0.56, + "grad_norm": 0.9458821418898986, + "learning_rate": 8.424676149499133e-06, + "loss": 0.1704, + "step": 11088 + }, + { + "epoch": 0.56, + "grad_norm": 1.266642190112638, + "learning_rate": 8.423049766590368e-06, + "loss": 0.1862, + "step": 11089 + }, + { + "epoch": 0.56, + "grad_norm": 1.0161348196522118, + "learning_rate": 8.42142342645646e-06, + "loss": 0.1985, + "step": 11090 + }, + { + "epoch": 0.56, + "grad_norm": 0.8030952803424762, + "learning_rate": 8.419797129141535e-06, + "loss": 0.174, + "step": 11091 + }, + { + "epoch": 0.56, + "grad_norm": 1.9562934368345417, + "learning_rate": 8.418170874689698e-06, + "loss": 0.1998, + "step": 11092 + }, + { + "epoch": 0.56, + "grad_norm": 0.7236144403554686, + "learning_rate": 8.416544663145073e-06, + "loss": 0.1681, + "step": 11093 + }, + { + "epoch": 0.56, + "grad_norm": 0.7208411969746172, + "learning_rate": 8.414918494551756e-06, + "loss": 0.1795, + "step": 11094 + }, + { + "epoch": 0.56, + "grad_norm": 1.0597353018878433, + "learning_rate": 8.41329236895387e-06, + "loss": 0.1828, + "step": 11095 + }, + { + "epoch": 0.56, + "grad_norm": 1.9124523581849326, + "learning_rate": 8.411666286395512e-06, + "loss": 0.1724, + "step": 11096 + }, + { + "epoch": 0.56, + "grad_norm": 0.9149754997881748, + "learning_rate": 8.410040246920799e-06, + "loss": 0.1765, + "step": 11097 + }, + { + "epoch": 0.56, + "grad_norm": 1.8623594055291297, + "learning_rate": 8.408414250573836e-06, + "loss": 0.1808, + "step": 11098 + }, + { + "epoch": 0.56, + "grad_norm": 1.6064872872688194, + "learning_rate": 8.406788297398722e-06, + "loss": 0.2015, + "step": 11099 + }, + { + "epoch": 0.56, + "grad_norm": 0.8449701891957053, + "learning_rate": 8.40516238743957e-06, + "loss": 0.1716, + "step": 11100 + }, + { + "epoch": 0.56, + "grad_norm": 0.8100056755848879, + "learning_rate": 8.403536520740474e-06, + "loss": 0.1733, + "step": 11101 + }, + { + "epoch": 0.56, + "grad_norm": 1.958813365387284, + "learning_rate": 8.401910697345545e-06, + "loss": 0.2077, + "step": 11102 + }, + { + "epoch": 0.56, + "grad_norm": 0.7220376893487891, + "learning_rate": 8.400284917298873e-06, + "loss": 0.1976, + "step": 11103 + }, + { + "epoch": 0.56, + "grad_norm": 1.2678557390705776, + "learning_rate": 8.398659180644566e-06, + "loss": 0.2011, + "step": 11104 + }, + { + "epoch": 0.56, + "grad_norm": 1.1575418812541884, + "learning_rate": 8.397033487426717e-06, + "loss": 0.1832, + "step": 11105 + }, + { + "epoch": 0.56, + "grad_norm": 0.8109308744591895, + "learning_rate": 8.395407837689429e-06, + "loss": 0.186, + "step": 11106 + }, + { + "epoch": 0.56, + "grad_norm": 0.8894855499170428, + "learning_rate": 8.393782231476791e-06, + "loss": 0.1618, + "step": 11107 + }, + { + "epoch": 0.56, + "grad_norm": 0.9600703395707851, + "learning_rate": 8.392156668832904e-06, + "loss": 0.1905, + "step": 11108 + }, + { + "epoch": 0.56, + "grad_norm": 0.7495019141844758, + "learning_rate": 8.390531149801855e-06, + "loss": 0.1779, + "step": 11109 + }, + { + "epoch": 0.56, + "grad_norm": 0.8162713651475454, + "learning_rate": 8.38890567442774e-06, + "loss": 0.1886, + "step": 11110 + }, + { + "epoch": 0.57, + "grad_norm": 0.8336747419583592, + "learning_rate": 8.387280242754647e-06, + "loss": 0.1572, + "step": 11111 + }, + { + "epoch": 0.57, + "grad_norm": 1.2507564505391258, + "learning_rate": 8.385654854826674e-06, + "loss": 0.1887, + "step": 11112 + }, + { + "epoch": 0.57, + "grad_norm": 1.3709963705184494, + "learning_rate": 8.384029510687901e-06, + "loss": 0.1749, + "step": 11113 + }, + { + "epoch": 0.57, + "grad_norm": 1.2214279227277296, + "learning_rate": 8.38240421038242e-06, + "loss": 0.1783, + "step": 11114 + }, + { + "epoch": 0.57, + "grad_norm": 1.1108161355604176, + "learning_rate": 8.380778953954314e-06, + "loss": 0.1769, + "step": 11115 + }, + { + "epoch": 0.57, + "grad_norm": 0.8516403521844824, + "learning_rate": 8.379153741447674e-06, + "loss": 0.1869, + "step": 11116 + }, + { + "epoch": 0.57, + "grad_norm": 1.5159161379968633, + "learning_rate": 8.377528572906577e-06, + "loss": 0.1771, + "step": 11117 + }, + { + "epoch": 0.57, + "grad_norm": 2.1689390073254047, + "learning_rate": 8.375903448375109e-06, + "loss": 0.1902, + "step": 11118 + }, + { + "epoch": 0.57, + "grad_norm": 0.9519725613019046, + "learning_rate": 8.374278367897356e-06, + "loss": 0.1678, + "step": 11119 + }, + { + "epoch": 0.57, + "grad_norm": 1.1234418790420935, + "learning_rate": 8.37265333151739e-06, + "loss": 0.19, + "step": 11120 + }, + { + "epoch": 0.57, + "grad_norm": 0.8878798498587716, + "learning_rate": 8.371028339279294e-06, + "loss": 0.1852, + "step": 11121 + }, + { + "epoch": 0.57, + "grad_norm": 0.8025824103558893, + "learning_rate": 8.369403391227147e-06, + "loss": 0.1799, + "step": 11122 + }, + { + "epoch": 0.57, + "grad_norm": 0.9236287372839385, + "learning_rate": 8.367778487405028e-06, + "loss": 0.1707, + "step": 11123 + }, + { + "epoch": 0.57, + "grad_norm": 0.91283308205594, + "learning_rate": 8.366153627857007e-06, + "loss": 0.1595, + "step": 11124 + }, + { + "epoch": 0.57, + "grad_norm": 0.9898467211961541, + "learning_rate": 8.364528812627164e-06, + "loss": 0.1914, + "step": 11125 + }, + { + "epoch": 0.57, + "grad_norm": 1.496393384975324, + "learning_rate": 8.362904041759565e-06, + "loss": 0.1833, + "step": 11126 + }, + { + "epoch": 0.57, + "grad_norm": 1.1537680813326736, + "learning_rate": 8.361279315298289e-06, + "loss": 0.1994, + "step": 11127 + }, + { + "epoch": 0.57, + "grad_norm": 1.041084597870637, + "learning_rate": 8.359654633287402e-06, + "loss": 0.1973, + "step": 11128 + }, + { + "epoch": 0.57, + "grad_norm": 1.0126117950716194, + "learning_rate": 8.358029995770979e-06, + "loss": 0.1643, + "step": 11129 + }, + { + "epoch": 0.57, + "grad_norm": 1.0569021104004925, + "learning_rate": 8.356405402793081e-06, + "loss": 0.1698, + "step": 11130 + }, + { + "epoch": 0.57, + "grad_norm": 1.026689494861531, + "learning_rate": 8.354780854397782e-06, + "loss": 0.1609, + "step": 11131 + }, + { + "epoch": 0.57, + "grad_norm": 1.0032381083929853, + "learning_rate": 8.353156350629144e-06, + "loss": 0.1742, + "step": 11132 + }, + { + "epoch": 0.57, + "grad_norm": 0.9080894368846892, + "learning_rate": 8.351531891531235e-06, + "loss": 0.1905, + "step": 11133 + }, + { + "epoch": 0.57, + "grad_norm": 0.8909433565269967, + "learning_rate": 8.349907477148117e-06, + "loss": 0.1918, + "step": 11134 + }, + { + "epoch": 0.57, + "grad_norm": 0.9240676936852567, + "learning_rate": 8.348283107523848e-06, + "loss": 0.1654, + "step": 11135 + }, + { + "epoch": 0.57, + "grad_norm": 1.0737051026902602, + "learning_rate": 8.346658782702497e-06, + "loss": 0.1875, + "step": 11136 + }, + { + "epoch": 0.57, + "grad_norm": 1.2452176936417192, + "learning_rate": 8.345034502728117e-06, + "loss": 0.1878, + "step": 11137 + }, + { + "epoch": 0.57, + "grad_norm": 1.0940724955406431, + "learning_rate": 8.343410267644772e-06, + "loss": 0.1712, + "step": 11138 + }, + { + "epoch": 0.57, + "grad_norm": 1.2688468526066765, + "learning_rate": 8.341786077496513e-06, + "loss": 0.1796, + "step": 11139 + }, + { + "epoch": 0.57, + "grad_norm": 1.1625892788590617, + "learning_rate": 8.340161932327405e-06, + "loss": 0.1892, + "step": 11140 + }, + { + "epoch": 0.57, + "grad_norm": 0.8787948953079177, + "learning_rate": 8.338537832181494e-06, + "loss": 0.1679, + "step": 11141 + }, + { + "epoch": 0.57, + "grad_norm": 1.433149178929281, + "learning_rate": 8.336913777102844e-06, + "loss": 0.1955, + "step": 11142 + }, + { + "epoch": 0.57, + "grad_norm": 1.0963719115563995, + "learning_rate": 8.335289767135497e-06, + "loss": 0.1678, + "step": 11143 + }, + { + "epoch": 0.57, + "grad_norm": 0.8733510208261461, + "learning_rate": 8.333665802323511e-06, + "loss": 0.1749, + "step": 11144 + }, + { + "epoch": 0.57, + "grad_norm": 1.0572328492621077, + "learning_rate": 8.33204188271093e-06, + "loss": 0.1763, + "step": 11145 + }, + { + "epoch": 0.57, + "grad_norm": 0.9655645976323488, + "learning_rate": 8.330418008341814e-06, + "loss": 0.1817, + "step": 11146 + }, + { + "epoch": 0.57, + "grad_norm": 1.0939906086573947, + "learning_rate": 8.328794179260199e-06, + "loss": 0.181, + "step": 11147 + }, + { + "epoch": 0.57, + "grad_norm": 1.243143463741493, + "learning_rate": 8.327170395510137e-06, + "loss": 0.182, + "step": 11148 + }, + { + "epoch": 0.57, + "grad_norm": 1.0128360439150017, + "learning_rate": 8.325546657135673e-06, + "loss": 0.1882, + "step": 11149 + }, + { + "epoch": 0.57, + "grad_norm": 1.2114392175763768, + "learning_rate": 8.323922964180853e-06, + "loss": 0.1819, + "step": 11150 + }, + { + "epoch": 0.57, + "grad_norm": 1.111825839568996, + "learning_rate": 8.322299316689717e-06, + "loss": 0.1881, + "step": 11151 + }, + { + "epoch": 0.57, + "grad_norm": 0.9514854567930829, + "learning_rate": 8.320675714706304e-06, + "loss": 0.1885, + "step": 11152 + }, + { + "epoch": 0.57, + "grad_norm": 1.729532871086949, + "learning_rate": 8.319052158274659e-06, + "loss": 0.1857, + "step": 11153 + }, + { + "epoch": 0.57, + "grad_norm": 0.8476108959359467, + "learning_rate": 8.317428647438816e-06, + "loss": 0.1958, + "step": 11154 + }, + { + "epoch": 0.57, + "grad_norm": 1.4892354448303768, + "learning_rate": 8.31580518224282e-06, + "loss": 0.1747, + "step": 11155 + }, + { + "epoch": 0.57, + "grad_norm": 0.9187311605693036, + "learning_rate": 8.3141817627307e-06, + "loss": 0.1769, + "step": 11156 + }, + { + "epoch": 0.57, + "grad_norm": 1.072172900218075, + "learning_rate": 8.312558388946497e-06, + "loss": 0.1733, + "step": 11157 + }, + { + "epoch": 0.57, + "grad_norm": 1.1420718748445113, + "learning_rate": 8.310935060934242e-06, + "loss": 0.1699, + "step": 11158 + }, + { + "epoch": 0.57, + "grad_norm": 0.913240919562509, + "learning_rate": 8.30931177873797e-06, + "loss": 0.1633, + "step": 11159 + }, + { + "epoch": 0.57, + "grad_norm": 1.051073321740484, + "learning_rate": 8.307688542401709e-06, + "loss": 0.1867, + "step": 11160 + }, + { + "epoch": 0.57, + "grad_norm": 0.8171469423370246, + "learning_rate": 8.306065351969494e-06, + "loss": 0.1793, + "step": 11161 + }, + { + "epoch": 0.57, + "grad_norm": 0.8922136251925066, + "learning_rate": 8.30444220748535e-06, + "loss": 0.2011, + "step": 11162 + }, + { + "epoch": 0.57, + "grad_norm": 0.9938900223149604, + "learning_rate": 8.302819108993311e-06, + "loss": 0.1852, + "step": 11163 + }, + { + "epoch": 0.57, + "grad_norm": 1.4287792786208626, + "learning_rate": 8.301196056537398e-06, + "loss": 0.1674, + "step": 11164 + }, + { + "epoch": 0.57, + "grad_norm": 0.7941065747802549, + "learning_rate": 8.299573050161637e-06, + "loss": 0.1573, + "step": 11165 + }, + { + "epoch": 0.57, + "grad_norm": 1.8134429355374344, + "learning_rate": 8.297950089910053e-06, + "loss": 0.1637, + "step": 11166 + }, + { + "epoch": 0.57, + "grad_norm": 1.0627650354860654, + "learning_rate": 8.29632717582667e-06, + "loss": 0.1735, + "step": 11167 + }, + { + "epoch": 0.57, + "grad_norm": 1.1882568419001993, + "learning_rate": 8.29470430795551e-06, + "loss": 0.2061, + "step": 11168 + }, + { + "epoch": 0.57, + "grad_norm": 0.9156524069604196, + "learning_rate": 8.293081486340587e-06, + "loss": 0.1754, + "step": 11169 + }, + { + "epoch": 0.57, + "grad_norm": 0.9397394528457855, + "learning_rate": 8.291458711025928e-06, + "loss": 0.1683, + "step": 11170 + }, + { + "epoch": 0.57, + "grad_norm": 0.9827543442383088, + "learning_rate": 8.289835982055546e-06, + "loss": 0.1969, + "step": 11171 + }, + { + "epoch": 0.57, + "grad_norm": 1.001792440456909, + "learning_rate": 8.288213299473464e-06, + "loss": 0.1875, + "step": 11172 + }, + { + "epoch": 0.57, + "grad_norm": 1.083625888509629, + "learning_rate": 8.286590663323689e-06, + "loss": 0.1876, + "step": 11173 + }, + { + "epoch": 0.57, + "grad_norm": 1.220390786988613, + "learning_rate": 8.28496807365024e-06, + "loss": 0.1822, + "step": 11174 + }, + { + "epoch": 0.57, + "grad_norm": 0.9251997557124158, + "learning_rate": 8.283345530497128e-06, + "loss": 0.1911, + "step": 11175 + }, + { + "epoch": 0.57, + "grad_norm": 0.9765254071523586, + "learning_rate": 8.281723033908369e-06, + "loss": 0.1863, + "step": 11176 + }, + { + "epoch": 0.57, + "grad_norm": 1.4798687261341534, + "learning_rate": 8.280100583927965e-06, + "loss": 0.1853, + "step": 11177 + }, + { + "epoch": 0.57, + "grad_norm": 0.813600417515956, + "learning_rate": 8.278478180599935e-06, + "loss": 0.1628, + "step": 11178 + }, + { + "epoch": 0.57, + "grad_norm": 1.0435886610526612, + "learning_rate": 8.276855823968278e-06, + "loss": 0.1875, + "step": 11179 + }, + { + "epoch": 0.57, + "grad_norm": 0.8112724641517984, + "learning_rate": 8.275233514077003e-06, + "loss": 0.1714, + "step": 11180 + }, + { + "epoch": 0.57, + "grad_norm": 1.3049418762007479, + "learning_rate": 8.273611250970118e-06, + "loss": 0.1742, + "step": 11181 + }, + { + "epoch": 0.57, + "grad_norm": 1.2920212964599433, + "learning_rate": 8.271989034691628e-06, + "loss": 0.1938, + "step": 11182 + }, + { + "epoch": 0.57, + "grad_norm": 0.9233125397139785, + "learning_rate": 8.270366865285528e-06, + "loss": 0.1923, + "step": 11183 + }, + { + "epoch": 0.57, + "grad_norm": 1.1376499852517181, + "learning_rate": 8.268744742795827e-06, + "loss": 0.1899, + "step": 11184 + }, + { + "epoch": 0.57, + "grad_norm": 1.094168903904755, + "learning_rate": 8.26712266726652e-06, + "loss": 0.1805, + "step": 11185 + }, + { + "epoch": 0.57, + "grad_norm": 0.9333248112316166, + "learning_rate": 8.265500638741615e-06, + "loss": 0.1801, + "step": 11186 + }, + { + "epoch": 0.57, + "grad_norm": 0.823286028623516, + "learning_rate": 8.263878657265099e-06, + "loss": 0.1779, + "step": 11187 + }, + { + "epoch": 0.57, + "grad_norm": 0.7560114879876358, + "learning_rate": 8.262256722880972e-06, + "loss": 0.1864, + "step": 11188 + }, + { + "epoch": 0.57, + "grad_norm": 1.336291099600056, + "learning_rate": 8.260634835633232e-06, + "loss": 0.1729, + "step": 11189 + }, + { + "epoch": 0.57, + "grad_norm": 0.7182598354778278, + "learning_rate": 8.259012995565868e-06, + "loss": 0.1624, + "step": 11190 + }, + { + "epoch": 0.57, + "grad_norm": 0.7813488338610487, + "learning_rate": 8.257391202722877e-06, + "loss": 0.1972, + "step": 11191 + }, + { + "epoch": 0.57, + "grad_norm": 1.3591205075647002, + "learning_rate": 8.255769457148245e-06, + "loss": 0.197, + "step": 11192 + }, + { + "epoch": 0.57, + "grad_norm": 0.7968283715538477, + "learning_rate": 8.254147758885967e-06, + "loss": 0.1754, + "step": 11193 + }, + { + "epoch": 0.57, + "grad_norm": 0.7972409968258353, + "learning_rate": 8.252526107980027e-06, + "loss": 0.177, + "step": 11194 + }, + { + "epoch": 0.57, + "grad_norm": 1.092037423266726, + "learning_rate": 8.25090450447442e-06, + "loss": 0.198, + "step": 11195 + }, + { + "epoch": 0.57, + "grad_norm": 0.993110992833045, + "learning_rate": 8.249282948413123e-06, + "loss": 0.2043, + "step": 11196 + }, + { + "epoch": 0.57, + "grad_norm": 2.0829880769116116, + "learning_rate": 8.247661439840126e-06, + "loss": 0.1982, + "step": 11197 + }, + { + "epoch": 0.57, + "grad_norm": 1.15991828521091, + "learning_rate": 8.246039978799409e-06, + "loss": 0.1755, + "step": 11198 + }, + { + "epoch": 0.57, + "grad_norm": 0.8749820174230736, + "learning_rate": 8.244418565334962e-06, + "loss": 0.1856, + "step": 11199 + }, + { + "epoch": 0.57, + "grad_norm": 1.0155370756575854, + "learning_rate": 8.242797199490757e-06, + "loss": 0.1975, + "step": 11200 + }, + { + "epoch": 0.57, + "grad_norm": 0.954844012432073, + "learning_rate": 8.241175881310776e-06, + "loss": 0.15, + "step": 11201 + }, + { + "epoch": 0.57, + "grad_norm": 1.393593559664671, + "learning_rate": 8.239554610839e-06, + "loss": 0.1904, + "step": 11202 + }, + { + "epoch": 0.57, + "grad_norm": 1.0309112780357765, + "learning_rate": 8.237933388119409e-06, + "loss": 0.1784, + "step": 11203 + }, + { + "epoch": 0.57, + "grad_norm": 1.2242804853714373, + "learning_rate": 8.236312213195972e-06, + "loss": 0.1842, + "step": 11204 + }, + { + "epoch": 0.57, + "grad_norm": 1.092592283605871, + "learning_rate": 8.234691086112662e-06, + "loss": 0.1838, + "step": 11205 + }, + { + "epoch": 0.57, + "grad_norm": 0.7873812308441436, + "learning_rate": 8.23307000691346e-06, + "loss": 0.1828, + "step": 11206 + }, + { + "epoch": 0.57, + "grad_norm": 1.1405892274453442, + "learning_rate": 8.231448975642329e-06, + "loss": 0.2055, + "step": 11207 + }, + { + "epoch": 0.57, + "grad_norm": 0.9035227624336404, + "learning_rate": 8.229827992343251e-06, + "loss": 0.1863, + "step": 11208 + }, + { + "epoch": 0.57, + "grad_norm": 0.9515723670404533, + "learning_rate": 8.228207057060184e-06, + "loss": 0.1696, + "step": 11209 + }, + { + "epoch": 0.57, + "grad_norm": 1.2242351733938421, + "learning_rate": 8.226586169837103e-06, + "loss": 0.213, + "step": 11210 + }, + { + "epoch": 0.57, + "grad_norm": 1.0159569573403826, + "learning_rate": 8.224965330717967e-06, + "loss": 0.1848, + "step": 11211 + }, + { + "epoch": 0.57, + "grad_norm": 0.8746281789128181, + "learning_rate": 8.223344539746755e-06, + "loss": 0.1847, + "step": 11212 + }, + { + "epoch": 0.57, + "grad_norm": 0.9441233355578249, + "learning_rate": 8.221723796967416e-06, + "loss": 0.1997, + "step": 11213 + }, + { + "epoch": 0.57, + "grad_norm": 0.9518524036191406, + "learning_rate": 8.220103102423923e-06, + "loss": 0.1789, + "step": 11214 + }, + { + "epoch": 0.57, + "grad_norm": 0.779774266436401, + "learning_rate": 8.21848245616023e-06, + "loss": 0.1759, + "step": 11215 + }, + { + "epoch": 0.57, + "grad_norm": 2.3450116809336112, + "learning_rate": 8.216861858220307e-06, + "loss": 0.1888, + "step": 11216 + }, + { + "epoch": 0.57, + "grad_norm": 1.229616077127556, + "learning_rate": 8.215241308648102e-06, + "loss": 0.194, + "step": 11217 + }, + { + "epoch": 0.57, + "grad_norm": 0.9018917733087243, + "learning_rate": 8.213620807487583e-06, + "loss": 0.168, + "step": 11218 + }, + { + "epoch": 0.57, + "grad_norm": 1.079239171539652, + "learning_rate": 8.212000354782695e-06, + "loss": 0.19, + "step": 11219 + }, + { + "epoch": 0.57, + "grad_norm": 0.8430855813690791, + "learning_rate": 8.210379950577398e-06, + "loss": 0.1706, + "step": 11220 + }, + { + "epoch": 0.57, + "grad_norm": 0.8230309276074811, + "learning_rate": 8.208759594915652e-06, + "loss": 0.1828, + "step": 11221 + }, + { + "epoch": 0.57, + "grad_norm": 1.050508723669258, + "learning_rate": 8.207139287841397e-06, + "loss": 0.1818, + "step": 11222 + }, + { + "epoch": 0.57, + "grad_norm": 0.8161318295355628, + "learning_rate": 8.205519029398592e-06, + "loss": 0.1982, + "step": 11223 + }, + { + "epoch": 0.57, + "grad_norm": 0.8558635578387, + "learning_rate": 8.203898819631183e-06, + "loss": 0.1853, + "step": 11224 + }, + { + "epoch": 0.57, + "grad_norm": 0.8714033130034283, + "learning_rate": 8.202278658583125e-06, + "loss": 0.1865, + "step": 11225 + }, + { + "epoch": 0.57, + "grad_norm": 0.8827239327377759, + "learning_rate": 8.200658546298354e-06, + "loss": 0.1877, + "step": 11226 + }, + { + "epoch": 0.57, + "grad_norm": 1.2222996881129682, + "learning_rate": 8.199038482820824e-06, + "loss": 0.1913, + "step": 11227 + }, + { + "epoch": 0.57, + "grad_norm": 0.8298858935965866, + "learning_rate": 8.197418468194476e-06, + "loss": 0.179, + "step": 11228 + }, + { + "epoch": 0.57, + "grad_norm": 0.8088596354338242, + "learning_rate": 8.195798502463256e-06, + "loss": 0.1849, + "step": 11229 + }, + { + "epoch": 0.57, + "grad_norm": 1.3270774863738242, + "learning_rate": 8.194178585671102e-06, + "loss": 0.1787, + "step": 11230 + }, + { + "epoch": 0.57, + "grad_norm": 1.1797488127169842, + "learning_rate": 8.192558717861956e-06, + "loss": 0.1669, + "step": 11231 + }, + { + "epoch": 0.57, + "grad_norm": 0.790298626038553, + "learning_rate": 8.190938899079756e-06, + "loss": 0.1583, + "step": 11232 + }, + { + "epoch": 0.57, + "grad_norm": 0.7010232969764986, + "learning_rate": 8.189319129368445e-06, + "loss": 0.1859, + "step": 11233 + }, + { + "epoch": 0.57, + "grad_norm": 0.9080372020704749, + "learning_rate": 8.18769940877195e-06, + "loss": 0.1892, + "step": 11234 + }, + { + "epoch": 0.57, + "grad_norm": 1.0889779620400961, + "learning_rate": 8.186079737334214e-06, + "loss": 0.1793, + "step": 11235 + }, + { + "epoch": 0.57, + "grad_norm": 0.91466018716544, + "learning_rate": 8.184460115099165e-06, + "loss": 0.1735, + "step": 11236 + }, + { + "epoch": 0.57, + "grad_norm": 0.8871193225037902, + "learning_rate": 8.182840542110739e-06, + "loss": 0.1565, + "step": 11237 + }, + { + "epoch": 0.57, + "grad_norm": 1.0285746494638235, + "learning_rate": 8.181221018412868e-06, + "loss": 0.1718, + "step": 11238 + }, + { + "epoch": 0.57, + "grad_norm": 1.029761854841097, + "learning_rate": 8.179601544049475e-06, + "loss": 0.187, + "step": 11239 + }, + { + "epoch": 0.57, + "grad_norm": 1.1239798590547327, + "learning_rate": 8.177982119064497e-06, + "loss": 0.1579, + "step": 11240 + }, + { + "epoch": 0.57, + "grad_norm": 1.275798523162937, + "learning_rate": 8.176362743501853e-06, + "loss": 0.1851, + "step": 11241 + }, + { + "epoch": 0.57, + "grad_norm": 1.1377935943260051, + "learning_rate": 8.174743417405479e-06, + "loss": 0.1923, + "step": 11242 + }, + { + "epoch": 0.57, + "grad_norm": 0.7527062986335642, + "learning_rate": 8.173124140819285e-06, + "loss": 0.1969, + "step": 11243 + }, + { + "epoch": 0.57, + "grad_norm": 1.0965192679867608, + "learning_rate": 8.171504913787208e-06, + "loss": 0.1695, + "step": 11244 + }, + { + "epoch": 0.57, + "grad_norm": 0.8068912443926077, + "learning_rate": 8.169885736353158e-06, + "loss": 0.1896, + "step": 11245 + }, + { + "epoch": 0.57, + "grad_norm": 0.8361653287998111, + "learning_rate": 8.168266608561068e-06, + "loss": 0.1828, + "step": 11246 + }, + { + "epoch": 0.57, + "grad_norm": 1.2573793588577753, + "learning_rate": 8.166647530454843e-06, + "loss": 0.1468, + "step": 11247 + }, + { + "epoch": 0.57, + "grad_norm": 0.9078758393689891, + "learning_rate": 8.165028502078412e-06, + "loss": 0.1715, + "step": 11248 + }, + { + "epoch": 0.57, + "grad_norm": 0.7759762634961216, + "learning_rate": 8.163409523475682e-06, + "loss": 0.1895, + "step": 11249 + }, + { + "epoch": 0.57, + "grad_norm": 0.8809090715288981, + "learning_rate": 8.161790594690577e-06, + "loss": 0.1923, + "step": 11250 + }, + { + "epoch": 0.57, + "grad_norm": 1.07713204744277, + "learning_rate": 8.160171715767002e-06, + "loss": 0.1645, + "step": 11251 + }, + { + "epoch": 0.57, + "grad_norm": 0.8341872450950087, + "learning_rate": 8.158552886748878e-06, + "loss": 0.1677, + "step": 11252 + }, + { + "epoch": 0.57, + "grad_norm": 1.642327976245687, + "learning_rate": 8.156934107680106e-06, + "loss": 0.1772, + "step": 11253 + }, + { + "epoch": 0.57, + "grad_norm": 0.822319012757017, + "learning_rate": 8.155315378604602e-06, + "loss": 0.1802, + "step": 11254 + }, + { + "epoch": 0.57, + "grad_norm": 1.0442418900022932, + "learning_rate": 8.153696699566272e-06, + "loss": 0.1798, + "step": 11255 + }, + { + "epoch": 0.57, + "grad_norm": 0.8988307675062374, + "learning_rate": 8.152078070609027e-06, + "loss": 0.1967, + "step": 11256 + }, + { + "epoch": 0.57, + "grad_norm": 1.1779903856225626, + "learning_rate": 8.150459491776765e-06, + "loss": 0.1847, + "step": 11257 + }, + { + "epoch": 0.57, + "grad_norm": 0.8690379175414268, + "learning_rate": 8.148840963113392e-06, + "loss": 0.1668, + "step": 11258 + }, + { + "epoch": 0.57, + "grad_norm": 7.08253893833268, + "learning_rate": 8.147222484662817e-06, + "loss": 0.1828, + "step": 11259 + }, + { + "epoch": 0.57, + "grad_norm": 2.802304156833683, + "learning_rate": 8.145604056468935e-06, + "loss": 0.177, + "step": 11260 + }, + { + "epoch": 0.57, + "grad_norm": 0.8294670880912641, + "learning_rate": 8.14398567857565e-06, + "loss": 0.1621, + "step": 11261 + }, + { + "epoch": 0.57, + "grad_norm": 0.7540643717119795, + "learning_rate": 8.142367351026853e-06, + "loss": 0.1481, + "step": 11262 + }, + { + "epoch": 0.57, + "grad_norm": 1.26321843148518, + "learning_rate": 8.140749073866449e-06, + "loss": 0.1903, + "step": 11263 + }, + { + "epoch": 0.57, + "grad_norm": 1.3476642102613676, + "learning_rate": 8.139130847138332e-06, + "loss": 0.1826, + "step": 11264 + }, + { + "epoch": 0.57, + "grad_norm": 1.2458429990147137, + "learning_rate": 8.137512670886397e-06, + "loss": 0.1746, + "step": 11265 + }, + { + "epoch": 0.57, + "grad_norm": 8.834594923921182, + "learning_rate": 8.135894545154533e-06, + "loss": 0.2108, + "step": 11266 + }, + { + "epoch": 0.57, + "grad_norm": 1.129814446121592, + "learning_rate": 8.134276469986638e-06, + "loss": 0.1707, + "step": 11267 + }, + { + "epoch": 0.57, + "grad_norm": 1.136819679803226, + "learning_rate": 8.132658445426595e-06, + "loss": 0.1569, + "step": 11268 + }, + { + "epoch": 0.57, + "grad_norm": 0.9788466534491648, + "learning_rate": 8.131040471518302e-06, + "loss": 0.173, + "step": 11269 + }, + { + "epoch": 0.57, + "grad_norm": 1.0440416991646695, + "learning_rate": 8.129422548305637e-06, + "loss": 0.1741, + "step": 11270 + }, + { + "epoch": 0.57, + "grad_norm": 1.1616215131566432, + "learning_rate": 8.127804675832494e-06, + "loss": 0.1861, + "step": 11271 + }, + { + "epoch": 0.57, + "grad_norm": 1.1600212126434577, + "learning_rate": 8.126186854142752e-06, + "loss": 0.2059, + "step": 11272 + }, + { + "epoch": 0.57, + "grad_norm": 1.6623548564513533, + "learning_rate": 8.124569083280303e-06, + "loss": 0.1855, + "step": 11273 + }, + { + "epoch": 0.57, + "grad_norm": 1.185514362249397, + "learning_rate": 8.122951363289022e-06, + "loss": 0.1988, + "step": 11274 + }, + { + "epoch": 0.57, + "grad_norm": 2.536505725783757, + "learning_rate": 8.12133369421279e-06, + "loss": 0.1921, + "step": 11275 + }, + { + "epoch": 0.57, + "grad_norm": 1.047302910115353, + "learning_rate": 8.119716076095485e-06, + "loss": 0.2041, + "step": 11276 + }, + { + "epoch": 0.57, + "grad_norm": 1.1227914996846082, + "learning_rate": 8.118098508980989e-06, + "loss": 0.1765, + "step": 11277 + }, + { + "epoch": 0.57, + "grad_norm": 0.8278666307866703, + "learning_rate": 8.116480992913181e-06, + "loss": 0.1863, + "step": 11278 + }, + { + "epoch": 0.57, + "grad_norm": 1.2470534589076179, + "learning_rate": 8.114863527935929e-06, + "loss": 0.1943, + "step": 11279 + }, + { + "epoch": 0.57, + "grad_norm": 0.7763928476800777, + "learning_rate": 8.11324611409311e-06, + "loss": 0.1893, + "step": 11280 + }, + { + "epoch": 0.57, + "grad_norm": 0.7925818183893052, + "learning_rate": 8.111628751428595e-06, + "loss": 0.1634, + "step": 11281 + }, + { + "epoch": 0.57, + "grad_norm": 0.8936196251979709, + "learning_rate": 8.110011439986262e-06, + "loss": 0.163, + "step": 11282 + }, + { + "epoch": 0.57, + "grad_norm": 1.1152052160492107, + "learning_rate": 8.10839417980997e-06, + "loss": 0.1821, + "step": 11283 + }, + { + "epoch": 0.57, + "grad_norm": 0.8475383954634401, + "learning_rate": 8.106776970943597e-06, + "loss": 0.1888, + "step": 11284 + }, + { + "epoch": 0.57, + "grad_norm": 0.9449434158508707, + "learning_rate": 8.105159813431002e-06, + "loss": 0.1883, + "step": 11285 + }, + { + "epoch": 0.57, + "grad_norm": 1.0582156036341694, + "learning_rate": 8.103542707316058e-06, + "loss": 0.1877, + "step": 11286 + }, + { + "epoch": 0.57, + "grad_norm": 0.6783740262877755, + "learning_rate": 8.10192565264262e-06, + "loss": 0.1699, + "step": 11287 + }, + { + "epoch": 0.57, + "grad_norm": 1.2497198731430568, + "learning_rate": 8.100308649454563e-06, + "loss": 0.1756, + "step": 11288 + }, + { + "epoch": 0.57, + "grad_norm": 1.7127747891379077, + "learning_rate": 8.098691697795737e-06, + "loss": 0.1665, + "step": 11289 + }, + { + "epoch": 0.57, + "grad_norm": 0.8696899217535935, + "learning_rate": 8.097074797710007e-06, + "loss": 0.1796, + "step": 11290 + }, + { + "epoch": 0.57, + "grad_norm": 0.8810934002107135, + "learning_rate": 8.095457949241233e-06, + "loss": 0.1798, + "step": 11291 + }, + { + "epoch": 0.57, + "grad_norm": 1.086874398334323, + "learning_rate": 8.093841152433265e-06, + "loss": 0.1778, + "step": 11292 + }, + { + "epoch": 0.57, + "grad_norm": 1.2700382193260717, + "learning_rate": 8.092224407329965e-06, + "loss": 0.1766, + "step": 11293 + }, + { + "epoch": 0.57, + "grad_norm": 1.0137503311295675, + "learning_rate": 8.090607713975182e-06, + "loss": 0.1728, + "step": 11294 + }, + { + "epoch": 0.57, + "grad_norm": 1.280165089528958, + "learning_rate": 8.08899107241278e-06, + "loss": 0.1919, + "step": 11295 + }, + { + "epoch": 0.57, + "grad_norm": 1.2938704499783362, + "learning_rate": 8.087374482686598e-06, + "loss": 0.2076, + "step": 11296 + }, + { + "epoch": 0.57, + "grad_norm": 1.1512383258265375, + "learning_rate": 8.085757944840493e-06, + "loss": 0.2066, + "step": 11297 + }, + { + "epoch": 0.57, + "grad_norm": 0.7807085489114645, + "learning_rate": 8.08414145891831e-06, + "loss": 0.2115, + "step": 11298 + }, + { + "epoch": 0.57, + "grad_norm": 1.05569501529721, + "learning_rate": 8.082525024963902e-06, + "loss": 0.1771, + "step": 11299 + }, + { + "epoch": 0.57, + "grad_norm": 0.9034811271517589, + "learning_rate": 8.080908643021107e-06, + "loss": 0.2346, + "step": 11300 + }, + { + "epoch": 0.57, + "grad_norm": 1.0660529540221337, + "learning_rate": 8.079292313133778e-06, + "loss": 0.1686, + "step": 11301 + }, + { + "epoch": 0.57, + "grad_norm": 1.2568586287865866, + "learning_rate": 8.077676035345748e-06, + "loss": 0.1725, + "step": 11302 + }, + { + "epoch": 0.57, + "grad_norm": 0.8573594071467343, + "learning_rate": 8.076059809700866e-06, + "loss": 0.1938, + "step": 11303 + }, + { + "epoch": 0.57, + "grad_norm": 1.0561172525104656, + "learning_rate": 8.07444363624297e-06, + "loss": 0.1567, + "step": 11304 + }, + { + "epoch": 0.57, + "grad_norm": 0.9525099461969996, + "learning_rate": 8.072827515015902e-06, + "loss": 0.1808, + "step": 11305 + }, + { + "epoch": 0.57, + "grad_norm": 1.1728442563294836, + "learning_rate": 8.071211446063495e-06, + "loss": 0.1774, + "step": 11306 + }, + { + "epoch": 0.57, + "grad_norm": 1.2680203679857898, + "learning_rate": 8.069595429429586e-06, + "loss": 0.1642, + "step": 11307 + }, + { + "epoch": 0.58, + "grad_norm": 0.9485894628866848, + "learning_rate": 8.067979465158013e-06, + "loss": 0.1728, + "step": 11308 + }, + { + "epoch": 0.58, + "grad_norm": 0.9960977651544729, + "learning_rate": 8.0663635532926e-06, + "loss": 0.1747, + "step": 11309 + }, + { + "epoch": 0.58, + "grad_norm": 1.3899001437254705, + "learning_rate": 8.06474769387719e-06, + "loss": 0.1401, + "step": 11310 + }, + { + "epoch": 0.58, + "grad_norm": 2.8600698725128217, + "learning_rate": 8.063131886955605e-06, + "loss": 0.1904, + "step": 11311 + }, + { + "epoch": 0.58, + "grad_norm": 0.9555677688751293, + "learning_rate": 8.061516132571679e-06, + "loss": 0.1964, + "step": 11312 + }, + { + "epoch": 0.58, + "grad_norm": 0.9597259602120783, + "learning_rate": 8.059900430769234e-06, + "loss": 0.185, + "step": 11313 + }, + { + "epoch": 0.58, + "grad_norm": 1.111605309098347, + "learning_rate": 8.058284781592107e-06, + "loss": 0.1626, + "step": 11314 + }, + { + "epoch": 0.58, + "grad_norm": 1.2225140069439981, + "learning_rate": 8.056669185084108e-06, + "loss": 0.1598, + "step": 11315 + }, + { + "epoch": 0.58, + "grad_norm": 1.2846536877935064, + "learning_rate": 8.05505364128907e-06, + "loss": 0.1752, + "step": 11316 + }, + { + "epoch": 0.58, + "grad_norm": 0.8394495233121696, + "learning_rate": 8.053438150250808e-06, + "loss": 0.172, + "step": 11317 + }, + { + "epoch": 0.58, + "grad_norm": 0.9968223731163737, + "learning_rate": 8.051822712013151e-06, + "loss": 0.1919, + "step": 11318 + }, + { + "epoch": 0.58, + "grad_norm": 1.123603368114008, + "learning_rate": 8.05020732661991e-06, + "loss": 0.1887, + "step": 11319 + }, + { + "epoch": 0.58, + "grad_norm": 0.7884087579364176, + "learning_rate": 8.048591994114906e-06, + "loss": 0.1702, + "step": 11320 + }, + { + "epoch": 0.58, + "grad_norm": 1.4011551740812698, + "learning_rate": 8.046976714541953e-06, + "loss": 0.164, + "step": 11321 + }, + { + "epoch": 0.58, + "grad_norm": 0.8102925771158213, + "learning_rate": 8.045361487944872e-06, + "loss": 0.2049, + "step": 11322 + }, + { + "epoch": 0.58, + "grad_norm": 0.7791722073468371, + "learning_rate": 8.043746314367466e-06, + "loss": 0.1966, + "step": 11323 + }, + { + "epoch": 0.58, + "grad_norm": 1.2138756039240277, + "learning_rate": 8.042131193853553e-06, + "loss": 0.1648, + "step": 11324 + }, + { + "epoch": 0.58, + "grad_norm": 1.1604943883973953, + "learning_rate": 8.04051612644694e-06, + "loss": 0.1767, + "step": 11325 + }, + { + "epoch": 0.58, + "grad_norm": 0.7364800594337658, + "learning_rate": 8.038901112191443e-06, + "loss": 0.157, + "step": 11326 + }, + { + "epoch": 0.58, + "grad_norm": 0.9303579297888586, + "learning_rate": 8.037286151130864e-06, + "loss": 0.1788, + "step": 11327 + }, + { + "epoch": 0.58, + "grad_norm": 0.9261735222055252, + "learning_rate": 8.035671243309005e-06, + "loss": 0.2088, + "step": 11328 + }, + { + "epoch": 0.58, + "grad_norm": 1.146395985557706, + "learning_rate": 8.034056388769676e-06, + "loss": 0.1908, + "step": 11329 + }, + { + "epoch": 0.58, + "grad_norm": 0.8834531220910166, + "learning_rate": 8.032441587556676e-06, + "loss": 0.182, + "step": 11330 + }, + { + "epoch": 0.58, + "grad_norm": 0.7483513003591301, + "learning_rate": 8.030826839713814e-06, + "loss": 0.1804, + "step": 11331 + }, + { + "epoch": 0.58, + "grad_norm": 0.9241777250512747, + "learning_rate": 8.02921214528488e-06, + "loss": 0.1819, + "step": 11332 + }, + { + "epoch": 0.58, + "grad_norm": 0.8444863698741362, + "learning_rate": 8.02759750431368e-06, + "loss": 0.1852, + "step": 11333 + }, + { + "epoch": 0.58, + "grad_norm": 1.3926107355585513, + "learning_rate": 8.025982916844008e-06, + "loss": 0.1818, + "step": 11334 + }, + { + "epoch": 0.58, + "grad_norm": 0.8435975489819109, + "learning_rate": 8.024368382919665e-06, + "loss": 0.1855, + "step": 11335 + }, + { + "epoch": 0.58, + "grad_norm": 1.3527644387003557, + "learning_rate": 8.022753902584436e-06, + "loss": 0.206, + "step": 11336 + }, + { + "epoch": 0.58, + "grad_norm": 0.9316082958199031, + "learning_rate": 8.021139475882122e-06, + "loss": 0.1661, + "step": 11337 + }, + { + "epoch": 0.58, + "grad_norm": 0.7054655059974458, + "learning_rate": 8.01952510285651e-06, + "loss": 0.1804, + "step": 11338 + }, + { + "epoch": 0.58, + "grad_norm": 0.908661234462497, + "learning_rate": 8.017910783551394e-06, + "loss": 0.176, + "step": 11339 + }, + { + "epoch": 0.58, + "grad_norm": 0.7743444896095891, + "learning_rate": 8.016296518010558e-06, + "loss": 0.1833, + "step": 11340 + }, + { + "epoch": 0.58, + "grad_norm": 0.9433594583525929, + "learning_rate": 8.014682306277792e-06, + "loss": 0.1766, + "step": 11341 + }, + { + "epoch": 0.58, + "grad_norm": 1.122567249465266, + "learning_rate": 8.013068148396878e-06, + "loss": 0.1981, + "step": 11342 + }, + { + "epoch": 0.58, + "grad_norm": 0.8714340710402263, + "learning_rate": 8.011454044411606e-06, + "loss": 0.1819, + "step": 11343 + }, + { + "epoch": 0.58, + "grad_norm": 1.572982517943364, + "learning_rate": 8.009839994365757e-06, + "loss": 0.1771, + "step": 11344 + }, + { + "epoch": 0.58, + "grad_norm": 1.033687137192266, + "learning_rate": 8.008225998303107e-06, + "loss": 0.1911, + "step": 11345 + }, + { + "epoch": 0.58, + "grad_norm": 1.0309772673882498, + "learning_rate": 8.00661205626744e-06, + "loss": 0.2027, + "step": 11346 + }, + { + "epoch": 0.58, + "grad_norm": 0.9153445694050915, + "learning_rate": 8.004998168302531e-06, + "loss": 0.1847, + "step": 11347 + }, + { + "epoch": 0.58, + "grad_norm": 0.8893563536780098, + "learning_rate": 8.003384334452165e-06, + "loss": 0.1967, + "step": 11348 + }, + { + "epoch": 0.58, + "grad_norm": 0.9472506688885662, + "learning_rate": 8.001770554760107e-06, + "loss": 0.1759, + "step": 11349 + }, + { + "epoch": 0.58, + "grad_norm": 0.8856873542075918, + "learning_rate": 8.000156829270136e-06, + "loss": 0.1713, + "step": 11350 + }, + { + "epoch": 0.58, + "grad_norm": 0.8354670246876854, + "learning_rate": 7.998543158026025e-06, + "loss": 0.1784, + "step": 11351 + }, + { + "epoch": 0.58, + "grad_norm": 1.8998962124671674, + "learning_rate": 7.996929541071545e-06, + "loss": 0.189, + "step": 11352 + }, + { + "epoch": 0.58, + "grad_norm": 0.6963210550002428, + "learning_rate": 7.995315978450462e-06, + "loss": 0.1902, + "step": 11353 + }, + { + "epoch": 0.58, + "grad_norm": 1.7976868012803862, + "learning_rate": 7.993702470206547e-06, + "loss": 0.1982, + "step": 11354 + }, + { + "epoch": 0.58, + "grad_norm": 0.8559801835244099, + "learning_rate": 7.992089016383565e-06, + "loss": 0.1885, + "step": 11355 + }, + { + "epoch": 0.58, + "grad_norm": 0.9255002415299407, + "learning_rate": 7.990475617025286e-06, + "loss": 0.1789, + "step": 11356 + }, + { + "epoch": 0.58, + "grad_norm": 1.0647958227251417, + "learning_rate": 7.988862272175464e-06, + "loss": 0.1602, + "step": 11357 + }, + { + "epoch": 0.58, + "grad_norm": 1.0094849945715152, + "learning_rate": 7.987248981877872e-06, + "loss": 0.1838, + "step": 11358 + }, + { + "epoch": 0.58, + "grad_norm": 0.7584797503213979, + "learning_rate": 7.985635746176261e-06, + "loss": 0.1643, + "step": 11359 + }, + { + "epoch": 0.58, + "grad_norm": 0.9401437229638838, + "learning_rate": 7.984022565114396e-06, + "loss": 0.2103, + "step": 11360 + }, + { + "epoch": 0.58, + "grad_norm": 0.9411254100791999, + "learning_rate": 7.982409438736034e-06, + "loss": 0.1814, + "step": 11361 + }, + { + "epoch": 0.58, + "grad_norm": 2.467728278191582, + "learning_rate": 7.980796367084925e-06, + "loss": 0.1642, + "step": 11362 + }, + { + "epoch": 0.58, + "grad_norm": 0.7544167417792813, + "learning_rate": 7.979183350204833e-06, + "loss": 0.1721, + "step": 11363 + }, + { + "epoch": 0.58, + "grad_norm": 0.7551276841736471, + "learning_rate": 7.977570388139503e-06, + "loss": 0.1781, + "step": 11364 + }, + { + "epoch": 0.58, + "grad_norm": 0.9763581032326041, + "learning_rate": 7.975957480932695e-06, + "loss": 0.1714, + "step": 11365 + }, + { + "epoch": 0.58, + "grad_norm": 0.7418208277674712, + "learning_rate": 7.974344628628151e-06, + "loss": 0.1668, + "step": 11366 + }, + { + "epoch": 0.58, + "grad_norm": 0.8384863204541823, + "learning_rate": 7.972731831269624e-06, + "loss": 0.1927, + "step": 11367 + }, + { + "epoch": 0.58, + "grad_norm": 0.8651724563330945, + "learning_rate": 7.97111908890086e-06, + "loss": 0.1944, + "step": 11368 + }, + { + "epoch": 0.58, + "grad_norm": 1.0535952841849294, + "learning_rate": 7.96950640156561e-06, + "loss": 0.2104, + "step": 11369 + }, + { + "epoch": 0.58, + "grad_norm": 4.437367487342895, + "learning_rate": 7.967893769307608e-06, + "loss": 0.1829, + "step": 11370 + }, + { + "epoch": 0.58, + "grad_norm": 1.0217015755872205, + "learning_rate": 7.966281192170607e-06, + "loss": 0.1721, + "step": 11371 + }, + { + "epoch": 0.58, + "grad_norm": 0.9469785199313246, + "learning_rate": 7.964668670198339e-06, + "loss": 0.1768, + "step": 11372 + }, + { + "epoch": 0.58, + "grad_norm": 0.8351899718021233, + "learning_rate": 7.963056203434552e-06, + "loss": 0.1773, + "step": 11373 + }, + { + "epoch": 0.58, + "grad_norm": 0.8950493743963949, + "learning_rate": 7.961443791922975e-06, + "loss": 0.1957, + "step": 11374 + }, + { + "epoch": 0.58, + "grad_norm": 0.5883478193156267, + "learning_rate": 7.959831435707357e-06, + "loss": 0.1735, + "step": 11375 + }, + { + "epoch": 0.58, + "grad_norm": 1.0007651701611708, + "learning_rate": 7.958219134831423e-06, + "loss": 0.1675, + "step": 11376 + }, + { + "epoch": 0.58, + "grad_norm": 1.183435973584841, + "learning_rate": 7.95660688933891e-06, + "loss": 0.161, + "step": 11377 + }, + { + "epoch": 0.58, + "grad_norm": 0.8598891102521062, + "learning_rate": 7.954994699273555e-06, + "loss": 0.1785, + "step": 11378 + }, + { + "epoch": 0.58, + "grad_norm": 1.0475770668796385, + "learning_rate": 7.953382564679078e-06, + "loss": 0.1632, + "step": 11379 + }, + { + "epoch": 0.58, + "grad_norm": 0.7656142930701677, + "learning_rate": 7.951770485599218e-06, + "loss": 0.1835, + "step": 11380 + }, + { + "epoch": 0.58, + "grad_norm": 0.7597755941350266, + "learning_rate": 7.950158462077697e-06, + "loss": 0.175, + "step": 11381 + }, + { + "epoch": 0.58, + "grad_norm": 1.1075597327894078, + "learning_rate": 7.948546494158247e-06, + "loss": 0.1563, + "step": 11382 + }, + { + "epoch": 0.58, + "grad_norm": 0.919981907436329, + "learning_rate": 7.946934581884585e-06, + "loss": 0.1861, + "step": 11383 + }, + { + "epoch": 0.58, + "grad_norm": 0.8663663674300841, + "learning_rate": 7.945322725300444e-06, + "loss": 0.1851, + "step": 11384 + }, + { + "epoch": 0.58, + "grad_norm": 1.0200009715386287, + "learning_rate": 7.943710924449535e-06, + "loss": 0.1677, + "step": 11385 + }, + { + "epoch": 0.58, + "grad_norm": 0.7627219820305977, + "learning_rate": 7.942099179375585e-06, + "loss": 0.1654, + "step": 11386 + }, + { + "epoch": 0.58, + "grad_norm": 1.0960879832055765, + "learning_rate": 7.940487490122309e-06, + "loss": 0.1573, + "step": 11387 + }, + { + "epoch": 0.58, + "grad_norm": 0.9655957129166881, + "learning_rate": 7.93887585673343e-06, + "loss": 0.1622, + "step": 11388 + }, + { + "epoch": 0.58, + "grad_norm": 1.0674479446431646, + "learning_rate": 7.937264279252657e-06, + "loss": 0.1907, + "step": 11389 + }, + { + "epoch": 0.58, + "grad_norm": 0.7380337556095736, + "learning_rate": 7.935652757723709e-06, + "loss": 0.1808, + "step": 11390 + }, + { + "epoch": 0.58, + "grad_norm": 1.027981513065775, + "learning_rate": 7.934041292190293e-06, + "loss": 0.2013, + "step": 11391 + }, + { + "epoch": 0.58, + "grad_norm": 0.7875171759545012, + "learning_rate": 7.93242988269613e-06, + "loss": 0.1519, + "step": 11392 + }, + { + "epoch": 0.58, + "grad_norm": 1.210993758514946, + "learning_rate": 7.930818529284917e-06, + "loss": 0.1916, + "step": 11393 + }, + { + "epoch": 0.58, + "grad_norm": 0.7811719689055883, + "learning_rate": 7.92920723200037e-06, + "loss": 0.1937, + "step": 11394 + }, + { + "epoch": 0.58, + "grad_norm": 0.8521424976568306, + "learning_rate": 7.927595990886194e-06, + "loss": 0.1843, + "step": 11395 + }, + { + "epoch": 0.58, + "grad_norm": 1.190810739178779, + "learning_rate": 7.925984805986096e-06, + "loss": 0.1903, + "step": 11396 + }, + { + "epoch": 0.58, + "grad_norm": 0.8073776952125132, + "learning_rate": 7.924373677343778e-06, + "loss": 0.1905, + "step": 11397 + }, + { + "epoch": 0.58, + "grad_norm": 0.9438162305535253, + "learning_rate": 7.922762605002938e-06, + "loss": 0.1794, + "step": 11398 + }, + { + "epoch": 0.58, + "grad_norm": 1.0413964995074405, + "learning_rate": 7.92115158900728e-06, + "loss": 0.1932, + "step": 11399 + }, + { + "epoch": 0.58, + "grad_norm": 0.8659254048488056, + "learning_rate": 7.9195406294005e-06, + "loss": 0.1756, + "step": 11400 + }, + { + "epoch": 0.58, + "grad_norm": 0.8148011974376181, + "learning_rate": 7.917929726226305e-06, + "loss": 0.1778, + "step": 11401 + }, + { + "epoch": 0.58, + "grad_norm": 0.8350748355467905, + "learning_rate": 7.916318879528377e-06, + "loss": 0.1828, + "step": 11402 + }, + { + "epoch": 0.58, + "grad_norm": 1.0380626632884626, + "learning_rate": 7.91470808935042e-06, + "loss": 0.1881, + "step": 11403 + }, + { + "epoch": 0.58, + "grad_norm": 0.924415556504601, + "learning_rate": 7.913097355736122e-06, + "loss": 0.1809, + "step": 11404 + }, + { + "epoch": 0.58, + "grad_norm": 0.7720068220086019, + "learning_rate": 7.91148667872918e-06, + "loss": 0.1791, + "step": 11405 + }, + { + "epoch": 0.58, + "grad_norm": 0.8069146836895251, + "learning_rate": 7.909876058373275e-06, + "loss": 0.1721, + "step": 11406 + }, + { + "epoch": 0.58, + "grad_norm": 0.7883066391198847, + "learning_rate": 7.908265494712105e-06, + "loss": 0.2014, + "step": 11407 + }, + { + "epoch": 0.58, + "grad_norm": 1.8683896446322161, + "learning_rate": 7.906654987789346e-06, + "loss": 0.1462, + "step": 11408 + }, + { + "epoch": 0.58, + "grad_norm": 0.7508193646444697, + "learning_rate": 7.905044537648693e-06, + "loss": 0.1703, + "step": 11409 + }, + { + "epoch": 0.58, + "grad_norm": 1.5055920321230043, + "learning_rate": 7.903434144333824e-06, + "loss": 0.1907, + "step": 11410 + }, + { + "epoch": 0.58, + "grad_norm": 0.888206103381393, + "learning_rate": 7.901823807888423e-06, + "loss": 0.1638, + "step": 11411 + }, + { + "epoch": 0.58, + "grad_norm": 0.7553488596562018, + "learning_rate": 7.900213528356167e-06, + "loss": 0.1638, + "step": 11412 + }, + { + "epoch": 0.58, + "grad_norm": 1.0805557840987123, + "learning_rate": 7.898603305780741e-06, + "loss": 0.1854, + "step": 11413 + }, + { + "epoch": 0.58, + "grad_norm": 0.9065778378349452, + "learning_rate": 7.89699314020582e-06, + "loss": 0.1778, + "step": 11414 + }, + { + "epoch": 0.58, + "grad_norm": 0.9282067156258447, + "learning_rate": 7.895383031675074e-06, + "loss": 0.1867, + "step": 11415 + }, + { + "epoch": 0.58, + "grad_norm": 0.9695888508795255, + "learning_rate": 7.893772980232186e-06, + "loss": 0.1935, + "step": 11416 + }, + { + "epoch": 0.58, + "grad_norm": 0.8176310094625524, + "learning_rate": 7.89216298592082e-06, + "loss": 0.1776, + "step": 11417 + }, + { + "epoch": 0.58, + "grad_norm": 0.8476605704866567, + "learning_rate": 7.89055304878466e-06, + "loss": 0.1779, + "step": 11418 + }, + { + "epoch": 0.58, + "grad_norm": 0.7864485731998299, + "learning_rate": 7.88894316886736e-06, + "loss": 0.1586, + "step": 11419 + }, + { + "epoch": 0.58, + "grad_norm": 0.8945343470837589, + "learning_rate": 7.8873333462126e-06, + "loss": 0.1903, + "step": 11420 + }, + { + "epoch": 0.58, + "grad_norm": 1.0553523338334387, + "learning_rate": 7.885723580864039e-06, + "loss": 0.1942, + "step": 11421 + }, + { + "epoch": 0.58, + "grad_norm": 1.4852096499603038, + "learning_rate": 7.884113872865352e-06, + "loss": 0.1909, + "step": 11422 + }, + { + "epoch": 0.58, + "grad_norm": 2.918085514806113, + "learning_rate": 7.882504222260187e-06, + "loss": 0.2025, + "step": 11423 + }, + { + "epoch": 0.58, + "grad_norm": 0.7621380179939279, + "learning_rate": 7.880894629092222e-06, + "loss": 0.1904, + "step": 11424 + }, + { + "epoch": 0.58, + "grad_norm": 0.8141253588089128, + "learning_rate": 7.879285093405105e-06, + "loss": 0.198, + "step": 11425 + }, + { + "epoch": 0.58, + "grad_norm": 0.9091609362931211, + "learning_rate": 7.877675615242502e-06, + "loss": 0.1942, + "step": 11426 + }, + { + "epoch": 0.58, + "grad_norm": 0.779397878754563, + "learning_rate": 7.876066194648066e-06, + "loss": 0.1855, + "step": 11427 + }, + { + "epoch": 0.58, + "grad_norm": 0.894861345972049, + "learning_rate": 7.874456831665457e-06, + "loss": 0.1745, + "step": 11428 + }, + { + "epoch": 0.58, + "grad_norm": 0.8408203658866416, + "learning_rate": 7.872847526338324e-06, + "loss": 0.1742, + "step": 11429 + }, + { + "epoch": 0.58, + "grad_norm": 0.811850660399226, + "learning_rate": 7.871238278710322e-06, + "loss": 0.1826, + "step": 11430 + }, + { + "epoch": 0.58, + "grad_norm": 0.8983352155401033, + "learning_rate": 7.869629088825105e-06, + "loss": 0.187, + "step": 11431 + }, + { + "epoch": 0.58, + "grad_norm": 0.727772267058997, + "learning_rate": 7.868019956726318e-06, + "loss": 0.1926, + "step": 11432 + }, + { + "epoch": 0.58, + "grad_norm": 0.9780031004620156, + "learning_rate": 7.866410882457609e-06, + "loss": 0.1567, + "step": 11433 + }, + { + "epoch": 0.58, + "grad_norm": 0.9214707777852454, + "learning_rate": 7.864801866062624e-06, + "loss": 0.1843, + "step": 11434 + }, + { + "epoch": 0.58, + "grad_norm": 1.1192018977233658, + "learning_rate": 7.863192907585013e-06, + "loss": 0.1722, + "step": 11435 + }, + { + "epoch": 0.58, + "grad_norm": 0.7960680607754248, + "learning_rate": 7.861584007068411e-06, + "loss": 0.1832, + "step": 11436 + }, + { + "epoch": 0.58, + "grad_norm": 0.9839711925450026, + "learning_rate": 7.859975164556468e-06, + "loss": 0.17, + "step": 11437 + }, + { + "epoch": 0.58, + "grad_norm": 1.0211951625381592, + "learning_rate": 7.858366380092814e-06, + "loss": 0.1893, + "step": 11438 + }, + { + "epoch": 0.58, + "grad_norm": 2.425161321405098, + "learning_rate": 7.856757653721097e-06, + "loss": 0.1898, + "step": 11439 + }, + { + "epoch": 0.58, + "grad_norm": 0.8515906228888953, + "learning_rate": 7.855148985484946e-06, + "loss": 0.1855, + "step": 11440 + }, + { + "epoch": 0.58, + "grad_norm": 0.8177327295630341, + "learning_rate": 7.853540375428006e-06, + "loss": 0.1662, + "step": 11441 + }, + { + "epoch": 0.58, + "grad_norm": 0.8708247841613003, + "learning_rate": 7.851931823593897e-06, + "loss": 0.1933, + "step": 11442 + }, + { + "epoch": 0.58, + "grad_norm": 0.7436403301338063, + "learning_rate": 7.850323330026264e-06, + "loss": 0.1819, + "step": 11443 + }, + { + "epoch": 0.58, + "grad_norm": 0.8529954048960217, + "learning_rate": 7.848714894768729e-06, + "loss": 0.1752, + "step": 11444 + }, + { + "epoch": 0.58, + "grad_norm": 0.86165304634647, + "learning_rate": 7.847106517864927e-06, + "loss": 0.1675, + "step": 11445 + }, + { + "epoch": 0.58, + "grad_norm": 0.795950397385831, + "learning_rate": 7.84549819935848e-06, + "loss": 0.1692, + "step": 11446 + }, + { + "epoch": 0.58, + "grad_norm": 0.8796257453564453, + "learning_rate": 7.843889939293017e-06, + "loss": 0.1604, + "step": 11447 + }, + { + "epoch": 0.58, + "grad_norm": 0.8178018265488591, + "learning_rate": 7.842281737712164e-06, + "loss": 0.1763, + "step": 11448 + }, + { + "epoch": 0.58, + "grad_norm": 1.2412486625603991, + "learning_rate": 7.840673594659535e-06, + "loss": 0.1635, + "step": 11449 + }, + { + "epoch": 0.58, + "grad_norm": 5.138558639655599, + "learning_rate": 7.839065510178763e-06, + "loss": 0.151, + "step": 11450 + }, + { + "epoch": 0.58, + "grad_norm": 0.8575207245711235, + "learning_rate": 7.837457484313452e-06, + "loss": 0.1722, + "step": 11451 + }, + { + "epoch": 0.58, + "grad_norm": 1.1377203906398663, + "learning_rate": 7.835849517107237e-06, + "loss": 0.1826, + "step": 11452 + }, + { + "epoch": 0.58, + "grad_norm": 0.9724400883078111, + "learning_rate": 7.834241608603722e-06, + "loss": 0.1759, + "step": 11453 + }, + { + "epoch": 0.58, + "grad_norm": 3.823551349347765, + "learning_rate": 7.83263375884653e-06, + "loss": 0.1969, + "step": 11454 + }, + { + "epoch": 0.58, + "grad_norm": 0.8901574635583533, + "learning_rate": 7.831025967879265e-06, + "loss": 0.1827, + "step": 11455 + }, + { + "epoch": 0.58, + "grad_norm": 0.9064639578916673, + "learning_rate": 7.829418235745547e-06, + "loss": 0.1743, + "step": 11456 + }, + { + "epoch": 0.58, + "grad_norm": 1.1197538740125437, + "learning_rate": 7.827810562488978e-06, + "loss": 0.1735, + "step": 11457 + }, + { + "epoch": 0.58, + "grad_norm": 0.7804009424828282, + "learning_rate": 7.826202948153174e-06, + "loss": 0.1559, + "step": 11458 + }, + { + "epoch": 0.58, + "grad_norm": 0.784302834791359, + "learning_rate": 7.824595392781735e-06, + "loss": 0.1955, + "step": 11459 + }, + { + "epoch": 0.58, + "grad_norm": 0.955136017244413, + "learning_rate": 7.822987896418269e-06, + "loss": 0.1802, + "step": 11460 + }, + { + "epoch": 0.58, + "grad_norm": 1.11692735362752, + "learning_rate": 7.821380459106379e-06, + "loss": 0.1784, + "step": 11461 + }, + { + "epoch": 0.58, + "grad_norm": 0.686381091202176, + "learning_rate": 7.81977308088967e-06, + "loss": 0.1852, + "step": 11462 + }, + { + "epoch": 0.58, + "grad_norm": 0.9285742766920333, + "learning_rate": 7.818165761811736e-06, + "loss": 0.1724, + "step": 11463 + }, + { + "epoch": 0.58, + "grad_norm": 1.5463402825839065, + "learning_rate": 7.81655850191618e-06, + "loss": 0.1889, + "step": 11464 + }, + { + "epoch": 0.58, + "grad_norm": 0.8834470298152093, + "learning_rate": 7.814951301246597e-06, + "loss": 0.2023, + "step": 11465 + }, + { + "epoch": 0.58, + "grad_norm": 0.9091701075412179, + "learning_rate": 7.813344159846588e-06, + "loss": 0.1946, + "step": 11466 + }, + { + "epoch": 0.58, + "grad_norm": 0.8394475643175278, + "learning_rate": 7.811737077759742e-06, + "loss": 0.1687, + "step": 11467 + }, + { + "epoch": 0.58, + "grad_norm": 0.6724614209465645, + "learning_rate": 7.810130055029646e-06, + "loss": 0.1806, + "step": 11468 + }, + { + "epoch": 0.58, + "grad_norm": 0.9378148453117767, + "learning_rate": 7.808523091699898e-06, + "loss": 0.1773, + "step": 11469 + }, + { + "epoch": 0.58, + "grad_norm": 2.5997825594312163, + "learning_rate": 7.806916187814084e-06, + "loss": 0.1728, + "step": 11470 + }, + { + "epoch": 0.58, + "grad_norm": 0.7187489821039615, + "learning_rate": 7.805309343415796e-06, + "loss": 0.1724, + "step": 11471 + }, + { + "epoch": 0.58, + "grad_norm": 0.7144109041965934, + "learning_rate": 7.803702558548611e-06, + "loss": 0.1796, + "step": 11472 + }, + { + "epoch": 0.58, + "grad_norm": 0.9252592793590034, + "learning_rate": 7.802095833256121e-06, + "loss": 0.2001, + "step": 11473 + }, + { + "epoch": 0.58, + "grad_norm": 0.786306724546287, + "learning_rate": 7.800489167581903e-06, + "loss": 0.1819, + "step": 11474 + }, + { + "epoch": 0.58, + "grad_norm": 0.9248066516176838, + "learning_rate": 7.798882561569546e-06, + "loss": 0.1855, + "step": 11475 + }, + { + "epoch": 0.58, + "grad_norm": 1.1746878367894527, + "learning_rate": 7.797276015262619e-06, + "loss": 0.1822, + "step": 11476 + }, + { + "epoch": 0.58, + "grad_norm": 0.9795575556369396, + "learning_rate": 7.795669528704707e-06, + "loss": 0.1721, + "step": 11477 + }, + { + "epoch": 0.58, + "grad_norm": 0.9468774083981828, + "learning_rate": 7.794063101939381e-06, + "loss": 0.1758, + "step": 11478 + }, + { + "epoch": 0.58, + "grad_norm": 0.8166886570894248, + "learning_rate": 7.792456735010223e-06, + "loss": 0.1815, + "step": 11479 + }, + { + "epoch": 0.58, + "grad_norm": 0.9671715919770977, + "learning_rate": 7.790850427960795e-06, + "loss": 0.201, + "step": 11480 + }, + { + "epoch": 0.58, + "grad_norm": 1.357756688548844, + "learning_rate": 7.789244180834679e-06, + "loss": 0.1946, + "step": 11481 + }, + { + "epoch": 0.58, + "grad_norm": 0.8662586324346514, + "learning_rate": 7.787637993675434e-06, + "loss": 0.1871, + "step": 11482 + }, + { + "epoch": 0.58, + "grad_norm": 1.066321292841425, + "learning_rate": 7.786031866526636e-06, + "loss": 0.193, + "step": 11483 + }, + { + "epoch": 0.58, + "grad_norm": 0.8948737952595948, + "learning_rate": 7.784425799431852e-06, + "loss": 0.1686, + "step": 11484 + }, + { + "epoch": 0.58, + "grad_norm": 1.1269647961952738, + "learning_rate": 7.782819792434638e-06, + "loss": 0.174, + "step": 11485 + }, + { + "epoch": 0.58, + "grad_norm": 1.2486855535449515, + "learning_rate": 7.781213845578564e-06, + "loss": 0.181, + "step": 11486 + }, + { + "epoch": 0.58, + "grad_norm": 1.2255911988677148, + "learning_rate": 7.779607958907189e-06, + "loss": 0.2061, + "step": 11487 + }, + { + "epoch": 0.58, + "grad_norm": 0.6697046944408509, + "learning_rate": 7.778002132464077e-06, + "loss": 0.1637, + "step": 11488 + }, + { + "epoch": 0.58, + "grad_norm": 1.1222438318590247, + "learning_rate": 7.77639636629278e-06, + "loss": 0.2002, + "step": 11489 + }, + { + "epoch": 0.58, + "grad_norm": 0.8560548327618978, + "learning_rate": 7.774790660436857e-06, + "loss": 0.1879, + "step": 11490 + }, + { + "epoch": 0.58, + "grad_norm": 1.0547481846929991, + "learning_rate": 7.773185014939863e-06, + "loss": 0.2034, + "step": 11491 + }, + { + "epoch": 0.58, + "grad_norm": 0.9656593366959516, + "learning_rate": 7.771579429845353e-06, + "loss": 0.1883, + "step": 11492 + }, + { + "epoch": 0.58, + "grad_norm": 0.933937310893644, + "learning_rate": 7.769973905196875e-06, + "loss": 0.1685, + "step": 11493 + }, + { + "epoch": 0.58, + "grad_norm": 1.0471187930082608, + "learning_rate": 7.768368441037983e-06, + "loss": 0.1507, + "step": 11494 + }, + { + "epoch": 0.58, + "grad_norm": 0.8376089379719395, + "learning_rate": 7.766763037412219e-06, + "loss": 0.1825, + "step": 11495 + }, + { + "epoch": 0.58, + "grad_norm": 0.9080888688549174, + "learning_rate": 7.765157694363138e-06, + "loss": 0.2009, + "step": 11496 + }, + { + "epoch": 0.58, + "grad_norm": 0.8024492902468562, + "learning_rate": 7.763552411934277e-06, + "loss": 0.1785, + "step": 11497 + }, + { + "epoch": 0.58, + "grad_norm": 0.7860320019469207, + "learning_rate": 7.761947190169188e-06, + "loss": 0.1718, + "step": 11498 + }, + { + "epoch": 0.58, + "grad_norm": 1.0387294443548292, + "learning_rate": 7.760342029111403e-06, + "loss": 0.1829, + "step": 11499 + }, + { + "epoch": 0.58, + "grad_norm": 0.9896381467940631, + "learning_rate": 7.758736928804469e-06, + "loss": 0.1924, + "step": 11500 + }, + { + "epoch": 0.58, + "grad_norm": 1.0018665717164907, + "learning_rate": 7.757131889291925e-06, + "loss": 0.1735, + "step": 11501 + }, + { + "epoch": 0.58, + "grad_norm": 0.9894239106183639, + "learning_rate": 7.7555269106173e-06, + "loss": 0.1751, + "step": 11502 + }, + { + "epoch": 0.58, + "grad_norm": 0.8985708137642188, + "learning_rate": 7.753921992824139e-06, + "loss": 0.1964, + "step": 11503 + }, + { + "epoch": 0.58, + "grad_norm": 1.1661604244942374, + "learning_rate": 7.752317135955966e-06, + "loss": 0.1713, + "step": 11504 + }, + { + "epoch": 0.59, + "grad_norm": 0.8612006619090127, + "learning_rate": 7.750712340056323e-06, + "loss": 0.1987, + "step": 11505 + }, + { + "epoch": 0.59, + "grad_norm": 0.7680521604795602, + "learning_rate": 7.74910760516873e-06, + "loss": 0.1889, + "step": 11506 + }, + { + "epoch": 0.59, + "grad_norm": 0.9533786515303825, + "learning_rate": 7.747502931336726e-06, + "loss": 0.1727, + "step": 11507 + }, + { + "epoch": 0.59, + "grad_norm": 0.7931110934670622, + "learning_rate": 7.745898318603826e-06, + "loss": 0.1815, + "step": 11508 + }, + { + "epoch": 0.59, + "grad_norm": 1.9576609653751136, + "learning_rate": 7.744293767013564e-06, + "loss": 0.1833, + "step": 11509 + }, + { + "epoch": 0.59, + "grad_norm": 0.9853274013225376, + "learning_rate": 7.742689276609459e-06, + "loss": 0.1607, + "step": 11510 + }, + { + "epoch": 0.59, + "grad_norm": 0.8886805308841748, + "learning_rate": 7.74108484743504e-06, + "loss": 0.1788, + "step": 11511 + }, + { + "epoch": 0.59, + "grad_norm": 0.9910320281706113, + "learning_rate": 7.739480479533818e-06, + "loss": 0.1914, + "step": 11512 + }, + { + "epoch": 0.59, + "grad_norm": 0.9545227317871207, + "learning_rate": 7.737876172949317e-06, + "loss": 0.1613, + "step": 11513 + }, + { + "epoch": 0.59, + "grad_norm": 0.7746215432805033, + "learning_rate": 7.73627192772505e-06, + "loss": 0.1784, + "step": 11514 + }, + { + "epoch": 0.59, + "grad_norm": 1.1258108748177553, + "learning_rate": 7.73466774390454e-06, + "loss": 0.1493, + "step": 11515 + }, + { + "epoch": 0.59, + "grad_norm": 1.1555646260024954, + "learning_rate": 7.73306362153129e-06, + "loss": 0.1875, + "step": 11516 + }, + { + "epoch": 0.59, + "grad_norm": 1.0687324728634058, + "learning_rate": 7.73145956064882e-06, + "loss": 0.1546, + "step": 11517 + }, + { + "epoch": 0.59, + "grad_norm": 1.106528469273153, + "learning_rate": 7.72985556130064e-06, + "loss": 0.1887, + "step": 11518 + }, + { + "epoch": 0.59, + "grad_norm": 0.837873803762906, + "learning_rate": 7.728251623530253e-06, + "loss": 0.1703, + "step": 11519 + }, + { + "epoch": 0.59, + "grad_norm": 0.9792314296285167, + "learning_rate": 7.726647747381171e-06, + "loss": 0.1769, + "step": 11520 + }, + { + "epoch": 0.59, + "grad_norm": 1.036453018636925, + "learning_rate": 7.725043932896895e-06, + "loss": 0.1763, + "step": 11521 + }, + { + "epoch": 0.59, + "grad_norm": 1.6120548814324298, + "learning_rate": 7.723440180120932e-06, + "loss": 0.1884, + "step": 11522 + }, + { + "epoch": 0.59, + "grad_norm": 0.952343846906782, + "learning_rate": 7.72183648909678e-06, + "loss": 0.1594, + "step": 11523 + }, + { + "epoch": 0.59, + "grad_norm": 0.8464787482722366, + "learning_rate": 7.720232859867946e-06, + "loss": 0.1709, + "step": 11524 + }, + { + "epoch": 0.59, + "grad_norm": 0.84795764242471, + "learning_rate": 7.71862929247792e-06, + "loss": 0.1867, + "step": 11525 + }, + { + "epoch": 0.59, + "grad_norm": 1.0405235420112218, + "learning_rate": 7.717025786970208e-06, + "loss": 0.176, + "step": 11526 + }, + { + "epoch": 0.59, + "grad_norm": 1.781721864117291, + "learning_rate": 7.715422343388296e-06, + "loss": 0.1801, + "step": 11527 + }, + { + "epoch": 0.59, + "grad_norm": 0.9009036342142364, + "learning_rate": 7.713818961775686e-06, + "loss": 0.1775, + "step": 11528 + }, + { + "epoch": 0.59, + "grad_norm": 0.889676087190506, + "learning_rate": 7.712215642175862e-06, + "loss": 0.1806, + "step": 11529 + }, + { + "epoch": 0.59, + "grad_norm": 1.537611419687043, + "learning_rate": 7.710612384632321e-06, + "loss": 0.1986, + "step": 11530 + }, + { + "epoch": 0.59, + "grad_norm": 0.9885073435263368, + "learning_rate": 7.709009189188546e-06, + "loss": 0.2099, + "step": 11531 + }, + { + "epoch": 0.59, + "grad_norm": 1.157845104013614, + "learning_rate": 7.70740605588803e-06, + "loss": 0.1774, + "step": 11532 + }, + { + "epoch": 0.59, + "grad_norm": 1.5695577270079757, + "learning_rate": 7.70580298477425e-06, + "loss": 0.1729, + "step": 11533 + }, + { + "epoch": 0.59, + "grad_norm": 0.7935269584611251, + "learning_rate": 7.704199975890698e-06, + "loss": 0.1811, + "step": 11534 + }, + { + "epoch": 0.59, + "grad_norm": 1.302641620541749, + "learning_rate": 7.702597029280848e-06, + "loss": 0.1637, + "step": 11535 + }, + { + "epoch": 0.59, + "grad_norm": 1.0267924968478008, + "learning_rate": 7.700994144988183e-06, + "loss": 0.1992, + "step": 11536 + }, + { + "epoch": 0.59, + "grad_norm": 1.0705791649139917, + "learning_rate": 7.699391323056184e-06, + "loss": 0.1867, + "step": 11537 + }, + { + "epoch": 0.59, + "grad_norm": 1.0443080061831171, + "learning_rate": 7.697788563528323e-06, + "loss": 0.1743, + "step": 11538 + }, + { + "epoch": 0.59, + "grad_norm": 4.91455270222627, + "learning_rate": 7.696185866448079e-06, + "loss": 0.1781, + "step": 11539 + }, + { + "epoch": 0.59, + "grad_norm": 0.9676560323614188, + "learning_rate": 7.694583231858921e-06, + "loss": 0.1898, + "step": 11540 + }, + { + "epoch": 0.59, + "grad_norm": 0.8959806001854843, + "learning_rate": 7.692980659804327e-06, + "loss": 0.1763, + "step": 11541 + }, + { + "epoch": 0.59, + "grad_norm": 1.0203472241064597, + "learning_rate": 7.691378150327759e-06, + "loss": 0.1764, + "step": 11542 + }, + { + "epoch": 0.59, + "grad_norm": 1.1974668086090456, + "learning_rate": 7.689775703472691e-06, + "loss": 0.1866, + "step": 11543 + }, + { + "epoch": 0.59, + "grad_norm": 1.7475706747831443, + "learning_rate": 7.688173319282586e-06, + "loss": 0.1976, + "step": 11544 + }, + { + "epoch": 0.59, + "grad_norm": 0.935656524326181, + "learning_rate": 7.686570997800914e-06, + "loss": 0.1605, + "step": 11545 + }, + { + "epoch": 0.59, + "grad_norm": 0.8084136464393837, + "learning_rate": 7.68496873907113e-06, + "loss": 0.2016, + "step": 11546 + }, + { + "epoch": 0.59, + "grad_norm": 0.9290286115022512, + "learning_rate": 7.683366543136703e-06, + "loss": 0.1894, + "step": 11547 + }, + { + "epoch": 0.59, + "grad_norm": 1.4974991685353187, + "learning_rate": 7.681764410041087e-06, + "loss": 0.1957, + "step": 11548 + }, + { + "epoch": 0.59, + "grad_norm": 0.8232639356455898, + "learning_rate": 7.680162339827744e-06, + "loss": 0.1841, + "step": 11549 + }, + { + "epoch": 0.59, + "grad_norm": 0.9511264667655934, + "learning_rate": 7.678560332540126e-06, + "loss": 0.1775, + "step": 11550 + }, + { + "epoch": 0.59, + "grad_norm": 0.9769977386312791, + "learning_rate": 7.676958388221693e-06, + "loss": 0.1797, + "step": 11551 + }, + { + "epoch": 0.59, + "grad_norm": 1.167911556068729, + "learning_rate": 7.675356506915892e-06, + "loss": 0.1962, + "step": 11552 + }, + { + "epoch": 0.59, + "grad_norm": 0.8463201079565178, + "learning_rate": 7.67375468866618e-06, + "loss": 0.1771, + "step": 11553 + }, + { + "epoch": 0.59, + "grad_norm": 1.2272626867654062, + "learning_rate": 7.672152933516005e-06, + "loss": 0.1973, + "step": 11554 + }, + { + "epoch": 0.59, + "grad_norm": 2.094380301002933, + "learning_rate": 7.670551241508809e-06, + "loss": 0.1678, + "step": 11555 + }, + { + "epoch": 0.59, + "grad_norm": 0.915630672766507, + "learning_rate": 7.668949612688044e-06, + "loss": 0.1837, + "step": 11556 + }, + { + "epoch": 0.59, + "grad_norm": 0.7441259642618644, + "learning_rate": 7.667348047097151e-06, + "loss": 0.1744, + "step": 11557 + }, + { + "epoch": 0.59, + "grad_norm": 0.761767486300642, + "learning_rate": 7.665746544779577e-06, + "loss": 0.172, + "step": 11558 + }, + { + "epoch": 0.59, + "grad_norm": 0.6978167876222314, + "learning_rate": 7.664145105778755e-06, + "loss": 0.1646, + "step": 11559 + }, + { + "epoch": 0.59, + "grad_norm": 0.8980322487996942, + "learning_rate": 7.662543730138136e-06, + "loss": 0.1795, + "step": 11560 + }, + { + "epoch": 0.59, + "grad_norm": 1.1376196282122772, + "learning_rate": 7.660942417901145e-06, + "loss": 0.1729, + "step": 11561 + }, + { + "epoch": 0.59, + "grad_norm": 1.372232682551164, + "learning_rate": 7.659341169111222e-06, + "loss": 0.1562, + "step": 11562 + }, + { + "epoch": 0.59, + "grad_norm": 0.9663702113761, + "learning_rate": 7.657739983811803e-06, + "loss": 0.2076, + "step": 11563 + }, + { + "epoch": 0.59, + "grad_norm": 1.0129999221068482, + "learning_rate": 7.656138862046323e-06, + "loss": 0.1767, + "step": 11564 + }, + { + "epoch": 0.59, + "grad_norm": 0.9004518055692233, + "learning_rate": 7.654537803858205e-06, + "loss": 0.186, + "step": 11565 + }, + { + "epoch": 0.59, + "grad_norm": 0.8210445742427821, + "learning_rate": 7.652936809290883e-06, + "loss": 0.2024, + "step": 11566 + }, + { + "epoch": 0.59, + "grad_norm": 0.9394699808210847, + "learning_rate": 7.65133587838778e-06, + "loss": 0.1847, + "step": 11567 + }, + { + "epoch": 0.59, + "grad_norm": 0.9609354948526633, + "learning_rate": 7.649735011192329e-06, + "loss": 0.1912, + "step": 11568 + }, + { + "epoch": 0.59, + "grad_norm": 1.654273035810828, + "learning_rate": 7.648134207747944e-06, + "loss": 0.1791, + "step": 11569 + }, + { + "epoch": 0.59, + "grad_norm": 0.8138671952721602, + "learning_rate": 7.646533468098054e-06, + "loss": 0.1994, + "step": 11570 + }, + { + "epoch": 0.59, + "grad_norm": 0.9362668301075935, + "learning_rate": 7.644932792286078e-06, + "loss": 0.2046, + "step": 11571 + }, + { + "epoch": 0.59, + "grad_norm": 0.8749552721286621, + "learning_rate": 7.64333218035543e-06, + "loss": 0.1914, + "step": 11572 + }, + { + "epoch": 0.59, + "grad_norm": 0.8649832225609275, + "learning_rate": 7.64173163234953e-06, + "loss": 0.1579, + "step": 11573 + }, + { + "epoch": 0.59, + "grad_norm": 1.171430802862261, + "learning_rate": 7.640131148311791e-06, + "loss": 0.19, + "step": 11574 + }, + { + "epoch": 0.59, + "grad_norm": 1.2259477037304825, + "learning_rate": 7.638530728285633e-06, + "loss": 0.1808, + "step": 11575 + }, + { + "epoch": 0.59, + "grad_norm": 1.4841824061106357, + "learning_rate": 7.636930372314457e-06, + "loss": 0.19, + "step": 11576 + }, + { + "epoch": 0.59, + "grad_norm": 0.9639021119081354, + "learning_rate": 7.635330080441684e-06, + "loss": 0.1811, + "step": 11577 + }, + { + "epoch": 0.59, + "grad_norm": 0.9701950679898839, + "learning_rate": 7.633729852710711e-06, + "loss": 0.1714, + "step": 11578 + }, + { + "epoch": 0.59, + "grad_norm": 1.0768229910505707, + "learning_rate": 7.632129689164951e-06, + "loss": 0.2228, + "step": 11579 + }, + { + "epoch": 0.59, + "grad_norm": 1.3353356856633012, + "learning_rate": 7.630529589847807e-06, + "loss": 0.1921, + "step": 11580 + }, + { + "epoch": 0.59, + "grad_norm": 1.3622609087993482, + "learning_rate": 7.628929554802683e-06, + "loss": 0.1911, + "step": 11581 + }, + { + "epoch": 0.59, + "grad_norm": 0.8975720323589307, + "learning_rate": 7.627329584072976e-06, + "loss": 0.1796, + "step": 11582 + }, + { + "epoch": 0.59, + "grad_norm": 0.9448305446099222, + "learning_rate": 7.625729677702089e-06, + "loss": 0.197, + "step": 11583 + }, + { + "epoch": 0.59, + "grad_norm": 1.414447297300965, + "learning_rate": 7.624129835733418e-06, + "loss": 0.1847, + "step": 11584 + }, + { + "epoch": 0.59, + "grad_norm": 1.1403047850573143, + "learning_rate": 7.622530058210363e-06, + "loss": 0.1705, + "step": 11585 + }, + { + "epoch": 0.59, + "grad_norm": 0.7769183214743727, + "learning_rate": 7.62093034517631e-06, + "loss": 0.1736, + "step": 11586 + }, + { + "epoch": 0.59, + "grad_norm": 0.8832212847030111, + "learning_rate": 7.619330696674658e-06, + "loss": 0.1749, + "step": 11587 + }, + { + "epoch": 0.59, + "grad_norm": 1.0324852829353546, + "learning_rate": 7.6177311127487984e-06, + "loss": 0.17, + "step": 11588 + }, + { + "epoch": 0.59, + "grad_norm": 1.0803947078290193, + "learning_rate": 7.616131593442111e-06, + "loss": 0.1657, + "step": 11589 + }, + { + "epoch": 0.59, + "grad_norm": 1.3593432430994428, + "learning_rate": 7.614532138797994e-06, + "loss": 0.2016, + "step": 11590 + }, + { + "epoch": 0.59, + "grad_norm": 1.1694049472944021, + "learning_rate": 7.6129327488598225e-06, + "loss": 0.1627, + "step": 11591 + }, + { + "epoch": 0.59, + "grad_norm": 0.9088413784789771, + "learning_rate": 7.611333423670988e-06, + "loss": 0.184, + "step": 11592 + }, + { + "epoch": 0.59, + "grad_norm": 1.2524174806149122, + "learning_rate": 7.609734163274867e-06, + "loss": 0.1803, + "step": 11593 + }, + { + "epoch": 0.59, + "grad_norm": 0.8086586822961561, + "learning_rate": 7.608134967714846e-06, + "loss": 0.1657, + "step": 11594 + }, + { + "epoch": 0.59, + "grad_norm": 1.0821545601165399, + "learning_rate": 7.606535837034295e-06, + "loss": 0.1814, + "step": 11595 + }, + { + "epoch": 0.59, + "grad_norm": 1.1909757173673818, + "learning_rate": 7.604936771276596e-06, + "loss": 0.1925, + "step": 11596 + }, + { + "epoch": 0.59, + "grad_norm": 1.1089161855517706, + "learning_rate": 7.603337770485122e-06, + "loss": 0.2052, + "step": 11597 + }, + { + "epoch": 0.59, + "grad_norm": 0.9012722774072845, + "learning_rate": 7.601738834703249e-06, + "loss": 0.1869, + "step": 11598 + }, + { + "epoch": 0.59, + "grad_norm": 0.7300428580712354, + "learning_rate": 7.600139963974341e-06, + "loss": 0.1819, + "step": 11599 + }, + { + "epoch": 0.59, + "grad_norm": 0.9574724234455277, + "learning_rate": 7.598541158341774e-06, + "loss": 0.1799, + "step": 11600 + }, + { + "epoch": 0.59, + "grad_norm": 1.2387333692896372, + "learning_rate": 7.5969424178489134e-06, + "loss": 0.17, + "step": 11601 + }, + { + "epoch": 0.59, + "grad_norm": 0.907897336158818, + "learning_rate": 7.5953437425391284e-06, + "loss": 0.1887, + "step": 11602 + }, + { + "epoch": 0.59, + "grad_norm": 1.4548819371977963, + "learning_rate": 7.593745132455776e-06, + "loss": 0.1931, + "step": 11603 + }, + { + "epoch": 0.59, + "grad_norm": 1.0247766606758641, + "learning_rate": 7.592146587642227e-06, + "loss": 0.1961, + "step": 11604 + }, + { + "epoch": 0.59, + "grad_norm": 1.237579976507833, + "learning_rate": 7.5905481081418365e-06, + "loss": 0.1764, + "step": 11605 + }, + { + "epoch": 0.59, + "grad_norm": 0.9641963466945946, + "learning_rate": 7.588949693997962e-06, + "loss": 0.1806, + "step": 11606 + }, + { + "epoch": 0.59, + "grad_norm": 1.0322631441623458, + "learning_rate": 7.587351345253968e-06, + "loss": 0.1658, + "step": 11607 + }, + { + "epoch": 0.59, + "grad_norm": 0.8648225298624148, + "learning_rate": 7.585753061953199e-06, + "loss": 0.1559, + "step": 11608 + }, + { + "epoch": 0.59, + "grad_norm": 0.8683653108191068, + "learning_rate": 7.584154844139019e-06, + "loss": 0.1815, + "step": 11609 + }, + { + "epoch": 0.59, + "grad_norm": 2.291889450017652, + "learning_rate": 7.582556691854772e-06, + "loss": 0.1659, + "step": 11610 + }, + { + "epoch": 0.59, + "grad_norm": 1.074740180118149, + "learning_rate": 7.580958605143816e-06, + "loss": 0.1688, + "step": 11611 + }, + { + "epoch": 0.59, + "grad_norm": 1.8152071906384597, + "learning_rate": 7.579360584049489e-06, + "loss": 0.1947, + "step": 11612 + }, + { + "epoch": 0.59, + "grad_norm": 1.1801412874720607, + "learning_rate": 7.577762628615146e-06, + "loss": 0.1859, + "step": 11613 + }, + { + "epoch": 0.59, + "grad_norm": 1.034604716980372, + "learning_rate": 7.576164738884126e-06, + "loss": 0.1831, + "step": 11614 + }, + { + "epoch": 0.59, + "grad_norm": 0.9786064729118367, + "learning_rate": 7.574566914899779e-06, + "loss": 0.1892, + "step": 11615 + }, + { + "epoch": 0.59, + "grad_norm": 1.1607546685118113, + "learning_rate": 7.572969156705437e-06, + "loss": 0.1854, + "step": 11616 + }, + { + "epoch": 0.59, + "grad_norm": 0.8754173581044404, + "learning_rate": 7.571371464344448e-06, + "loss": 0.1663, + "step": 11617 + }, + { + "epoch": 0.59, + "grad_norm": 0.9708501178409704, + "learning_rate": 7.5697738378601406e-06, + "loss": 0.2012, + "step": 11618 + }, + { + "epoch": 0.59, + "grad_norm": 1.0361952174520443, + "learning_rate": 7.568176277295858e-06, + "loss": 0.1848, + "step": 11619 + }, + { + "epoch": 0.59, + "grad_norm": 1.31195992549576, + "learning_rate": 7.566578782694928e-06, + "loss": 0.1803, + "step": 11620 + }, + { + "epoch": 0.59, + "grad_norm": 1.0422363977834639, + "learning_rate": 7.564981354100691e-06, + "loss": 0.1711, + "step": 11621 + }, + { + "epoch": 0.59, + "grad_norm": 1.7208745670437828, + "learning_rate": 7.563383991556468e-06, + "loss": 0.1681, + "step": 11622 + }, + { + "epoch": 0.59, + "grad_norm": 1.4976767554787114, + "learning_rate": 7.561786695105593e-06, + "loss": 0.1692, + "step": 11623 + }, + { + "epoch": 0.59, + "grad_norm": 1.1242864380247015, + "learning_rate": 7.5601894647913955e-06, + "loss": 0.1749, + "step": 11624 + }, + { + "epoch": 0.59, + "grad_norm": 1.1508597414092376, + "learning_rate": 7.55859230065719e-06, + "loss": 0.165, + "step": 11625 + }, + { + "epoch": 0.59, + "grad_norm": 0.9724639392749773, + "learning_rate": 7.556995202746311e-06, + "loss": 0.1606, + "step": 11626 + }, + { + "epoch": 0.59, + "grad_norm": 0.7715827756096528, + "learning_rate": 7.555398171102072e-06, + "loss": 0.1771, + "step": 11627 + }, + { + "epoch": 0.59, + "grad_norm": 1.3122835713518486, + "learning_rate": 7.5538012057677985e-06, + "loss": 0.1746, + "step": 11628 + }, + { + "epoch": 0.59, + "grad_norm": 1.5803497435595275, + "learning_rate": 7.5522043067868034e-06, + "loss": 0.1704, + "step": 11629 + }, + { + "epoch": 0.59, + "grad_norm": 1.1103465528131011, + "learning_rate": 7.550607474202407e-06, + "loss": 0.1813, + "step": 11630 + }, + { + "epoch": 0.59, + "grad_norm": 0.998855733673789, + "learning_rate": 7.549010708057919e-06, + "loss": 0.1905, + "step": 11631 + }, + { + "epoch": 0.59, + "grad_norm": 2.5212565968715075, + "learning_rate": 7.5474140083966544e-06, + "loss": 0.1947, + "step": 11632 + }, + { + "epoch": 0.59, + "grad_norm": 0.973467654139691, + "learning_rate": 7.545817375261921e-06, + "loss": 0.1931, + "step": 11633 + }, + { + "epoch": 0.59, + "grad_norm": 0.941498013978441, + "learning_rate": 7.544220808697036e-06, + "loss": 0.1799, + "step": 11634 + }, + { + "epoch": 0.59, + "grad_norm": 0.7971826901881703, + "learning_rate": 7.542624308745294e-06, + "loss": 0.1502, + "step": 11635 + }, + { + "epoch": 0.59, + "grad_norm": 0.8618617274901741, + "learning_rate": 7.541027875450011e-06, + "loss": 0.1794, + "step": 11636 + }, + { + "epoch": 0.59, + "grad_norm": 0.8212372318307198, + "learning_rate": 7.539431508854482e-06, + "loss": 0.1771, + "step": 11637 + }, + { + "epoch": 0.59, + "grad_norm": 0.8450855924204931, + "learning_rate": 7.537835209002015e-06, + "loss": 0.1869, + "step": 11638 + }, + { + "epoch": 0.59, + "grad_norm": 2.1731059901473806, + "learning_rate": 7.536238975935906e-06, + "loss": 0.1832, + "step": 11639 + }, + { + "epoch": 0.59, + "grad_norm": 0.8971279390004603, + "learning_rate": 7.534642809699455e-06, + "loss": 0.1556, + "step": 11640 + }, + { + "epoch": 0.59, + "grad_norm": 1.279437219873534, + "learning_rate": 7.533046710335959e-06, + "loss": 0.2015, + "step": 11641 + }, + { + "epoch": 0.59, + "grad_norm": 1.3048828184889434, + "learning_rate": 7.531450677888706e-06, + "loss": 0.192, + "step": 11642 + }, + { + "epoch": 0.59, + "grad_norm": 1.0784834754808177, + "learning_rate": 7.529854712400996e-06, + "loss": 0.1835, + "step": 11643 + }, + { + "epoch": 0.59, + "grad_norm": 1.8167241931927995, + "learning_rate": 7.528258813916113e-06, + "loss": 0.175, + "step": 11644 + }, + { + "epoch": 0.59, + "grad_norm": 0.9433414492128455, + "learning_rate": 7.5266629824773506e-06, + "loss": 0.1663, + "step": 11645 + }, + { + "epoch": 0.59, + "grad_norm": 0.9127783252817138, + "learning_rate": 7.525067218127994e-06, + "loss": 0.1837, + "step": 11646 + }, + { + "epoch": 0.59, + "grad_norm": 0.8962957098263203, + "learning_rate": 7.523471520911332e-06, + "loss": 0.1634, + "step": 11647 + }, + { + "epoch": 0.59, + "grad_norm": 0.9029306025815423, + "learning_rate": 7.521875890870641e-06, + "loss": 0.1654, + "step": 11648 + }, + { + "epoch": 0.59, + "grad_norm": 1.472572365862214, + "learning_rate": 7.520280328049209e-06, + "loss": 0.1952, + "step": 11649 + }, + { + "epoch": 0.59, + "grad_norm": 0.8529342448124863, + "learning_rate": 7.518684832490311e-06, + "loss": 0.1789, + "step": 11650 + }, + { + "epoch": 0.59, + "grad_norm": 1.0128886629264706, + "learning_rate": 7.51708940423723e-06, + "loss": 0.1848, + "step": 11651 + }, + { + "epoch": 0.59, + "grad_norm": 0.8367934406403794, + "learning_rate": 7.5154940433332354e-06, + "loss": 0.1677, + "step": 11652 + }, + { + "epoch": 0.59, + "grad_norm": 1.7232024685942742, + "learning_rate": 7.513898749821607e-06, + "loss": 0.189, + "step": 11653 + }, + { + "epoch": 0.59, + "grad_norm": 0.9930819245270107, + "learning_rate": 7.5123035237456145e-06, + "loss": 0.19, + "step": 11654 + }, + { + "epoch": 0.59, + "grad_norm": 0.813383634612531, + "learning_rate": 7.510708365148534e-06, + "loss": 0.1614, + "step": 11655 + }, + { + "epoch": 0.59, + "grad_norm": 1.0880834403732438, + "learning_rate": 7.509113274073624e-06, + "loss": 0.1766, + "step": 11656 + }, + { + "epoch": 0.59, + "grad_norm": 1.0095368000735225, + "learning_rate": 7.507518250564162e-06, + "loss": 0.1605, + "step": 11657 + }, + { + "epoch": 0.59, + "grad_norm": 0.8563653106348561, + "learning_rate": 7.505923294663407e-06, + "loss": 0.1922, + "step": 11658 + }, + { + "epoch": 0.59, + "grad_norm": 0.9991615525590314, + "learning_rate": 7.5043284064146195e-06, + "loss": 0.1761, + "step": 11659 + }, + { + "epoch": 0.59, + "grad_norm": 0.8596434111528071, + "learning_rate": 7.50273358586107e-06, + "loss": 0.1658, + "step": 11660 + }, + { + "epoch": 0.59, + "grad_norm": 0.8708077223179779, + "learning_rate": 7.501138833046009e-06, + "loss": 0.17, + "step": 11661 + }, + { + "epoch": 0.59, + "grad_norm": 0.9135953852571289, + "learning_rate": 7.499544148012701e-06, + "loss": 0.1691, + "step": 11662 + }, + { + "epoch": 0.59, + "grad_norm": 1.1459883181004817, + "learning_rate": 7.4979495308043956e-06, + "loss": 0.1632, + "step": 11663 + }, + { + "epoch": 0.59, + "grad_norm": 1.0935777455155868, + "learning_rate": 7.496354981464355e-06, + "loss": 0.1817, + "step": 11664 + }, + { + "epoch": 0.59, + "grad_norm": 0.9206013159913617, + "learning_rate": 7.494760500035824e-06, + "loss": 0.1793, + "step": 11665 + }, + { + "epoch": 0.59, + "grad_norm": 0.9309447002221981, + "learning_rate": 7.493166086562057e-06, + "loss": 0.1669, + "step": 11666 + }, + { + "epoch": 0.59, + "grad_norm": 0.847841347921362, + "learning_rate": 7.4915717410862985e-06, + "loss": 0.1515, + "step": 11667 + }, + { + "epoch": 0.59, + "grad_norm": 0.8058277967072249, + "learning_rate": 7.489977463651805e-06, + "loss": 0.1861, + "step": 11668 + }, + { + "epoch": 0.59, + "grad_norm": 1.0099173797801007, + "learning_rate": 7.488383254301809e-06, + "loss": 0.1436, + "step": 11669 + }, + { + "epoch": 0.59, + "grad_norm": 1.146285183247257, + "learning_rate": 7.4867891130795625e-06, + "loss": 0.1771, + "step": 11670 + }, + { + "epoch": 0.59, + "grad_norm": 0.9109887240430162, + "learning_rate": 7.4851950400283e-06, + "loss": 0.1819, + "step": 11671 + }, + { + "epoch": 0.59, + "grad_norm": 1.0952192354772308, + "learning_rate": 7.483601035191265e-06, + "loss": 0.1899, + "step": 11672 + }, + { + "epoch": 0.59, + "grad_norm": 0.7876174516574952, + "learning_rate": 7.482007098611694e-06, + "loss": 0.1723, + "step": 11673 + }, + { + "epoch": 0.59, + "grad_norm": 0.9015007081376896, + "learning_rate": 7.480413230332826e-06, + "loss": 0.189, + "step": 11674 + }, + { + "epoch": 0.59, + "grad_norm": 1.1568448543002259, + "learning_rate": 7.478819430397888e-06, + "loss": 0.1847, + "step": 11675 + }, + { + "epoch": 0.59, + "grad_norm": 1.333752114037828, + "learning_rate": 7.4772256988501145e-06, + "loss": 0.1753, + "step": 11676 + }, + { + "epoch": 0.59, + "grad_norm": 0.8048652598854352, + "learning_rate": 7.4756320357327406e-06, + "loss": 0.1843, + "step": 11677 + }, + { + "epoch": 0.59, + "grad_norm": 0.8734669456812948, + "learning_rate": 7.474038441088987e-06, + "loss": 0.1565, + "step": 11678 + }, + { + "epoch": 0.59, + "grad_norm": 1.6532642783039464, + "learning_rate": 7.472444914962084e-06, + "loss": 0.1814, + "step": 11679 + }, + { + "epoch": 0.59, + "grad_norm": 1.0401268716166456, + "learning_rate": 7.470851457395254e-06, + "loss": 0.182, + "step": 11680 + }, + { + "epoch": 0.59, + "grad_norm": 1.4617086415693403, + "learning_rate": 7.469258068431724e-06, + "loss": 0.2092, + "step": 11681 + }, + { + "epoch": 0.59, + "grad_norm": 0.8619882782489962, + "learning_rate": 7.467664748114709e-06, + "loss": 0.1808, + "step": 11682 + }, + { + "epoch": 0.59, + "grad_norm": 0.9972063938232998, + "learning_rate": 7.466071496487432e-06, + "loss": 0.1763, + "step": 11683 + }, + { + "epoch": 0.59, + "grad_norm": 0.8998667580089821, + "learning_rate": 7.4644783135931076e-06, + "loss": 0.2041, + "step": 11684 + }, + { + "epoch": 0.59, + "grad_norm": 0.8611236553868576, + "learning_rate": 7.462885199474956e-06, + "loss": 0.1838, + "step": 11685 + }, + { + "epoch": 0.59, + "grad_norm": 1.5463735071999285, + "learning_rate": 7.461292154176183e-06, + "loss": 0.164, + "step": 11686 + }, + { + "epoch": 0.59, + "grad_norm": 0.9550680424020984, + "learning_rate": 7.459699177740006e-06, + "loss": 0.1831, + "step": 11687 + }, + { + "epoch": 0.59, + "grad_norm": 1.0858276043205346, + "learning_rate": 7.4581062702096295e-06, + "loss": 0.1742, + "step": 11688 + }, + { + "epoch": 0.59, + "grad_norm": 1.1960381092670782, + "learning_rate": 7.456513431628266e-06, + "loss": 0.2053, + "step": 11689 + }, + { + "epoch": 0.59, + "grad_norm": 1.127318310682017, + "learning_rate": 7.454920662039118e-06, + "loss": 0.1765, + "step": 11690 + }, + { + "epoch": 0.59, + "grad_norm": 0.8470517961431754, + "learning_rate": 7.4533279614853935e-06, + "loss": 0.1777, + "step": 11691 + }, + { + "epoch": 0.59, + "grad_norm": 0.9298438108234138, + "learning_rate": 7.451735330010288e-06, + "loss": 0.1779, + "step": 11692 + }, + { + "epoch": 0.59, + "grad_norm": 0.8895116893439401, + "learning_rate": 7.450142767657009e-06, + "loss": 0.1914, + "step": 11693 + }, + { + "epoch": 0.59, + "grad_norm": 1.2369497075642366, + "learning_rate": 7.448550274468752e-06, + "loss": 0.1621, + "step": 11694 + }, + { + "epoch": 0.59, + "grad_norm": 1.7270373270555748, + "learning_rate": 7.4469578504887094e-06, + "loss": 0.1751, + "step": 11695 + }, + { + "epoch": 0.59, + "grad_norm": 1.0363877318541614, + "learning_rate": 7.445365495760082e-06, + "loss": 0.1832, + "step": 11696 + }, + { + "epoch": 0.59, + "grad_norm": 1.473420292278499, + "learning_rate": 7.443773210326057e-06, + "loss": 0.1829, + "step": 11697 + }, + { + "epoch": 0.59, + "grad_norm": 1.0751573517871695, + "learning_rate": 7.4421809942298305e-06, + "loss": 0.1953, + "step": 11698 + }, + { + "epoch": 0.59, + "grad_norm": 0.9729007669559803, + "learning_rate": 7.440588847514587e-06, + "loss": 0.1871, + "step": 11699 + }, + { + "epoch": 0.59, + "grad_norm": 0.9134498944853251, + "learning_rate": 7.43899677022352e-06, + "loss": 0.1819, + "step": 11700 + }, + { + "epoch": 0.6, + "grad_norm": 0.9035168835614964, + "learning_rate": 7.437404762399805e-06, + "loss": 0.1806, + "step": 11701 + }, + { + "epoch": 0.6, + "grad_norm": 1.1679300875833856, + "learning_rate": 7.435812824086632e-06, + "loss": 0.1791, + "step": 11702 + }, + { + "epoch": 0.6, + "grad_norm": 0.8328750754816019, + "learning_rate": 7.434220955327181e-06, + "loss": 0.1596, + "step": 11703 + }, + { + "epoch": 0.6, + "grad_norm": 1.354004614535836, + "learning_rate": 7.4326291561646345e-06, + "loss": 0.1971, + "step": 11704 + }, + { + "epoch": 0.6, + "grad_norm": 1.2102840579710277, + "learning_rate": 7.4310374266421625e-06, + "loss": 0.1502, + "step": 11705 + }, + { + "epoch": 0.6, + "grad_norm": 1.0356127604610017, + "learning_rate": 7.429445766802949e-06, + "loss": 0.1783, + "step": 11706 + }, + { + "epoch": 0.6, + "grad_norm": 0.8936017296139833, + "learning_rate": 7.427854176690161e-06, + "loss": 0.1605, + "step": 11707 + }, + { + "epoch": 0.6, + "grad_norm": 1.4480314333799575, + "learning_rate": 7.426262656346979e-06, + "loss": 0.1862, + "step": 11708 + }, + { + "epoch": 0.6, + "grad_norm": 0.7651554481125494, + "learning_rate": 7.424671205816562e-06, + "loss": 0.1647, + "step": 11709 + }, + { + "epoch": 0.6, + "grad_norm": 1.125730464461837, + "learning_rate": 7.4230798251420865e-06, + "loss": 0.2053, + "step": 11710 + }, + { + "epoch": 0.6, + "grad_norm": 1.0174053546119461, + "learning_rate": 7.421488514366719e-06, + "loss": 0.1772, + "step": 11711 + }, + { + "epoch": 0.6, + "grad_norm": 1.0451093555191666, + "learning_rate": 7.419897273533616e-06, + "loss": 0.1855, + "step": 11712 + }, + { + "epoch": 0.6, + "grad_norm": 0.770060246482943, + "learning_rate": 7.41830610268595e-06, + "loss": 0.1818, + "step": 11713 + }, + { + "epoch": 0.6, + "grad_norm": 1.2007239777086642, + "learning_rate": 7.416715001866873e-06, + "loss": 0.184, + "step": 11714 + }, + { + "epoch": 0.6, + "grad_norm": 1.1565457242610033, + "learning_rate": 7.415123971119549e-06, + "loss": 0.1896, + "step": 11715 + }, + { + "epoch": 0.6, + "grad_norm": 0.9928207158486638, + "learning_rate": 7.4135330104871315e-06, + "loss": 0.1813, + "step": 11716 + }, + { + "epoch": 0.6, + "grad_norm": 1.1910867213681666, + "learning_rate": 7.411942120012781e-06, + "loss": 0.1823, + "step": 11717 + }, + { + "epoch": 0.6, + "grad_norm": 1.1848318570575225, + "learning_rate": 7.4103512997396434e-06, + "loss": 0.1878, + "step": 11718 + }, + { + "epoch": 0.6, + "grad_norm": 1.2032769546753332, + "learning_rate": 7.408760549710874e-06, + "loss": 0.1879, + "step": 11719 + }, + { + "epoch": 0.6, + "grad_norm": 0.7872865465004285, + "learning_rate": 7.40716986996962e-06, + "loss": 0.1565, + "step": 11720 + }, + { + "epoch": 0.6, + "grad_norm": 2.1778944733024477, + "learning_rate": 7.405579260559033e-06, + "loss": 0.1751, + "step": 11721 + }, + { + "epoch": 0.6, + "grad_norm": 2.6036289736231404, + "learning_rate": 7.4039887215222515e-06, + "loss": 0.1724, + "step": 11722 + }, + { + "epoch": 0.6, + "grad_norm": 0.7619375542330458, + "learning_rate": 7.402398252902425e-06, + "loss": 0.169, + "step": 11723 + }, + { + "epoch": 0.6, + "grad_norm": 1.0087057949594154, + "learning_rate": 7.400807854742689e-06, + "loss": 0.1545, + "step": 11724 + }, + { + "epoch": 0.6, + "grad_norm": 1.008949785204754, + "learning_rate": 7.399217527086192e-06, + "loss": 0.1866, + "step": 11725 + }, + { + "epoch": 0.6, + "grad_norm": 1.3812962714038624, + "learning_rate": 7.397627269976062e-06, + "loss": 0.1737, + "step": 11726 + }, + { + "epoch": 0.6, + "grad_norm": 1.0063596374669785, + "learning_rate": 7.396037083455444e-06, + "loss": 0.1723, + "step": 11727 + }, + { + "epoch": 0.6, + "grad_norm": 0.9817553872537692, + "learning_rate": 7.394446967567464e-06, + "loss": 0.1541, + "step": 11728 + }, + { + "epoch": 0.6, + "grad_norm": 0.9842802734124819, + "learning_rate": 7.392856922355256e-06, + "loss": 0.1935, + "step": 11729 + }, + { + "epoch": 0.6, + "grad_norm": 0.932910795695672, + "learning_rate": 7.3912669478619555e-06, + "loss": 0.1928, + "step": 11730 + }, + { + "epoch": 0.6, + "grad_norm": 0.9222028155289357, + "learning_rate": 7.389677044130682e-06, + "loss": 0.1671, + "step": 11731 + }, + { + "epoch": 0.6, + "grad_norm": 1.0197399095709354, + "learning_rate": 7.3880872112045685e-06, + "loss": 0.1622, + "step": 11732 + }, + { + "epoch": 0.6, + "grad_norm": 1.3319641346176132, + "learning_rate": 7.386497449126735e-06, + "loss": 0.1827, + "step": 11733 + }, + { + "epoch": 0.6, + "grad_norm": 1.352246925301863, + "learning_rate": 7.384907757940309e-06, + "loss": 0.1698, + "step": 11734 + }, + { + "epoch": 0.6, + "grad_norm": 1.2899917207740446, + "learning_rate": 7.383318137688403e-06, + "loss": 0.1954, + "step": 11735 + }, + { + "epoch": 0.6, + "grad_norm": 0.8328991342166945, + "learning_rate": 7.381728588414143e-06, + "loss": 0.1813, + "step": 11736 + }, + { + "epoch": 0.6, + "grad_norm": 1.0932700968247486, + "learning_rate": 7.380139110160642e-06, + "loss": 0.1883, + "step": 11737 + }, + { + "epoch": 0.6, + "grad_norm": 0.8564260352071182, + "learning_rate": 7.378549702971018e-06, + "loss": 0.1905, + "step": 11738 + }, + { + "epoch": 0.6, + "grad_norm": 1.2383818779988518, + "learning_rate": 7.3769603668883794e-06, + "loss": 0.1784, + "step": 11739 + }, + { + "epoch": 0.6, + "grad_norm": 0.9750897662967867, + "learning_rate": 7.375371101955842e-06, + "loss": 0.1606, + "step": 11740 + }, + { + "epoch": 0.6, + "grad_norm": 1.0507731167395518, + "learning_rate": 7.373781908216507e-06, + "loss": 0.174, + "step": 11741 + }, + { + "epoch": 0.6, + "grad_norm": 2.124368626038594, + "learning_rate": 7.372192785713489e-06, + "loss": 0.1719, + "step": 11742 + }, + { + "epoch": 0.6, + "grad_norm": 0.7837162544033184, + "learning_rate": 7.370603734489887e-06, + "loss": 0.159, + "step": 11743 + }, + { + "epoch": 0.6, + "grad_norm": 0.8844333450966607, + "learning_rate": 7.3690147545888124e-06, + "loss": 0.2085, + "step": 11744 + }, + { + "epoch": 0.6, + "grad_norm": 1.2703046460006664, + "learning_rate": 7.36742584605336e-06, + "loss": 0.1899, + "step": 11745 + }, + { + "epoch": 0.6, + "grad_norm": 1.0418363545100195, + "learning_rate": 7.365837008926626e-06, + "loss": 0.1729, + "step": 11746 + }, + { + "epoch": 0.6, + "grad_norm": 0.9182324287203589, + "learning_rate": 7.364248243251717e-06, + "loss": 0.1685, + "step": 11747 + }, + { + "epoch": 0.6, + "grad_norm": 1.4520427465418861, + "learning_rate": 7.362659549071719e-06, + "loss": 0.203, + "step": 11748 + }, + { + "epoch": 0.6, + "grad_norm": 0.8872792416800015, + "learning_rate": 7.361070926429732e-06, + "loss": 0.1743, + "step": 11749 + }, + { + "epoch": 0.6, + "grad_norm": 0.9247274282638314, + "learning_rate": 7.359482375368843e-06, + "loss": 0.1945, + "step": 11750 + }, + { + "epoch": 0.6, + "grad_norm": 1.0557085759243954, + "learning_rate": 7.357893895932148e-06, + "loss": 0.1891, + "step": 11751 + }, + { + "epoch": 0.6, + "grad_norm": 1.8863500229256194, + "learning_rate": 7.356305488162725e-06, + "loss": 0.1874, + "step": 11752 + }, + { + "epoch": 0.6, + "grad_norm": 0.6322796010243121, + "learning_rate": 7.3547171521036705e-06, + "loss": 0.1572, + "step": 11753 + }, + { + "epoch": 0.6, + "grad_norm": 1.1802599977822752, + "learning_rate": 7.353128887798058e-06, + "loss": 0.1852, + "step": 11754 + }, + { + "epoch": 0.6, + "grad_norm": 0.9031348770048434, + "learning_rate": 7.351540695288977e-06, + "loss": 0.1597, + "step": 11755 + }, + { + "epoch": 0.6, + "grad_norm": 1.3039431244166906, + "learning_rate": 7.349952574619501e-06, + "loss": 0.1914, + "step": 11756 + }, + { + "epoch": 0.6, + "grad_norm": 0.8685734518172754, + "learning_rate": 7.3483645258327145e-06, + "loss": 0.1813, + "step": 11757 + }, + { + "epoch": 0.6, + "grad_norm": 1.0207187065630585, + "learning_rate": 7.346776548971687e-06, + "loss": 0.198, + "step": 11758 + }, + { + "epoch": 0.6, + "grad_norm": 1.0524126214529708, + "learning_rate": 7.345188644079497e-06, + "loss": 0.1833, + "step": 11759 + }, + { + "epoch": 0.6, + "grad_norm": 0.8143507974511598, + "learning_rate": 7.3436008111992145e-06, + "loss": 0.1725, + "step": 11760 + }, + { + "epoch": 0.6, + "grad_norm": 0.8899011291190664, + "learning_rate": 7.342013050373913e-06, + "loss": 0.1763, + "step": 11761 + }, + { + "epoch": 0.6, + "grad_norm": 0.9923921534932216, + "learning_rate": 7.340425361646653e-06, + "loss": 0.1843, + "step": 11762 + }, + { + "epoch": 0.6, + "grad_norm": 0.7734285672321267, + "learning_rate": 7.338837745060508e-06, + "loss": 0.1713, + "step": 11763 + }, + { + "epoch": 0.6, + "grad_norm": 0.9661992269342103, + "learning_rate": 7.337250200658541e-06, + "loss": 0.1829, + "step": 11764 + }, + { + "epoch": 0.6, + "grad_norm": 0.9162980784153151, + "learning_rate": 7.335662728483808e-06, + "loss": 0.1751, + "step": 11765 + }, + { + "epoch": 0.6, + "grad_norm": 2.6337898949008385, + "learning_rate": 7.33407532857938e-06, + "loss": 0.1695, + "step": 11766 + }, + { + "epoch": 0.6, + "grad_norm": 1.1236715789822076, + "learning_rate": 7.332488000988303e-06, + "loss": 0.1855, + "step": 11767 + }, + { + "epoch": 0.6, + "grad_norm": 1.157821899167246, + "learning_rate": 7.330900745753643e-06, + "loss": 0.1811, + "step": 11768 + }, + { + "epoch": 0.6, + "grad_norm": 0.9014010739784609, + "learning_rate": 7.329313562918449e-06, + "loss": 0.1748, + "step": 11769 + }, + { + "epoch": 0.6, + "grad_norm": 1.13217901184811, + "learning_rate": 7.327726452525779e-06, + "loss": 0.1707, + "step": 11770 + }, + { + "epoch": 0.6, + "grad_norm": 0.9580859428605121, + "learning_rate": 7.326139414618676e-06, + "loss": 0.1931, + "step": 11771 + }, + { + "epoch": 0.6, + "grad_norm": 0.9840830862799712, + "learning_rate": 7.324552449240194e-06, + "loss": 0.1843, + "step": 11772 + }, + { + "epoch": 0.6, + "grad_norm": 0.9225628334085767, + "learning_rate": 7.322965556433377e-06, + "loss": 0.1967, + "step": 11773 + }, + { + "epoch": 0.6, + "grad_norm": 0.8297351049262989, + "learning_rate": 7.321378736241274e-06, + "loss": 0.1662, + "step": 11774 + }, + { + "epoch": 0.6, + "grad_norm": 0.9391575526978314, + "learning_rate": 7.31979198870692e-06, + "loss": 0.1803, + "step": 11775 + }, + { + "epoch": 0.6, + "grad_norm": 0.8288981321997082, + "learning_rate": 7.318205313873361e-06, + "loss": 0.1572, + "step": 11776 + }, + { + "epoch": 0.6, + "grad_norm": 1.0773086929648976, + "learning_rate": 7.316618711783634e-06, + "loss": 0.2063, + "step": 11777 + }, + { + "epoch": 0.6, + "grad_norm": 1.1221116623307208, + "learning_rate": 7.315032182480779e-06, + "loss": 0.181, + "step": 11778 + }, + { + "epoch": 0.6, + "grad_norm": 1.0846276680180476, + "learning_rate": 7.313445726007824e-06, + "loss": 0.1942, + "step": 11779 + }, + { + "epoch": 0.6, + "grad_norm": 0.9901471340187721, + "learning_rate": 7.311859342407809e-06, + "loss": 0.1945, + "step": 11780 + }, + { + "epoch": 0.6, + "grad_norm": 0.8943804861069643, + "learning_rate": 7.310273031723759e-06, + "loss": 0.1569, + "step": 11781 + }, + { + "epoch": 0.6, + "grad_norm": 0.8785188472162674, + "learning_rate": 7.3086867939987025e-06, + "loss": 0.1808, + "step": 11782 + }, + { + "epoch": 0.6, + "grad_norm": 1.2553735501788523, + "learning_rate": 7.307100629275674e-06, + "loss": 0.1856, + "step": 11783 + }, + { + "epoch": 0.6, + "grad_norm": 0.7430195608217605, + "learning_rate": 7.305514537597689e-06, + "loss": 0.1745, + "step": 11784 + }, + { + "epoch": 0.6, + "grad_norm": 0.8762282857033288, + "learning_rate": 7.303928519007776e-06, + "loss": 0.1854, + "step": 11785 + }, + { + "epoch": 0.6, + "grad_norm": 0.973383706689411, + "learning_rate": 7.302342573548952e-06, + "loss": 0.2029, + "step": 11786 + }, + { + "epoch": 0.6, + "grad_norm": 1.2048069280922016, + "learning_rate": 7.300756701264242e-06, + "loss": 0.1754, + "step": 11787 + }, + { + "epoch": 0.6, + "grad_norm": 1.330264833832681, + "learning_rate": 7.299170902196655e-06, + "loss": 0.18, + "step": 11788 + }, + { + "epoch": 0.6, + "grad_norm": 0.8528049614191048, + "learning_rate": 7.297585176389212e-06, + "loss": 0.1892, + "step": 11789 + }, + { + "epoch": 0.6, + "grad_norm": 0.9450155315439861, + "learning_rate": 7.295999523884921e-06, + "loss": 0.168, + "step": 11790 + }, + { + "epoch": 0.6, + "grad_norm": 0.9239769701781485, + "learning_rate": 7.294413944726801e-06, + "loss": 0.1762, + "step": 11791 + }, + { + "epoch": 0.6, + "grad_norm": 1.1140158903085968, + "learning_rate": 7.292828438957851e-06, + "loss": 0.1669, + "step": 11792 + }, + { + "epoch": 0.6, + "grad_norm": 0.7712235884304885, + "learning_rate": 7.291243006621084e-06, + "loss": 0.1739, + "step": 11793 + }, + { + "epoch": 0.6, + "grad_norm": 1.0059461187022227, + "learning_rate": 7.289657647759501e-06, + "loss": 0.163, + "step": 11794 + }, + { + "epoch": 0.6, + "grad_norm": 0.9109098709361532, + "learning_rate": 7.288072362416112e-06, + "loss": 0.1922, + "step": 11795 + }, + { + "epoch": 0.6, + "grad_norm": 0.9489883402080502, + "learning_rate": 7.28648715063391e-06, + "loss": 0.16, + "step": 11796 + }, + { + "epoch": 0.6, + "grad_norm": 1.0745977552488903, + "learning_rate": 7.2849020124559015e-06, + "loss": 0.1783, + "step": 11797 + }, + { + "epoch": 0.6, + "grad_norm": 0.9793700759659624, + "learning_rate": 7.283316947925075e-06, + "loss": 0.2013, + "step": 11798 + }, + { + "epoch": 0.6, + "grad_norm": 0.9171954822057149, + "learning_rate": 7.28173195708443e-06, + "loss": 0.1914, + "step": 11799 + }, + { + "epoch": 0.6, + "grad_norm": 0.9517807304858662, + "learning_rate": 7.280147039976963e-06, + "loss": 0.1757, + "step": 11800 + }, + { + "epoch": 0.6, + "grad_norm": 0.9207643617133459, + "learning_rate": 7.278562196645656e-06, + "loss": 0.1978, + "step": 11801 + }, + { + "epoch": 0.6, + "grad_norm": 1.2926951372931075, + "learning_rate": 7.276977427133505e-06, + "loss": 0.2026, + "step": 11802 + }, + { + "epoch": 0.6, + "grad_norm": 1.1629358351213168, + "learning_rate": 7.275392731483495e-06, + "loss": 0.1937, + "step": 11803 + }, + { + "epoch": 0.6, + "grad_norm": 0.9957458561200179, + "learning_rate": 7.273808109738614e-06, + "loss": 0.1602, + "step": 11804 + }, + { + "epoch": 0.6, + "grad_norm": 1.4993122787846052, + "learning_rate": 7.272223561941837e-06, + "loss": 0.1736, + "step": 11805 + }, + { + "epoch": 0.6, + "grad_norm": 1.3441130422411014, + "learning_rate": 7.270639088136154e-06, + "loss": 0.1756, + "step": 11806 + }, + { + "epoch": 0.6, + "grad_norm": 0.9196103667338753, + "learning_rate": 7.269054688364535e-06, + "loss": 0.1838, + "step": 11807 + }, + { + "epoch": 0.6, + "grad_norm": 1.0711928579048098, + "learning_rate": 7.2674703626699685e-06, + "loss": 0.1731, + "step": 11808 + }, + { + "epoch": 0.6, + "grad_norm": 1.1727203349150228, + "learning_rate": 7.265886111095417e-06, + "loss": 0.2018, + "step": 11809 + }, + { + "epoch": 0.6, + "grad_norm": 1.19263133780406, + "learning_rate": 7.264301933683864e-06, + "loss": 0.1866, + "step": 11810 + }, + { + "epoch": 0.6, + "grad_norm": 0.9406769046267162, + "learning_rate": 7.262717830478272e-06, + "loss": 0.1979, + "step": 11811 + }, + { + "epoch": 0.6, + "grad_norm": 0.9433538747251087, + "learning_rate": 7.261133801521614e-06, + "loss": 0.1738, + "step": 11812 + }, + { + "epoch": 0.6, + "grad_norm": 0.9999985459271858, + "learning_rate": 7.259549846856855e-06, + "loss": 0.1768, + "step": 11813 + }, + { + "epoch": 0.6, + "grad_norm": 2.089501907253873, + "learning_rate": 7.257965966526966e-06, + "loss": 0.1793, + "step": 11814 + }, + { + "epoch": 0.6, + "grad_norm": 1.1513213292828732, + "learning_rate": 7.256382160574902e-06, + "loss": 0.1967, + "step": 11815 + }, + { + "epoch": 0.6, + "grad_norm": 0.9955636307423217, + "learning_rate": 7.254798429043626e-06, + "loss": 0.1881, + "step": 11816 + }, + { + "epoch": 0.6, + "grad_norm": 0.9804159920200639, + "learning_rate": 7.2532147719761e-06, + "loss": 0.1697, + "step": 11817 + }, + { + "epoch": 0.6, + "grad_norm": 1.1111216413748202, + "learning_rate": 7.251631189415275e-06, + "loss": 0.1902, + "step": 11818 + }, + { + "epoch": 0.6, + "grad_norm": 1.0571395710658018, + "learning_rate": 7.250047681404113e-06, + "loss": 0.2079, + "step": 11819 + }, + { + "epoch": 0.6, + "grad_norm": 2.1265894734074346, + "learning_rate": 7.248464247985558e-06, + "loss": 0.192, + "step": 11820 + }, + { + "epoch": 0.6, + "grad_norm": 0.8806117017761279, + "learning_rate": 7.246880889202572e-06, + "loss": 0.155, + "step": 11821 + }, + { + "epoch": 0.6, + "grad_norm": 0.9507389103704897, + "learning_rate": 7.245297605098093e-06, + "loss": 0.1478, + "step": 11822 + }, + { + "epoch": 0.6, + "grad_norm": 1.2828201375748278, + "learning_rate": 7.243714395715076e-06, + "loss": 0.1679, + "step": 11823 + }, + { + "epoch": 0.6, + "grad_norm": 2.137428365187284, + "learning_rate": 7.242131261096457e-06, + "loss": 0.1591, + "step": 11824 + }, + { + "epoch": 0.6, + "grad_norm": 0.9450990752849141, + "learning_rate": 7.240548201285186e-06, + "loss": 0.1715, + "step": 11825 + }, + { + "epoch": 0.6, + "grad_norm": 0.9348710156084757, + "learning_rate": 7.238965216324199e-06, + "loss": 0.1897, + "step": 11826 + }, + { + "epoch": 0.6, + "grad_norm": 1.2180133790398777, + "learning_rate": 7.2373823062564396e-06, + "loss": 0.1676, + "step": 11827 + }, + { + "epoch": 0.6, + "grad_norm": 1.4145821638061544, + "learning_rate": 7.235799471124838e-06, + "loss": 0.1753, + "step": 11828 + }, + { + "epoch": 0.6, + "grad_norm": 1.479901347156181, + "learning_rate": 7.234216710972333e-06, + "loss": 0.1736, + "step": 11829 + }, + { + "epoch": 0.6, + "grad_norm": 1.0622603323670472, + "learning_rate": 7.232634025841853e-06, + "loss": 0.1667, + "step": 11830 + }, + { + "epoch": 0.6, + "grad_norm": 0.8733703542173635, + "learning_rate": 7.231051415776338e-06, + "loss": 0.1768, + "step": 11831 + }, + { + "epoch": 0.6, + "grad_norm": 0.7926624980613768, + "learning_rate": 7.2294688808187045e-06, + "loss": 0.1545, + "step": 11832 + }, + { + "epoch": 0.6, + "grad_norm": 0.8767977208712247, + "learning_rate": 7.227886421011886e-06, + "loss": 0.1706, + "step": 11833 + }, + { + "epoch": 0.6, + "grad_norm": 0.9092722362527742, + "learning_rate": 7.226304036398808e-06, + "loss": 0.179, + "step": 11834 + }, + { + "epoch": 0.6, + "grad_norm": 0.9596541254080619, + "learning_rate": 7.224721727022384e-06, + "loss": 0.1998, + "step": 11835 + }, + { + "epoch": 0.6, + "grad_norm": 1.2707631192094582, + "learning_rate": 7.223139492925544e-06, + "loss": 0.1838, + "step": 11836 + }, + { + "epoch": 0.6, + "grad_norm": 1.4641264206313687, + "learning_rate": 7.221557334151199e-06, + "loss": 0.1954, + "step": 11837 + }, + { + "epoch": 0.6, + "grad_norm": 1.0628876393856026, + "learning_rate": 7.21997525074227e-06, + "loss": 0.1831, + "step": 11838 + }, + { + "epoch": 0.6, + "grad_norm": 0.8944603679621254, + "learning_rate": 7.218393242741667e-06, + "loss": 0.1661, + "step": 11839 + }, + { + "epoch": 0.6, + "grad_norm": 0.8294708528212483, + "learning_rate": 7.2168113101923085e-06, + "loss": 0.1595, + "step": 11840 + }, + { + "epoch": 0.6, + "grad_norm": 0.9545116326224699, + "learning_rate": 7.215229453137097e-06, + "loss": 0.1907, + "step": 11841 + }, + { + "epoch": 0.6, + "grad_norm": 1.5609981833288196, + "learning_rate": 7.213647671618945e-06, + "loss": 0.2064, + "step": 11842 + }, + { + "epoch": 0.6, + "grad_norm": 1.2486096471058774, + "learning_rate": 7.212065965680755e-06, + "loss": 0.1733, + "step": 11843 + }, + { + "epoch": 0.6, + "grad_norm": 0.9830128638983774, + "learning_rate": 7.210484335365438e-06, + "loss": 0.1777, + "step": 11844 + }, + { + "epoch": 0.6, + "grad_norm": 1.1222461529512562, + "learning_rate": 7.208902780715888e-06, + "loss": 0.1849, + "step": 11845 + }, + { + "epoch": 0.6, + "grad_norm": 0.9584478418686001, + "learning_rate": 7.207321301775008e-06, + "loss": 0.1651, + "step": 11846 + }, + { + "epoch": 0.6, + "grad_norm": 1.0400600781793397, + "learning_rate": 7.205739898585693e-06, + "loss": 0.2062, + "step": 11847 + }, + { + "epoch": 0.6, + "grad_norm": 2.101326132606511, + "learning_rate": 7.204158571190847e-06, + "loss": 0.1983, + "step": 11848 + }, + { + "epoch": 0.6, + "grad_norm": 0.9634348795913377, + "learning_rate": 7.202577319633353e-06, + "loss": 0.1736, + "step": 11849 + }, + { + "epoch": 0.6, + "grad_norm": 0.8806079665225696, + "learning_rate": 7.200996143956111e-06, + "loss": 0.1777, + "step": 11850 + }, + { + "epoch": 0.6, + "grad_norm": 1.1346657628354975, + "learning_rate": 7.199415044202004e-06, + "loss": 0.1747, + "step": 11851 + }, + { + "epoch": 0.6, + "grad_norm": 1.7454248060467574, + "learning_rate": 7.1978340204139205e-06, + "loss": 0.229, + "step": 11852 + }, + { + "epoch": 0.6, + "grad_norm": 0.8133745687679493, + "learning_rate": 7.196253072634751e-06, + "loss": 0.1764, + "step": 11853 + }, + { + "epoch": 0.6, + "grad_norm": 0.9192064226925258, + "learning_rate": 7.194672200907373e-06, + "loss": 0.1787, + "step": 11854 + }, + { + "epoch": 0.6, + "grad_norm": 0.9325417312120747, + "learning_rate": 7.193091405274671e-06, + "loss": 0.1826, + "step": 11855 + }, + { + "epoch": 0.6, + "grad_norm": 0.8704646661650539, + "learning_rate": 7.19151068577952e-06, + "loss": 0.1806, + "step": 11856 + }, + { + "epoch": 0.6, + "grad_norm": 0.9427144139646895, + "learning_rate": 7.189930042464806e-06, + "loss": 0.1696, + "step": 11857 + }, + { + "epoch": 0.6, + "grad_norm": 0.7537092950958617, + "learning_rate": 7.188349475373393e-06, + "loss": 0.16, + "step": 11858 + }, + { + "epoch": 0.6, + "grad_norm": 1.35349247055404, + "learning_rate": 7.186768984548162e-06, + "loss": 0.1975, + "step": 11859 + }, + { + "epoch": 0.6, + "grad_norm": 1.1981338252651514, + "learning_rate": 7.185188570031979e-06, + "loss": 0.19, + "step": 11860 + }, + { + "epoch": 0.6, + "grad_norm": 1.001442490909649, + "learning_rate": 7.18360823186772e-06, + "loss": 0.1824, + "step": 11861 + }, + { + "epoch": 0.6, + "grad_norm": 0.8589437176538642, + "learning_rate": 7.182027970098242e-06, + "loss": 0.1514, + "step": 11862 + }, + { + "epoch": 0.6, + "grad_norm": 1.3932418492827867, + "learning_rate": 7.180447784766418e-06, + "loss": 0.1654, + "step": 11863 + }, + { + "epoch": 0.6, + "grad_norm": 1.680823262999588, + "learning_rate": 7.178867675915104e-06, + "loss": 0.1665, + "step": 11864 + }, + { + "epoch": 0.6, + "grad_norm": 0.8770714709369166, + "learning_rate": 7.177287643587164e-06, + "loss": 0.1608, + "step": 11865 + }, + { + "epoch": 0.6, + "grad_norm": 1.4125927927249728, + "learning_rate": 7.175707687825455e-06, + "loss": 0.1637, + "step": 11866 + }, + { + "epoch": 0.6, + "grad_norm": 0.9644948475142574, + "learning_rate": 7.1741278086728395e-06, + "loss": 0.1644, + "step": 11867 + }, + { + "epoch": 0.6, + "grad_norm": 1.0163483995642684, + "learning_rate": 7.172548006172164e-06, + "loss": 0.202, + "step": 11868 + }, + { + "epoch": 0.6, + "grad_norm": 1.0437266979742186, + "learning_rate": 7.170968280366281e-06, + "loss": 0.1642, + "step": 11869 + }, + { + "epoch": 0.6, + "grad_norm": 1.8267503919071744, + "learning_rate": 7.1693886312980475e-06, + "loss": 0.1618, + "step": 11870 + }, + { + "epoch": 0.6, + "grad_norm": 1.1865281005301913, + "learning_rate": 7.1678090590103035e-06, + "loss": 0.1923, + "step": 11871 + }, + { + "epoch": 0.6, + "grad_norm": 0.9358919918321995, + "learning_rate": 7.166229563545901e-06, + "loss": 0.1808, + "step": 11872 + }, + { + "epoch": 0.6, + "grad_norm": 2.050465910789463, + "learning_rate": 7.164650144947679e-06, + "loss": 0.1753, + "step": 11873 + }, + { + "epoch": 0.6, + "grad_norm": 1.0196472656738296, + "learning_rate": 7.163070803258486e-06, + "loss": 0.2121, + "step": 11874 + }, + { + "epoch": 0.6, + "grad_norm": 2.553112700593638, + "learning_rate": 7.161491538521154e-06, + "loss": 0.1763, + "step": 11875 + }, + { + "epoch": 0.6, + "grad_norm": 1.012673181882685, + "learning_rate": 7.159912350778528e-06, + "loss": 0.1572, + "step": 11876 + }, + { + "epoch": 0.6, + "grad_norm": 1.216975617994192, + "learning_rate": 7.158333240073436e-06, + "loss": 0.1983, + "step": 11877 + }, + { + "epoch": 0.6, + "grad_norm": 0.9765534866537904, + "learning_rate": 7.156754206448718e-06, + "loss": 0.1688, + "step": 11878 + }, + { + "epoch": 0.6, + "grad_norm": 0.7845748811610675, + "learning_rate": 7.1551752499472005e-06, + "loss": 0.19, + "step": 11879 + }, + { + "epoch": 0.6, + "grad_norm": 1.424597105206769, + "learning_rate": 7.153596370611719e-06, + "loss": 0.1788, + "step": 11880 + }, + { + "epoch": 0.6, + "grad_norm": 1.3585442671939263, + "learning_rate": 7.152017568485092e-06, + "loss": 0.1835, + "step": 11881 + }, + { + "epoch": 0.6, + "grad_norm": 2.7360441143193666, + "learning_rate": 7.150438843610152e-06, + "loss": 0.189, + "step": 11882 + }, + { + "epoch": 0.6, + "grad_norm": 0.9452059160457844, + "learning_rate": 7.148860196029717e-06, + "loss": 0.2015, + "step": 11883 + }, + { + "epoch": 0.6, + "grad_norm": 1.2493024368049896, + "learning_rate": 7.147281625786615e-06, + "loss": 0.1752, + "step": 11884 + }, + { + "epoch": 0.6, + "grad_norm": 0.883847817100227, + "learning_rate": 7.145703132923657e-06, + "loss": 0.1823, + "step": 11885 + }, + { + "epoch": 0.6, + "grad_norm": 1.906830011163742, + "learning_rate": 7.144124717483661e-06, + "loss": 0.175, + "step": 11886 + }, + { + "epoch": 0.6, + "grad_norm": 0.9714690367328279, + "learning_rate": 7.1425463795094476e-06, + "loss": 0.163, + "step": 11887 + }, + { + "epoch": 0.6, + "grad_norm": 1.1441987658528388, + "learning_rate": 7.14096811904382e-06, + "loss": 0.1728, + "step": 11888 + }, + { + "epoch": 0.6, + "grad_norm": 0.9804796842368632, + "learning_rate": 7.139389936129599e-06, + "loss": 0.1925, + "step": 11889 + }, + { + "epoch": 0.6, + "grad_norm": 1.2599611555433587, + "learning_rate": 7.1378118308095835e-06, + "loss": 0.1869, + "step": 11890 + }, + { + "epoch": 0.6, + "grad_norm": 1.0853613521821157, + "learning_rate": 7.136233803126584e-06, + "loss": 0.1751, + "step": 11891 + }, + { + "epoch": 0.6, + "grad_norm": 2.6086541355655184, + "learning_rate": 7.1346558531234046e-06, + "loss": 0.1936, + "step": 11892 + }, + { + "epoch": 0.6, + "grad_norm": 1.1234407378780253, + "learning_rate": 7.133077980842851e-06, + "loss": 0.1855, + "step": 11893 + }, + { + "epoch": 0.6, + "grad_norm": 1.1006285198696308, + "learning_rate": 7.1315001863277135e-06, + "loss": 0.1831, + "step": 11894 + }, + { + "epoch": 0.6, + "grad_norm": 1.128971427493001, + "learning_rate": 7.129922469620798e-06, + "loss": 0.1685, + "step": 11895 + }, + { + "epoch": 0.6, + "grad_norm": 0.878266359471762, + "learning_rate": 7.128344830764895e-06, + "loss": 0.1826, + "step": 11896 + }, + { + "epoch": 0.6, + "grad_norm": 1.4104581272003494, + "learning_rate": 7.126767269802806e-06, + "loss": 0.1746, + "step": 11897 + }, + { + "epoch": 0.61, + "grad_norm": 1.1426165349149782, + "learning_rate": 7.125189786777312e-06, + "loss": 0.1707, + "step": 11898 + }, + { + "epoch": 0.61, + "grad_norm": 1.3657276142785477, + "learning_rate": 7.123612381731211e-06, + "loss": 0.1663, + "step": 11899 + }, + { + "epoch": 0.61, + "grad_norm": 1.014592637511237, + "learning_rate": 7.122035054707283e-06, + "loss": 0.1808, + "step": 11900 + }, + { + "epoch": 0.61, + "grad_norm": 1.059580390073793, + "learning_rate": 7.1204578057483206e-06, + "loss": 0.1851, + "step": 11901 + }, + { + "epoch": 0.61, + "grad_norm": 1.1218977907688872, + "learning_rate": 7.1188806348971e-06, + "loss": 0.192, + "step": 11902 + }, + { + "epoch": 0.61, + "grad_norm": 0.8440398569371811, + "learning_rate": 7.117303542196406e-06, + "loss": 0.1728, + "step": 11903 + }, + { + "epoch": 0.61, + "grad_norm": 1.0808151942873958, + "learning_rate": 7.1157265276890195e-06, + "loss": 0.1944, + "step": 11904 + }, + { + "epoch": 0.61, + "grad_norm": 1.194100575050271, + "learning_rate": 7.114149591417709e-06, + "loss": 0.1842, + "step": 11905 + }, + { + "epoch": 0.61, + "grad_norm": 0.9826936837280463, + "learning_rate": 7.112572733425257e-06, + "loss": 0.1748, + "step": 11906 + }, + { + "epoch": 0.61, + "grad_norm": 1.234040095182392, + "learning_rate": 7.11099595375443e-06, + "loss": 0.1875, + "step": 11907 + }, + { + "epoch": 0.61, + "grad_norm": 0.9664874950364423, + "learning_rate": 7.1094192524480025e-06, + "loss": 0.1562, + "step": 11908 + }, + { + "epoch": 0.61, + "grad_norm": 0.7664885752023356, + "learning_rate": 7.107842629548738e-06, + "loss": 0.1667, + "step": 11909 + }, + { + "epoch": 0.61, + "grad_norm": 0.8510699508661388, + "learning_rate": 7.106266085099412e-06, + "loss": 0.1942, + "step": 11910 + }, + { + "epoch": 0.61, + "grad_norm": 1.783986982922015, + "learning_rate": 7.104689619142775e-06, + "loss": 0.1705, + "step": 11911 + }, + { + "epoch": 0.61, + "grad_norm": 0.7713147450056765, + "learning_rate": 7.103113231721599e-06, + "loss": 0.1825, + "step": 11912 + }, + { + "epoch": 0.61, + "grad_norm": 0.8175255022774114, + "learning_rate": 7.101536922878638e-06, + "loss": 0.1721, + "step": 11913 + }, + { + "epoch": 0.61, + "grad_norm": 1.5835071361004465, + "learning_rate": 7.0999606926566554e-06, + "loss": 0.1853, + "step": 11914 + }, + { + "epoch": 0.61, + "grad_norm": 1.4009658683843862, + "learning_rate": 7.0983845410984e-06, + "loss": 0.187, + "step": 11915 + }, + { + "epoch": 0.61, + "grad_norm": 1.0056035099506464, + "learning_rate": 7.096808468246629e-06, + "loss": 0.1735, + "step": 11916 + }, + { + "epoch": 0.61, + "grad_norm": 1.0911482613971564, + "learning_rate": 7.095232474144089e-06, + "loss": 0.1802, + "step": 11917 + }, + { + "epoch": 0.61, + "grad_norm": 0.9992152632875951, + "learning_rate": 7.0936565588335386e-06, + "loss": 0.1792, + "step": 11918 + }, + { + "epoch": 0.61, + "grad_norm": 1.3219752435746845, + "learning_rate": 7.092080722357713e-06, + "loss": 0.1683, + "step": 11919 + }, + { + "epoch": 0.61, + "grad_norm": 1.1595662144721943, + "learning_rate": 7.090504964759366e-06, + "loss": 0.1739, + "step": 11920 + }, + { + "epoch": 0.61, + "grad_norm": 1.0274332516187044, + "learning_rate": 7.0889292860812344e-06, + "loss": 0.1694, + "step": 11921 + }, + { + "epoch": 0.61, + "grad_norm": 0.7438566646687913, + "learning_rate": 7.087353686366059e-06, + "loss": 0.1743, + "step": 11922 + }, + { + "epoch": 0.61, + "grad_norm": 0.7802668225695544, + "learning_rate": 7.085778165656581e-06, + "loss": 0.1644, + "step": 11923 + }, + { + "epoch": 0.61, + "grad_norm": 1.044161670020728, + "learning_rate": 7.084202723995533e-06, + "loss": 0.1855, + "step": 11924 + }, + { + "epoch": 0.61, + "grad_norm": 0.8374455063926431, + "learning_rate": 7.082627361425652e-06, + "loss": 0.1736, + "step": 11925 + }, + { + "epoch": 0.61, + "grad_norm": 2.0659359053226045, + "learning_rate": 7.081052077989668e-06, + "loss": 0.1615, + "step": 11926 + }, + { + "epoch": 0.61, + "grad_norm": 1.0140527059862043, + "learning_rate": 7.0794768737303135e-06, + "loss": 0.1732, + "step": 11927 + }, + { + "epoch": 0.61, + "grad_norm": 0.9475497891387962, + "learning_rate": 7.07790174869031e-06, + "loss": 0.1554, + "step": 11928 + }, + { + "epoch": 0.61, + "grad_norm": 0.8568709849395552, + "learning_rate": 7.076326702912388e-06, + "loss": 0.1957, + "step": 11929 + }, + { + "epoch": 0.61, + "grad_norm": 1.0945804443320943, + "learning_rate": 7.0747517364392694e-06, + "loss": 0.1911, + "step": 11930 + }, + { + "epoch": 0.61, + "grad_norm": 1.042478240027438, + "learning_rate": 7.073176849313678e-06, + "loss": 0.193, + "step": 11931 + }, + { + "epoch": 0.61, + "grad_norm": 0.9780085983785852, + "learning_rate": 7.071602041578325e-06, + "loss": 0.2077, + "step": 11932 + }, + { + "epoch": 0.61, + "grad_norm": 0.8456134232105421, + "learning_rate": 7.0700273132759374e-06, + "loss": 0.1859, + "step": 11933 + }, + { + "epoch": 0.61, + "grad_norm": 1.0665183972480694, + "learning_rate": 7.068452664449219e-06, + "loss": 0.189, + "step": 11934 + }, + { + "epoch": 0.61, + "grad_norm": 7.7600552381034, + "learning_rate": 7.066878095140892e-06, + "loss": 0.1883, + "step": 11935 + }, + { + "epoch": 0.61, + "grad_norm": 1.7283221691703692, + "learning_rate": 7.065303605393659e-06, + "loss": 0.1785, + "step": 11936 + }, + { + "epoch": 0.61, + "grad_norm": 3.944965829965286, + "learning_rate": 7.0637291952502355e-06, + "loss": 0.191, + "step": 11937 + }, + { + "epoch": 0.61, + "grad_norm": 1.2512234638902588, + "learning_rate": 7.062154864753321e-06, + "loss": 0.1475, + "step": 11938 + }, + { + "epoch": 0.61, + "grad_norm": 0.8827605647103475, + "learning_rate": 7.0605806139456205e-06, + "loss": 0.1713, + "step": 11939 + }, + { + "epoch": 0.61, + "grad_norm": 0.9519568727584321, + "learning_rate": 7.05900644286984e-06, + "loss": 0.1674, + "step": 11940 + }, + { + "epoch": 0.61, + "grad_norm": 1.0050025496707462, + "learning_rate": 7.057432351568671e-06, + "loss": 0.1691, + "step": 11941 + }, + { + "epoch": 0.61, + "grad_norm": 1.1691513647351677, + "learning_rate": 7.05585834008482e-06, + "loss": 0.1958, + "step": 11942 + }, + { + "epoch": 0.61, + "grad_norm": 1.082537787781271, + "learning_rate": 7.054284408460974e-06, + "loss": 0.1719, + "step": 11943 + }, + { + "epoch": 0.61, + "grad_norm": 1.0093555069161197, + "learning_rate": 7.052710556739835e-06, + "loss": 0.1721, + "step": 11944 + }, + { + "epoch": 0.61, + "grad_norm": 1.367301110642045, + "learning_rate": 7.051136784964083e-06, + "loss": 0.1933, + "step": 11945 + }, + { + "epoch": 0.61, + "grad_norm": 0.8675198631297651, + "learning_rate": 7.049563093176418e-06, + "loss": 0.1958, + "step": 11946 + }, + { + "epoch": 0.61, + "grad_norm": 0.8474305343227716, + "learning_rate": 7.047989481419516e-06, + "loss": 0.1806, + "step": 11947 + }, + { + "epoch": 0.61, + "grad_norm": 1.2477633360015177, + "learning_rate": 7.0464159497360675e-06, + "loss": 0.2078, + "step": 11948 + }, + { + "epoch": 0.61, + "grad_norm": 1.37049127199201, + "learning_rate": 7.044842498168752e-06, + "loss": 0.1967, + "step": 11949 + }, + { + "epoch": 0.61, + "grad_norm": 1.081084129031886, + "learning_rate": 7.043269126760255e-06, + "loss": 0.1803, + "step": 11950 + }, + { + "epoch": 0.61, + "grad_norm": 1.0286902605759753, + "learning_rate": 7.041695835553245e-06, + "loss": 0.1862, + "step": 11951 + }, + { + "epoch": 0.61, + "grad_norm": 0.9401373508312021, + "learning_rate": 7.040122624590405e-06, + "loss": 0.1669, + "step": 11952 + }, + { + "epoch": 0.61, + "grad_norm": 1.520605516020862, + "learning_rate": 7.038549493914404e-06, + "loss": 0.1609, + "step": 11953 + }, + { + "epoch": 0.61, + "grad_norm": 1.1363660276509426, + "learning_rate": 7.03697644356792e-06, + "loss": 0.1696, + "step": 11954 + }, + { + "epoch": 0.61, + "grad_norm": 1.269973237982814, + "learning_rate": 7.035403473593614e-06, + "loss": 0.1637, + "step": 11955 + }, + { + "epoch": 0.61, + "grad_norm": 1.0499017769975487, + "learning_rate": 7.0338305840341535e-06, + "loss": 0.1809, + "step": 11956 + }, + { + "epoch": 0.61, + "grad_norm": 0.9581650638544686, + "learning_rate": 7.032257774932212e-06, + "loss": 0.202, + "step": 11957 + }, + { + "epoch": 0.61, + "grad_norm": 1.1165249053239028, + "learning_rate": 7.030685046330441e-06, + "loss": 0.1592, + "step": 11958 + }, + { + "epoch": 0.61, + "grad_norm": 3.303437766903637, + "learning_rate": 7.02911239827151e-06, + "loss": 0.1623, + "step": 11959 + }, + { + "epoch": 0.61, + "grad_norm": 0.8286811962313174, + "learning_rate": 7.027539830798069e-06, + "loss": 0.1495, + "step": 11960 + }, + { + "epoch": 0.61, + "grad_norm": 2.4931590119544556, + "learning_rate": 7.02596734395278e-06, + "loss": 0.1829, + "step": 11961 + }, + { + "epoch": 0.61, + "grad_norm": 0.9899743456092847, + "learning_rate": 7.024394937778293e-06, + "loss": 0.178, + "step": 11962 + }, + { + "epoch": 0.61, + "grad_norm": 1.1519840371806078, + "learning_rate": 7.022822612317265e-06, + "loss": 0.2048, + "step": 11963 + }, + { + "epoch": 0.61, + "grad_norm": 0.9024290437544673, + "learning_rate": 7.021250367612338e-06, + "loss": 0.1552, + "step": 11964 + }, + { + "epoch": 0.61, + "grad_norm": 0.9192970344585016, + "learning_rate": 7.019678203706164e-06, + "loss": 0.1918, + "step": 11965 + }, + { + "epoch": 0.61, + "grad_norm": 0.9435428055972597, + "learning_rate": 7.018106120641386e-06, + "loss": 0.1798, + "step": 11966 + }, + { + "epoch": 0.61, + "grad_norm": 0.9971047001488691, + "learning_rate": 7.016534118460652e-06, + "loss": 0.1695, + "step": 11967 + }, + { + "epoch": 0.61, + "grad_norm": 1.1748885021929922, + "learning_rate": 7.014962197206594e-06, + "loss": 0.175, + "step": 11968 + }, + { + "epoch": 0.61, + "grad_norm": 0.9969796770369592, + "learning_rate": 7.013390356921858e-06, + "loss": 0.1897, + "step": 11969 + }, + { + "epoch": 0.61, + "grad_norm": 0.9595108683100719, + "learning_rate": 7.011818597649074e-06, + "loss": 0.1694, + "step": 11970 + }, + { + "epoch": 0.61, + "grad_norm": 0.9267782372098969, + "learning_rate": 7.010246919430884e-06, + "loss": 0.1956, + "step": 11971 + }, + { + "epoch": 0.61, + "grad_norm": 0.8777993144052069, + "learning_rate": 7.008675322309911e-06, + "loss": 0.1847, + "step": 11972 + }, + { + "epoch": 0.61, + "grad_norm": 1.7337740983158976, + "learning_rate": 7.0071038063287935e-06, + "loss": 0.1991, + "step": 11973 + }, + { + "epoch": 0.61, + "grad_norm": 1.0782367779242605, + "learning_rate": 7.005532371530152e-06, + "loss": 0.201, + "step": 11974 + }, + { + "epoch": 0.61, + "grad_norm": 1.501456289799915, + "learning_rate": 7.003961017956611e-06, + "loss": 0.1869, + "step": 11975 + }, + { + "epoch": 0.61, + "grad_norm": 1.1395341072149203, + "learning_rate": 7.002389745650801e-06, + "loss": 0.2017, + "step": 11976 + }, + { + "epoch": 0.61, + "grad_norm": 1.1050083080002921, + "learning_rate": 7.000818554655335e-06, + "loss": 0.1697, + "step": 11977 + }, + { + "epoch": 0.61, + "grad_norm": 0.7721561404794662, + "learning_rate": 6.9992474450128375e-06, + "loss": 0.1542, + "step": 11978 + }, + { + "epoch": 0.61, + "grad_norm": 1.043825789826784, + "learning_rate": 6.997676416765919e-06, + "loss": 0.1771, + "step": 11979 + }, + { + "epoch": 0.61, + "grad_norm": 3.473082441806016, + "learning_rate": 6.996105469957204e-06, + "loss": 0.1672, + "step": 11980 + }, + { + "epoch": 0.61, + "grad_norm": 1.1440472364717835, + "learning_rate": 6.994534604629291e-06, + "loss": 0.1782, + "step": 11981 + }, + { + "epoch": 0.61, + "grad_norm": 0.9716464255364722, + "learning_rate": 6.9929638208247994e-06, + "loss": 0.189, + "step": 11982 + }, + { + "epoch": 0.61, + "grad_norm": 1.069848684646463, + "learning_rate": 6.991393118586333e-06, + "loss": 0.199, + "step": 11983 + }, + { + "epoch": 0.61, + "grad_norm": 0.909313878941909, + "learning_rate": 6.989822497956501e-06, + "loss": 0.182, + "step": 11984 + }, + { + "epoch": 0.61, + "grad_norm": 1.029764795486539, + "learning_rate": 6.9882519589779005e-06, + "loss": 0.194, + "step": 11985 + }, + { + "epoch": 0.61, + "grad_norm": 0.9275625368461713, + "learning_rate": 6.986681501693139e-06, + "loss": 0.1921, + "step": 11986 + }, + { + "epoch": 0.61, + "grad_norm": 1.0466489063396667, + "learning_rate": 6.985111126144808e-06, + "loss": 0.1705, + "step": 11987 + }, + { + "epoch": 0.61, + "grad_norm": 0.7498953008113175, + "learning_rate": 6.983540832375511e-06, + "loss": 0.1567, + "step": 11988 + }, + { + "epoch": 0.61, + "grad_norm": 2.3164636095920277, + "learning_rate": 6.981970620427836e-06, + "loss": 0.1582, + "step": 11989 + }, + { + "epoch": 0.61, + "grad_norm": 0.9719462276135705, + "learning_rate": 6.980400490344383e-06, + "loss": 0.2122, + "step": 11990 + }, + { + "epoch": 0.61, + "grad_norm": 0.8601857009168751, + "learning_rate": 6.9788304421677355e-06, + "loss": 0.1746, + "step": 11991 + }, + { + "epoch": 0.61, + "grad_norm": 0.861866966346215, + "learning_rate": 6.97726047594048e-06, + "loss": 0.1765, + "step": 11992 + }, + { + "epoch": 0.61, + "grad_norm": 0.9007785415154862, + "learning_rate": 6.975690591705211e-06, + "loss": 0.162, + "step": 11993 + }, + { + "epoch": 0.61, + "grad_norm": 0.9620423411317283, + "learning_rate": 6.974120789504499e-06, + "loss": 0.1736, + "step": 11994 + }, + { + "epoch": 0.61, + "grad_norm": 4.686444458854959, + "learning_rate": 6.972551069380935e-06, + "loss": 0.1866, + "step": 11995 + }, + { + "epoch": 0.61, + "grad_norm": 1.3545266700952874, + "learning_rate": 6.9709814313770905e-06, + "loss": 0.1644, + "step": 11996 + }, + { + "epoch": 0.61, + "grad_norm": 1.1660286325084908, + "learning_rate": 6.969411875535552e-06, + "loss": 0.1799, + "step": 11997 + }, + { + "epoch": 0.61, + "grad_norm": 1.0370267507532889, + "learning_rate": 6.967842401898882e-06, + "loss": 0.2005, + "step": 11998 + }, + { + "epoch": 0.61, + "grad_norm": 0.7969908814107881, + "learning_rate": 6.966273010509663e-06, + "loss": 0.1664, + "step": 11999 + }, + { + "epoch": 0.61, + "grad_norm": 1.3401157501995602, + "learning_rate": 6.964703701410455e-06, + "loss": 0.1846, + "step": 12000 + }, + { + "epoch": 0.61, + "grad_norm": 1.0017537274961412, + "learning_rate": 6.963134474643834e-06, + "loss": 0.2044, + "step": 12001 + }, + { + "epoch": 0.61, + "grad_norm": 0.9056146251290355, + "learning_rate": 6.961565330252358e-06, + "loss": 0.1871, + "step": 12002 + }, + { + "epoch": 0.61, + "grad_norm": 1.2843447159022305, + "learning_rate": 6.959996268278599e-06, + "loss": 0.1831, + "step": 12003 + }, + { + "epoch": 0.61, + "grad_norm": 0.9759637968679855, + "learning_rate": 6.958427288765108e-06, + "loss": 0.1687, + "step": 12004 + }, + { + "epoch": 0.61, + "grad_norm": 1.1690450666793533, + "learning_rate": 6.956858391754453e-06, + "loss": 0.2056, + "step": 12005 + }, + { + "epoch": 0.61, + "grad_norm": 1.412385050354641, + "learning_rate": 6.955289577289181e-06, + "loss": 0.1845, + "step": 12006 + }, + { + "epoch": 0.61, + "grad_norm": 0.9436818805180692, + "learning_rate": 6.953720845411858e-06, + "loss": 0.1676, + "step": 12007 + }, + { + "epoch": 0.61, + "grad_norm": 1.3523117395487558, + "learning_rate": 6.952152196165025e-06, + "loss": 0.1984, + "step": 12008 + }, + { + "epoch": 0.61, + "grad_norm": 0.8942514384754394, + "learning_rate": 6.950583629591235e-06, + "loss": 0.1808, + "step": 12009 + }, + { + "epoch": 0.61, + "grad_norm": 0.9647460971315064, + "learning_rate": 6.94901514573304e-06, + "loss": 0.1716, + "step": 12010 + }, + { + "epoch": 0.61, + "grad_norm": 0.7947737276649559, + "learning_rate": 6.9474467446329775e-06, + "loss": 0.1601, + "step": 12011 + }, + { + "epoch": 0.61, + "grad_norm": 1.2135854992453965, + "learning_rate": 6.9458784263335965e-06, + "loss": 0.1749, + "step": 12012 + }, + { + "epoch": 0.61, + "grad_norm": 0.8258705553834466, + "learning_rate": 6.944310190877433e-06, + "loss": 0.1806, + "step": 12013 + }, + { + "epoch": 0.61, + "grad_norm": 1.0624409171900406, + "learning_rate": 6.942742038307033e-06, + "loss": 0.1654, + "step": 12014 + }, + { + "epoch": 0.61, + "grad_norm": 0.8998384943681688, + "learning_rate": 6.941173968664923e-06, + "loss": 0.1806, + "step": 12015 + }, + { + "epoch": 0.61, + "grad_norm": 1.0775549450326525, + "learning_rate": 6.939605981993647e-06, + "loss": 0.1928, + "step": 12016 + }, + { + "epoch": 0.61, + "grad_norm": 1.0758381169200557, + "learning_rate": 6.938038078335727e-06, + "loss": 0.1669, + "step": 12017 + }, + { + "epoch": 0.61, + "grad_norm": 1.0478686096447207, + "learning_rate": 6.936470257733699e-06, + "loss": 0.1935, + "step": 12018 + }, + { + "epoch": 0.61, + "grad_norm": 1.0007796945466119, + "learning_rate": 6.9349025202300865e-06, + "loss": 0.163, + "step": 12019 + }, + { + "epoch": 0.61, + "grad_norm": 1.062954767058354, + "learning_rate": 6.93333486586742e-06, + "loss": 0.174, + "step": 12020 + }, + { + "epoch": 0.61, + "grad_norm": 1.1895420775002619, + "learning_rate": 6.931767294688214e-06, + "loss": 0.1786, + "step": 12021 + }, + { + "epoch": 0.61, + "grad_norm": 1.150659466077958, + "learning_rate": 6.930199806734996e-06, + "loss": 0.1693, + "step": 12022 + }, + { + "epoch": 0.61, + "grad_norm": 0.9054373439655421, + "learning_rate": 6.92863240205028e-06, + "loss": 0.1825, + "step": 12023 + }, + { + "epoch": 0.61, + "grad_norm": 1.0779083796299886, + "learning_rate": 6.927065080676587e-06, + "loss": 0.1796, + "step": 12024 + }, + { + "epoch": 0.61, + "grad_norm": 1.6111483127425923, + "learning_rate": 6.9254978426564256e-06, + "loss": 0.1789, + "step": 12025 + }, + { + "epoch": 0.61, + "grad_norm": 1.7561342831981865, + "learning_rate": 6.923930688032308e-06, + "loss": 0.1653, + "step": 12026 + }, + { + "epoch": 0.61, + "grad_norm": 1.252568257892285, + "learning_rate": 6.922363616846746e-06, + "loss": 0.1928, + "step": 12027 + }, + { + "epoch": 0.61, + "grad_norm": 1.3861545130349302, + "learning_rate": 6.920796629142242e-06, + "loss": 0.1789, + "step": 12028 + }, + { + "epoch": 0.61, + "grad_norm": 1.126739769588076, + "learning_rate": 6.9192297249613074e-06, + "loss": 0.1963, + "step": 12029 + }, + { + "epoch": 0.61, + "grad_norm": 1.376665325309207, + "learning_rate": 6.9176629043464364e-06, + "loss": 0.1705, + "step": 12030 + }, + { + "epoch": 0.61, + "grad_norm": 1.0270356795638427, + "learning_rate": 6.916096167340134e-06, + "loss": 0.1797, + "step": 12031 + }, + { + "epoch": 0.61, + "grad_norm": 0.872751930898111, + "learning_rate": 6.9145295139848954e-06, + "loss": 0.1788, + "step": 12032 + }, + { + "epoch": 0.61, + "grad_norm": 1.3292159171289397, + "learning_rate": 6.9129629443232235e-06, + "loss": 0.184, + "step": 12033 + }, + { + "epoch": 0.61, + "grad_norm": 0.9125351804266222, + "learning_rate": 6.911396458397602e-06, + "loss": 0.1744, + "step": 12034 + }, + { + "epoch": 0.61, + "grad_norm": 0.8528246667288966, + "learning_rate": 6.909830056250527e-06, + "loss": 0.1622, + "step": 12035 + }, + { + "epoch": 0.61, + "grad_norm": 1.270843585328278, + "learning_rate": 6.9082637379244844e-06, + "loss": 0.2054, + "step": 12036 + }, + { + "epoch": 0.61, + "grad_norm": 0.9391457519420248, + "learning_rate": 6.906697503461968e-06, + "loss": 0.1799, + "step": 12037 + }, + { + "epoch": 0.61, + "grad_norm": 0.7632331087196969, + "learning_rate": 6.90513135290545e-06, + "loss": 0.1588, + "step": 12038 + }, + { + "epoch": 0.61, + "grad_norm": 1.0727942068343892, + "learning_rate": 6.903565286297422e-06, + "loss": 0.1679, + "step": 12039 + }, + { + "epoch": 0.61, + "grad_norm": 1.0133257925068448, + "learning_rate": 6.901999303680359e-06, + "loss": 0.177, + "step": 12040 + }, + { + "epoch": 0.61, + "grad_norm": 0.8623895125847567, + "learning_rate": 6.900433405096744e-06, + "loss": 0.1623, + "step": 12041 + }, + { + "epoch": 0.61, + "grad_norm": 0.9602670825421511, + "learning_rate": 6.898867590589047e-06, + "loss": 0.1617, + "step": 12042 + }, + { + "epoch": 0.61, + "grad_norm": 0.9567573291126384, + "learning_rate": 6.897301860199738e-06, + "loss": 0.1533, + "step": 12043 + }, + { + "epoch": 0.61, + "grad_norm": 0.9042572859837555, + "learning_rate": 6.895736213971293e-06, + "loss": 0.1654, + "step": 12044 + }, + { + "epoch": 0.61, + "grad_norm": 1.330908865010921, + "learning_rate": 6.8941706519461785e-06, + "loss": 0.1989, + "step": 12045 + }, + { + "epoch": 0.61, + "grad_norm": 1.0736449432758612, + "learning_rate": 6.892605174166862e-06, + "loss": 0.1762, + "step": 12046 + }, + { + "epoch": 0.61, + "grad_norm": 0.8609311260815152, + "learning_rate": 6.891039780675803e-06, + "loss": 0.1609, + "step": 12047 + }, + { + "epoch": 0.61, + "grad_norm": 1.502432617527464, + "learning_rate": 6.889474471515467e-06, + "loss": 0.1671, + "step": 12048 + }, + { + "epoch": 0.61, + "grad_norm": 1.5916558758401025, + "learning_rate": 6.887909246728311e-06, + "loss": 0.1678, + "step": 12049 + }, + { + "epoch": 0.61, + "grad_norm": 1.7377823963807983, + "learning_rate": 6.886344106356795e-06, + "loss": 0.1733, + "step": 12050 + }, + { + "epoch": 0.61, + "grad_norm": 0.8830484258834155, + "learning_rate": 6.8847790504433664e-06, + "loss": 0.162, + "step": 12051 + }, + { + "epoch": 0.61, + "grad_norm": 1.1390348137066064, + "learning_rate": 6.883214079030485e-06, + "loss": 0.1737, + "step": 12052 + }, + { + "epoch": 0.61, + "grad_norm": 0.8768182692622531, + "learning_rate": 6.881649192160596e-06, + "loss": 0.1553, + "step": 12053 + }, + { + "epoch": 0.61, + "grad_norm": 1.3875687021614835, + "learning_rate": 6.880084389876153e-06, + "loss": 0.1863, + "step": 12054 + }, + { + "epoch": 0.61, + "grad_norm": 0.9544048864076122, + "learning_rate": 6.878519672219592e-06, + "loss": 0.1617, + "step": 12055 + }, + { + "epoch": 0.61, + "grad_norm": 1.0780909741086129, + "learning_rate": 6.8769550392333665e-06, + "loss": 0.1881, + "step": 12056 + }, + { + "epoch": 0.61, + "grad_norm": 0.954038343043023, + "learning_rate": 6.875390490959907e-06, + "loss": 0.1743, + "step": 12057 + }, + { + "epoch": 0.61, + "grad_norm": 1.1336834996926124, + "learning_rate": 6.873826027441659e-06, + "loss": 0.1784, + "step": 12058 + }, + { + "epoch": 0.61, + "grad_norm": 0.9306362669905568, + "learning_rate": 6.872261648721055e-06, + "loss": 0.1879, + "step": 12059 + }, + { + "epoch": 0.61, + "grad_norm": 1.197695678667343, + "learning_rate": 6.870697354840534e-06, + "loss": 0.1962, + "step": 12060 + }, + { + "epoch": 0.61, + "grad_norm": 0.8779976121881782, + "learning_rate": 6.869133145842521e-06, + "loss": 0.1757, + "step": 12061 + }, + { + "epoch": 0.61, + "grad_norm": 1.0792172420476411, + "learning_rate": 6.8675690217694476e-06, + "loss": 0.1962, + "step": 12062 + }, + { + "epoch": 0.61, + "grad_norm": 1.0307635970176123, + "learning_rate": 6.866004982663746e-06, + "loss": 0.1805, + "step": 12063 + }, + { + "epoch": 0.61, + "grad_norm": 0.8749574919822711, + "learning_rate": 6.864441028567831e-06, + "loss": 0.1726, + "step": 12064 + }, + { + "epoch": 0.61, + "grad_norm": 1.898665738913217, + "learning_rate": 6.862877159524133e-06, + "loss": 0.1558, + "step": 12065 + }, + { + "epoch": 0.61, + "grad_norm": 1.4824044475075975, + "learning_rate": 6.861313375575067e-06, + "loss": 0.1659, + "step": 12066 + }, + { + "epoch": 0.61, + "grad_norm": 0.9629904483904526, + "learning_rate": 6.859749676763056e-06, + "loss": 0.1581, + "step": 12067 + }, + { + "epoch": 0.61, + "grad_norm": 0.8272828329013648, + "learning_rate": 6.85818606313051e-06, + "loss": 0.1689, + "step": 12068 + }, + { + "epoch": 0.61, + "grad_norm": 1.6118105822053284, + "learning_rate": 6.856622534719848e-06, + "loss": 0.1891, + "step": 12069 + }, + { + "epoch": 0.61, + "grad_norm": 1.7255331847586153, + "learning_rate": 6.855059091573472e-06, + "loss": 0.2084, + "step": 12070 + }, + { + "epoch": 0.61, + "grad_norm": 1.1749647430370458, + "learning_rate": 6.853495733733799e-06, + "loss": 0.2064, + "step": 12071 + }, + { + "epoch": 0.61, + "grad_norm": 0.8938854197590528, + "learning_rate": 6.851932461243229e-06, + "loss": 0.1925, + "step": 12072 + }, + { + "epoch": 0.61, + "grad_norm": 1.0110531702751793, + "learning_rate": 6.850369274144174e-06, + "loss": 0.1726, + "step": 12073 + }, + { + "epoch": 0.61, + "grad_norm": 0.6937841164262705, + "learning_rate": 6.848806172479025e-06, + "loss": 0.1823, + "step": 12074 + }, + { + "epoch": 0.61, + "grad_norm": 0.9360324682026385, + "learning_rate": 6.847243156290191e-06, + "loss": 0.173, + "step": 12075 + }, + { + "epoch": 0.61, + "grad_norm": 2.5232523179973194, + "learning_rate": 6.8456802256200596e-06, + "loss": 0.1734, + "step": 12076 + }, + { + "epoch": 0.61, + "grad_norm": 1.0482809026447173, + "learning_rate": 6.844117380511036e-06, + "loss": 0.1811, + "step": 12077 + }, + { + "epoch": 0.61, + "grad_norm": 0.8719802085341933, + "learning_rate": 6.842554621005504e-06, + "loss": 0.1792, + "step": 12078 + }, + { + "epoch": 0.61, + "grad_norm": 1.224889727426434, + "learning_rate": 6.840991947145854e-06, + "loss": 0.1858, + "step": 12079 + }, + { + "epoch": 0.61, + "grad_norm": 1.0220060453992144, + "learning_rate": 6.839429358974481e-06, + "loss": 0.1755, + "step": 12080 + }, + { + "epoch": 0.61, + "grad_norm": 0.9639989927823014, + "learning_rate": 6.837866856533761e-06, + "loss": 0.1683, + "step": 12081 + }, + { + "epoch": 0.61, + "grad_norm": 1.2119181742534815, + "learning_rate": 6.836304439866084e-06, + "loss": 0.1741, + "step": 12082 + }, + { + "epoch": 0.61, + "grad_norm": 1.02227900587153, + "learning_rate": 6.834742109013823e-06, + "loss": 0.1968, + "step": 12083 + }, + { + "epoch": 0.61, + "grad_norm": 1.3070559048308237, + "learning_rate": 6.833179864019366e-06, + "loss": 0.1761, + "step": 12084 + }, + { + "epoch": 0.61, + "grad_norm": 1.1660429668454133, + "learning_rate": 6.83161770492508e-06, + "loss": 0.1861, + "step": 12085 + }, + { + "epoch": 0.61, + "grad_norm": 0.8861079705636326, + "learning_rate": 6.830055631773347e-06, + "loss": 0.1627, + "step": 12086 + }, + { + "epoch": 0.61, + "grad_norm": 0.838978038894428, + "learning_rate": 6.82849364460653e-06, + "loss": 0.1684, + "step": 12087 + }, + { + "epoch": 0.61, + "grad_norm": 0.8761185023942043, + "learning_rate": 6.826931743467004e-06, + "loss": 0.1795, + "step": 12088 + }, + { + "epoch": 0.61, + "grad_norm": 0.8362184895394403, + "learning_rate": 6.825369928397132e-06, + "loss": 0.1841, + "step": 12089 + }, + { + "epoch": 0.61, + "grad_norm": 1.3908870975417034, + "learning_rate": 6.8238081994392836e-06, + "loss": 0.1791, + "step": 12090 + }, + { + "epoch": 0.61, + "grad_norm": 0.8640804279105561, + "learning_rate": 6.822246556635814e-06, + "loss": 0.1689, + "step": 12091 + }, + { + "epoch": 0.61, + "grad_norm": 1.0446730178037433, + "learning_rate": 6.820685000029087e-06, + "loss": 0.1835, + "step": 12092 + }, + { + "epoch": 0.61, + "grad_norm": 1.0395863481397598, + "learning_rate": 6.819123529661458e-06, + "loss": 0.1669, + "step": 12093 + }, + { + "epoch": 0.62, + "grad_norm": 0.9949242524187777, + "learning_rate": 6.817562145575285e-06, + "loss": 0.1808, + "step": 12094 + }, + { + "epoch": 0.62, + "grad_norm": 1.2442467941307531, + "learning_rate": 6.81600084781292e-06, + "loss": 0.1768, + "step": 12095 + }, + { + "epoch": 0.62, + "grad_norm": 0.9768991335555441, + "learning_rate": 6.814439636416708e-06, + "loss": 0.1734, + "step": 12096 + }, + { + "epoch": 0.62, + "grad_norm": 1.1049467939463726, + "learning_rate": 6.812878511429002e-06, + "loss": 0.2294, + "step": 12097 + }, + { + "epoch": 0.62, + "grad_norm": 0.8864407710560207, + "learning_rate": 6.811317472892145e-06, + "loss": 0.1815, + "step": 12098 + }, + { + "epoch": 0.62, + "grad_norm": 1.648388743148969, + "learning_rate": 6.809756520848486e-06, + "loss": 0.1769, + "step": 12099 + }, + { + "epoch": 0.62, + "grad_norm": 0.7739231396867304, + "learning_rate": 6.808195655340357e-06, + "loss": 0.1562, + "step": 12100 + }, + { + "epoch": 0.62, + "grad_norm": 0.9479706936152373, + "learning_rate": 6.806634876410103e-06, + "loss": 0.1788, + "step": 12101 + }, + { + "epoch": 0.62, + "grad_norm": 1.5049638635512383, + "learning_rate": 6.805074184100056e-06, + "loss": 0.1851, + "step": 12102 + }, + { + "epoch": 0.62, + "grad_norm": 1.1612459449559867, + "learning_rate": 6.803513578452557e-06, + "loss": 0.1992, + "step": 12103 + }, + { + "epoch": 0.62, + "grad_norm": 0.8635805305573898, + "learning_rate": 6.8019530595099294e-06, + "loss": 0.1798, + "step": 12104 + }, + { + "epoch": 0.62, + "grad_norm": 0.9803848651176261, + "learning_rate": 6.800392627314506e-06, + "loss": 0.1819, + "step": 12105 + }, + { + "epoch": 0.62, + "grad_norm": 0.934349171865883, + "learning_rate": 6.798832281908612e-06, + "loss": 0.1826, + "step": 12106 + }, + { + "epoch": 0.62, + "grad_norm": 1.284915985657181, + "learning_rate": 6.797272023334578e-06, + "loss": 0.17, + "step": 12107 + }, + { + "epoch": 0.62, + "grad_norm": 0.967496996176986, + "learning_rate": 6.7957118516347156e-06, + "loss": 0.1915, + "step": 12108 + }, + { + "epoch": 0.62, + "grad_norm": 1.028295447950258, + "learning_rate": 6.794151766851356e-06, + "loss": 0.1713, + "step": 12109 + }, + { + "epoch": 0.62, + "grad_norm": 0.9704153265494159, + "learning_rate": 6.792591769026804e-06, + "loss": 0.1794, + "step": 12110 + }, + { + "epoch": 0.62, + "grad_norm": 1.5479240050610468, + "learning_rate": 6.791031858203385e-06, + "loss": 0.17, + "step": 12111 + }, + { + "epoch": 0.62, + "grad_norm": 1.0705127091317566, + "learning_rate": 6.789472034423412e-06, + "loss": 0.1824, + "step": 12112 + }, + { + "epoch": 0.62, + "grad_norm": 1.0649028556132054, + "learning_rate": 6.787912297729184e-06, + "loss": 0.1883, + "step": 12113 + }, + { + "epoch": 0.62, + "grad_norm": 0.9102264829131282, + "learning_rate": 6.786352648163022e-06, + "loss": 0.1838, + "step": 12114 + }, + { + "epoch": 0.62, + "grad_norm": 1.0511014344801506, + "learning_rate": 6.7847930857672205e-06, + "loss": 0.1784, + "step": 12115 + }, + { + "epoch": 0.62, + "grad_norm": 1.0186815963205775, + "learning_rate": 6.783233610584095e-06, + "loss": 0.1676, + "step": 12116 + }, + { + "epoch": 0.62, + "grad_norm": 0.8880854123514019, + "learning_rate": 6.781674222655934e-06, + "loss": 0.1794, + "step": 12117 + }, + { + "epoch": 0.62, + "grad_norm": 0.9295879693635356, + "learning_rate": 6.780114922025043e-06, + "loss": 0.1856, + "step": 12118 + }, + { + "epoch": 0.62, + "grad_norm": 0.9006839537874907, + "learning_rate": 6.778555708733715e-06, + "loss": 0.1743, + "step": 12119 + }, + { + "epoch": 0.62, + "grad_norm": 0.9975869625126415, + "learning_rate": 6.7769965828242505e-06, + "loss": 0.1928, + "step": 12120 + }, + { + "epoch": 0.62, + "grad_norm": 1.1032710418317961, + "learning_rate": 6.7754375443389294e-06, + "loss": 0.1958, + "step": 12121 + }, + { + "epoch": 0.62, + "grad_norm": 1.0261541172084292, + "learning_rate": 6.773878593320052e-06, + "loss": 0.1651, + "step": 12122 + }, + { + "epoch": 0.62, + "grad_norm": 0.7787786381676972, + "learning_rate": 6.772319729809895e-06, + "loss": 0.1636, + "step": 12123 + }, + { + "epoch": 0.62, + "grad_norm": 1.6696086999669617, + "learning_rate": 6.770760953850754e-06, + "loss": 0.1811, + "step": 12124 + }, + { + "epoch": 0.62, + "grad_norm": 1.4360424505617724, + "learning_rate": 6.769202265484899e-06, + "loss": 0.1558, + "step": 12125 + }, + { + "epoch": 0.62, + "grad_norm": 1.1170506912862783, + "learning_rate": 6.767643664754619e-06, + "loss": 0.1812, + "step": 12126 + }, + { + "epoch": 0.62, + "grad_norm": 1.2946728030781767, + "learning_rate": 6.766085151702184e-06, + "loss": 0.1774, + "step": 12127 + }, + { + "epoch": 0.62, + "grad_norm": 0.9326024358102776, + "learning_rate": 6.764526726369873e-06, + "loss": 0.1827, + "step": 12128 + }, + { + "epoch": 0.62, + "grad_norm": 1.076727177272974, + "learning_rate": 6.762968388799958e-06, + "loss": 0.1913, + "step": 12129 + }, + { + "epoch": 0.62, + "grad_norm": 0.9799196474439371, + "learning_rate": 6.7614101390347095e-06, + "loss": 0.1742, + "step": 12130 + }, + { + "epoch": 0.62, + "grad_norm": 0.9460898423948457, + "learning_rate": 6.759851977116392e-06, + "loss": 0.1788, + "step": 12131 + }, + { + "epoch": 0.62, + "grad_norm": 0.9967772399446618, + "learning_rate": 6.758293903087272e-06, + "loss": 0.1802, + "step": 12132 + }, + { + "epoch": 0.62, + "grad_norm": 0.9750470589847444, + "learning_rate": 6.756735916989616e-06, + "loss": 0.169, + "step": 12133 + }, + { + "epoch": 0.62, + "grad_norm": 0.9776038603721809, + "learning_rate": 6.755178018865678e-06, + "loss": 0.1588, + "step": 12134 + }, + { + "epoch": 0.62, + "grad_norm": 0.8506390814787281, + "learning_rate": 6.753620208757721e-06, + "loss": 0.1761, + "step": 12135 + }, + { + "epoch": 0.62, + "grad_norm": 0.8690918817459329, + "learning_rate": 6.7520624867079965e-06, + "loss": 0.1851, + "step": 12136 + }, + { + "epoch": 0.62, + "grad_norm": 0.9535446340652028, + "learning_rate": 6.7505048527587656e-06, + "loss": 0.1634, + "step": 12137 + }, + { + "epoch": 0.62, + "grad_norm": 1.3906783266482012, + "learning_rate": 6.748947306952269e-06, + "loss": 0.179, + "step": 12138 + }, + { + "epoch": 0.62, + "grad_norm": 1.0931209334496226, + "learning_rate": 6.747389849330765e-06, + "loss": 0.1685, + "step": 12139 + }, + { + "epoch": 0.62, + "grad_norm": 1.2113979580898313, + "learning_rate": 6.745832479936492e-06, + "loss": 0.186, + "step": 12140 + }, + { + "epoch": 0.62, + "grad_norm": 1.0226123145274468, + "learning_rate": 6.744275198811698e-06, + "loss": 0.2114, + "step": 12141 + }, + { + "epoch": 0.62, + "grad_norm": 0.8793336866514591, + "learning_rate": 6.742718005998621e-06, + "loss": 0.1767, + "step": 12142 + }, + { + "epoch": 0.62, + "grad_norm": 1.0782116318894777, + "learning_rate": 6.741160901539506e-06, + "loss": 0.1711, + "step": 12143 + }, + { + "epoch": 0.62, + "grad_norm": 1.1707774557584145, + "learning_rate": 6.7396038854765825e-06, + "loss": 0.1655, + "step": 12144 + }, + { + "epoch": 0.62, + "grad_norm": 1.104390856186119, + "learning_rate": 6.738046957852089e-06, + "loss": 0.1661, + "step": 12145 + }, + { + "epoch": 0.62, + "grad_norm": 0.9243443440720327, + "learning_rate": 6.736490118708253e-06, + "loss": 0.1777, + "step": 12146 + }, + { + "epoch": 0.62, + "grad_norm": 1.7956070831990771, + "learning_rate": 6.7349333680873134e-06, + "loss": 0.1696, + "step": 12147 + }, + { + "epoch": 0.62, + "grad_norm": 1.0515919332880705, + "learning_rate": 6.733376706031486e-06, + "loss": 0.1781, + "step": 12148 + }, + { + "epoch": 0.62, + "grad_norm": 0.857621003636688, + "learning_rate": 6.731820132582999e-06, + "loss": 0.1746, + "step": 12149 + }, + { + "epoch": 0.62, + "grad_norm": 1.1617687067828353, + "learning_rate": 6.730263647784079e-06, + "loss": 0.1528, + "step": 12150 + }, + { + "epoch": 0.62, + "grad_norm": 2.0959333069840698, + "learning_rate": 6.728707251676939e-06, + "loss": 0.1886, + "step": 12151 + }, + { + "epoch": 0.62, + "grad_norm": 1.1623054341465726, + "learning_rate": 6.727150944303804e-06, + "loss": 0.1776, + "step": 12152 + }, + { + "epoch": 0.62, + "grad_norm": 1.200230087745242, + "learning_rate": 6.7255947257068785e-06, + "loss": 0.1949, + "step": 12153 + }, + { + "epoch": 0.62, + "grad_norm": 0.8042497751515356, + "learning_rate": 6.724038595928385e-06, + "loss": 0.1624, + "step": 12154 + }, + { + "epoch": 0.62, + "grad_norm": 1.3636760843144802, + "learning_rate": 6.722482555010528e-06, + "loss": 0.1907, + "step": 12155 + }, + { + "epoch": 0.62, + "grad_norm": 0.8964440433985537, + "learning_rate": 6.720926602995519e-06, + "loss": 0.177, + "step": 12156 + }, + { + "epoch": 0.62, + "grad_norm": 0.782888558585591, + "learning_rate": 6.719370739925557e-06, + "loss": 0.1531, + "step": 12157 + }, + { + "epoch": 0.62, + "grad_norm": 1.4599766269420338, + "learning_rate": 6.717814965842852e-06, + "loss": 0.1639, + "step": 12158 + }, + { + "epoch": 0.62, + "grad_norm": 2.032260911561945, + "learning_rate": 6.716259280789599e-06, + "loss": 0.2493, + "step": 12159 + }, + { + "epoch": 0.62, + "grad_norm": 1.4459519392693336, + "learning_rate": 6.714703684808004e-06, + "loss": 0.1685, + "step": 12160 + }, + { + "epoch": 0.62, + "grad_norm": 0.8809204338200619, + "learning_rate": 6.7131481779402505e-06, + "loss": 0.178, + "step": 12161 + }, + { + "epoch": 0.62, + "grad_norm": 2.0338654307609594, + "learning_rate": 6.7115927602285424e-06, + "loss": 0.1912, + "step": 12162 + }, + { + "epoch": 0.62, + "grad_norm": 1.0140484819462559, + "learning_rate": 6.710037431715063e-06, + "loss": 0.1967, + "step": 12163 + }, + { + "epoch": 0.62, + "grad_norm": 1.0985829362573525, + "learning_rate": 6.70848219244201e-06, + "loss": 0.1755, + "step": 12164 + }, + { + "epoch": 0.62, + "grad_norm": 0.7797192565191957, + "learning_rate": 6.706927042451561e-06, + "loss": 0.1654, + "step": 12165 + }, + { + "epoch": 0.62, + "grad_norm": 1.0983960363940297, + "learning_rate": 6.7053719817859e-06, + "loss": 0.1785, + "step": 12166 + }, + { + "epoch": 0.62, + "grad_norm": 0.75585709682769, + "learning_rate": 6.7038170104872106e-06, + "loss": 0.1605, + "step": 12167 + }, + { + "epoch": 0.62, + "grad_norm": 1.2958472121653062, + "learning_rate": 6.70226212859767e-06, + "loss": 0.217, + "step": 12168 + }, + { + "epoch": 0.62, + "grad_norm": 0.8712282797124832, + "learning_rate": 6.700707336159458e-06, + "loss": 0.1797, + "step": 12169 + }, + { + "epoch": 0.62, + "grad_norm": 0.9107466535849766, + "learning_rate": 6.699152633214743e-06, + "loss": 0.18, + "step": 12170 + }, + { + "epoch": 0.62, + "grad_norm": 0.8591545415978744, + "learning_rate": 6.697598019805701e-06, + "loss": 0.175, + "step": 12171 + }, + { + "epoch": 0.62, + "grad_norm": 1.1046279818870088, + "learning_rate": 6.696043495974498e-06, + "loss": 0.1778, + "step": 12172 + }, + { + "epoch": 0.62, + "grad_norm": 1.0874204812001573, + "learning_rate": 6.6944890617633055e-06, + "loss": 0.1929, + "step": 12173 + }, + { + "epoch": 0.62, + "grad_norm": 1.1738304949725218, + "learning_rate": 6.6929347172142785e-06, + "loss": 0.208, + "step": 12174 + }, + { + "epoch": 0.62, + "grad_norm": 1.1878166769378955, + "learning_rate": 6.691380462369588e-06, + "loss": 0.1731, + "step": 12175 + }, + { + "epoch": 0.62, + "grad_norm": 0.9056200496531202, + "learning_rate": 6.689826297271384e-06, + "loss": 0.1787, + "step": 12176 + }, + { + "epoch": 0.62, + "grad_norm": 1.1374102316252948, + "learning_rate": 6.6882722219618355e-06, + "loss": 0.1701, + "step": 12177 + }, + { + "epoch": 0.62, + "grad_norm": 0.695640372186796, + "learning_rate": 6.686718236483086e-06, + "loss": 0.1782, + "step": 12178 + }, + { + "epoch": 0.62, + "grad_norm": 1.3815010096582772, + "learning_rate": 6.685164340877295e-06, + "loss": 0.1695, + "step": 12179 + }, + { + "epoch": 0.62, + "grad_norm": 1.083543432973144, + "learning_rate": 6.683610535186604e-06, + "loss": 0.1866, + "step": 12180 + }, + { + "epoch": 0.62, + "grad_norm": 1.2725443888858214, + "learning_rate": 6.682056819453168e-06, + "loss": 0.1651, + "step": 12181 + }, + { + "epoch": 0.62, + "grad_norm": 0.8442178166320784, + "learning_rate": 6.680503193719129e-06, + "loss": 0.1743, + "step": 12182 + }, + { + "epoch": 0.62, + "grad_norm": 1.0029030646090114, + "learning_rate": 6.678949658026625e-06, + "loss": 0.1923, + "step": 12183 + }, + { + "epoch": 0.62, + "grad_norm": 2.3476567787546982, + "learning_rate": 6.677396212417801e-06, + "loss": 0.1663, + "step": 12184 + }, + { + "epoch": 0.62, + "grad_norm": 1.0698221213216408, + "learning_rate": 6.675842856934789e-06, + "loss": 0.1724, + "step": 12185 + }, + { + "epoch": 0.62, + "grad_norm": 1.0245881277985018, + "learning_rate": 6.674289591619732e-06, + "loss": 0.1664, + "step": 12186 + }, + { + "epoch": 0.62, + "grad_norm": 0.81864411656269, + "learning_rate": 6.672736416514754e-06, + "loss": 0.1814, + "step": 12187 + }, + { + "epoch": 0.62, + "grad_norm": 1.1444151316489966, + "learning_rate": 6.671183331661991e-06, + "loss": 0.1682, + "step": 12188 + }, + { + "epoch": 0.62, + "grad_norm": 1.0227672142427702, + "learning_rate": 6.669630337103565e-06, + "loss": 0.194, + "step": 12189 + }, + { + "epoch": 0.62, + "grad_norm": 1.1132674106865061, + "learning_rate": 6.66807743288161e-06, + "loss": 0.1802, + "step": 12190 + }, + { + "epoch": 0.62, + "grad_norm": 9.422171075788219, + "learning_rate": 6.666524619038237e-06, + "loss": 0.1802, + "step": 12191 + }, + { + "epoch": 0.62, + "grad_norm": 1.1871758296843817, + "learning_rate": 6.664971895615578e-06, + "loss": 0.1834, + "step": 12192 + }, + { + "epoch": 0.62, + "grad_norm": 2.460493666235359, + "learning_rate": 6.663419262655739e-06, + "loss": 0.1733, + "step": 12193 + }, + { + "epoch": 0.62, + "grad_norm": 0.7359353917510435, + "learning_rate": 6.6618667202008435e-06, + "loss": 0.1693, + "step": 12194 + }, + { + "epoch": 0.62, + "grad_norm": 1.112629516499026, + "learning_rate": 6.660314268293e-06, + "loss": 0.1551, + "step": 12195 + }, + { + "epoch": 0.62, + "grad_norm": 1.0067294238530116, + "learning_rate": 6.6587619069743236e-06, + "loss": 0.1956, + "step": 12196 + }, + { + "epoch": 0.62, + "grad_norm": 1.1353157521263597, + "learning_rate": 6.6572096362869165e-06, + "loss": 0.1654, + "step": 12197 + }, + { + "epoch": 0.62, + "grad_norm": 1.0968623503957977, + "learning_rate": 6.655657456272888e-06, + "loss": 0.2003, + "step": 12198 + }, + { + "epoch": 0.62, + "grad_norm": 0.9659530346298896, + "learning_rate": 6.6541053669743375e-06, + "loss": 0.1741, + "step": 12199 + }, + { + "epoch": 0.62, + "grad_norm": 1.1324711606541336, + "learning_rate": 6.6525533684333724e-06, + "loss": 0.1946, + "step": 12200 + }, + { + "epoch": 0.62, + "grad_norm": 1.06053526847367, + "learning_rate": 6.6510014606920845e-06, + "loss": 0.1825, + "step": 12201 + }, + { + "epoch": 0.62, + "grad_norm": 0.936716622629329, + "learning_rate": 6.64944964379257e-06, + "loss": 0.1448, + "step": 12202 + }, + { + "epoch": 0.62, + "grad_norm": 0.8731618283716971, + "learning_rate": 6.647897917776925e-06, + "loss": 0.1658, + "step": 12203 + }, + { + "epoch": 0.62, + "grad_norm": 0.9600048699949525, + "learning_rate": 6.646346282687235e-06, + "loss": 0.179, + "step": 12204 + }, + { + "epoch": 0.62, + "grad_norm": 1.387098874350567, + "learning_rate": 6.644794738565597e-06, + "loss": 0.1876, + "step": 12205 + }, + { + "epoch": 0.62, + "grad_norm": 0.8351157077754774, + "learning_rate": 6.643243285454086e-06, + "loss": 0.1786, + "step": 12206 + }, + { + "epoch": 0.62, + "grad_norm": 0.8903763554787368, + "learning_rate": 6.641691923394792e-06, + "loss": 0.1825, + "step": 12207 + }, + { + "epoch": 0.62, + "grad_norm": 0.9163699840386397, + "learning_rate": 6.640140652429793e-06, + "loss": 0.1649, + "step": 12208 + }, + { + "epoch": 0.62, + "grad_norm": 1.1858028194793624, + "learning_rate": 6.6385894726011725e-06, + "loss": 0.166, + "step": 12209 + }, + { + "epoch": 0.62, + "grad_norm": 1.400037704545918, + "learning_rate": 6.637038383950998e-06, + "loss": 0.1737, + "step": 12210 + }, + { + "epoch": 0.62, + "grad_norm": 0.9340838632121498, + "learning_rate": 6.63548738652135e-06, + "loss": 0.1666, + "step": 12211 + }, + { + "epoch": 0.62, + "grad_norm": 1.4298851897685383, + "learning_rate": 6.633936480354294e-06, + "loss": 0.1829, + "step": 12212 + }, + { + "epoch": 0.62, + "grad_norm": 1.153294646208856, + "learning_rate": 6.632385665491905e-06, + "loss": 0.1828, + "step": 12213 + }, + { + "epoch": 0.62, + "grad_norm": 1.5212721972224903, + "learning_rate": 6.630834941976241e-06, + "loss": 0.1757, + "step": 12214 + }, + { + "epoch": 0.62, + "grad_norm": 1.8293599964046527, + "learning_rate": 6.629284309849373e-06, + "loss": 0.1726, + "step": 12215 + }, + { + "epoch": 0.62, + "grad_norm": 1.1039891440162553, + "learning_rate": 6.627733769153355e-06, + "loss": 0.182, + "step": 12216 + }, + { + "epoch": 0.62, + "grad_norm": 0.9536046109231258, + "learning_rate": 6.626183319930253e-06, + "loss": 0.1731, + "step": 12217 + }, + { + "epoch": 0.62, + "grad_norm": 1.04469142650679, + "learning_rate": 6.624632962222119e-06, + "loss": 0.1766, + "step": 12218 + }, + { + "epoch": 0.62, + "grad_norm": 0.9295249528325686, + "learning_rate": 6.6230826960710035e-06, + "loss": 0.1728, + "step": 12219 + }, + { + "epoch": 0.62, + "grad_norm": 1.0219966420634277, + "learning_rate": 6.621532521518962e-06, + "loss": 0.1728, + "step": 12220 + }, + { + "epoch": 0.62, + "grad_norm": 1.3289858108783676, + "learning_rate": 6.619982438608039e-06, + "loss": 0.1957, + "step": 12221 + }, + { + "epoch": 0.62, + "grad_norm": 1.2373544417696125, + "learning_rate": 6.618432447380288e-06, + "loss": 0.1997, + "step": 12222 + }, + { + "epoch": 0.62, + "grad_norm": 0.9934359092593344, + "learning_rate": 6.616882547877743e-06, + "loss": 0.1508, + "step": 12223 + }, + { + "epoch": 0.62, + "grad_norm": 0.8810613838875165, + "learning_rate": 6.615332740142454e-06, + "loss": 0.1403, + "step": 12224 + }, + { + "epoch": 0.62, + "grad_norm": 0.9879691489052049, + "learning_rate": 6.613783024216451e-06, + "loss": 0.172, + "step": 12225 + }, + { + "epoch": 0.62, + "grad_norm": 0.9526821220696458, + "learning_rate": 6.612233400141781e-06, + "loss": 0.1726, + "step": 12226 + }, + { + "epoch": 0.62, + "grad_norm": 1.3838159800652556, + "learning_rate": 6.610683867960466e-06, + "loss": 0.1858, + "step": 12227 + }, + { + "epoch": 0.62, + "grad_norm": 1.4162296287964915, + "learning_rate": 6.6091344277145456e-06, + "loss": 0.2036, + "step": 12228 + }, + { + "epoch": 0.62, + "grad_norm": 1.1086836490155574, + "learning_rate": 6.6075850794460414e-06, + "loss": 0.1688, + "step": 12229 + }, + { + "epoch": 0.62, + "grad_norm": 1.8066030135254134, + "learning_rate": 6.60603582319699e-06, + "loss": 0.1664, + "step": 12230 + }, + { + "epoch": 0.62, + "grad_norm": 1.4957728270277422, + "learning_rate": 6.604486659009404e-06, + "loss": 0.1833, + "step": 12231 + }, + { + "epoch": 0.62, + "grad_norm": 1.0987554878137216, + "learning_rate": 6.602937586925309e-06, + "loss": 0.188, + "step": 12232 + }, + { + "epoch": 0.62, + "grad_norm": 1.1942177727237888, + "learning_rate": 6.6013886069867235e-06, + "loss": 0.1785, + "step": 12233 + }, + { + "epoch": 0.62, + "grad_norm": 0.8794121473028255, + "learning_rate": 6.599839719235668e-06, + "loss": 0.1964, + "step": 12234 + }, + { + "epoch": 0.62, + "grad_norm": 1.3678191518713683, + "learning_rate": 6.598290923714152e-06, + "loss": 0.1797, + "step": 12235 + }, + { + "epoch": 0.62, + "grad_norm": 0.9275715467464166, + "learning_rate": 6.596742220464183e-06, + "loss": 0.1843, + "step": 12236 + }, + { + "epoch": 0.62, + "grad_norm": 0.7509251617436092, + "learning_rate": 6.595193609527774e-06, + "loss": 0.1704, + "step": 12237 + }, + { + "epoch": 0.62, + "grad_norm": 2.261131799563364, + "learning_rate": 6.593645090946932e-06, + "loss": 0.1765, + "step": 12238 + }, + { + "epoch": 0.62, + "grad_norm": 1.393015058725475, + "learning_rate": 6.592096664763661e-06, + "loss": 0.1702, + "step": 12239 + }, + { + "epoch": 0.62, + "grad_norm": 0.956100047072734, + "learning_rate": 6.590548331019957e-06, + "loss": 0.1815, + "step": 12240 + }, + { + "epoch": 0.62, + "grad_norm": 0.9956111845289233, + "learning_rate": 6.589000089757822e-06, + "loss": 0.1566, + "step": 12241 + }, + { + "epoch": 0.62, + "grad_norm": 0.7385208387149487, + "learning_rate": 6.587451941019253e-06, + "loss": 0.1718, + "step": 12242 + }, + { + "epoch": 0.62, + "grad_norm": 0.7324993874826935, + "learning_rate": 6.585903884846245e-06, + "loss": 0.1692, + "step": 12243 + }, + { + "epoch": 0.62, + "grad_norm": 0.8834736171907606, + "learning_rate": 6.584355921280785e-06, + "loss": 0.1764, + "step": 12244 + }, + { + "epoch": 0.62, + "grad_norm": 1.1075738748603206, + "learning_rate": 6.582808050364864e-06, + "loss": 0.1816, + "step": 12245 + }, + { + "epoch": 0.62, + "grad_norm": 0.8171360429587897, + "learning_rate": 6.581260272140466e-06, + "loss": 0.1796, + "step": 12246 + }, + { + "epoch": 0.62, + "grad_norm": 1.085648007323475, + "learning_rate": 6.579712586649581e-06, + "loss": 0.1715, + "step": 12247 + }, + { + "epoch": 0.62, + "grad_norm": 1.0691469016265858, + "learning_rate": 6.5781649939341794e-06, + "loss": 0.1849, + "step": 12248 + }, + { + "epoch": 0.62, + "grad_norm": 0.9259858332783487, + "learning_rate": 6.5766174940362505e-06, + "loss": 0.1606, + "step": 12249 + }, + { + "epoch": 0.62, + "grad_norm": 1.446395498306882, + "learning_rate": 6.575070086997762e-06, + "loss": 0.1877, + "step": 12250 + }, + { + "epoch": 0.62, + "grad_norm": 0.9671137700828732, + "learning_rate": 6.573522772860692e-06, + "loss": 0.1925, + "step": 12251 + }, + { + "epoch": 0.62, + "grad_norm": 0.8860157833790498, + "learning_rate": 6.571975551667014e-06, + "loss": 0.1821, + "step": 12252 + }, + { + "epoch": 0.62, + "grad_norm": 1.2918582948810673, + "learning_rate": 6.570428423458687e-06, + "loss": 0.1804, + "step": 12253 + }, + { + "epoch": 0.62, + "grad_norm": 1.3403658730770927, + "learning_rate": 6.568881388277685e-06, + "loss": 0.1773, + "step": 12254 + }, + { + "epoch": 0.62, + "grad_norm": 1.3153845570882179, + "learning_rate": 6.567334446165967e-06, + "loss": 0.1775, + "step": 12255 + }, + { + "epoch": 0.62, + "grad_norm": 0.7505597276119826, + "learning_rate": 6.565787597165501e-06, + "loss": 0.1549, + "step": 12256 + }, + { + "epoch": 0.62, + "grad_norm": 1.7234390957600532, + "learning_rate": 6.5642408413182345e-06, + "loss": 0.1667, + "step": 12257 + }, + { + "epoch": 0.62, + "grad_norm": 0.9668169901742122, + "learning_rate": 6.5626941786661335e-06, + "loss": 0.1609, + "step": 12258 + }, + { + "epoch": 0.62, + "grad_norm": 0.9568058195100089, + "learning_rate": 6.5611476092511435e-06, + "loss": 0.194, + "step": 12259 + }, + { + "epoch": 0.62, + "grad_norm": 2.6035064737348144, + "learning_rate": 6.559601133115223e-06, + "loss": 0.202, + "step": 12260 + }, + { + "epoch": 0.62, + "grad_norm": 0.96974975639801, + "learning_rate": 6.558054750300313e-06, + "loss": 0.1978, + "step": 12261 + }, + { + "epoch": 0.62, + "grad_norm": 0.79531616090137, + "learning_rate": 6.556508460848365e-06, + "loss": 0.1568, + "step": 12262 + }, + { + "epoch": 0.62, + "grad_norm": 1.071490924175172, + "learning_rate": 6.554962264801316e-06, + "loss": 0.1729, + "step": 12263 + }, + { + "epoch": 0.62, + "grad_norm": 1.036724040066627, + "learning_rate": 6.553416162201114e-06, + "loss": 0.1712, + "step": 12264 + }, + { + "epoch": 0.62, + "grad_norm": 1.3034074952249906, + "learning_rate": 6.55187015308969e-06, + "loss": 0.2014, + "step": 12265 + }, + { + "epoch": 0.62, + "grad_norm": 0.8093630274081415, + "learning_rate": 6.550324237508986e-06, + "loss": 0.1779, + "step": 12266 + }, + { + "epoch": 0.62, + "grad_norm": 1.093760841897314, + "learning_rate": 6.5487784155009285e-06, + "loss": 0.1687, + "step": 12267 + }, + { + "epoch": 0.62, + "grad_norm": 0.9507641187044447, + "learning_rate": 6.547232687107453e-06, + "loss": 0.1768, + "step": 12268 + }, + { + "epoch": 0.62, + "grad_norm": 0.9106248716503401, + "learning_rate": 6.5456870523704845e-06, + "loss": 0.1669, + "step": 12269 + }, + { + "epoch": 0.62, + "grad_norm": 0.8055247993376253, + "learning_rate": 6.544141511331954e-06, + "loss": 0.1783, + "step": 12270 + }, + { + "epoch": 0.62, + "grad_norm": 0.9886344291566992, + "learning_rate": 6.542596064033777e-06, + "loss": 0.1875, + "step": 12271 + }, + { + "epoch": 0.62, + "grad_norm": 1.0897904038840873, + "learning_rate": 6.541050710517875e-06, + "loss": 0.1544, + "step": 12272 + }, + { + "epoch": 0.62, + "grad_norm": 0.99314961842094, + "learning_rate": 6.539505450826174e-06, + "loss": 0.1726, + "step": 12273 + }, + { + "epoch": 0.62, + "grad_norm": 0.8909882895410705, + "learning_rate": 6.537960285000577e-06, + "loss": 0.1981, + "step": 12274 + }, + { + "epoch": 0.62, + "grad_norm": 1.2303960200796495, + "learning_rate": 6.536415213083007e-06, + "loss": 0.1855, + "step": 12275 + }, + { + "epoch": 0.62, + "grad_norm": 0.8916355656067946, + "learning_rate": 6.534870235115367e-06, + "loss": 0.1833, + "step": 12276 + }, + { + "epoch": 0.62, + "grad_norm": 0.8624614446017977, + "learning_rate": 6.533325351139569e-06, + "loss": 0.1747, + "step": 12277 + }, + { + "epoch": 0.62, + "grad_norm": 1.103022254689305, + "learning_rate": 6.531780561197514e-06, + "loss": 0.1913, + "step": 12278 + }, + { + "epoch": 0.62, + "grad_norm": 1.3641657032827077, + "learning_rate": 6.530235865331112e-06, + "loss": 0.1624, + "step": 12279 + }, + { + "epoch": 0.62, + "grad_norm": 0.8736367851666924, + "learning_rate": 6.528691263582254e-06, + "loss": 0.1776, + "step": 12280 + }, + { + "epoch": 0.62, + "grad_norm": 0.9389189226683222, + "learning_rate": 6.527146755992844e-06, + "loss": 0.1697, + "step": 12281 + }, + { + "epoch": 0.62, + "grad_norm": 1.536533177111427, + "learning_rate": 6.525602342604771e-06, + "loss": 0.1688, + "step": 12282 + }, + { + "epoch": 0.62, + "grad_norm": 1.0136640445014258, + "learning_rate": 6.524058023459936e-06, + "loss": 0.2022, + "step": 12283 + }, + { + "epoch": 0.62, + "grad_norm": 0.9594188684445367, + "learning_rate": 6.522513798600219e-06, + "loss": 0.1913, + "step": 12284 + }, + { + "epoch": 0.62, + "grad_norm": 0.8637535389066199, + "learning_rate": 6.520969668067514e-06, + "loss": 0.1587, + "step": 12285 + }, + { + "epoch": 0.62, + "grad_norm": 0.7712698643365918, + "learning_rate": 6.5194256319036996e-06, + "loss": 0.1583, + "step": 12286 + }, + { + "epoch": 0.62, + "grad_norm": 1.1062185087587382, + "learning_rate": 6.517881690150667e-06, + "loss": 0.1824, + "step": 12287 + }, + { + "epoch": 0.62, + "grad_norm": 0.8197071505996901, + "learning_rate": 6.516337842850291e-06, + "loss": 0.1703, + "step": 12288 + }, + { + "epoch": 0.62, + "grad_norm": 1.5756187693803567, + "learning_rate": 6.514794090044443e-06, + "loss": 0.175, + "step": 12289 + }, + { + "epoch": 0.62, + "grad_norm": 1.7986011584614001, + "learning_rate": 6.513250431775003e-06, + "loss": 0.1599, + "step": 12290 + }, + { + "epoch": 0.63, + "grad_norm": 0.876400412985864, + "learning_rate": 6.511706868083842e-06, + "loss": 0.1826, + "step": 12291 + }, + { + "epoch": 0.63, + "grad_norm": 1.3253744757454275, + "learning_rate": 6.510163399012832e-06, + "loss": 0.2196, + "step": 12292 + }, + { + "epoch": 0.63, + "grad_norm": 0.8228314003915462, + "learning_rate": 6.508620024603833e-06, + "loss": 0.1873, + "step": 12293 + }, + { + "epoch": 0.63, + "grad_norm": 1.182334128744673, + "learning_rate": 6.507076744898715e-06, + "loss": 0.1665, + "step": 12294 + }, + { + "epoch": 0.63, + "grad_norm": 1.0461347391662688, + "learning_rate": 6.505533559939335e-06, + "loss": 0.1864, + "step": 12295 + }, + { + "epoch": 0.63, + "grad_norm": 0.896008704215264, + "learning_rate": 6.50399046976756e-06, + "loss": 0.2103, + "step": 12296 + }, + { + "epoch": 0.63, + "grad_norm": 1.2823807091290649, + "learning_rate": 6.502447474425235e-06, + "loss": 0.1633, + "step": 12297 + }, + { + "epoch": 0.63, + "grad_norm": 0.9199393892955969, + "learning_rate": 6.5009045739542235e-06, + "loss": 0.1988, + "step": 12298 + }, + { + "epoch": 0.63, + "grad_norm": 1.0069335458303679, + "learning_rate": 6.499361768396371e-06, + "loss": 0.1854, + "step": 12299 + }, + { + "epoch": 0.63, + "grad_norm": 1.0795233172269108, + "learning_rate": 6.497819057793531e-06, + "loss": 0.1732, + "step": 12300 + }, + { + "epoch": 0.63, + "grad_norm": 0.9021808424439925, + "learning_rate": 6.496276442187543e-06, + "loss": 0.1833, + "step": 12301 + }, + { + "epoch": 0.63, + "grad_norm": 0.965671577262932, + "learning_rate": 6.49473392162026e-06, + "loss": 0.1763, + "step": 12302 + }, + { + "epoch": 0.63, + "grad_norm": 1.0642907663234293, + "learning_rate": 6.493191496133513e-06, + "loss": 0.1626, + "step": 12303 + }, + { + "epoch": 0.63, + "grad_norm": 0.9872091817808939, + "learning_rate": 6.491649165769145e-06, + "loss": 0.1797, + "step": 12304 + }, + { + "epoch": 0.63, + "grad_norm": 0.829784055595061, + "learning_rate": 6.4901069305689955e-06, + "loss": 0.1834, + "step": 12305 + }, + { + "epoch": 0.63, + "grad_norm": 1.2547817572673163, + "learning_rate": 6.488564790574889e-06, + "loss": 0.1705, + "step": 12306 + }, + { + "epoch": 0.63, + "grad_norm": 1.2223911861358498, + "learning_rate": 6.487022745828663e-06, + "loss": 0.1894, + "step": 12307 + }, + { + "epoch": 0.63, + "grad_norm": 2.015859280347554, + "learning_rate": 6.485480796372141e-06, + "loss": 0.1863, + "step": 12308 + }, + { + "epoch": 0.63, + "grad_norm": 1.0186987150755753, + "learning_rate": 6.483938942247155e-06, + "loss": 0.1735, + "step": 12309 + }, + { + "epoch": 0.63, + "grad_norm": 0.9826918667646444, + "learning_rate": 6.482397183495519e-06, + "loss": 0.1746, + "step": 12310 + }, + { + "epoch": 0.63, + "grad_norm": 1.171723404689601, + "learning_rate": 6.4808555201590614e-06, + "loss": 0.1505, + "step": 12311 + }, + { + "epoch": 0.63, + "grad_norm": 1.1341544628466593, + "learning_rate": 6.479313952279594e-06, + "loss": 0.181, + "step": 12312 + }, + { + "epoch": 0.63, + "grad_norm": 0.9730907582567121, + "learning_rate": 6.47777247989894e-06, + "loss": 0.1756, + "step": 12313 + }, + { + "epoch": 0.63, + "grad_norm": 1.165375281334116, + "learning_rate": 6.476231103058901e-06, + "loss": 0.1869, + "step": 12314 + }, + { + "epoch": 0.63, + "grad_norm": 1.2957891090340417, + "learning_rate": 6.474689821801295e-06, + "loss": 0.1717, + "step": 12315 + }, + { + "epoch": 0.63, + "grad_norm": 0.8354411998000495, + "learning_rate": 6.473148636167925e-06, + "loss": 0.1796, + "step": 12316 + }, + { + "epoch": 0.63, + "grad_norm": 1.0039260067099076, + "learning_rate": 6.471607546200598e-06, + "loss": 0.1761, + "step": 12317 + }, + { + "epoch": 0.63, + "grad_norm": 0.9107910951230012, + "learning_rate": 6.470066551941114e-06, + "loss": 0.1643, + "step": 12318 + }, + { + "epoch": 0.63, + "grad_norm": 3.013960752228629, + "learning_rate": 6.468525653431279e-06, + "loss": 0.1653, + "step": 12319 + }, + { + "epoch": 0.63, + "grad_norm": 0.8127586630711379, + "learning_rate": 6.466984850712881e-06, + "loss": 0.1688, + "step": 12320 + }, + { + "epoch": 0.63, + "grad_norm": 0.8394294595716062, + "learning_rate": 6.4654441438277194e-06, + "loss": 0.1725, + "step": 12321 + }, + { + "epoch": 0.63, + "grad_norm": 1.2239762650728694, + "learning_rate": 6.463903532817587e-06, + "loss": 0.1498, + "step": 12322 + }, + { + "epoch": 0.63, + "grad_norm": 1.5568944801202331, + "learning_rate": 6.462363017724267e-06, + "loss": 0.182, + "step": 12323 + }, + { + "epoch": 0.63, + "grad_norm": 0.9073127865426454, + "learning_rate": 6.460822598589554e-06, + "loss": 0.1526, + "step": 12324 + }, + { + "epoch": 0.63, + "grad_norm": 1.2372909375299115, + "learning_rate": 6.459282275455223e-06, + "loss": 0.1721, + "step": 12325 + }, + { + "epoch": 0.63, + "grad_norm": 0.8625107471525778, + "learning_rate": 6.457742048363066e-06, + "loss": 0.1461, + "step": 12326 + }, + { + "epoch": 0.63, + "grad_norm": 2.449375047735124, + "learning_rate": 6.456201917354852e-06, + "loss": 0.1865, + "step": 12327 + }, + { + "epoch": 0.63, + "grad_norm": 0.952600062618889, + "learning_rate": 6.454661882472364e-06, + "loss": 0.1612, + "step": 12328 + }, + { + "epoch": 0.63, + "grad_norm": 1.0525552911764944, + "learning_rate": 6.45312194375737e-06, + "loss": 0.1621, + "step": 12329 + }, + { + "epoch": 0.63, + "grad_norm": 1.1407915087173905, + "learning_rate": 6.451582101251645e-06, + "loss": 0.2157, + "step": 12330 + }, + { + "epoch": 0.63, + "grad_norm": 0.9388793084022125, + "learning_rate": 6.450042354996954e-06, + "loss": 0.1724, + "step": 12331 + }, + { + "epoch": 0.63, + "grad_norm": 2.195697188482703, + "learning_rate": 6.448502705035069e-06, + "loss": 0.1737, + "step": 12332 + }, + { + "epoch": 0.63, + "grad_norm": 26.04404863008669, + "learning_rate": 6.446963151407743e-06, + "loss": 0.1697, + "step": 12333 + }, + { + "epoch": 0.63, + "grad_norm": 1.0686488893519888, + "learning_rate": 6.445423694156746e-06, + "loss": 0.1445, + "step": 12334 + }, + { + "epoch": 0.63, + "grad_norm": 1.2770191889968054, + "learning_rate": 6.44388433332383e-06, + "loss": 0.1686, + "step": 12335 + }, + { + "epoch": 0.63, + "grad_norm": 0.9588144049117461, + "learning_rate": 6.442345068950755e-06, + "loss": 0.1631, + "step": 12336 + }, + { + "epoch": 0.63, + "grad_norm": 1.0817090132952616, + "learning_rate": 6.440805901079268e-06, + "loss": 0.2015, + "step": 12337 + }, + { + "epoch": 0.63, + "grad_norm": 1.1930738953388942, + "learning_rate": 6.4392668297511244e-06, + "loss": 0.1689, + "step": 12338 + }, + { + "epoch": 0.63, + "grad_norm": 1.5876164763829481, + "learning_rate": 6.4377278550080664e-06, + "loss": 0.1798, + "step": 12339 + }, + { + "epoch": 0.63, + "grad_norm": 1.1346038126888687, + "learning_rate": 6.436188976891846e-06, + "loss": 0.1722, + "step": 12340 + }, + { + "epoch": 0.63, + "grad_norm": 0.8509113581377679, + "learning_rate": 6.434650195444199e-06, + "loss": 0.1723, + "step": 12341 + }, + { + "epoch": 0.63, + "grad_norm": 1.4524280102048681, + "learning_rate": 6.433111510706864e-06, + "loss": 0.1895, + "step": 12342 + }, + { + "epoch": 0.63, + "grad_norm": 1.2121512139484962, + "learning_rate": 6.431572922721585e-06, + "loss": 0.1904, + "step": 12343 + }, + { + "epoch": 0.63, + "grad_norm": 1.0653587248529126, + "learning_rate": 6.430034431530088e-06, + "loss": 0.1584, + "step": 12344 + }, + { + "epoch": 0.63, + "grad_norm": 0.9241859336426226, + "learning_rate": 6.428496037174112e-06, + "loss": 0.1727, + "step": 12345 + }, + { + "epoch": 0.63, + "grad_norm": 1.7452389512234971, + "learning_rate": 6.42695773969538e-06, + "loss": 0.1622, + "step": 12346 + }, + { + "epoch": 0.63, + "grad_norm": 1.2684237230799293, + "learning_rate": 6.425419539135622e-06, + "loss": 0.1634, + "step": 12347 + }, + { + "epoch": 0.63, + "grad_norm": 1.237057056550504, + "learning_rate": 6.42388143553656e-06, + "loss": 0.1799, + "step": 12348 + }, + { + "epoch": 0.63, + "grad_norm": 0.9888821411178348, + "learning_rate": 6.422343428939919e-06, + "loss": 0.1728, + "step": 12349 + }, + { + "epoch": 0.63, + "grad_norm": 1.4023727794262242, + "learning_rate": 6.420805519387412e-06, + "loss": 0.1853, + "step": 12350 + }, + { + "epoch": 0.63, + "grad_norm": 1.2704004212987643, + "learning_rate": 6.419267706920758e-06, + "loss": 0.1814, + "step": 12351 + }, + { + "epoch": 0.63, + "grad_norm": 0.9049368121777028, + "learning_rate": 6.417729991581668e-06, + "loss": 0.1794, + "step": 12352 + }, + { + "epoch": 0.63, + "grad_norm": 1.0791248063852146, + "learning_rate": 6.4161923734118594e-06, + "loss": 0.178, + "step": 12353 + }, + { + "epoch": 0.63, + "grad_norm": 1.0154705817518106, + "learning_rate": 6.41465485245303e-06, + "loss": 0.1718, + "step": 12354 + }, + { + "epoch": 0.63, + "grad_norm": 0.8059650877129417, + "learning_rate": 6.413117428746892e-06, + "loss": 0.1903, + "step": 12355 + }, + { + "epoch": 0.63, + "grad_norm": 0.8510180569734818, + "learning_rate": 6.4115801023351444e-06, + "loss": 0.1621, + "step": 12356 + }, + { + "epoch": 0.63, + "grad_norm": 1.0253418963036458, + "learning_rate": 6.410042873259494e-06, + "loss": 0.147, + "step": 12357 + }, + { + "epoch": 0.63, + "grad_norm": 1.1512407414739831, + "learning_rate": 6.408505741561633e-06, + "loss": 0.1798, + "step": 12358 + }, + { + "epoch": 0.63, + "grad_norm": 0.9846029617594317, + "learning_rate": 6.406968707283253e-06, + "loss": 0.1722, + "step": 12359 + }, + { + "epoch": 0.63, + "grad_norm": 1.2493645732998286, + "learning_rate": 6.405431770466051e-06, + "loss": 0.1677, + "step": 12360 + }, + { + "epoch": 0.63, + "grad_norm": 0.9348559490222239, + "learning_rate": 6.403894931151714e-06, + "loss": 0.1615, + "step": 12361 + }, + { + "epoch": 0.63, + "grad_norm": 0.9569479373941275, + "learning_rate": 6.4023581893819345e-06, + "loss": 0.1838, + "step": 12362 + }, + { + "epoch": 0.63, + "grad_norm": 0.7611388126812884, + "learning_rate": 6.4008215451983864e-06, + "loss": 0.1756, + "step": 12363 + }, + { + "epoch": 0.63, + "grad_norm": 1.2822095789309997, + "learning_rate": 6.399284998642761e-06, + "loss": 0.2031, + "step": 12364 + }, + { + "epoch": 0.63, + "grad_norm": 0.7669714612070854, + "learning_rate": 6.39774854975673e-06, + "loss": 0.17, + "step": 12365 + }, + { + "epoch": 0.63, + "grad_norm": 0.9943848290476993, + "learning_rate": 6.396212198581978e-06, + "loss": 0.1678, + "step": 12366 + }, + { + "epoch": 0.63, + "grad_norm": 2.708205758868274, + "learning_rate": 6.394675945160169e-06, + "loss": 0.1662, + "step": 12367 + }, + { + "epoch": 0.63, + "grad_norm": 1.5553428840830763, + "learning_rate": 6.39313978953298e-06, + "loss": 0.1749, + "step": 12368 + }, + { + "epoch": 0.63, + "grad_norm": 0.8969706376294244, + "learning_rate": 6.391603731742078e-06, + "loss": 0.1733, + "step": 12369 + }, + { + "epoch": 0.63, + "grad_norm": 1.1488854694275947, + "learning_rate": 6.390067771829132e-06, + "loss": 0.1677, + "step": 12370 + }, + { + "epoch": 0.63, + "grad_norm": 0.9769116755166073, + "learning_rate": 6.3885319098357966e-06, + "loss": 0.1584, + "step": 12371 + }, + { + "epoch": 0.63, + "grad_norm": 0.8587655311387853, + "learning_rate": 6.386996145803741e-06, + "loss": 0.1848, + "step": 12372 + }, + { + "epoch": 0.63, + "grad_norm": 1.6151992229384966, + "learning_rate": 6.385460479774616e-06, + "loss": 0.1781, + "step": 12373 + }, + { + "epoch": 0.63, + "grad_norm": 1.161128554908505, + "learning_rate": 6.383924911790081e-06, + "loss": 0.1978, + "step": 12374 + }, + { + "epoch": 0.63, + "grad_norm": 4.102166422327023, + "learning_rate": 6.3823894418917895e-06, + "loss": 0.1716, + "step": 12375 + }, + { + "epoch": 0.63, + "grad_norm": 0.9866300969961047, + "learning_rate": 6.380854070121385e-06, + "loss": 0.1741, + "step": 12376 + }, + { + "epoch": 0.63, + "grad_norm": 0.8744583736841051, + "learning_rate": 6.37931879652052e-06, + "loss": 0.1776, + "step": 12377 + }, + { + "epoch": 0.63, + "grad_norm": 0.9033550694190644, + "learning_rate": 6.377783621130834e-06, + "loss": 0.1815, + "step": 12378 + }, + { + "epoch": 0.63, + "grad_norm": 1.7952117507679426, + "learning_rate": 6.376248543993977e-06, + "loss": 0.1769, + "step": 12379 + }, + { + "epoch": 0.63, + "grad_norm": 1.0221285316727067, + "learning_rate": 6.374713565151579e-06, + "loss": 0.1809, + "step": 12380 + }, + { + "epoch": 0.63, + "grad_norm": 0.9361376699622568, + "learning_rate": 6.373178684645283e-06, + "loss": 0.1831, + "step": 12381 + }, + { + "epoch": 0.63, + "grad_norm": 1.000278952974784, + "learning_rate": 6.371643902516715e-06, + "loss": 0.2024, + "step": 12382 + }, + { + "epoch": 0.63, + "grad_norm": 0.8337946910176021, + "learning_rate": 6.3701092188075176e-06, + "loss": 0.1733, + "step": 12383 + }, + { + "epoch": 0.63, + "grad_norm": 1.068934840910689, + "learning_rate": 6.368574633559308e-06, + "loss": 0.1867, + "step": 12384 + }, + { + "epoch": 0.63, + "grad_norm": 1.0862813904471362, + "learning_rate": 6.367040146813721e-06, + "loss": 0.1759, + "step": 12385 + }, + { + "epoch": 0.63, + "grad_norm": 0.9875804752664659, + "learning_rate": 6.365505758612371e-06, + "loss": 0.1781, + "step": 12386 + }, + { + "epoch": 0.63, + "grad_norm": 1.0816791746408998, + "learning_rate": 6.363971468996883e-06, + "loss": 0.1785, + "step": 12387 + }, + { + "epoch": 0.63, + "grad_norm": 1.2488081898099574, + "learning_rate": 6.362437278008875e-06, + "loss": 0.1764, + "step": 12388 + }, + { + "epoch": 0.63, + "grad_norm": 1.1154057711008427, + "learning_rate": 6.360903185689964e-06, + "loss": 0.1629, + "step": 12389 + }, + { + "epoch": 0.63, + "grad_norm": 0.7958664729630673, + "learning_rate": 6.359369192081756e-06, + "loss": 0.164, + "step": 12390 + }, + { + "epoch": 0.63, + "grad_norm": 0.8140188803014982, + "learning_rate": 6.357835297225865e-06, + "loss": 0.1648, + "step": 12391 + }, + { + "epoch": 0.63, + "grad_norm": 0.9086263255788167, + "learning_rate": 6.356301501163901e-06, + "loss": 0.1498, + "step": 12392 + }, + { + "epoch": 0.63, + "grad_norm": 1.1784567810825564, + "learning_rate": 6.3547678039374595e-06, + "loss": 0.1702, + "step": 12393 + }, + { + "epoch": 0.63, + "grad_norm": 0.9426996480766275, + "learning_rate": 6.35323420558815e-06, + "loss": 0.1861, + "step": 12394 + }, + { + "epoch": 0.63, + "grad_norm": 0.8746147919053582, + "learning_rate": 6.351700706157565e-06, + "loss": 0.1816, + "step": 12395 + }, + { + "epoch": 0.63, + "grad_norm": 0.9674403403160589, + "learning_rate": 6.350167305687309e-06, + "loss": 0.1854, + "step": 12396 + }, + { + "epoch": 0.63, + "grad_norm": 2.3616418400384807, + "learning_rate": 6.348634004218969e-06, + "loss": 0.1557, + "step": 12397 + }, + { + "epoch": 0.63, + "grad_norm": 1.191532683707872, + "learning_rate": 6.3471008017941396e-06, + "loss": 0.1812, + "step": 12398 + }, + { + "epoch": 0.63, + "grad_norm": 0.9497595078806705, + "learning_rate": 6.345567698454405e-06, + "loss": 0.1836, + "step": 12399 + }, + { + "epoch": 0.63, + "grad_norm": 1.061604308953469, + "learning_rate": 6.344034694241353e-06, + "loss": 0.1713, + "step": 12400 + }, + { + "epoch": 0.63, + "grad_norm": 0.9765112821432018, + "learning_rate": 6.342501789196565e-06, + "loss": 0.1707, + "step": 12401 + }, + { + "epoch": 0.63, + "grad_norm": 0.9497694404575935, + "learning_rate": 6.340968983361629e-06, + "loss": 0.1837, + "step": 12402 + }, + { + "epoch": 0.63, + "grad_norm": 1.1070548884603109, + "learning_rate": 6.339436276778108e-06, + "loss": 0.1638, + "step": 12403 + }, + { + "epoch": 0.63, + "grad_norm": 1.5135514613785872, + "learning_rate": 6.33790366948759e-06, + "loss": 0.1823, + "step": 12404 + }, + { + "epoch": 0.63, + "grad_norm": 0.9890899244681153, + "learning_rate": 6.3363711615316384e-06, + "loss": 0.1621, + "step": 12405 + }, + { + "epoch": 0.63, + "grad_norm": 2.085404642674482, + "learning_rate": 6.334838752951829e-06, + "loss": 0.2035, + "step": 12406 + }, + { + "epoch": 0.63, + "grad_norm": 0.821864051302577, + "learning_rate": 6.333306443789723e-06, + "loss": 0.1637, + "step": 12407 + }, + { + "epoch": 0.63, + "grad_norm": 1.018056615727074, + "learning_rate": 6.331774234086888e-06, + "loss": 0.1798, + "step": 12408 + }, + { + "epoch": 0.63, + "grad_norm": 1.1267528713054624, + "learning_rate": 6.330242123884882e-06, + "loss": 0.1932, + "step": 12409 + }, + { + "epoch": 0.63, + "grad_norm": 0.9204954399604649, + "learning_rate": 6.328710113225271e-06, + "loss": 0.1607, + "step": 12410 + }, + { + "epoch": 0.63, + "grad_norm": 1.134368920284439, + "learning_rate": 6.327178202149604e-06, + "loss": 0.1712, + "step": 12411 + }, + { + "epoch": 0.63, + "grad_norm": 0.8948304924405651, + "learning_rate": 6.325646390699432e-06, + "loss": 0.1707, + "step": 12412 + }, + { + "epoch": 0.63, + "grad_norm": 1.148432251960856, + "learning_rate": 6.324114678916312e-06, + "loss": 0.1651, + "step": 12413 + }, + { + "epoch": 0.63, + "grad_norm": 1.3243261950780123, + "learning_rate": 6.322583066841787e-06, + "loss": 0.2044, + "step": 12414 + }, + { + "epoch": 0.63, + "grad_norm": 1.0739468116695186, + "learning_rate": 6.321051554517406e-06, + "loss": 0.1714, + "step": 12415 + }, + { + "epoch": 0.63, + "grad_norm": 0.8085880976560276, + "learning_rate": 6.3195201419847075e-06, + "loss": 0.1876, + "step": 12416 + }, + { + "epoch": 0.63, + "grad_norm": 1.2755854149125554, + "learning_rate": 6.3179888292852345e-06, + "loss": 0.1897, + "step": 12417 + }, + { + "epoch": 0.63, + "grad_norm": 0.8972934946635283, + "learning_rate": 6.316457616460521e-06, + "loss": 0.175, + "step": 12418 + }, + { + "epoch": 0.63, + "grad_norm": 0.8943975090876413, + "learning_rate": 6.314926503552106e-06, + "loss": 0.1985, + "step": 12419 + }, + { + "epoch": 0.63, + "grad_norm": 1.0943262305329209, + "learning_rate": 6.313395490601513e-06, + "loss": 0.182, + "step": 12420 + }, + { + "epoch": 0.63, + "grad_norm": 0.9752335207142642, + "learning_rate": 6.311864577650278e-06, + "loss": 0.1536, + "step": 12421 + }, + { + "epoch": 0.63, + "grad_norm": 1.255788513507667, + "learning_rate": 6.310333764739922e-06, + "loss": 0.1692, + "step": 12422 + }, + { + "epoch": 0.63, + "grad_norm": 0.8946464195463422, + "learning_rate": 6.308803051911977e-06, + "loss": 0.1734, + "step": 12423 + }, + { + "epoch": 0.63, + "grad_norm": 1.0774757219799522, + "learning_rate": 6.307272439207952e-06, + "loss": 0.2096, + "step": 12424 + }, + { + "epoch": 0.63, + "grad_norm": 0.8735326158800345, + "learning_rate": 6.305741926669376e-06, + "loss": 0.1648, + "step": 12425 + }, + { + "epoch": 0.63, + "grad_norm": 0.7804412362106942, + "learning_rate": 6.304211514337755e-06, + "loss": 0.1792, + "step": 12426 + }, + { + "epoch": 0.63, + "grad_norm": 0.8648967530320314, + "learning_rate": 6.302681202254605e-06, + "loss": 0.1727, + "step": 12427 + }, + { + "epoch": 0.63, + "grad_norm": 0.951635997434637, + "learning_rate": 6.30115099046144e-06, + "loss": 0.1663, + "step": 12428 + }, + { + "epoch": 0.63, + "grad_norm": 0.8540674273278183, + "learning_rate": 6.299620878999759e-06, + "loss": 0.1723, + "step": 12429 + }, + { + "epoch": 0.63, + "grad_norm": 0.8448351618492159, + "learning_rate": 6.298090867911073e-06, + "loss": 0.1673, + "step": 12430 + }, + { + "epoch": 0.63, + "grad_norm": 0.8999928356265918, + "learning_rate": 6.296560957236879e-06, + "loss": 0.1645, + "step": 12431 + }, + { + "epoch": 0.63, + "grad_norm": 1.0263325239331658, + "learning_rate": 6.295031147018682e-06, + "loss": 0.1971, + "step": 12432 + }, + { + "epoch": 0.63, + "grad_norm": 0.8343603797823569, + "learning_rate": 6.293501437297971e-06, + "loss": 0.1734, + "step": 12433 + }, + { + "epoch": 0.63, + "grad_norm": 1.2727832738429121, + "learning_rate": 6.291971828116244e-06, + "loss": 0.1658, + "step": 12434 + }, + { + "epoch": 0.63, + "grad_norm": 0.9658412176768761, + "learning_rate": 6.290442319514989e-06, + "loss": 0.1793, + "step": 12435 + }, + { + "epoch": 0.63, + "grad_norm": 1.2050351172057916, + "learning_rate": 6.288912911535701e-06, + "loss": 0.174, + "step": 12436 + }, + { + "epoch": 0.63, + "grad_norm": 1.2826664564756307, + "learning_rate": 6.2873836042198546e-06, + "loss": 0.1864, + "step": 12437 + }, + { + "epoch": 0.63, + "grad_norm": 0.7933695499610162, + "learning_rate": 6.285854397608941e-06, + "loss": 0.1513, + "step": 12438 + }, + { + "epoch": 0.63, + "grad_norm": 0.8704491729302477, + "learning_rate": 6.284325291744433e-06, + "loss": 0.1652, + "step": 12439 + }, + { + "epoch": 0.63, + "grad_norm": 0.8751594756844792, + "learning_rate": 6.282796286667814e-06, + "loss": 0.179, + "step": 12440 + }, + { + "epoch": 0.63, + "grad_norm": 1.152411953728966, + "learning_rate": 6.281267382420553e-06, + "loss": 0.191, + "step": 12441 + }, + { + "epoch": 0.63, + "grad_norm": 1.0783429025431217, + "learning_rate": 6.2797385790441275e-06, + "loss": 0.1865, + "step": 12442 + }, + { + "epoch": 0.63, + "grad_norm": 1.165853509016558, + "learning_rate": 6.278209876580002e-06, + "loss": 0.1671, + "step": 12443 + }, + { + "epoch": 0.63, + "grad_norm": 1.4369216323054037, + "learning_rate": 6.2766812750696425e-06, + "loss": 0.1707, + "step": 12444 + }, + { + "epoch": 0.63, + "grad_norm": 1.1509816814159848, + "learning_rate": 6.275152774554518e-06, + "loss": 0.1796, + "step": 12445 + }, + { + "epoch": 0.63, + "grad_norm": 1.0754526489308998, + "learning_rate": 6.273624375076079e-06, + "loss": 0.178, + "step": 12446 + }, + { + "epoch": 0.63, + "grad_norm": 0.9512043301201804, + "learning_rate": 6.272096076675794e-06, + "loss": 0.1706, + "step": 12447 + }, + { + "epoch": 0.63, + "grad_norm": 0.9603013115681072, + "learning_rate": 6.2705678793951085e-06, + "loss": 0.2025, + "step": 12448 + }, + { + "epoch": 0.63, + "grad_norm": 1.0168091469068452, + "learning_rate": 6.269039783275486e-06, + "loss": 0.1876, + "step": 12449 + }, + { + "epoch": 0.63, + "grad_norm": 1.3743891386632616, + "learning_rate": 6.267511788358365e-06, + "loss": 0.1839, + "step": 12450 + }, + { + "epoch": 0.63, + "grad_norm": 0.8779440589301847, + "learning_rate": 6.265983894685199e-06, + "loss": 0.1658, + "step": 12451 + }, + { + "epoch": 0.63, + "grad_norm": 0.9168517762241354, + "learning_rate": 6.264456102297431e-06, + "loss": 0.1808, + "step": 12452 + }, + { + "epoch": 0.63, + "grad_norm": 1.478232498156621, + "learning_rate": 6.262928411236504e-06, + "loss": 0.1785, + "step": 12453 + }, + { + "epoch": 0.63, + "grad_norm": 1.850537886066106, + "learning_rate": 6.261400821543853e-06, + "loss": 0.1629, + "step": 12454 + }, + { + "epoch": 0.63, + "grad_norm": 0.945733684470051, + "learning_rate": 6.259873333260917e-06, + "loss": 0.175, + "step": 12455 + }, + { + "epoch": 0.63, + "grad_norm": 1.1153704099256443, + "learning_rate": 6.258345946429127e-06, + "loss": 0.1927, + "step": 12456 + }, + { + "epoch": 0.63, + "grad_norm": 1.0169887154193404, + "learning_rate": 6.256818661089914e-06, + "loss": 0.1717, + "step": 12457 + }, + { + "epoch": 0.63, + "grad_norm": 1.5532608557851424, + "learning_rate": 6.255291477284706e-06, + "loss": 0.1806, + "step": 12458 + }, + { + "epoch": 0.63, + "grad_norm": 0.8137842106602222, + "learning_rate": 6.253764395054931e-06, + "loss": 0.1781, + "step": 12459 + }, + { + "epoch": 0.63, + "grad_norm": 0.778704510268674, + "learning_rate": 6.252237414442006e-06, + "loss": 0.1645, + "step": 12460 + }, + { + "epoch": 0.63, + "grad_norm": 0.9806884124431697, + "learning_rate": 6.250710535487354e-06, + "loss": 0.1584, + "step": 12461 + }, + { + "epoch": 0.63, + "grad_norm": 0.7972196638544355, + "learning_rate": 6.249183758232391e-06, + "loss": 0.1791, + "step": 12462 + }, + { + "epoch": 0.63, + "grad_norm": 0.8762787480998664, + "learning_rate": 6.247657082718528e-06, + "loss": 0.1787, + "step": 12463 + }, + { + "epoch": 0.63, + "grad_norm": 1.5548117723420225, + "learning_rate": 6.246130508987181e-06, + "loss": 0.1589, + "step": 12464 + }, + { + "epoch": 0.63, + "grad_norm": 0.9578502567917115, + "learning_rate": 6.244604037079754e-06, + "loss": 0.1862, + "step": 12465 + }, + { + "epoch": 0.63, + "grad_norm": 1.1923637270836631, + "learning_rate": 6.2430776670376565e-06, + "loss": 0.1628, + "step": 12466 + }, + { + "epoch": 0.63, + "grad_norm": 1.132564443816655, + "learning_rate": 6.241551398902288e-06, + "loss": 0.1805, + "step": 12467 + }, + { + "epoch": 0.63, + "grad_norm": 1.297131069014862, + "learning_rate": 6.240025232715052e-06, + "loss": 0.1896, + "step": 12468 + }, + { + "epoch": 0.63, + "grad_norm": 1.0555852137876773, + "learning_rate": 6.2384991685173415e-06, + "loss": 0.1655, + "step": 12469 + }, + { + "epoch": 0.63, + "grad_norm": 1.2255988170345553, + "learning_rate": 6.236973206350554e-06, + "loss": 0.1969, + "step": 12470 + }, + { + "epoch": 0.63, + "grad_norm": 0.857306568176004, + "learning_rate": 6.23544734625608e-06, + "loss": 0.1773, + "step": 12471 + }, + { + "epoch": 0.63, + "grad_norm": 1.4411426640641447, + "learning_rate": 6.233921588275313e-06, + "loss": 0.1704, + "step": 12472 + }, + { + "epoch": 0.63, + "grad_norm": 1.081017069593761, + "learning_rate": 6.232395932449632e-06, + "loss": 0.179, + "step": 12473 + }, + { + "epoch": 0.63, + "grad_norm": 0.9309147223998671, + "learning_rate": 6.230870378820426e-06, + "loss": 0.1675, + "step": 12474 + }, + { + "epoch": 0.63, + "grad_norm": 1.3926729763150256, + "learning_rate": 6.22934492742907e-06, + "loss": 0.1871, + "step": 12475 + }, + { + "epoch": 0.63, + "grad_norm": 1.05205076758149, + "learning_rate": 6.2278195783169525e-06, + "loss": 0.1799, + "step": 12476 + }, + { + "epoch": 0.63, + "grad_norm": 0.9209143818079015, + "learning_rate": 6.226294331525437e-06, + "loss": 0.1982, + "step": 12477 + }, + { + "epoch": 0.63, + "grad_norm": 0.9332230467328521, + "learning_rate": 6.224769187095903e-06, + "loss": 0.1717, + "step": 12478 + }, + { + "epoch": 0.63, + "grad_norm": 0.8431590555877424, + "learning_rate": 6.223244145069715e-06, + "loss": 0.1714, + "step": 12479 + }, + { + "epoch": 0.63, + "grad_norm": 0.9699148561201477, + "learning_rate": 6.221719205488248e-06, + "loss": 0.1887, + "step": 12480 + }, + { + "epoch": 0.63, + "grad_norm": 0.8557638152178924, + "learning_rate": 6.220194368392862e-06, + "loss": 0.1801, + "step": 12481 + }, + { + "epoch": 0.63, + "grad_norm": 1.0081281462198912, + "learning_rate": 6.218669633824911e-06, + "loss": 0.1779, + "step": 12482 + }, + { + "epoch": 0.63, + "grad_norm": 0.9963855583977796, + "learning_rate": 6.2171450018257625e-06, + "loss": 0.1647, + "step": 12483 + }, + { + "epoch": 0.63, + "grad_norm": 0.812626859159139, + "learning_rate": 6.2156204724367674e-06, + "loss": 0.1848, + "step": 12484 + }, + { + "epoch": 0.63, + "grad_norm": 1.1443046290186372, + "learning_rate": 6.214096045699285e-06, + "loss": 0.1837, + "step": 12485 + }, + { + "epoch": 0.63, + "grad_norm": 1.0078120851075847, + "learning_rate": 6.212571721654658e-06, + "loss": 0.1919, + "step": 12486 + }, + { + "epoch": 0.63, + "grad_norm": 0.9909980525748541, + "learning_rate": 6.211047500344239e-06, + "loss": 0.1795, + "step": 12487 + }, + { + "epoch": 0.64, + "grad_norm": 0.9126111150104824, + "learning_rate": 6.209523381809366e-06, + "loss": 0.1545, + "step": 12488 + }, + { + "epoch": 0.64, + "grad_norm": 0.9009539115573664, + "learning_rate": 6.207999366091392e-06, + "loss": 0.1921, + "step": 12489 + }, + { + "epoch": 0.64, + "grad_norm": 0.7706876863747594, + "learning_rate": 6.206475453231644e-06, + "loss": 0.1635, + "step": 12490 + }, + { + "epoch": 0.64, + "grad_norm": 1.0875709472808894, + "learning_rate": 6.204951643271466e-06, + "loss": 0.1667, + "step": 12491 + }, + { + "epoch": 0.64, + "grad_norm": 1.060787182075934, + "learning_rate": 6.2034279362521866e-06, + "loss": 0.1804, + "step": 12492 + }, + { + "epoch": 0.64, + "grad_norm": 1.0740900720219337, + "learning_rate": 6.201904332215143e-06, + "loss": 0.1743, + "step": 12493 + }, + { + "epoch": 0.64, + "grad_norm": 0.9956019682721083, + "learning_rate": 6.200380831201655e-06, + "loss": 0.1902, + "step": 12494 + }, + { + "epoch": 0.64, + "grad_norm": 0.8991276352608935, + "learning_rate": 6.198857433253056e-06, + "loss": 0.1836, + "step": 12495 + }, + { + "epoch": 0.64, + "grad_norm": 1.0371989188888475, + "learning_rate": 6.19733413841066e-06, + "loss": 0.1809, + "step": 12496 + }, + { + "epoch": 0.64, + "grad_norm": 1.694860645984785, + "learning_rate": 6.1958109467157925e-06, + "loss": 0.1632, + "step": 12497 + }, + { + "epoch": 0.64, + "grad_norm": 1.22687686411913, + "learning_rate": 6.1942878582097685e-06, + "loss": 0.1845, + "step": 12498 + }, + { + "epoch": 0.64, + "grad_norm": 0.976504010769984, + "learning_rate": 6.192764872933899e-06, + "loss": 0.1771, + "step": 12499 + }, + { + "epoch": 0.64, + "grad_norm": 1.0657364619208907, + "learning_rate": 6.191241990929498e-06, + "loss": 0.1724, + "step": 12500 + }, + { + "epoch": 0.64, + "grad_norm": 1.0860099335201034, + "learning_rate": 6.1897192122378714e-06, + "loss": 0.1886, + "step": 12501 + }, + { + "epoch": 0.64, + "grad_norm": 0.9996622761610917, + "learning_rate": 6.18819653690033e-06, + "loss": 0.1871, + "step": 12502 + }, + { + "epoch": 0.64, + "grad_norm": 1.1164634795618917, + "learning_rate": 6.18667396495817e-06, + "loss": 0.1976, + "step": 12503 + }, + { + "epoch": 0.64, + "grad_norm": 0.8602778593863528, + "learning_rate": 6.185151496452695e-06, + "loss": 0.1791, + "step": 12504 + }, + { + "epoch": 0.64, + "grad_norm": 1.145776520803997, + "learning_rate": 6.1836291314252e-06, + "loss": 0.1649, + "step": 12505 + }, + { + "epoch": 0.64, + "grad_norm": 1.152848083543941, + "learning_rate": 6.182106869916984e-06, + "loss": 0.1775, + "step": 12506 + }, + { + "epoch": 0.64, + "grad_norm": 1.6093748241813428, + "learning_rate": 6.180584711969331e-06, + "loss": 0.1949, + "step": 12507 + }, + { + "epoch": 0.64, + "grad_norm": 1.1685262483998686, + "learning_rate": 6.179062657623536e-06, + "loss": 0.2048, + "step": 12508 + }, + { + "epoch": 0.64, + "grad_norm": 1.0364475450915975, + "learning_rate": 6.17754070692088e-06, + "loss": 0.1723, + "step": 12509 + }, + { + "epoch": 0.64, + "grad_norm": 1.137829764269063, + "learning_rate": 6.17601885990265e-06, + "loss": 0.1862, + "step": 12510 + }, + { + "epoch": 0.64, + "grad_norm": 0.9867987488422779, + "learning_rate": 6.174497116610121e-06, + "loss": 0.1683, + "step": 12511 + }, + { + "epoch": 0.64, + "grad_norm": 1.1026191481551983, + "learning_rate": 6.1729754770845795e-06, + "loss": 0.1665, + "step": 12512 + }, + { + "epoch": 0.64, + "grad_norm": 1.263035244463893, + "learning_rate": 6.171453941367289e-06, + "loss": 0.1601, + "step": 12513 + }, + { + "epoch": 0.64, + "grad_norm": 0.9018081110097101, + "learning_rate": 6.1699325094995284e-06, + "loss": 0.1554, + "step": 12514 + }, + { + "epoch": 0.64, + "grad_norm": 0.8358416616204396, + "learning_rate": 6.168411181522569e-06, + "loss": 0.1863, + "step": 12515 + }, + { + "epoch": 0.64, + "grad_norm": 1.3526473813417075, + "learning_rate": 6.1668899574776665e-06, + "loss": 0.1791, + "step": 12516 + }, + { + "epoch": 0.64, + "grad_norm": 1.074541065702537, + "learning_rate": 6.165368837406094e-06, + "loss": 0.1742, + "step": 12517 + }, + { + "epoch": 0.64, + "grad_norm": 1.266699220462641, + "learning_rate": 6.1638478213491045e-06, + "loss": 0.1811, + "step": 12518 + }, + { + "epoch": 0.64, + "grad_norm": 1.1978464666007855, + "learning_rate": 6.162326909347964e-06, + "loss": 0.1596, + "step": 12519 + }, + { + "epoch": 0.64, + "grad_norm": 0.9428725976666876, + "learning_rate": 6.160806101443919e-06, + "loss": 0.2014, + "step": 12520 + }, + { + "epoch": 0.64, + "grad_norm": 2.2711091819725837, + "learning_rate": 6.159285397678231e-06, + "loss": 0.1745, + "step": 12521 + }, + { + "epoch": 0.64, + "grad_norm": 0.8918084712482156, + "learning_rate": 6.157764798092139e-06, + "loss": 0.169, + "step": 12522 + }, + { + "epoch": 0.64, + "grad_norm": 1.1336763768588909, + "learning_rate": 6.156244302726894e-06, + "loss": 0.1734, + "step": 12523 + }, + { + "epoch": 0.64, + "grad_norm": 0.8880467909572677, + "learning_rate": 6.154723911623739e-06, + "loss": 0.1854, + "step": 12524 + }, + { + "epoch": 0.64, + "grad_norm": 2.2989210515788927, + "learning_rate": 6.153203624823918e-06, + "loss": 0.1713, + "step": 12525 + }, + { + "epoch": 0.64, + "grad_norm": 0.845146306048577, + "learning_rate": 6.151683442368662e-06, + "loss": 0.1709, + "step": 12526 + }, + { + "epoch": 0.64, + "grad_norm": 0.9185256844844901, + "learning_rate": 6.150163364299213e-06, + "loss": 0.1749, + "step": 12527 + }, + { + "epoch": 0.64, + "grad_norm": 0.797325566838219, + "learning_rate": 6.148643390656797e-06, + "loss": 0.1799, + "step": 12528 + }, + { + "epoch": 0.64, + "grad_norm": 1.075427656446058, + "learning_rate": 6.147123521482652e-06, + "loss": 0.2035, + "step": 12529 + }, + { + "epoch": 0.64, + "grad_norm": 1.4026009435662508, + "learning_rate": 6.145603756817994e-06, + "loss": 0.174, + "step": 12530 + }, + { + "epoch": 0.64, + "grad_norm": 1.1020490093254394, + "learning_rate": 6.144084096704054e-06, + "loss": 0.1807, + "step": 12531 + }, + { + "epoch": 0.64, + "grad_norm": 1.7151474986946946, + "learning_rate": 6.142564541182052e-06, + "loss": 0.1773, + "step": 12532 + }, + { + "epoch": 0.64, + "grad_norm": 0.8427975962075497, + "learning_rate": 6.141045090293203e-06, + "loss": 0.1482, + "step": 12533 + }, + { + "epoch": 0.64, + "grad_norm": 1.4389259546810442, + "learning_rate": 6.1395257440787246e-06, + "loss": 0.1911, + "step": 12534 + }, + { + "epoch": 0.64, + "grad_norm": 1.1012280045114835, + "learning_rate": 6.1380065025798275e-06, + "loss": 0.1771, + "step": 12535 + }, + { + "epoch": 0.64, + "grad_norm": 0.9123329869617892, + "learning_rate": 6.136487365837723e-06, + "loss": 0.1756, + "step": 12536 + }, + { + "epoch": 0.64, + "grad_norm": 0.8711802443983307, + "learning_rate": 6.134968333893614e-06, + "loss": 0.1823, + "step": 12537 + }, + { + "epoch": 0.64, + "grad_norm": 1.8182388385244617, + "learning_rate": 6.133449406788712e-06, + "loss": 0.1746, + "step": 12538 + }, + { + "epoch": 0.64, + "grad_norm": 0.7628933579636543, + "learning_rate": 6.13193058456421e-06, + "loss": 0.1636, + "step": 12539 + }, + { + "epoch": 0.64, + "grad_norm": 1.0452787874927696, + "learning_rate": 6.13041186726131e-06, + "loss": 0.1752, + "step": 12540 + }, + { + "epoch": 0.64, + "grad_norm": 1.1334682158832199, + "learning_rate": 6.128893254921204e-06, + "loss": 0.1732, + "step": 12541 + }, + { + "epoch": 0.64, + "grad_norm": 0.9651552788366533, + "learning_rate": 6.127374747585093e-06, + "loss": 0.1817, + "step": 12542 + }, + { + "epoch": 0.64, + "grad_norm": 1.003239206354539, + "learning_rate": 6.125856345294156e-06, + "loss": 0.1699, + "step": 12543 + }, + { + "epoch": 0.64, + "grad_norm": 1.0489086733374817, + "learning_rate": 6.124338048089586e-06, + "loss": 0.1975, + "step": 12544 + }, + { + "epoch": 0.64, + "grad_norm": 1.6214492669460177, + "learning_rate": 6.122819856012564e-06, + "loss": 0.1545, + "step": 12545 + }, + { + "epoch": 0.64, + "grad_norm": 0.9810971851715937, + "learning_rate": 6.121301769104277e-06, + "loss": 0.1707, + "step": 12546 + }, + { + "epoch": 0.64, + "grad_norm": 0.9002404769464839, + "learning_rate": 6.119783787405893e-06, + "loss": 0.1655, + "step": 12547 + }, + { + "epoch": 0.64, + "grad_norm": 0.9440022409126835, + "learning_rate": 6.118265910958599e-06, + "loss": 0.1719, + "step": 12548 + }, + { + "epoch": 0.64, + "grad_norm": 0.8769209904214279, + "learning_rate": 6.116748139803554e-06, + "loss": 0.1809, + "step": 12549 + }, + { + "epoch": 0.64, + "grad_norm": 1.039009549593771, + "learning_rate": 6.115230473981939e-06, + "loss": 0.18, + "step": 12550 + }, + { + "epoch": 0.64, + "grad_norm": 0.8954688446998333, + "learning_rate": 6.113712913534919e-06, + "loss": 0.1879, + "step": 12551 + }, + { + "epoch": 0.64, + "grad_norm": 0.8868688380705804, + "learning_rate": 6.1121954585036525e-06, + "loss": 0.1731, + "step": 12552 + }, + { + "epoch": 0.64, + "grad_norm": 0.9068755428729511, + "learning_rate": 6.110678108929304e-06, + "loss": 0.1735, + "step": 12553 + }, + { + "epoch": 0.64, + "grad_norm": 0.8247271190545595, + "learning_rate": 6.109160864853031e-06, + "loss": 0.1609, + "step": 12554 + }, + { + "epoch": 0.64, + "grad_norm": 0.8450076184849792, + "learning_rate": 6.107643726315993e-06, + "loss": 0.1572, + "step": 12555 + }, + { + "epoch": 0.64, + "grad_norm": 0.8092402376512072, + "learning_rate": 6.106126693359334e-06, + "loss": 0.1653, + "step": 12556 + }, + { + "epoch": 0.64, + "grad_norm": 0.8671305954230574, + "learning_rate": 6.104609766024211e-06, + "loss": 0.1621, + "step": 12557 + }, + { + "epoch": 0.64, + "grad_norm": 0.9829882238021675, + "learning_rate": 6.103092944351766e-06, + "loss": 0.1917, + "step": 12558 + }, + { + "epoch": 0.64, + "grad_norm": 0.9452635175929243, + "learning_rate": 6.1015762283831485e-06, + "loss": 0.225, + "step": 12559 + }, + { + "epoch": 0.64, + "grad_norm": 0.9586078615730995, + "learning_rate": 6.100059618159493e-06, + "loss": 0.1717, + "step": 12560 + }, + { + "epoch": 0.64, + "grad_norm": 1.0276062367385321, + "learning_rate": 6.098543113721942e-06, + "loss": 0.1911, + "step": 12561 + }, + { + "epoch": 0.64, + "grad_norm": 0.928138926530368, + "learning_rate": 6.097026715111627e-06, + "loss": 0.1668, + "step": 12562 + }, + { + "epoch": 0.64, + "grad_norm": 1.5221052583071613, + "learning_rate": 6.095510422369687e-06, + "loss": 0.1712, + "step": 12563 + }, + { + "epoch": 0.64, + "grad_norm": 1.0930100736865827, + "learning_rate": 6.093994235537244e-06, + "loss": 0.1807, + "step": 12564 + }, + { + "epoch": 0.64, + "grad_norm": 0.8970169138993683, + "learning_rate": 6.092478154655431e-06, + "loss": 0.1762, + "step": 12565 + }, + { + "epoch": 0.64, + "grad_norm": 0.8136805000756253, + "learning_rate": 6.090962179765365e-06, + "loss": 0.1645, + "step": 12566 + }, + { + "epoch": 0.64, + "grad_norm": 1.0157663073300447, + "learning_rate": 6.089446310908174e-06, + "loss": 0.1983, + "step": 12567 + }, + { + "epoch": 0.64, + "grad_norm": 1.4187437187365042, + "learning_rate": 6.087930548124973e-06, + "loss": 0.1778, + "step": 12568 + }, + { + "epoch": 0.64, + "grad_norm": 0.8998068081604149, + "learning_rate": 6.086414891456873e-06, + "loss": 0.2043, + "step": 12569 + }, + { + "epoch": 0.64, + "grad_norm": 1.23602924999552, + "learning_rate": 6.084899340944993e-06, + "loss": 0.1731, + "step": 12570 + }, + { + "epoch": 0.64, + "grad_norm": 1.0015833512001684, + "learning_rate": 6.083383896630437e-06, + "loss": 0.181, + "step": 12571 + }, + { + "epoch": 0.64, + "grad_norm": 1.1870337585167698, + "learning_rate": 6.081868558554318e-06, + "loss": 0.1765, + "step": 12572 + }, + { + "epoch": 0.64, + "grad_norm": 1.041099892707607, + "learning_rate": 6.080353326757732e-06, + "loss": 0.1851, + "step": 12573 + }, + { + "epoch": 0.64, + "grad_norm": 0.9109856770529133, + "learning_rate": 6.078838201281785e-06, + "loss": 0.1775, + "step": 12574 + }, + { + "epoch": 0.64, + "grad_norm": 0.8825147559118742, + "learning_rate": 6.077323182167572e-06, + "loss": 0.1584, + "step": 12575 + }, + { + "epoch": 0.64, + "grad_norm": 0.9183584383098351, + "learning_rate": 6.075808269456191e-06, + "loss": 0.1842, + "step": 12576 + }, + { + "epoch": 0.64, + "grad_norm": 0.8299857824314143, + "learning_rate": 6.074293463188731e-06, + "loss": 0.1685, + "step": 12577 + }, + { + "epoch": 0.64, + "grad_norm": 0.815838704444408, + "learning_rate": 6.072778763406285e-06, + "loss": 0.171, + "step": 12578 + }, + { + "epoch": 0.64, + "grad_norm": 0.8010028661546602, + "learning_rate": 6.071264170149933e-06, + "loss": 0.1635, + "step": 12579 + }, + { + "epoch": 0.64, + "grad_norm": 1.0617631073565157, + "learning_rate": 6.069749683460765e-06, + "loss": 0.1699, + "step": 12580 + }, + { + "epoch": 0.64, + "grad_norm": 0.8113835607544837, + "learning_rate": 6.068235303379857e-06, + "loss": 0.1667, + "step": 12581 + }, + { + "epoch": 0.64, + "grad_norm": 0.7552633298934263, + "learning_rate": 6.066721029948291e-06, + "loss": 0.1869, + "step": 12582 + }, + { + "epoch": 0.64, + "grad_norm": 0.8411484353367596, + "learning_rate": 6.065206863207136e-06, + "loss": 0.1598, + "step": 12583 + }, + { + "epoch": 0.64, + "grad_norm": 1.1801608714988914, + "learning_rate": 6.06369280319747e-06, + "loss": 0.1732, + "step": 12584 + }, + { + "epoch": 0.64, + "grad_norm": 0.9325254706005588, + "learning_rate": 6.062178849960359e-06, + "loss": 0.1748, + "step": 12585 + }, + { + "epoch": 0.64, + "grad_norm": 0.8197909597762826, + "learning_rate": 6.060665003536868e-06, + "loss": 0.1739, + "step": 12586 + }, + { + "epoch": 0.64, + "grad_norm": 0.8392306554506301, + "learning_rate": 6.059151263968061e-06, + "loss": 0.1668, + "step": 12587 + }, + { + "epoch": 0.64, + "grad_norm": 0.9631135744680915, + "learning_rate": 6.057637631294997e-06, + "loss": 0.1712, + "step": 12588 + }, + { + "epoch": 0.64, + "grad_norm": 0.9695296619769307, + "learning_rate": 6.0561241055587385e-06, + "loss": 0.1689, + "step": 12589 + }, + { + "epoch": 0.64, + "grad_norm": 0.9120100410242605, + "learning_rate": 6.054610686800333e-06, + "loss": 0.1876, + "step": 12590 + }, + { + "epoch": 0.64, + "grad_norm": 0.7689129787731763, + "learning_rate": 6.053097375060839e-06, + "loss": 0.1474, + "step": 12591 + }, + { + "epoch": 0.64, + "grad_norm": 1.12835739084911, + "learning_rate": 6.051584170381298e-06, + "loss": 0.1648, + "step": 12592 + }, + { + "epoch": 0.64, + "grad_norm": 0.7884327382625383, + "learning_rate": 6.050071072802761e-06, + "loss": 0.154, + "step": 12593 + }, + { + "epoch": 0.64, + "grad_norm": 0.9505681884985597, + "learning_rate": 6.048558082366269e-06, + "loss": 0.173, + "step": 12594 + }, + { + "epoch": 0.64, + "grad_norm": 1.1454112915904329, + "learning_rate": 6.047045199112865e-06, + "loss": 0.1908, + "step": 12595 + }, + { + "epoch": 0.64, + "grad_norm": 1.2481260398457952, + "learning_rate": 6.045532423083578e-06, + "loss": 0.1882, + "step": 12596 + }, + { + "epoch": 0.64, + "grad_norm": 1.0597575897436244, + "learning_rate": 6.04401975431945e-06, + "loss": 0.1596, + "step": 12597 + }, + { + "epoch": 0.64, + "grad_norm": 0.9179478487130808, + "learning_rate": 6.042507192861509e-06, + "loss": 0.1782, + "step": 12598 + }, + { + "epoch": 0.64, + "grad_norm": 1.1493989123689738, + "learning_rate": 6.040994738750788e-06, + "loss": 0.1578, + "step": 12599 + }, + { + "epoch": 0.64, + "grad_norm": 1.1523802744773493, + "learning_rate": 6.039482392028302e-06, + "loss": 0.162, + "step": 12600 + }, + { + "epoch": 0.64, + "grad_norm": 0.9364003989788462, + "learning_rate": 6.037970152735083e-06, + "loss": 0.1816, + "step": 12601 + }, + { + "epoch": 0.64, + "grad_norm": 0.9241871003791564, + "learning_rate": 6.036458020912151e-06, + "loss": 0.1792, + "step": 12602 + }, + { + "epoch": 0.64, + "grad_norm": 0.8505269578328596, + "learning_rate": 6.034945996600512e-06, + "loss": 0.1693, + "step": 12603 + }, + { + "epoch": 0.64, + "grad_norm": 1.070369815758006, + "learning_rate": 6.033434079841192e-06, + "loss": 0.1704, + "step": 12604 + }, + { + "epoch": 0.64, + "grad_norm": 0.7797409791480164, + "learning_rate": 6.031922270675193e-06, + "loss": 0.1754, + "step": 12605 + }, + { + "epoch": 0.64, + "grad_norm": 0.9061342005549365, + "learning_rate": 6.0304105691435285e-06, + "loss": 0.1869, + "step": 12606 + }, + { + "epoch": 0.64, + "grad_norm": 1.2881616547284978, + "learning_rate": 6.028898975287199e-06, + "loss": 0.1554, + "step": 12607 + }, + { + "epoch": 0.64, + "grad_norm": 1.3061192645383484, + "learning_rate": 6.027387489147214e-06, + "loss": 0.1809, + "step": 12608 + }, + { + "epoch": 0.64, + "grad_norm": 1.1680155757501567, + "learning_rate": 6.025876110764563e-06, + "loss": 0.1547, + "step": 12609 + }, + { + "epoch": 0.64, + "grad_norm": 0.8902388411726355, + "learning_rate": 6.02436484018025e-06, + "loss": 0.1789, + "step": 12610 + }, + { + "epoch": 0.64, + "grad_norm": 0.8750040119051603, + "learning_rate": 6.022853677435262e-06, + "loss": 0.1615, + "step": 12611 + }, + { + "epoch": 0.64, + "grad_norm": 1.0092482811801622, + "learning_rate": 6.021342622570597e-06, + "loss": 0.1881, + "step": 12612 + }, + { + "epoch": 0.64, + "grad_norm": 1.0436717957655843, + "learning_rate": 6.019831675627235e-06, + "loss": 0.184, + "step": 12613 + }, + { + "epoch": 0.64, + "grad_norm": 1.0653181286038373, + "learning_rate": 6.018320836646164e-06, + "loss": 0.1948, + "step": 12614 + }, + { + "epoch": 0.64, + "grad_norm": 1.280309670639057, + "learning_rate": 6.016810105668365e-06, + "loss": 0.1689, + "step": 12615 + }, + { + "epoch": 0.64, + "grad_norm": 1.231377345198479, + "learning_rate": 6.015299482734819e-06, + "loss": 0.1624, + "step": 12616 + }, + { + "epoch": 0.64, + "grad_norm": 0.7934969383349629, + "learning_rate": 6.013788967886496e-06, + "loss": 0.1635, + "step": 12617 + }, + { + "epoch": 0.64, + "grad_norm": 0.9639688222591635, + "learning_rate": 6.012278561164377e-06, + "loss": 0.2132, + "step": 12618 + }, + { + "epoch": 0.64, + "grad_norm": 0.8849522972008587, + "learning_rate": 6.010768262609425e-06, + "loss": 0.162, + "step": 12619 + }, + { + "epoch": 0.64, + "grad_norm": 1.0277230971550373, + "learning_rate": 6.009258072262607e-06, + "loss": 0.1852, + "step": 12620 + }, + { + "epoch": 0.64, + "grad_norm": 1.0009072609625371, + "learning_rate": 6.0077479901648935e-06, + "loss": 0.149, + "step": 12621 + }, + { + "epoch": 0.64, + "grad_norm": 1.136258000812448, + "learning_rate": 6.006238016357238e-06, + "loss": 0.1721, + "step": 12622 + }, + { + "epoch": 0.64, + "grad_norm": 0.8486374392042341, + "learning_rate": 6.0047281508806035e-06, + "loss": 0.1714, + "step": 12623 + }, + { + "epoch": 0.64, + "grad_norm": 0.9263199151783211, + "learning_rate": 6.00321839377594e-06, + "loss": 0.1878, + "step": 12624 + }, + { + "epoch": 0.64, + "grad_norm": 0.8905235912213837, + "learning_rate": 6.001708745084209e-06, + "loss": 0.1812, + "step": 12625 + }, + { + "epoch": 0.64, + "grad_norm": 1.0831038098071641, + "learning_rate": 6.000199204846348e-06, + "loss": 0.1646, + "step": 12626 + }, + { + "epoch": 0.64, + "grad_norm": 0.742740153937068, + "learning_rate": 5.998689773103314e-06, + "loss": 0.1742, + "step": 12627 + }, + { + "epoch": 0.64, + "grad_norm": 0.8225453319335321, + "learning_rate": 5.997180449896043e-06, + "loss": 0.1609, + "step": 12628 + }, + { + "epoch": 0.64, + "grad_norm": 0.9766197525212346, + "learning_rate": 5.995671235265483e-06, + "loss": 0.2027, + "step": 12629 + }, + { + "epoch": 0.64, + "grad_norm": 0.8413232819910352, + "learning_rate": 5.994162129252561e-06, + "loss": 0.2092, + "step": 12630 + }, + { + "epoch": 0.64, + "grad_norm": 3.6884841068191743, + "learning_rate": 5.992653131898223e-06, + "loss": 0.1658, + "step": 12631 + }, + { + "epoch": 0.64, + "grad_norm": 1.094459052333219, + "learning_rate": 5.991144243243392e-06, + "loss": 0.1712, + "step": 12632 + }, + { + "epoch": 0.64, + "grad_norm": 1.5800840062814814, + "learning_rate": 5.989635463329e-06, + "loss": 0.1637, + "step": 12633 + }, + { + "epoch": 0.64, + "grad_norm": 0.8334496722945175, + "learning_rate": 5.988126792195972e-06, + "loss": 0.161, + "step": 12634 + }, + { + "epoch": 0.64, + "grad_norm": 1.0395517635284837, + "learning_rate": 5.986618229885234e-06, + "loss": 0.1679, + "step": 12635 + }, + { + "epoch": 0.64, + "grad_norm": 1.1216746978677126, + "learning_rate": 5.985109776437699e-06, + "loss": 0.1818, + "step": 12636 + }, + { + "epoch": 0.64, + "grad_norm": 1.139123745295039, + "learning_rate": 5.983601431894291e-06, + "loss": 0.1953, + "step": 12637 + }, + { + "epoch": 0.64, + "grad_norm": 0.8956866938954258, + "learning_rate": 5.982093196295924e-06, + "loss": 0.1775, + "step": 12638 + }, + { + "epoch": 0.64, + "grad_norm": 0.939937101816068, + "learning_rate": 5.9805850696835e-06, + "loss": 0.1668, + "step": 12639 + }, + { + "epoch": 0.64, + "grad_norm": 0.9370917688385507, + "learning_rate": 5.979077052097936e-06, + "loss": 0.1718, + "step": 12640 + }, + { + "epoch": 0.64, + "grad_norm": 0.9764839246852787, + "learning_rate": 5.977569143580132e-06, + "loss": 0.1749, + "step": 12641 + }, + { + "epoch": 0.64, + "grad_norm": 0.857365213898431, + "learning_rate": 5.976061344170995e-06, + "loss": 0.1691, + "step": 12642 + }, + { + "epoch": 0.64, + "grad_norm": 1.000499061221936, + "learning_rate": 5.974553653911419e-06, + "loss": 0.1784, + "step": 12643 + }, + { + "epoch": 0.64, + "grad_norm": 0.8623887963384143, + "learning_rate": 5.973046072842305e-06, + "loss": 0.1772, + "step": 12644 + }, + { + "epoch": 0.64, + "grad_norm": 0.749589177560146, + "learning_rate": 5.971538601004542e-06, + "loss": 0.1734, + "step": 12645 + }, + { + "epoch": 0.64, + "grad_norm": 2.1602944806291715, + "learning_rate": 5.970031238439023e-06, + "loss": 0.1871, + "step": 12646 + }, + { + "epoch": 0.64, + "grad_norm": 0.8914992617167609, + "learning_rate": 5.968523985186632e-06, + "loss": 0.1541, + "step": 12647 + }, + { + "epoch": 0.64, + "grad_norm": 0.7662373818745853, + "learning_rate": 5.967016841288258e-06, + "loss": 0.1553, + "step": 12648 + }, + { + "epoch": 0.64, + "grad_norm": 2.1534363779308747, + "learning_rate": 5.965509806784777e-06, + "loss": 0.1768, + "step": 12649 + }, + { + "epoch": 0.64, + "grad_norm": 1.753453688553895, + "learning_rate": 5.964002881717073e-06, + "loss": 0.1986, + "step": 12650 + }, + { + "epoch": 0.64, + "grad_norm": 1.079062233365335, + "learning_rate": 5.962496066126018e-06, + "loss": 0.1727, + "step": 12651 + }, + { + "epoch": 0.64, + "grad_norm": 1.7864402552268221, + "learning_rate": 5.960989360052487e-06, + "loss": 0.1631, + "step": 12652 + }, + { + "epoch": 0.64, + "grad_norm": 1.3834907670938499, + "learning_rate": 5.959482763537344e-06, + "loss": 0.2107, + "step": 12653 + }, + { + "epoch": 0.64, + "grad_norm": 0.8888693044438032, + "learning_rate": 5.9579762766214624e-06, + "loss": 0.166, + "step": 12654 + }, + { + "epoch": 0.64, + "grad_norm": 1.041141009502544, + "learning_rate": 5.956469899345704e-06, + "loss": 0.1655, + "step": 12655 + }, + { + "epoch": 0.64, + "grad_norm": 1.14499034697284, + "learning_rate": 5.954963631750923e-06, + "loss": 0.1748, + "step": 12656 + }, + { + "epoch": 0.64, + "grad_norm": 0.7490457820595138, + "learning_rate": 5.953457473877988e-06, + "loss": 0.1484, + "step": 12657 + }, + { + "epoch": 0.64, + "grad_norm": 1.1030787733107046, + "learning_rate": 5.9519514257677416e-06, + "loss": 0.1658, + "step": 12658 + }, + { + "epoch": 0.64, + "grad_norm": 1.1598241699401937, + "learning_rate": 5.950445487461045e-06, + "loss": 0.1905, + "step": 12659 + }, + { + "epoch": 0.64, + "grad_norm": 1.3936158789622728, + "learning_rate": 5.94893965899874e-06, + "loss": 0.1799, + "step": 12660 + }, + { + "epoch": 0.64, + "grad_norm": 0.9178583971284034, + "learning_rate": 5.947433940421681e-06, + "loss": 0.1729, + "step": 12661 + }, + { + "epoch": 0.64, + "grad_norm": 0.8496088616896511, + "learning_rate": 5.9459283317707e-06, + "loss": 0.1826, + "step": 12662 + }, + { + "epoch": 0.64, + "grad_norm": 0.973870769727679, + "learning_rate": 5.944422833086645e-06, + "loss": 0.169, + "step": 12663 + }, + { + "epoch": 0.64, + "grad_norm": 0.7730557051735637, + "learning_rate": 5.942917444410346e-06, + "loss": 0.1772, + "step": 12664 + }, + { + "epoch": 0.64, + "grad_norm": 0.8075041620874908, + "learning_rate": 5.941412165782645e-06, + "loss": 0.1764, + "step": 12665 + }, + { + "epoch": 0.64, + "grad_norm": 0.8686461335552085, + "learning_rate": 5.939906997244364e-06, + "loss": 0.1511, + "step": 12666 + }, + { + "epoch": 0.64, + "grad_norm": 0.94060984040844, + "learning_rate": 5.938401938836339e-06, + "loss": 0.1719, + "step": 12667 + }, + { + "epoch": 0.64, + "grad_norm": 1.0649589557653525, + "learning_rate": 5.936896990599388e-06, + "loss": 0.2042, + "step": 12668 + }, + { + "epoch": 0.64, + "grad_norm": 1.4109876552900762, + "learning_rate": 5.9353921525743394e-06, + "loss": 0.1642, + "step": 12669 + }, + { + "epoch": 0.64, + "grad_norm": 0.9844463011964707, + "learning_rate": 5.933887424802003e-06, + "loss": 0.174, + "step": 12670 + }, + { + "epoch": 0.64, + "grad_norm": 0.8399276919431213, + "learning_rate": 5.9323828073232025e-06, + "loss": 0.1947, + "step": 12671 + }, + { + "epoch": 0.64, + "grad_norm": 1.3259786476616642, + "learning_rate": 5.930878300178751e-06, + "loss": 0.1677, + "step": 12672 + }, + { + "epoch": 0.64, + "grad_norm": 1.0698628567492015, + "learning_rate": 5.929373903409451e-06, + "loss": 0.1983, + "step": 12673 + }, + { + "epoch": 0.64, + "grad_norm": 0.9322445915079713, + "learning_rate": 5.9278696170561175e-06, + "loss": 0.1894, + "step": 12674 + }, + { + "epoch": 0.64, + "grad_norm": 1.0557079540673695, + "learning_rate": 5.926365441159547e-06, + "loss": 0.1894, + "step": 12675 + }, + { + "epoch": 0.64, + "grad_norm": 0.8609547739005059, + "learning_rate": 5.924861375760547e-06, + "loss": 0.1706, + "step": 12676 + }, + { + "epoch": 0.64, + "grad_norm": 1.1008738440055796, + "learning_rate": 5.923357420899908e-06, + "loss": 0.1648, + "step": 12677 + }, + { + "epoch": 0.64, + "grad_norm": 1.2090115629624052, + "learning_rate": 5.921853576618435e-06, + "loss": 0.1855, + "step": 12678 + }, + { + "epoch": 0.64, + "grad_norm": 1.6107429435083536, + "learning_rate": 5.920349842956909e-06, + "loss": 0.1756, + "step": 12679 + }, + { + "epoch": 0.64, + "grad_norm": 0.9722248729245764, + "learning_rate": 5.918846219956126e-06, + "loss": 0.1689, + "step": 12680 + }, + { + "epoch": 0.64, + "grad_norm": 0.8651383834716352, + "learning_rate": 5.917342707656868e-06, + "loss": 0.1806, + "step": 12681 + }, + { + "epoch": 0.64, + "grad_norm": 1.0928700656041868, + "learning_rate": 5.915839306099924e-06, + "loss": 0.1755, + "step": 12682 + }, + { + "epoch": 0.64, + "grad_norm": 1.1338462339538051, + "learning_rate": 5.9143360153260655e-06, + "loss": 0.1763, + "step": 12683 + }, + { + "epoch": 0.65, + "grad_norm": 0.9910656380198765, + "learning_rate": 5.912832835376074e-06, + "loss": 0.1552, + "step": 12684 + }, + { + "epoch": 0.65, + "grad_norm": 1.0793574090034495, + "learning_rate": 5.911329766290723e-06, + "loss": 0.1802, + "step": 12685 + }, + { + "epoch": 0.65, + "grad_norm": 0.9980755060477906, + "learning_rate": 5.9098268081107855e-06, + "loss": 0.1837, + "step": 12686 + }, + { + "epoch": 0.65, + "grad_norm": 1.013160068361227, + "learning_rate": 5.9083239608770225e-06, + "loss": 0.1778, + "step": 12687 + }, + { + "epoch": 0.65, + "grad_norm": 1.8906471754734917, + "learning_rate": 5.9068212246302084e-06, + "loss": 0.1656, + "step": 12688 + }, + { + "epoch": 0.65, + "grad_norm": 0.8351175530599811, + "learning_rate": 5.9053185994110975e-06, + "loss": 0.1603, + "step": 12689 + }, + { + "epoch": 0.65, + "grad_norm": 1.2711286630467495, + "learning_rate": 5.903816085260447e-06, + "loss": 0.1776, + "step": 12690 + }, + { + "epoch": 0.65, + "grad_norm": 0.928024545479977, + "learning_rate": 5.902313682219023e-06, + "loss": 0.1774, + "step": 12691 + }, + { + "epoch": 0.65, + "grad_norm": 1.6213176678927754, + "learning_rate": 5.9008113903275675e-06, + "loss": 0.1622, + "step": 12692 + }, + { + "epoch": 0.65, + "grad_norm": 0.9402086897119339, + "learning_rate": 5.899309209626836e-06, + "loss": 0.1659, + "step": 12693 + }, + { + "epoch": 0.65, + "grad_norm": 1.0009811336498147, + "learning_rate": 5.8978071401575724e-06, + "loss": 0.1802, + "step": 12694 + }, + { + "epoch": 0.65, + "grad_norm": 0.9628070707067192, + "learning_rate": 5.896305181960524e-06, + "loss": 0.1425, + "step": 12695 + }, + { + "epoch": 0.65, + "grad_norm": 0.7368779807655511, + "learning_rate": 5.894803335076427e-06, + "loss": 0.1902, + "step": 12696 + }, + { + "epoch": 0.65, + "grad_norm": 0.9819880577436897, + "learning_rate": 5.8933015995460215e-06, + "loss": 0.1744, + "step": 12697 + }, + { + "epoch": 0.65, + "grad_norm": 1.1420654284111416, + "learning_rate": 5.8917999754100415e-06, + "loss": 0.1839, + "step": 12698 + }, + { + "epoch": 0.65, + "grad_norm": 0.8692782871985733, + "learning_rate": 5.890298462709224e-06, + "loss": 0.175, + "step": 12699 + }, + { + "epoch": 0.65, + "grad_norm": 1.945136366281455, + "learning_rate": 5.888797061484288e-06, + "loss": 0.1802, + "step": 12700 + }, + { + "epoch": 0.65, + "grad_norm": 0.8843237227460644, + "learning_rate": 5.887295771775968e-06, + "loss": 0.1718, + "step": 12701 + }, + { + "epoch": 0.65, + "grad_norm": 1.2556922365483774, + "learning_rate": 5.885794593624978e-06, + "loss": 0.1818, + "step": 12702 + }, + { + "epoch": 0.65, + "grad_norm": 0.9307613969627148, + "learning_rate": 5.884293527072045e-06, + "loss": 0.1816, + "step": 12703 + }, + { + "epoch": 0.65, + "grad_norm": 0.9934432917494341, + "learning_rate": 5.88279257215788e-06, + "loss": 0.1676, + "step": 12704 + }, + { + "epoch": 0.65, + "grad_norm": 0.9168098448675442, + "learning_rate": 5.881291728923202e-06, + "loss": 0.1724, + "step": 12705 + }, + { + "epoch": 0.65, + "grad_norm": 0.9078438921079398, + "learning_rate": 5.8797909974087166e-06, + "loss": 0.1746, + "step": 12706 + }, + { + "epoch": 0.65, + "grad_norm": 0.9363191791158435, + "learning_rate": 5.878290377655134e-06, + "loss": 0.1869, + "step": 12707 + }, + { + "epoch": 0.65, + "grad_norm": 0.8066846892597944, + "learning_rate": 5.876789869703159e-06, + "loss": 0.1675, + "step": 12708 + }, + { + "epoch": 0.65, + "grad_norm": 1.1161164446921086, + "learning_rate": 5.875289473593489e-06, + "loss": 0.1872, + "step": 12709 + }, + { + "epoch": 0.65, + "grad_norm": 0.8283708904113805, + "learning_rate": 5.8737891893668255e-06, + "loss": 0.1833, + "step": 12710 + }, + { + "epoch": 0.65, + "grad_norm": 0.9024395515456646, + "learning_rate": 5.872289017063861e-06, + "loss": 0.1809, + "step": 12711 + }, + { + "epoch": 0.65, + "grad_norm": 1.864448371808424, + "learning_rate": 5.8707889567252965e-06, + "loss": 0.1474, + "step": 12712 + }, + { + "epoch": 0.65, + "grad_norm": 0.9419293887724072, + "learning_rate": 5.869289008391809e-06, + "loss": 0.1964, + "step": 12713 + }, + { + "epoch": 0.65, + "grad_norm": 2.14700717152764, + "learning_rate": 5.8677891721040945e-06, + "loss": 0.1757, + "step": 12714 + }, + { + "epoch": 0.65, + "grad_norm": 0.8869622133560018, + "learning_rate": 5.866289447902829e-06, + "loss": 0.2008, + "step": 12715 + }, + { + "epoch": 0.65, + "grad_norm": 1.4162474657019513, + "learning_rate": 5.864789835828697e-06, + "loss": 0.1503, + "step": 12716 + }, + { + "epoch": 0.65, + "grad_norm": 1.0489769046663078, + "learning_rate": 5.863290335922371e-06, + "loss": 0.1688, + "step": 12717 + }, + { + "epoch": 0.65, + "grad_norm": 1.1416131733336279, + "learning_rate": 5.861790948224535e-06, + "loss": 0.2004, + "step": 12718 + }, + { + "epoch": 0.65, + "grad_norm": 0.7944055566838395, + "learning_rate": 5.860291672775847e-06, + "loss": 0.1544, + "step": 12719 + }, + { + "epoch": 0.65, + "grad_norm": 1.1178239682169353, + "learning_rate": 5.858792509616984e-06, + "loss": 0.1741, + "step": 12720 + }, + { + "epoch": 0.65, + "grad_norm": 1.9122154797112343, + "learning_rate": 5.857293458788607e-06, + "loss": 0.1751, + "step": 12721 + }, + { + "epoch": 0.65, + "grad_norm": 0.9993588874074452, + "learning_rate": 5.855794520331382e-06, + "loss": 0.1545, + "step": 12722 + }, + { + "epoch": 0.65, + "grad_norm": 0.9449544422557149, + "learning_rate": 5.854295694285961e-06, + "loss": 0.1748, + "step": 12723 + }, + { + "epoch": 0.65, + "grad_norm": 1.0191948675121945, + "learning_rate": 5.852796980693005e-06, + "loss": 0.1866, + "step": 12724 + }, + { + "epoch": 0.65, + "grad_norm": 0.9340845236893027, + "learning_rate": 5.8512983795931665e-06, + "loss": 0.1717, + "step": 12725 + }, + { + "epoch": 0.65, + "grad_norm": 0.9371937668222649, + "learning_rate": 5.8497998910270915e-06, + "loss": 0.1793, + "step": 12726 + }, + { + "epoch": 0.65, + "grad_norm": 0.8167577931936453, + "learning_rate": 5.848301515035433e-06, + "loss": 0.1629, + "step": 12727 + }, + { + "epoch": 0.65, + "grad_norm": 0.9070997042355996, + "learning_rate": 5.846803251658824e-06, + "loss": 0.1653, + "step": 12728 + }, + { + "epoch": 0.65, + "grad_norm": 1.2774767895726513, + "learning_rate": 5.8453051009379145e-06, + "loss": 0.1733, + "step": 12729 + }, + { + "epoch": 0.65, + "grad_norm": 0.753620675806103, + "learning_rate": 5.843807062913338e-06, + "loss": 0.1849, + "step": 12730 + }, + { + "epoch": 0.65, + "grad_norm": 0.9565505103888103, + "learning_rate": 5.842309137625732e-06, + "loss": 0.1838, + "step": 12731 + }, + { + "epoch": 0.65, + "grad_norm": 1.5290954314169458, + "learning_rate": 5.840811325115723e-06, + "loss": 0.1738, + "step": 12732 + }, + { + "epoch": 0.65, + "grad_norm": 0.979728759438856, + "learning_rate": 5.8393136254239424e-06, + "loss": 0.1738, + "step": 12733 + }, + { + "epoch": 0.65, + "grad_norm": 0.8586160518040348, + "learning_rate": 5.837816038591016e-06, + "loss": 0.1518, + "step": 12734 + }, + { + "epoch": 0.65, + "grad_norm": 1.0986488341282836, + "learning_rate": 5.836318564657561e-06, + "loss": 0.1572, + "step": 12735 + }, + { + "epoch": 0.65, + "grad_norm": 0.970494590695129, + "learning_rate": 5.8348212036642004e-06, + "loss": 0.206, + "step": 12736 + }, + { + "epoch": 0.65, + "grad_norm": 1.1964209550960125, + "learning_rate": 5.833323955651555e-06, + "loss": 0.1912, + "step": 12737 + }, + { + "epoch": 0.65, + "grad_norm": 1.7420502057662117, + "learning_rate": 5.831826820660228e-06, + "loss": 0.1594, + "step": 12738 + }, + { + "epoch": 0.65, + "grad_norm": 0.9275542125230787, + "learning_rate": 5.8303297987308384e-06, + "loss": 0.1778, + "step": 12739 + }, + { + "epoch": 0.65, + "grad_norm": 1.4329471083008936, + "learning_rate": 5.828832889903983e-06, + "loss": 0.1749, + "step": 12740 + }, + { + "epoch": 0.65, + "grad_norm": 1.0909640951660313, + "learning_rate": 5.827336094220278e-06, + "loss": 0.1742, + "step": 12741 + }, + { + "epoch": 0.65, + "grad_norm": 0.8744495369447978, + "learning_rate": 5.825839411720314e-06, + "loss": 0.1972, + "step": 12742 + }, + { + "epoch": 0.65, + "grad_norm": 1.1558371119444353, + "learning_rate": 5.824342842444689e-06, + "loss": 0.1768, + "step": 12743 + }, + { + "epoch": 0.65, + "grad_norm": 1.0440422547177555, + "learning_rate": 5.822846386434e-06, + "loss": 0.1891, + "step": 12744 + }, + { + "epoch": 0.65, + "grad_norm": 0.9694946099458643, + "learning_rate": 5.82135004372884e-06, + "loss": 0.1823, + "step": 12745 + }, + { + "epoch": 0.65, + "grad_norm": 0.8690001007512806, + "learning_rate": 5.819853814369798e-06, + "loss": 0.1734, + "step": 12746 + }, + { + "epoch": 0.65, + "grad_norm": 0.9975380839997835, + "learning_rate": 5.818357698397455e-06, + "loss": 0.1795, + "step": 12747 + }, + { + "epoch": 0.65, + "grad_norm": 2.109368298211608, + "learning_rate": 5.816861695852398e-06, + "loss": 0.1872, + "step": 12748 + }, + { + "epoch": 0.65, + "grad_norm": 0.9314307134735961, + "learning_rate": 5.815365806775201e-06, + "loss": 0.1967, + "step": 12749 + }, + { + "epoch": 0.65, + "grad_norm": 1.1428904285367345, + "learning_rate": 5.813870031206448e-06, + "loss": 0.1875, + "step": 12750 + }, + { + "epoch": 0.65, + "grad_norm": 0.9960408697173141, + "learning_rate": 5.812374369186701e-06, + "loss": 0.1655, + "step": 12751 + }, + { + "epoch": 0.65, + "grad_norm": 1.2568265105055656, + "learning_rate": 5.8108788207565355e-06, + "loss": 0.1743, + "step": 12752 + }, + { + "epoch": 0.65, + "grad_norm": 0.8431837679769912, + "learning_rate": 5.8093833859565196e-06, + "loss": 0.1824, + "step": 12753 + }, + { + "epoch": 0.65, + "grad_norm": 1.1127045480558082, + "learning_rate": 5.80788806482722e-06, + "loss": 0.1508, + "step": 12754 + }, + { + "epoch": 0.65, + "grad_norm": 1.0746524208411485, + "learning_rate": 5.806392857409189e-06, + "loss": 0.213, + "step": 12755 + }, + { + "epoch": 0.65, + "grad_norm": 0.9716316543874394, + "learning_rate": 5.8048977637429925e-06, + "loss": 0.1939, + "step": 12756 + }, + { + "epoch": 0.65, + "grad_norm": 1.3417170022179468, + "learning_rate": 5.803402783869178e-06, + "loss": 0.1571, + "step": 12757 + }, + { + "epoch": 0.65, + "grad_norm": 0.9285050897401939, + "learning_rate": 5.801907917828303e-06, + "loss": 0.1817, + "step": 12758 + }, + { + "epoch": 0.65, + "grad_norm": 1.824448289024012, + "learning_rate": 5.800413165660913e-06, + "loss": 0.1817, + "step": 12759 + }, + { + "epoch": 0.65, + "grad_norm": 3.9595828542615563, + "learning_rate": 5.798918527407549e-06, + "loss": 0.1864, + "step": 12760 + }, + { + "epoch": 0.65, + "grad_norm": 0.9974578852520701, + "learning_rate": 5.797424003108758e-06, + "loss": 0.1507, + "step": 12761 + }, + { + "epoch": 0.65, + "grad_norm": 0.9464195385603775, + "learning_rate": 5.795929592805077e-06, + "loss": 0.16, + "step": 12762 + }, + { + "epoch": 0.65, + "grad_norm": 1.1374103732685328, + "learning_rate": 5.794435296537049e-06, + "loss": 0.1781, + "step": 12763 + }, + { + "epoch": 0.65, + "grad_norm": 1.1827366485411164, + "learning_rate": 5.7929411143451955e-06, + "loss": 0.1668, + "step": 12764 + }, + { + "epoch": 0.65, + "grad_norm": 1.168608591288615, + "learning_rate": 5.791447046270055e-06, + "loss": 0.1834, + "step": 12765 + }, + { + "epoch": 0.65, + "grad_norm": 0.9011955344686472, + "learning_rate": 5.78995309235215e-06, + "loss": 0.1653, + "step": 12766 + }, + { + "epoch": 0.65, + "grad_norm": 1.4317286725861893, + "learning_rate": 5.788459252632008e-06, + "loss": 0.1901, + "step": 12767 + }, + { + "epoch": 0.65, + "grad_norm": 1.3077605328902828, + "learning_rate": 5.7869655271501415e-06, + "loss": 0.1947, + "step": 12768 + }, + { + "epoch": 0.65, + "grad_norm": 0.8625789708204211, + "learning_rate": 5.785471915947078e-06, + "loss": 0.1683, + "step": 12769 + }, + { + "epoch": 0.65, + "grad_norm": 0.9455906607045814, + "learning_rate": 5.783978419063323e-06, + "loss": 0.1848, + "step": 12770 + }, + { + "epoch": 0.65, + "grad_norm": 1.2640924961109428, + "learning_rate": 5.782485036539391e-06, + "loss": 0.1574, + "step": 12771 + }, + { + "epoch": 0.65, + "grad_norm": 1.0937487504731338, + "learning_rate": 5.7809917684157915e-06, + "loss": 0.1685, + "step": 12772 + }, + { + "epoch": 0.65, + "grad_norm": 1.0232287650417144, + "learning_rate": 5.779498614733032e-06, + "loss": 0.1958, + "step": 12773 + }, + { + "epoch": 0.65, + "grad_norm": 0.9582307803099454, + "learning_rate": 5.778005575531606e-06, + "loss": 0.173, + "step": 12774 + }, + { + "epoch": 0.65, + "grad_norm": 1.08070120123374, + "learning_rate": 5.7765126508520216e-06, + "loss": 0.1669, + "step": 12775 + }, + { + "epoch": 0.65, + "grad_norm": 2.898585807582454, + "learning_rate": 5.775019840734768e-06, + "loss": 0.2019, + "step": 12776 + }, + { + "epoch": 0.65, + "grad_norm": 0.8173973148695457, + "learning_rate": 5.773527145220341e-06, + "loss": 0.1813, + "step": 12777 + }, + { + "epoch": 0.65, + "grad_norm": 1.1002081814255371, + "learning_rate": 5.772034564349227e-06, + "loss": 0.166, + "step": 12778 + }, + { + "epoch": 0.65, + "grad_norm": 0.9156571269819482, + "learning_rate": 5.770542098161913e-06, + "loss": 0.1703, + "step": 12779 + }, + { + "epoch": 0.65, + "grad_norm": 0.9475744155041969, + "learning_rate": 5.769049746698889e-06, + "loss": 0.1599, + "step": 12780 + }, + { + "epoch": 0.65, + "grad_norm": 1.027253670713997, + "learning_rate": 5.767557510000624e-06, + "loss": 0.1714, + "step": 12781 + }, + { + "epoch": 0.65, + "grad_norm": 0.9115685276161519, + "learning_rate": 5.7660653881076045e-06, + "loss": 0.1624, + "step": 12782 + }, + { + "epoch": 0.65, + "grad_norm": 0.9189978581613581, + "learning_rate": 5.7645733810602975e-06, + "loss": 0.1615, + "step": 12783 + }, + { + "epoch": 0.65, + "grad_norm": 1.6394894428716966, + "learning_rate": 5.76308148889918e-06, + "loss": 0.2053, + "step": 12784 + }, + { + "epoch": 0.65, + "grad_norm": 1.3596295229586994, + "learning_rate": 5.761589711664714e-06, + "loss": 0.1913, + "step": 12785 + }, + { + "epoch": 0.65, + "grad_norm": 0.8407133594687236, + "learning_rate": 5.760098049397369e-06, + "loss": 0.1632, + "step": 12786 + }, + { + "epoch": 0.65, + "grad_norm": 0.9167228532336161, + "learning_rate": 5.7586065021376e-06, + "loss": 0.1623, + "step": 12787 + }, + { + "epoch": 0.65, + "grad_norm": 1.0360576474570997, + "learning_rate": 5.7571150699258695e-06, + "loss": 0.2142, + "step": 12788 + }, + { + "epoch": 0.65, + "grad_norm": 1.1085304668570894, + "learning_rate": 5.7556237528026325e-06, + "loss": 0.1695, + "step": 12789 + }, + { + "epoch": 0.65, + "grad_norm": 1.0896331639110117, + "learning_rate": 5.754132550808345e-06, + "loss": 0.1871, + "step": 12790 + }, + { + "epoch": 0.65, + "grad_norm": 1.0283976013604725, + "learning_rate": 5.752641463983446e-06, + "loss": 0.1893, + "step": 12791 + }, + { + "epoch": 0.65, + "grad_norm": 0.9949917672413399, + "learning_rate": 5.751150492368394e-06, + "loss": 0.17, + "step": 12792 + }, + { + "epoch": 0.65, + "grad_norm": 1.0405359463683412, + "learning_rate": 5.749659636003619e-06, + "loss": 0.1585, + "step": 12793 + }, + { + "epoch": 0.65, + "grad_norm": 1.6514389831675638, + "learning_rate": 5.748168894929571e-06, + "loss": 0.1664, + "step": 12794 + }, + { + "epoch": 0.65, + "grad_norm": 0.9551165337416043, + "learning_rate": 5.746678269186682e-06, + "loss": 0.1687, + "step": 12795 + }, + { + "epoch": 0.65, + "grad_norm": 0.7885972333474208, + "learning_rate": 5.7451877588153805e-06, + "loss": 0.1671, + "step": 12796 + }, + { + "epoch": 0.65, + "grad_norm": 0.9364830001912637, + "learning_rate": 5.743697363856103e-06, + "loss": 0.1723, + "step": 12797 + }, + { + "epoch": 0.65, + "grad_norm": 1.2316441009063175, + "learning_rate": 5.742207084349274e-06, + "loss": 0.1726, + "step": 12798 + }, + { + "epoch": 0.65, + "grad_norm": 0.9264953776930172, + "learning_rate": 5.740716920335321e-06, + "loss": 0.1784, + "step": 12799 + }, + { + "epoch": 0.65, + "grad_norm": 1.0204461207639197, + "learning_rate": 5.739226871854659e-06, + "loss": 0.1939, + "step": 12800 + }, + { + "epoch": 0.65, + "grad_norm": 1.665770547649148, + "learning_rate": 5.737736938947713e-06, + "loss": 0.1666, + "step": 12801 + }, + { + "epoch": 0.65, + "grad_norm": 0.8495805423139131, + "learning_rate": 5.73624712165489e-06, + "loss": 0.1623, + "step": 12802 + }, + { + "epoch": 0.65, + "grad_norm": 1.0323761081415743, + "learning_rate": 5.734757420016608e-06, + "loss": 0.1841, + "step": 12803 + }, + { + "epoch": 0.65, + "grad_norm": 1.0378100442313927, + "learning_rate": 5.733267834073267e-06, + "loss": 0.1752, + "step": 12804 + }, + { + "epoch": 0.65, + "grad_norm": 1.1876253510119588, + "learning_rate": 5.731778363865278e-06, + "loss": 0.1599, + "step": 12805 + }, + { + "epoch": 0.65, + "grad_norm": 1.041679902340334, + "learning_rate": 5.730289009433041e-06, + "loss": 0.1814, + "step": 12806 + }, + { + "epoch": 0.65, + "grad_norm": 0.9306180346304497, + "learning_rate": 5.7287997708169615e-06, + "loss": 0.1777, + "step": 12807 + }, + { + "epoch": 0.65, + "grad_norm": 0.8422275117744921, + "learning_rate": 5.7273106480574245e-06, + "loss": 0.1842, + "step": 12808 + }, + { + "epoch": 0.65, + "grad_norm": 0.9099700432256908, + "learning_rate": 5.725821641194831e-06, + "loss": 0.1839, + "step": 12809 + }, + { + "epoch": 0.65, + "grad_norm": 1.0335046237387424, + "learning_rate": 5.724332750269563e-06, + "loss": 0.1808, + "step": 12810 + }, + { + "epoch": 0.65, + "grad_norm": 0.7342455579777192, + "learning_rate": 5.722843975322015e-06, + "loss": 0.1691, + "step": 12811 + }, + { + "epoch": 0.65, + "grad_norm": 1.021904470086725, + "learning_rate": 5.721355316392566e-06, + "loss": 0.1678, + "step": 12812 + }, + { + "epoch": 0.65, + "grad_norm": 0.8413994432957763, + "learning_rate": 5.719866773521592e-06, + "loss": 0.1739, + "step": 12813 + }, + { + "epoch": 0.65, + "grad_norm": 0.7656323654203047, + "learning_rate": 5.718378346749473e-06, + "loss": 0.173, + "step": 12814 + }, + { + "epoch": 0.65, + "grad_norm": 0.8695568552072197, + "learning_rate": 5.716890036116582e-06, + "loss": 0.1816, + "step": 12815 + }, + { + "epoch": 0.65, + "grad_norm": 0.637543553189384, + "learning_rate": 5.715401841663296e-06, + "loss": 0.1538, + "step": 12816 + }, + { + "epoch": 0.65, + "grad_norm": 0.8986297573930118, + "learning_rate": 5.713913763429972e-06, + "loss": 0.1765, + "step": 12817 + }, + { + "epoch": 0.65, + "grad_norm": 1.165151118458408, + "learning_rate": 5.712425801456984e-06, + "loss": 0.1858, + "step": 12818 + }, + { + "epoch": 0.65, + "grad_norm": 1.2275823408117625, + "learning_rate": 5.710937955784686e-06, + "loss": 0.1808, + "step": 12819 + }, + { + "epoch": 0.65, + "grad_norm": 1.0067028818244466, + "learning_rate": 5.709450226453439e-06, + "loss": 0.2044, + "step": 12820 + }, + { + "epoch": 0.65, + "grad_norm": 0.7775459709638857, + "learning_rate": 5.707962613503595e-06, + "loss": 0.1662, + "step": 12821 + }, + { + "epoch": 0.65, + "grad_norm": 2.4527849280844114, + "learning_rate": 5.706475116975512e-06, + "loss": 0.2129, + "step": 12822 + }, + { + "epoch": 0.65, + "grad_norm": 0.9436189868348397, + "learning_rate": 5.704987736909529e-06, + "loss": 0.1751, + "step": 12823 + }, + { + "epoch": 0.65, + "grad_norm": 1.0299563360182766, + "learning_rate": 5.703500473345995e-06, + "loss": 0.1849, + "step": 12824 + }, + { + "epoch": 0.65, + "grad_norm": 1.3373956988323639, + "learning_rate": 5.702013326325256e-06, + "loss": 0.1731, + "step": 12825 + }, + { + "epoch": 0.65, + "grad_norm": 0.6450668643735773, + "learning_rate": 5.700526295887649e-06, + "loss": 0.1551, + "step": 12826 + }, + { + "epoch": 0.65, + "grad_norm": 1.4047761777979644, + "learning_rate": 5.699039382073508e-06, + "loss": 0.159, + "step": 12827 + }, + { + "epoch": 0.65, + "grad_norm": 0.9549652310257992, + "learning_rate": 5.69755258492317e-06, + "loss": 0.1792, + "step": 12828 + }, + { + "epoch": 0.65, + "grad_norm": 0.8410795021030328, + "learning_rate": 5.6960659044769596e-06, + "loss": 0.1677, + "step": 12829 + }, + { + "epoch": 0.65, + "grad_norm": 1.8490038369881001, + "learning_rate": 5.694579340775202e-06, + "loss": 0.1716, + "step": 12830 + }, + { + "epoch": 0.65, + "grad_norm": 1.0215088608250185, + "learning_rate": 5.693092893858223e-06, + "loss": 0.1724, + "step": 12831 + }, + { + "epoch": 0.65, + "grad_norm": 0.801362536481239, + "learning_rate": 5.691606563766341e-06, + "loss": 0.1882, + "step": 12832 + }, + { + "epoch": 0.65, + "grad_norm": 1.1402567424615575, + "learning_rate": 5.6901203505398805e-06, + "loss": 0.1853, + "step": 12833 + }, + { + "epoch": 0.65, + "grad_norm": 1.134326759007839, + "learning_rate": 5.688634254219143e-06, + "loss": 0.1587, + "step": 12834 + }, + { + "epoch": 0.65, + "grad_norm": 0.9233574208843645, + "learning_rate": 5.687148274844449e-06, + "loss": 0.167, + "step": 12835 + }, + { + "epoch": 0.65, + "grad_norm": 0.7866757718859764, + "learning_rate": 5.6856624124560985e-06, + "loss": 0.1697, + "step": 12836 + }, + { + "epoch": 0.65, + "grad_norm": 1.0403016153270666, + "learning_rate": 5.684176667094403e-06, + "loss": 0.1639, + "step": 12837 + }, + { + "epoch": 0.65, + "grad_norm": 1.605615224634009, + "learning_rate": 5.682691038799655e-06, + "loss": 0.1811, + "step": 12838 + }, + { + "epoch": 0.65, + "grad_norm": 0.9004022258891372, + "learning_rate": 5.68120552761216e-06, + "loss": 0.2044, + "step": 12839 + }, + { + "epoch": 0.65, + "grad_norm": 0.9103926230618025, + "learning_rate": 5.6797201335722064e-06, + "loss": 0.181, + "step": 12840 + }, + { + "epoch": 0.65, + "grad_norm": 0.9655309677333925, + "learning_rate": 5.678234856720086e-06, + "loss": 0.1693, + "step": 12841 + }, + { + "epoch": 0.65, + "grad_norm": 0.9890053890231713, + "learning_rate": 5.67674969709609e-06, + "loss": 0.1832, + "step": 12842 + }, + { + "epoch": 0.65, + "grad_norm": 1.561738973816005, + "learning_rate": 5.675264654740506e-06, + "loss": 0.1944, + "step": 12843 + }, + { + "epoch": 0.65, + "grad_norm": 1.112333814532282, + "learning_rate": 5.67377972969361e-06, + "loss": 0.1898, + "step": 12844 + }, + { + "epoch": 0.65, + "grad_norm": 0.8642933030164457, + "learning_rate": 5.672294921995687e-06, + "loss": 0.186, + "step": 12845 + }, + { + "epoch": 0.65, + "grad_norm": 0.9086013400978419, + "learning_rate": 5.670810231687004e-06, + "loss": 0.16, + "step": 12846 + }, + { + "epoch": 0.65, + "grad_norm": 2.064045502304253, + "learning_rate": 5.669325658807843e-06, + "loss": 0.1797, + "step": 12847 + }, + { + "epoch": 0.65, + "grad_norm": 1.3122020336260767, + "learning_rate": 5.667841203398463e-06, + "loss": 0.1961, + "step": 12848 + }, + { + "epoch": 0.65, + "grad_norm": 1.319368124499809, + "learning_rate": 5.666356865499134e-06, + "loss": 0.1906, + "step": 12849 + }, + { + "epoch": 0.65, + "grad_norm": 1.129651978672975, + "learning_rate": 5.664872645150126e-06, + "loss": 0.1678, + "step": 12850 + }, + { + "epoch": 0.65, + "grad_norm": 0.8886433832662072, + "learning_rate": 5.663388542391687e-06, + "loss": 0.1729, + "step": 12851 + }, + { + "epoch": 0.65, + "grad_norm": 0.8454505714582762, + "learning_rate": 5.661904557264083e-06, + "loss": 0.1826, + "step": 12852 + }, + { + "epoch": 0.65, + "grad_norm": 0.909024630136565, + "learning_rate": 5.6604206898075595e-06, + "loss": 0.1766, + "step": 12853 + }, + { + "epoch": 0.65, + "grad_norm": 0.7422197954953109, + "learning_rate": 5.658936940062373e-06, + "loss": 0.1818, + "step": 12854 + }, + { + "epoch": 0.65, + "grad_norm": 0.9900409014516947, + "learning_rate": 5.657453308068763e-06, + "loss": 0.1645, + "step": 12855 + }, + { + "epoch": 0.65, + "grad_norm": 1.0872223642690009, + "learning_rate": 5.655969793866982e-06, + "loss": 0.1686, + "step": 12856 + }, + { + "epoch": 0.65, + "grad_norm": 1.1858898783551484, + "learning_rate": 5.654486397497262e-06, + "loss": 0.172, + "step": 12857 + }, + { + "epoch": 0.65, + "grad_norm": 1.69732530121704, + "learning_rate": 5.653003118999843e-06, + "loss": 0.1616, + "step": 12858 + }, + { + "epoch": 0.65, + "grad_norm": 1.0433921844175187, + "learning_rate": 5.651519958414961e-06, + "loss": 0.1705, + "step": 12859 + }, + { + "epoch": 0.65, + "grad_norm": 1.003929183034557, + "learning_rate": 5.650036915782849e-06, + "loss": 0.1941, + "step": 12860 + }, + { + "epoch": 0.65, + "grad_norm": 0.7799695862625812, + "learning_rate": 5.648553991143728e-06, + "loss": 0.1883, + "step": 12861 + }, + { + "epoch": 0.65, + "grad_norm": 0.9482306796546718, + "learning_rate": 5.647071184537829e-06, + "loss": 0.1592, + "step": 12862 + }, + { + "epoch": 0.65, + "grad_norm": 1.9391228858816858, + "learning_rate": 5.6455884960053655e-06, + "loss": 0.1697, + "step": 12863 + }, + { + "epoch": 0.65, + "grad_norm": 1.0291871921144702, + "learning_rate": 5.6441059255865645e-06, + "loss": 0.1647, + "step": 12864 + }, + { + "epoch": 0.65, + "grad_norm": 0.9809468989705948, + "learning_rate": 5.642623473321638e-06, + "loss": 0.179, + "step": 12865 + }, + { + "epoch": 0.65, + "grad_norm": 1.0666735024576603, + "learning_rate": 5.64114113925079e-06, + "loss": 0.1894, + "step": 12866 + }, + { + "epoch": 0.65, + "grad_norm": 0.9172747178146132, + "learning_rate": 5.639658923414235e-06, + "loss": 0.1778, + "step": 12867 + }, + { + "epoch": 0.65, + "grad_norm": 0.9556114312786075, + "learning_rate": 5.638176825852178e-06, + "loss": 0.1634, + "step": 12868 + }, + { + "epoch": 0.65, + "grad_norm": 0.8090140618805673, + "learning_rate": 5.636694846604825e-06, + "loss": 0.1591, + "step": 12869 + }, + { + "epoch": 0.65, + "grad_norm": 0.8509532820931227, + "learning_rate": 5.635212985712366e-06, + "loss": 0.1727, + "step": 12870 + }, + { + "epoch": 0.65, + "grad_norm": 6.221724279621428, + "learning_rate": 5.633731243215007e-06, + "loss": 0.1786, + "step": 12871 + }, + { + "epoch": 0.65, + "grad_norm": 0.9225094148693288, + "learning_rate": 5.63224961915293e-06, + "loss": 0.1659, + "step": 12872 + }, + { + "epoch": 0.65, + "grad_norm": 0.789810703620753, + "learning_rate": 5.6307681135663315e-06, + "loss": 0.1734, + "step": 12873 + }, + { + "epoch": 0.65, + "grad_norm": 0.8656698646274308, + "learning_rate": 5.629286726495393e-06, + "loss": 0.1606, + "step": 12874 + }, + { + "epoch": 0.65, + "grad_norm": 0.9659152400307496, + "learning_rate": 5.627805457980298e-06, + "loss": 0.1709, + "step": 12875 + }, + { + "epoch": 0.65, + "grad_norm": 1.0158763651419542, + "learning_rate": 5.626324308061226e-06, + "loss": 0.183, + "step": 12876 + }, + { + "epoch": 0.65, + "grad_norm": 0.9541796052338082, + "learning_rate": 5.624843276778358e-06, + "loss": 0.1869, + "step": 12877 + }, + { + "epoch": 0.65, + "grad_norm": 0.7038071148065027, + "learning_rate": 5.62336236417186e-06, + "loss": 0.1458, + "step": 12878 + }, + { + "epoch": 0.65, + "grad_norm": 0.8309038062333706, + "learning_rate": 5.621881570281909e-06, + "loss": 0.1536, + "step": 12879 + }, + { + "epoch": 0.65, + "grad_norm": 3.1898632042402766, + "learning_rate": 5.6204008951486636e-06, + "loss": 0.1907, + "step": 12880 + }, + { + "epoch": 0.66, + "grad_norm": 0.9013225322317349, + "learning_rate": 5.618920338812295e-06, + "loss": 0.1781, + "step": 12881 + }, + { + "epoch": 0.66, + "grad_norm": 0.8402968879388498, + "learning_rate": 5.61743990131296e-06, + "loss": 0.1772, + "step": 12882 + }, + { + "epoch": 0.66, + "grad_norm": 1.2585658510983027, + "learning_rate": 5.615959582690812e-06, + "loss": 0.2075, + "step": 12883 + }, + { + "epoch": 0.66, + "grad_norm": 1.0642373692264568, + "learning_rate": 5.614479382986007e-06, + "loss": 0.1954, + "step": 12884 + }, + { + "epoch": 0.66, + "grad_norm": 0.8121988348561141, + "learning_rate": 5.612999302238696e-06, + "loss": 0.1607, + "step": 12885 + }, + { + "epoch": 0.66, + "grad_norm": 0.9981702401051303, + "learning_rate": 5.611519340489031e-06, + "loss": 0.1698, + "step": 12886 + }, + { + "epoch": 0.66, + "grad_norm": 1.01944572613643, + "learning_rate": 5.610039497777149e-06, + "loss": 0.1731, + "step": 12887 + }, + { + "epoch": 0.66, + "grad_norm": 1.2615462260108041, + "learning_rate": 5.608559774143196e-06, + "loss": 0.1783, + "step": 12888 + }, + { + "epoch": 0.66, + "grad_norm": 1.2403582568494242, + "learning_rate": 5.607080169627304e-06, + "loss": 0.1756, + "step": 12889 + }, + { + "epoch": 0.66, + "grad_norm": 0.8311748159017751, + "learning_rate": 5.6056006842696145e-06, + "loss": 0.1722, + "step": 12890 + }, + { + "epoch": 0.66, + "grad_norm": 1.5292143098094482, + "learning_rate": 5.60412131811025e-06, + "loss": 0.1706, + "step": 12891 + }, + { + "epoch": 0.66, + "grad_norm": 1.1798060723173425, + "learning_rate": 5.6026420711893485e-06, + "loss": 0.2004, + "step": 12892 + }, + { + "epoch": 0.66, + "grad_norm": 1.9015841379843128, + "learning_rate": 5.601162943547023e-06, + "loss": 0.16, + "step": 12893 + }, + { + "epoch": 0.66, + "grad_norm": 1.1723385310959136, + "learning_rate": 5.599683935223402e-06, + "loss": 0.152, + "step": 12894 + }, + { + "epoch": 0.66, + "grad_norm": 1.1839672034771838, + "learning_rate": 5.598205046258603e-06, + "loss": 0.1623, + "step": 12895 + }, + { + "epoch": 0.66, + "grad_norm": 2.3569216140387987, + "learning_rate": 5.596726276692745e-06, + "loss": 0.1868, + "step": 12896 + }, + { + "epoch": 0.66, + "grad_norm": 0.9215860827965177, + "learning_rate": 5.5952476265659315e-06, + "loss": 0.1629, + "step": 12897 + }, + { + "epoch": 0.66, + "grad_norm": 1.2795995011922863, + "learning_rate": 5.593769095918278e-06, + "loss": 0.189, + "step": 12898 + }, + { + "epoch": 0.66, + "grad_norm": 1.0940029725833822, + "learning_rate": 5.592290684789887e-06, + "loss": 0.1557, + "step": 12899 + }, + { + "epoch": 0.66, + "grad_norm": 0.846845827744864, + "learning_rate": 5.5908123932208565e-06, + "loss": 0.1829, + "step": 12900 + }, + { + "epoch": 0.66, + "grad_norm": 1.0259373068826925, + "learning_rate": 5.589334221251289e-06, + "loss": 0.1777, + "step": 12901 + }, + { + "epoch": 0.66, + "grad_norm": 0.9493271016656901, + "learning_rate": 5.587856168921279e-06, + "loss": 0.1728, + "step": 12902 + }, + { + "epoch": 0.66, + "grad_norm": 1.137369310293687, + "learning_rate": 5.586378236270925e-06, + "loss": 0.1685, + "step": 12903 + }, + { + "epoch": 0.66, + "grad_norm": 0.8186890126068809, + "learning_rate": 5.584900423340306e-06, + "loss": 0.177, + "step": 12904 + }, + { + "epoch": 0.66, + "grad_norm": 4.218637687925129, + "learning_rate": 5.5834227301695166e-06, + "loss": 0.2081, + "step": 12905 + }, + { + "epoch": 0.66, + "grad_norm": 1.6991846261385979, + "learning_rate": 5.581945156798629e-06, + "loss": 0.1579, + "step": 12906 + }, + { + "epoch": 0.66, + "grad_norm": 1.0576293401743173, + "learning_rate": 5.580467703267736e-06, + "loss": 0.1876, + "step": 12907 + }, + { + "epoch": 0.66, + "grad_norm": 0.8104066484939919, + "learning_rate": 5.578990369616899e-06, + "loss": 0.1455, + "step": 12908 + }, + { + "epoch": 0.66, + "grad_norm": 0.8376631152931486, + "learning_rate": 5.577513155886204e-06, + "loss": 0.1837, + "step": 12909 + }, + { + "epoch": 0.66, + "grad_norm": 0.8523762348499792, + "learning_rate": 5.576036062115709e-06, + "loss": 0.1626, + "step": 12910 + }, + { + "epoch": 0.66, + "grad_norm": 0.9641752481361844, + "learning_rate": 5.574559088345487e-06, + "loss": 0.1724, + "step": 12911 + }, + { + "epoch": 0.66, + "grad_norm": 0.9164492673667283, + "learning_rate": 5.573082234615599e-06, + "loss": 0.1857, + "step": 12912 + }, + { + "epoch": 0.66, + "grad_norm": 2.019011994175837, + "learning_rate": 5.57160550096611e-06, + "loss": 0.1662, + "step": 12913 + }, + { + "epoch": 0.66, + "grad_norm": 0.9286318752863383, + "learning_rate": 5.570128887437067e-06, + "loss": 0.1789, + "step": 12914 + }, + { + "epoch": 0.66, + "grad_norm": 1.2982218911285703, + "learning_rate": 5.568652394068532e-06, + "loss": 0.1752, + "step": 12915 + }, + { + "epoch": 0.66, + "grad_norm": 1.1241713257522084, + "learning_rate": 5.567176020900549e-06, + "loss": 0.1662, + "step": 12916 + }, + { + "epoch": 0.66, + "grad_norm": 1.113905632611933, + "learning_rate": 5.565699767973169e-06, + "loss": 0.1792, + "step": 12917 + }, + { + "epoch": 0.66, + "grad_norm": 0.8693497758333975, + "learning_rate": 5.564223635326433e-06, + "loss": 0.1824, + "step": 12918 + }, + { + "epoch": 0.66, + "grad_norm": 1.2387170423762486, + "learning_rate": 5.562747623000379e-06, + "loss": 0.1758, + "step": 12919 + }, + { + "epoch": 0.66, + "grad_norm": 1.1359411489145923, + "learning_rate": 5.561271731035045e-06, + "loss": 0.2008, + "step": 12920 + }, + { + "epoch": 0.66, + "grad_norm": 0.9097943534585743, + "learning_rate": 5.559795959470467e-06, + "loss": 0.1707, + "step": 12921 + }, + { + "epoch": 0.66, + "grad_norm": 1.0803842029847395, + "learning_rate": 5.558320308346677e-06, + "loss": 0.1905, + "step": 12922 + }, + { + "epoch": 0.66, + "grad_norm": 1.2140417100881817, + "learning_rate": 5.556844777703697e-06, + "loss": 0.1703, + "step": 12923 + }, + { + "epoch": 0.66, + "grad_norm": 1.0870153760009866, + "learning_rate": 5.5553693675815565e-06, + "loss": 0.1776, + "step": 12924 + }, + { + "epoch": 0.66, + "grad_norm": 1.008402963959758, + "learning_rate": 5.55389407802027e-06, + "loss": 0.1832, + "step": 12925 + }, + { + "epoch": 0.66, + "grad_norm": 0.943314946152503, + "learning_rate": 5.55241890905986e-06, + "loss": 0.1818, + "step": 12926 + }, + { + "epoch": 0.66, + "grad_norm": 0.9804005130777163, + "learning_rate": 5.5509438607403355e-06, + "loss": 0.1663, + "step": 12927 + }, + { + "epoch": 0.66, + "grad_norm": 1.6517596718905303, + "learning_rate": 5.549468933101709e-06, + "loss": 0.1787, + "step": 12928 + }, + { + "epoch": 0.66, + "grad_norm": 1.2459658247423453, + "learning_rate": 5.547994126183991e-06, + "loss": 0.1752, + "step": 12929 + }, + { + "epoch": 0.66, + "grad_norm": 0.9106347033122705, + "learning_rate": 5.546519440027186e-06, + "loss": 0.1606, + "step": 12930 + }, + { + "epoch": 0.66, + "grad_norm": 1.641041700811944, + "learning_rate": 5.545044874671289e-06, + "loss": 0.1646, + "step": 12931 + }, + { + "epoch": 0.66, + "grad_norm": 0.9307740229504946, + "learning_rate": 5.543570430156307e-06, + "loss": 0.1647, + "step": 12932 + }, + { + "epoch": 0.66, + "grad_norm": 0.8617543708343491, + "learning_rate": 5.542096106522224e-06, + "loss": 0.1474, + "step": 12933 + }, + { + "epoch": 0.66, + "grad_norm": 0.7869774137012839, + "learning_rate": 5.540621903809038e-06, + "loss": 0.1718, + "step": 12934 + }, + { + "epoch": 0.66, + "grad_norm": 0.8044176966116101, + "learning_rate": 5.539147822056736e-06, + "loss": 0.165, + "step": 12935 + }, + { + "epoch": 0.66, + "grad_norm": 0.885086608903983, + "learning_rate": 5.537673861305297e-06, + "loss": 0.1713, + "step": 12936 + }, + { + "epoch": 0.66, + "grad_norm": 0.9198934470644922, + "learning_rate": 5.536200021594707e-06, + "loss": 0.1562, + "step": 12937 + }, + { + "epoch": 0.66, + "grad_norm": 1.415345934333553, + "learning_rate": 5.534726302964944e-06, + "loss": 0.1745, + "step": 12938 + }, + { + "epoch": 0.66, + "grad_norm": 1.1173915430681507, + "learning_rate": 5.533252705455985e-06, + "loss": 0.1775, + "step": 12939 + }, + { + "epoch": 0.66, + "grad_norm": 1.3559267057784843, + "learning_rate": 5.531779229107797e-06, + "loss": 0.1655, + "step": 12940 + }, + { + "epoch": 0.66, + "grad_norm": 0.8986687901180408, + "learning_rate": 5.530305873960351e-06, + "loss": 0.1875, + "step": 12941 + }, + { + "epoch": 0.66, + "grad_norm": 0.9091720492697619, + "learning_rate": 5.528832640053607e-06, + "loss": 0.1675, + "step": 12942 + }, + { + "epoch": 0.66, + "grad_norm": 1.0879286275960187, + "learning_rate": 5.527359527427536e-06, + "loss": 0.1863, + "step": 12943 + }, + { + "epoch": 0.66, + "grad_norm": 1.1110812546096838, + "learning_rate": 5.525886536122085e-06, + "loss": 0.1794, + "step": 12944 + }, + { + "epoch": 0.66, + "grad_norm": 1.0303150589230612, + "learning_rate": 5.524413666177216e-06, + "loss": 0.1943, + "step": 12945 + }, + { + "epoch": 0.66, + "grad_norm": 0.8707871663246685, + "learning_rate": 5.522940917632878e-06, + "loss": 0.1703, + "step": 12946 + }, + { + "epoch": 0.66, + "grad_norm": 0.8765865925819567, + "learning_rate": 5.521468290529023e-06, + "loss": 0.1785, + "step": 12947 + }, + { + "epoch": 0.66, + "grad_norm": 0.8522416033424861, + "learning_rate": 5.5199957849055905e-06, + "loss": 0.1682, + "step": 12948 + }, + { + "epoch": 0.66, + "grad_norm": 1.2651317087868765, + "learning_rate": 5.51852340080253e-06, + "loss": 0.1779, + "step": 12949 + }, + { + "epoch": 0.66, + "grad_norm": 1.036138580945707, + "learning_rate": 5.517051138259771e-06, + "loss": 0.1651, + "step": 12950 + }, + { + "epoch": 0.66, + "grad_norm": 1.5286022891139401, + "learning_rate": 5.515578997317257e-06, + "loss": 0.1616, + "step": 12951 + }, + { + "epoch": 0.66, + "grad_norm": 0.9511716474931488, + "learning_rate": 5.514106978014917e-06, + "loss": 0.1965, + "step": 12952 + }, + { + "epoch": 0.66, + "grad_norm": 0.8597682366979487, + "learning_rate": 5.512635080392673e-06, + "loss": 0.1596, + "step": 12953 + }, + { + "epoch": 0.66, + "grad_norm": 1.0870180016132283, + "learning_rate": 5.511163304490456e-06, + "loss": 0.1469, + "step": 12954 + }, + { + "epoch": 0.66, + "grad_norm": 1.0480633504288328, + "learning_rate": 5.50969165034819e-06, + "loss": 0.1846, + "step": 12955 + }, + { + "epoch": 0.66, + "grad_norm": 3.5381871452442013, + "learning_rate": 5.508220118005794e-06, + "loss": 0.1666, + "step": 12956 + }, + { + "epoch": 0.66, + "grad_norm": 0.8148043083133922, + "learning_rate": 5.5067487075031764e-06, + "loss": 0.1735, + "step": 12957 + }, + { + "epoch": 0.66, + "grad_norm": 1.0187217086868456, + "learning_rate": 5.505277418880259e-06, + "loss": 0.1874, + "step": 12958 + }, + { + "epoch": 0.66, + "grad_norm": 1.4282204424031621, + "learning_rate": 5.503806252176941e-06, + "loss": 0.169, + "step": 12959 + }, + { + "epoch": 0.66, + "grad_norm": 0.9937963162216736, + "learning_rate": 5.502335207433136e-06, + "loss": 0.1993, + "step": 12960 + }, + { + "epoch": 0.66, + "grad_norm": 0.930548165625113, + "learning_rate": 5.500864284688739e-06, + "loss": 0.1654, + "step": 12961 + }, + { + "epoch": 0.66, + "grad_norm": 0.8048369428436289, + "learning_rate": 5.499393483983657e-06, + "loss": 0.1646, + "step": 12962 + }, + { + "epoch": 0.66, + "grad_norm": 1.2232545060032005, + "learning_rate": 5.497922805357776e-06, + "loss": 0.1819, + "step": 12963 + }, + { + "epoch": 0.66, + "grad_norm": 0.8363692909755749, + "learning_rate": 5.496452248850994e-06, + "loss": 0.153, + "step": 12964 + }, + { + "epoch": 0.66, + "grad_norm": 0.8803241764374603, + "learning_rate": 5.494981814503199e-06, + "loss": 0.1801, + "step": 12965 + }, + { + "epoch": 0.66, + "grad_norm": 1.189162147818141, + "learning_rate": 5.49351150235428e-06, + "loss": 0.181, + "step": 12966 + }, + { + "epoch": 0.66, + "grad_norm": 2.0669972255809883, + "learning_rate": 5.492041312444112e-06, + "loss": 0.1623, + "step": 12967 + }, + { + "epoch": 0.66, + "grad_norm": 1.7783963693138267, + "learning_rate": 5.490571244812582e-06, + "loss": 0.1841, + "step": 12968 + }, + { + "epoch": 0.66, + "grad_norm": 1.0361244849682094, + "learning_rate": 5.489101299499562e-06, + "loss": 0.1925, + "step": 12969 + }, + { + "epoch": 0.66, + "grad_norm": 0.9770182854110955, + "learning_rate": 5.487631476544921e-06, + "loss": 0.15, + "step": 12970 + }, + { + "epoch": 0.66, + "grad_norm": 1.600128640473354, + "learning_rate": 5.48616177598853e-06, + "loss": 0.1938, + "step": 12971 + }, + { + "epoch": 0.66, + "grad_norm": 1.070133379570775, + "learning_rate": 5.484692197870256e-06, + "loss": 0.193, + "step": 12972 + }, + { + "epoch": 0.66, + "grad_norm": 1.1204297316426883, + "learning_rate": 5.483222742229964e-06, + "loss": 0.1566, + "step": 12973 + }, + { + "epoch": 0.66, + "grad_norm": 3.001429122190217, + "learning_rate": 5.4817534091075084e-06, + "loss": 0.1688, + "step": 12974 + }, + { + "epoch": 0.66, + "grad_norm": 1.0830367798031313, + "learning_rate": 5.480284198542749e-06, + "loss": 0.1836, + "step": 12975 + }, + { + "epoch": 0.66, + "grad_norm": 1.0061740479549004, + "learning_rate": 5.4788151105755326e-06, + "loss": 0.1923, + "step": 12976 + }, + { + "epoch": 0.66, + "grad_norm": 1.008732350438919, + "learning_rate": 5.477346145245717e-06, + "loss": 0.1805, + "step": 12977 + }, + { + "epoch": 0.66, + "grad_norm": 1.1567289166163461, + "learning_rate": 5.475877302593135e-06, + "loss": 0.1672, + "step": 12978 + }, + { + "epoch": 0.66, + "grad_norm": 0.9044833916082834, + "learning_rate": 5.4744085826576445e-06, + "loss": 0.1732, + "step": 12979 + }, + { + "epoch": 0.66, + "grad_norm": 0.7896497231676506, + "learning_rate": 5.472939985479071e-06, + "loss": 0.1773, + "step": 12980 + }, + { + "epoch": 0.66, + "grad_norm": 1.1109312907063378, + "learning_rate": 5.471471511097257e-06, + "loss": 0.1797, + "step": 12981 + }, + { + "epoch": 0.66, + "grad_norm": 0.9273909805501587, + "learning_rate": 5.470003159552033e-06, + "loss": 0.158, + "step": 12982 + }, + { + "epoch": 0.66, + "grad_norm": 0.9094058333615452, + "learning_rate": 5.468534930883234e-06, + "loss": 0.1557, + "step": 12983 + }, + { + "epoch": 0.66, + "grad_norm": 1.0902517874089719, + "learning_rate": 5.467066825130676e-06, + "loss": 0.1632, + "step": 12984 + }, + { + "epoch": 0.66, + "grad_norm": 1.0784363867926698, + "learning_rate": 5.465598842334192e-06, + "loss": 0.1722, + "step": 12985 + }, + { + "epoch": 0.66, + "grad_norm": 1.0900263158696295, + "learning_rate": 5.46413098253359e-06, + "loss": 0.168, + "step": 12986 + }, + { + "epoch": 0.66, + "grad_norm": 1.8354048699959953, + "learning_rate": 5.462663245768696e-06, + "loss": 0.1683, + "step": 12987 + }, + { + "epoch": 0.66, + "grad_norm": 0.8168123012717525, + "learning_rate": 5.461195632079317e-06, + "loss": 0.1612, + "step": 12988 + }, + { + "epoch": 0.66, + "grad_norm": 1.2364602564754394, + "learning_rate": 5.459728141505259e-06, + "loss": 0.1733, + "step": 12989 + }, + { + "epoch": 0.66, + "grad_norm": 1.041956726976596, + "learning_rate": 5.458260774086332e-06, + "loss": 0.1809, + "step": 12990 + }, + { + "epoch": 0.66, + "grad_norm": 0.8923402773137696, + "learning_rate": 5.4567935298623385e-06, + "loss": 0.188, + "step": 12991 + }, + { + "epoch": 0.66, + "grad_norm": 0.9256412258996756, + "learning_rate": 5.45532640887308e-06, + "loss": 0.203, + "step": 12992 + }, + { + "epoch": 0.66, + "grad_norm": 0.9424634298516746, + "learning_rate": 5.453859411158347e-06, + "loss": 0.1721, + "step": 12993 + }, + { + "epoch": 0.66, + "grad_norm": 3.748854710324556, + "learning_rate": 5.452392536757936e-06, + "loss": 0.1591, + "step": 12994 + }, + { + "epoch": 0.66, + "grad_norm": 1.107341764985314, + "learning_rate": 5.450925785711632e-06, + "loss": 0.1865, + "step": 12995 + }, + { + "epoch": 0.66, + "grad_norm": 1.000456094721113, + "learning_rate": 5.449459158059226e-06, + "loss": 0.1719, + "step": 12996 + }, + { + "epoch": 0.66, + "grad_norm": 0.8533555961879594, + "learning_rate": 5.447992653840494e-06, + "loss": 0.1715, + "step": 12997 + }, + { + "epoch": 0.66, + "grad_norm": 1.0126316604351642, + "learning_rate": 5.4465262730952186e-06, + "loss": 0.1931, + "step": 12998 + }, + { + "epoch": 0.66, + "grad_norm": 0.8904401314425945, + "learning_rate": 5.445060015863175e-06, + "loss": 0.1852, + "step": 12999 + }, + { + "epoch": 0.66, + "grad_norm": 2.886172356588746, + "learning_rate": 5.443593882184139e-06, + "loss": 0.1705, + "step": 13000 + }, + { + "epoch": 0.66, + "grad_norm": 0.9153670957628168, + "learning_rate": 5.442127872097873e-06, + "loss": 0.1782, + "step": 13001 + }, + { + "epoch": 0.66, + "grad_norm": 1.2129372270740568, + "learning_rate": 5.440661985644149e-06, + "loss": 0.165, + "step": 13002 + }, + { + "epoch": 0.66, + "grad_norm": 0.8697255131527631, + "learning_rate": 5.439196222862724e-06, + "loss": 0.1648, + "step": 13003 + }, + { + "epoch": 0.66, + "grad_norm": 1.1582811645269684, + "learning_rate": 5.437730583793362e-06, + "loss": 0.1658, + "step": 13004 + }, + { + "epoch": 0.66, + "grad_norm": 1.3861422946046051, + "learning_rate": 5.436265068475815e-06, + "loss": 0.1846, + "step": 13005 + }, + { + "epoch": 0.66, + "grad_norm": 0.7321198610302062, + "learning_rate": 5.4347996769498315e-06, + "loss": 0.1696, + "step": 13006 + }, + { + "epoch": 0.66, + "grad_norm": 0.9401040464943988, + "learning_rate": 5.433334409255165e-06, + "loss": 0.1754, + "step": 13007 + }, + { + "epoch": 0.66, + "grad_norm": 2.078890431543232, + "learning_rate": 5.431869265431562e-06, + "loss": 0.1951, + "step": 13008 + }, + { + "epoch": 0.66, + "grad_norm": 1.0439385899204332, + "learning_rate": 5.430404245518766e-06, + "loss": 0.1818, + "step": 13009 + }, + { + "epoch": 0.66, + "grad_norm": 0.9609589053298132, + "learning_rate": 5.4289393495565076e-06, + "loss": 0.1804, + "step": 13010 + }, + { + "epoch": 0.66, + "grad_norm": 1.1166850002485198, + "learning_rate": 5.427474577584534e-06, + "loss": 0.1579, + "step": 13011 + }, + { + "epoch": 0.66, + "grad_norm": 0.9587875712844113, + "learning_rate": 5.426009929642566e-06, + "loss": 0.1869, + "step": 13012 + }, + { + "epoch": 0.66, + "grad_norm": 1.0499277335217805, + "learning_rate": 5.424545405770341e-06, + "loss": 0.1932, + "step": 13013 + }, + { + "epoch": 0.66, + "grad_norm": 0.9020401891335598, + "learning_rate": 5.423081006007576e-06, + "loss": 0.1855, + "step": 13014 + }, + { + "epoch": 0.66, + "grad_norm": 1.0372741435503523, + "learning_rate": 5.421616730394e-06, + "loss": 0.1919, + "step": 13015 + }, + { + "epoch": 0.66, + "grad_norm": 1.9728797046954707, + "learning_rate": 5.420152578969327e-06, + "loss": 0.1659, + "step": 13016 + }, + { + "epoch": 0.66, + "grad_norm": 1.0922675199537346, + "learning_rate": 5.4186885517732724e-06, + "loss": 0.1897, + "step": 13017 + }, + { + "epoch": 0.66, + "grad_norm": 0.9800919632695411, + "learning_rate": 5.417224648845551e-06, + "loss": 0.2087, + "step": 13018 + }, + { + "epoch": 0.66, + "grad_norm": 0.946962467462363, + "learning_rate": 5.415760870225873e-06, + "loss": 0.1881, + "step": 13019 + }, + { + "epoch": 0.66, + "grad_norm": 2.648603475752819, + "learning_rate": 5.414297215953937e-06, + "loss": 0.1628, + "step": 13020 + }, + { + "epoch": 0.66, + "grad_norm": 1.042906718375941, + "learning_rate": 5.41283368606945e-06, + "loss": 0.1598, + "step": 13021 + }, + { + "epoch": 0.66, + "grad_norm": 1.1437852574832168, + "learning_rate": 5.411370280612109e-06, + "loss": 0.1707, + "step": 13022 + }, + { + "epoch": 0.66, + "grad_norm": 1.109094609556804, + "learning_rate": 5.4099069996216055e-06, + "loss": 0.1531, + "step": 13023 + }, + { + "epoch": 0.66, + "grad_norm": 1.1181170488361203, + "learning_rate": 5.408443843137634e-06, + "loss": 0.1735, + "step": 13024 + }, + { + "epoch": 0.66, + "grad_norm": 1.0072555196252104, + "learning_rate": 5.406980811199881e-06, + "loss": 0.1584, + "step": 13025 + }, + { + "epoch": 0.66, + "grad_norm": 1.9447327044724418, + "learning_rate": 5.405517903848039e-06, + "loss": 0.1512, + "step": 13026 + }, + { + "epoch": 0.66, + "grad_norm": 1.0406129068479135, + "learning_rate": 5.404055121121778e-06, + "loss": 0.1772, + "step": 13027 + }, + { + "epoch": 0.66, + "grad_norm": 0.9813397660614563, + "learning_rate": 5.402592463060785e-06, + "loss": 0.1917, + "step": 13028 + }, + { + "epoch": 0.66, + "grad_norm": 0.9043884637346777, + "learning_rate": 5.401129929704727e-06, + "loss": 0.1743, + "step": 13029 + }, + { + "epoch": 0.66, + "grad_norm": 3.7330322694414013, + "learning_rate": 5.399667521093285e-06, + "loss": 0.1953, + "step": 13030 + }, + { + "epoch": 0.66, + "grad_norm": 1.2142528716335321, + "learning_rate": 5.398205237266116e-06, + "loss": 0.1697, + "step": 13031 + }, + { + "epoch": 0.66, + "grad_norm": 0.9287096481877403, + "learning_rate": 5.396743078262895e-06, + "loss": 0.1736, + "step": 13032 + }, + { + "epoch": 0.66, + "grad_norm": 1.2355905776550489, + "learning_rate": 5.395281044123273e-06, + "loss": 0.1645, + "step": 13033 + }, + { + "epoch": 0.66, + "grad_norm": 0.8630831827981413, + "learning_rate": 5.393819134886913e-06, + "loss": 0.1682, + "step": 13034 + }, + { + "epoch": 0.66, + "grad_norm": 2.0966432650161853, + "learning_rate": 5.392357350593469e-06, + "loss": 0.1745, + "step": 13035 + }, + { + "epoch": 0.66, + "grad_norm": 0.8858228134039371, + "learning_rate": 5.390895691282596e-06, + "loss": 0.1785, + "step": 13036 + }, + { + "epoch": 0.66, + "grad_norm": 0.9077285942791732, + "learning_rate": 5.389434156993935e-06, + "loss": 0.169, + "step": 13037 + }, + { + "epoch": 0.66, + "grad_norm": 0.9508245618541968, + "learning_rate": 5.387972747767136e-06, + "loss": 0.1893, + "step": 13038 + }, + { + "epoch": 0.66, + "grad_norm": 1.127200876715937, + "learning_rate": 5.386511463641836e-06, + "loss": 0.1729, + "step": 13039 + }, + { + "epoch": 0.66, + "grad_norm": 1.5310594407487175, + "learning_rate": 5.38505030465767e-06, + "loss": 0.1898, + "step": 13040 + }, + { + "epoch": 0.66, + "grad_norm": 0.938278714469896, + "learning_rate": 5.383589270854279e-06, + "loss": 0.1894, + "step": 13041 + }, + { + "epoch": 0.66, + "grad_norm": 1.0370258145919886, + "learning_rate": 5.382128362271285e-06, + "loss": 0.1794, + "step": 13042 + }, + { + "epoch": 0.66, + "grad_norm": 2.064224882974358, + "learning_rate": 5.380667578948321e-06, + "loss": 0.1656, + "step": 13043 + }, + { + "epoch": 0.66, + "grad_norm": 0.9352410513608885, + "learning_rate": 5.379206920925009e-06, + "loss": 0.1976, + "step": 13044 + }, + { + "epoch": 0.66, + "grad_norm": 0.7886476725295988, + "learning_rate": 5.3777463882409744e-06, + "loss": 0.1864, + "step": 13045 + }, + { + "epoch": 0.66, + "grad_norm": 1.079848051901478, + "learning_rate": 5.376285980935827e-06, + "loss": 0.1585, + "step": 13046 + }, + { + "epoch": 0.66, + "grad_norm": 0.9483693289000759, + "learning_rate": 5.374825699049186e-06, + "loss": 0.1597, + "step": 13047 + }, + { + "epoch": 0.66, + "grad_norm": 1.109683010946804, + "learning_rate": 5.3733655426206564e-06, + "loss": 0.1881, + "step": 13048 + }, + { + "epoch": 0.66, + "grad_norm": 0.9538865416720401, + "learning_rate": 5.371905511689852e-06, + "loss": 0.1738, + "step": 13049 + }, + { + "epoch": 0.66, + "grad_norm": 0.8736303533856425, + "learning_rate": 5.3704456062963674e-06, + "loss": 0.1677, + "step": 13050 + }, + { + "epoch": 0.66, + "grad_norm": 0.9141910495008178, + "learning_rate": 5.368985826479807e-06, + "loss": 0.1821, + "step": 13051 + }, + { + "epoch": 0.66, + "grad_norm": 0.9491642343236588, + "learning_rate": 5.367526172279768e-06, + "loss": 0.1619, + "step": 13052 + }, + { + "epoch": 0.66, + "grad_norm": 1.1240984076090763, + "learning_rate": 5.366066643735847e-06, + "loss": 0.1568, + "step": 13053 + }, + { + "epoch": 0.66, + "grad_norm": 1.389968272416857, + "learning_rate": 5.364607240887626e-06, + "loss": 0.1866, + "step": 13054 + }, + { + "epoch": 0.66, + "grad_norm": 3.3602336034440015, + "learning_rate": 5.3631479637747e-06, + "loss": 0.1631, + "step": 13055 + }, + { + "epoch": 0.66, + "grad_norm": 2.0299834130513226, + "learning_rate": 5.361688812436642e-06, + "loss": 0.1609, + "step": 13056 + }, + { + "epoch": 0.66, + "grad_norm": 1.2674930173145462, + "learning_rate": 5.360229786913042e-06, + "loss": 0.1657, + "step": 13057 + }, + { + "epoch": 0.66, + "grad_norm": 1.13060064340207, + "learning_rate": 5.3587708872434705e-06, + "loss": 0.1746, + "step": 13058 + }, + { + "epoch": 0.66, + "grad_norm": 0.8379752729010114, + "learning_rate": 5.357312113467497e-06, + "loss": 0.1534, + "step": 13059 + }, + { + "epoch": 0.66, + "grad_norm": 0.9557520113683214, + "learning_rate": 5.355853465624695e-06, + "loss": 0.1711, + "step": 13060 + }, + { + "epoch": 0.66, + "grad_norm": 0.9239994296030585, + "learning_rate": 5.354394943754631e-06, + "loss": 0.1737, + "step": 13061 + }, + { + "epoch": 0.66, + "grad_norm": 1.321773416829171, + "learning_rate": 5.352936547896868e-06, + "loss": 0.1484, + "step": 13062 + }, + { + "epoch": 0.66, + "grad_norm": 0.9572197883433509, + "learning_rate": 5.351478278090962e-06, + "loss": 0.1839, + "step": 13063 + }, + { + "epoch": 0.66, + "grad_norm": 1.5778148636154194, + "learning_rate": 5.350020134376472e-06, + "loss": 0.1724, + "step": 13064 + }, + { + "epoch": 0.66, + "grad_norm": 0.9079286587968833, + "learning_rate": 5.348562116792946e-06, + "loss": 0.1738, + "step": 13065 + }, + { + "epoch": 0.66, + "grad_norm": 0.9419826033561322, + "learning_rate": 5.34710422537994e-06, + "loss": 0.1763, + "step": 13066 + }, + { + "epoch": 0.66, + "grad_norm": 1.116714619089233, + "learning_rate": 5.345646460176989e-06, + "loss": 0.1757, + "step": 13067 + }, + { + "epoch": 0.66, + "grad_norm": 1.1682603017157274, + "learning_rate": 5.344188821223642e-06, + "loss": 0.1545, + "step": 13068 + }, + { + "epoch": 0.66, + "grad_norm": 0.9448118760115939, + "learning_rate": 5.342731308559435e-06, + "loss": 0.163, + "step": 13069 + }, + { + "epoch": 0.66, + "grad_norm": 1.2298396046807265, + "learning_rate": 5.341273922223908e-06, + "loss": 0.1825, + "step": 13070 + }, + { + "epoch": 0.66, + "grad_norm": 1.1640638526000524, + "learning_rate": 5.339816662256587e-06, + "loss": 0.1556, + "step": 13071 + }, + { + "epoch": 0.66, + "grad_norm": 1.1082203563147006, + "learning_rate": 5.338359528697005e-06, + "loss": 0.1632, + "step": 13072 + }, + { + "epoch": 0.66, + "grad_norm": 1.2232992504838813, + "learning_rate": 5.3369025215846796e-06, + "loss": 0.1789, + "step": 13073 + }, + { + "epoch": 0.66, + "grad_norm": 0.9051337295054136, + "learning_rate": 5.3354456409591405e-06, + "loss": 0.1649, + "step": 13074 + }, + { + "epoch": 0.66, + "grad_norm": 0.9972396031450846, + "learning_rate": 5.333988886859903e-06, + "loss": 0.1574, + "step": 13075 + }, + { + "epoch": 0.66, + "grad_norm": 0.7576046547142246, + "learning_rate": 5.332532259326476e-06, + "loss": 0.1912, + "step": 13076 + }, + { + "epoch": 0.66, + "grad_norm": 0.9144689190373975, + "learning_rate": 5.331075758398375e-06, + "loss": 0.1796, + "step": 13077 + }, + { + "epoch": 0.67, + "grad_norm": 1.7592614649841753, + "learning_rate": 5.329619384115108e-06, + "loss": 0.1616, + "step": 13078 + }, + { + "epoch": 0.67, + "grad_norm": 1.0801539664317343, + "learning_rate": 5.328163136516184e-06, + "loss": 0.1411, + "step": 13079 + }, + { + "epoch": 0.67, + "grad_norm": 1.0975411316744548, + "learning_rate": 5.326707015641093e-06, + "loss": 0.1754, + "step": 13080 + }, + { + "epoch": 0.67, + "grad_norm": 2.636575145510722, + "learning_rate": 5.325251021529343e-06, + "loss": 0.1778, + "step": 13081 + }, + { + "epoch": 0.67, + "grad_norm": 1.4382356623470685, + "learning_rate": 5.323795154220419e-06, + "loss": 0.1736, + "step": 13082 + }, + { + "epoch": 0.67, + "grad_norm": 1.6580238322303211, + "learning_rate": 5.322339413753819e-06, + "loss": 0.1732, + "step": 13083 + }, + { + "epoch": 0.67, + "grad_norm": 1.0080504503860495, + "learning_rate": 5.3208838001690236e-06, + "loss": 0.1907, + "step": 13084 + }, + { + "epoch": 0.67, + "grad_norm": 0.7317639621699024, + "learning_rate": 5.319428313505523e-06, + "loss": 0.1658, + "step": 13085 + }, + { + "epoch": 0.67, + "grad_norm": 1.4234701788945343, + "learning_rate": 5.317972953802789e-06, + "loss": 0.205, + "step": 13086 + }, + { + "epoch": 0.67, + "grad_norm": 0.8881240899721385, + "learning_rate": 5.316517721100304e-06, + "loss": 0.1756, + "step": 13087 + }, + { + "epoch": 0.67, + "grad_norm": 1.0114148632119997, + "learning_rate": 5.31506261543754e-06, + "loss": 0.1618, + "step": 13088 + }, + { + "epoch": 0.67, + "grad_norm": 1.2493343913754495, + "learning_rate": 5.3136076368539706e-06, + "loss": 0.1857, + "step": 13089 + }, + { + "epoch": 0.67, + "grad_norm": 1.011576010409811, + "learning_rate": 5.312152785389056e-06, + "loss": 0.1718, + "step": 13090 + }, + { + "epoch": 0.67, + "grad_norm": 1.1400992672593633, + "learning_rate": 5.310698061082264e-06, + "loss": 0.187, + "step": 13091 + }, + { + "epoch": 0.67, + "grad_norm": 0.8391929239457808, + "learning_rate": 5.309243463973054e-06, + "loss": 0.1662, + "step": 13092 + }, + { + "epoch": 0.67, + "grad_norm": 1.0521217755250742, + "learning_rate": 5.307788994100876e-06, + "loss": 0.189, + "step": 13093 + }, + { + "epoch": 0.67, + "grad_norm": 0.8679850090640902, + "learning_rate": 5.306334651505185e-06, + "loss": 0.1759, + "step": 13094 + }, + { + "epoch": 0.67, + "grad_norm": 1.5800351297499289, + "learning_rate": 5.304880436225432e-06, + "loss": 0.2178, + "step": 13095 + }, + { + "epoch": 0.67, + "grad_norm": 1.1295677367762624, + "learning_rate": 5.303426348301066e-06, + "loss": 0.157, + "step": 13096 + }, + { + "epoch": 0.67, + "grad_norm": 0.9239596443120076, + "learning_rate": 5.3019723877715235e-06, + "loss": 0.1811, + "step": 13097 + }, + { + "epoch": 0.67, + "grad_norm": 0.9849840091548583, + "learning_rate": 5.300518554676247e-06, + "loss": 0.1619, + "step": 13098 + }, + { + "epoch": 0.67, + "grad_norm": 0.9109699828134366, + "learning_rate": 5.299064849054667e-06, + "loss": 0.1476, + "step": 13099 + }, + { + "epoch": 0.67, + "grad_norm": 1.3790099251749453, + "learning_rate": 5.297611270946223e-06, + "loss": 0.1761, + "step": 13100 + }, + { + "epoch": 0.67, + "grad_norm": 1.226670570464689, + "learning_rate": 5.296157820390335e-06, + "loss": 0.1539, + "step": 13101 + }, + { + "epoch": 0.67, + "grad_norm": 0.7468050795649048, + "learning_rate": 5.294704497426435e-06, + "loss": 0.1555, + "step": 13102 + }, + { + "epoch": 0.67, + "grad_norm": 0.9152207253205684, + "learning_rate": 5.293251302093938e-06, + "loss": 0.1565, + "step": 13103 + }, + { + "epoch": 0.67, + "grad_norm": 1.1090519116839588, + "learning_rate": 5.291798234432264e-06, + "loss": 0.1788, + "step": 13104 + }, + { + "epoch": 0.67, + "grad_norm": 1.0360390926658547, + "learning_rate": 5.2903452944808294e-06, + "loss": 0.2028, + "step": 13105 + }, + { + "epoch": 0.67, + "grad_norm": 0.7909596590664957, + "learning_rate": 5.28889248227905e-06, + "loss": 0.1699, + "step": 13106 + }, + { + "epoch": 0.67, + "grad_norm": 1.618341463656754, + "learning_rate": 5.287439797866323e-06, + "loss": 0.1906, + "step": 13107 + }, + { + "epoch": 0.67, + "grad_norm": 0.9067444923206541, + "learning_rate": 5.2859872412820625e-06, + "loss": 0.1802, + "step": 13108 + }, + { + "epoch": 0.67, + "grad_norm": 1.4808531136665637, + "learning_rate": 5.284534812565663e-06, + "loss": 0.1836, + "step": 13109 + }, + { + "epoch": 0.67, + "grad_norm": 0.8096508917230821, + "learning_rate": 5.283082511756519e-06, + "loss": 0.173, + "step": 13110 + }, + { + "epoch": 0.67, + "grad_norm": 0.9257012351305746, + "learning_rate": 5.281630338894032e-06, + "loss": 0.1756, + "step": 13111 + }, + { + "epoch": 0.67, + "grad_norm": 1.064130585477865, + "learning_rate": 5.280178294017586e-06, + "loss": 0.1563, + "step": 13112 + }, + { + "epoch": 0.67, + "grad_norm": 1.3097633680081981, + "learning_rate": 5.27872637716657e-06, + "loss": 0.1749, + "step": 13113 + }, + { + "epoch": 0.67, + "grad_norm": 1.4768187093469949, + "learning_rate": 5.277274588380368e-06, + "loss": 0.1736, + "step": 13114 + }, + { + "epoch": 0.67, + "grad_norm": 0.9706272896638604, + "learning_rate": 5.275822927698362e-06, + "loss": 0.2017, + "step": 13115 + }, + { + "epoch": 0.67, + "grad_norm": 1.6035603029810908, + "learning_rate": 5.274371395159923e-06, + "loss": 0.1456, + "step": 13116 + }, + { + "epoch": 0.67, + "grad_norm": 1.7868879807142999, + "learning_rate": 5.27291999080443e-06, + "loss": 0.172, + "step": 13117 + }, + { + "epoch": 0.67, + "grad_norm": 1.16407422123123, + "learning_rate": 5.271468714671247e-06, + "loss": 0.1899, + "step": 13118 + }, + { + "epoch": 0.67, + "grad_norm": 1.121016751150056, + "learning_rate": 5.2700175667997456e-06, + "loss": 0.1558, + "step": 13119 + }, + { + "epoch": 0.67, + "grad_norm": 1.0911689136172678, + "learning_rate": 5.2685665472292805e-06, + "loss": 0.1773, + "step": 13120 + }, + { + "epoch": 0.67, + "grad_norm": 0.7623012469449829, + "learning_rate": 5.267115655999214e-06, + "loss": 0.1694, + "step": 13121 + }, + { + "epoch": 0.67, + "grad_norm": 1.2615801018605743, + "learning_rate": 5.265664893148904e-06, + "loss": 0.1792, + "step": 13122 + }, + { + "epoch": 0.67, + "grad_norm": 1.2063061781367201, + "learning_rate": 5.264214258717705e-06, + "loss": 0.1627, + "step": 13123 + }, + { + "epoch": 0.67, + "grad_norm": 1.36952437005172, + "learning_rate": 5.2627637527449575e-06, + "loss": 0.1762, + "step": 13124 + }, + { + "epoch": 0.67, + "grad_norm": 0.9686427549608618, + "learning_rate": 5.2613133752700145e-06, + "loss": 0.1918, + "step": 13125 + }, + { + "epoch": 0.67, + "grad_norm": 1.341735433036232, + "learning_rate": 5.2598631263322145e-06, + "loss": 0.1809, + "step": 13126 + }, + { + "epoch": 0.67, + "grad_norm": 1.3729701286309794, + "learning_rate": 5.25841300597089e-06, + "loss": 0.1642, + "step": 13127 + }, + { + "epoch": 0.67, + "grad_norm": 0.9975682349986622, + "learning_rate": 5.256963014225385e-06, + "loss": 0.1864, + "step": 13128 + }, + { + "epoch": 0.67, + "grad_norm": 1.4672529989764542, + "learning_rate": 5.255513151135022e-06, + "loss": 0.2066, + "step": 13129 + }, + { + "epoch": 0.67, + "grad_norm": 0.7928599134134791, + "learning_rate": 5.2540634167391325e-06, + "loss": 0.195, + "step": 13130 + }, + { + "epoch": 0.67, + "grad_norm": 1.0659403602991226, + "learning_rate": 5.252613811077042e-06, + "loss": 0.1718, + "step": 13131 + }, + { + "epoch": 0.67, + "grad_norm": 1.20100561882506, + "learning_rate": 5.251164334188073e-06, + "loss": 0.1737, + "step": 13132 + }, + { + "epoch": 0.67, + "grad_norm": 1.405450961930424, + "learning_rate": 5.249714986111536e-06, + "loss": 0.1619, + "step": 13133 + }, + { + "epoch": 0.67, + "grad_norm": 1.3184257732901865, + "learning_rate": 5.248265766886752e-06, + "loss": 0.1524, + "step": 13134 + }, + { + "epoch": 0.67, + "grad_norm": 1.032123083377175, + "learning_rate": 5.246816676553024e-06, + "loss": 0.1774, + "step": 13135 + }, + { + "epoch": 0.67, + "grad_norm": 2.3472235154009833, + "learning_rate": 5.245367715149665e-06, + "loss": 0.1861, + "step": 13136 + }, + { + "epoch": 0.67, + "grad_norm": 4.587312343510043, + "learning_rate": 5.243918882715973e-06, + "loss": 0.1769, + "step": 13137 + }, + { + "epoch": 0.67, + "grad_norm": 1.1996450448267364, + "learning_rate": 5.242470179291253e-06, + "loss": 0.1543, + "step": 13138 + }, + { + "epoch": 0.67, + "grad_norm": 0.9134390388709754, + "learning_rate": 5.241021604914793e-06, + "loss": 0.178, + "step": 13139 + }, + { + "epoch": 0.67, + "grad_norm": 0.897598360509561, + "learning_rate": 5.2395731596258925e-06, + "loss": 0.1726, + "step": 13140 + }, + { + "epoch": 0.67, + "grad_norm": 1.2008256831395474, + "learning_rate": 5.238124843463839e-06, + "loss": 0.1792, + "step": 13141 + }, + { + "epoch": 0.67, + "grad_norm": 1.025255671697507, + "learning_rate": 5.236676656467921e-06, + "loss": 0.1699, + "step": 13142 + }, + { + "epoch": 0.67, + "grad_norm": 1.1622225323396218, + "learning_rate": 5.235228598677413e-06, + "loss": 0.1631, + "step": 13143 + }, + { + "epoch": 0.67, + "grad_norm": 0.9028964025869091, + "learning_rate": 5.233780670131603e-06, + "loss": 0.1655, + "step": 13144 + }, + { + "epoch": 0.67, + "grad_norm": 0.947513699797315, + "learning_rate": 5.232332870869763e-06, + "loss": 0.1798, + "step": 13145 + }, + { + "epoch": 0.67, + "grad_norm": 0.9029104736554413, + "learning_rate": 5.2308852009311576e-06, + "loss": 0.1928, + "step": 13146 + }, + { + "epoch": 0.67, + "grad_norm": 1.686357294204009, + "learning_rate": 5.229437660355061e-06, + "loss": 0.1809, + "step": 13147 + }, + { + "epoch": 0.67, + "grad_norm": 0.9317309818471727, + "learning_rate": 5.227990249180737e-06, + "loss": 0.1723, + "step": 13148 + }, + { + "epoch": 0.67, + "grad_norm": 1.0209870204447664, + "learning_rate": 5.226542967447452e-06, + "loss": 0.1816, + "step": 13149 + }, + { + "epoch": 0.67, + "grad_norm": 1.167574722120494, + "learning_rate": 5.225095815194453e-06, + "loss": 0.1733, + "step": 13150 + }, + { + "epoch": 0.67, + "grad_norm": 0.9041556488319442, + "learning_rate": 5.223648792461005e-06, + "loss": 0.1856, + "step": 13151 + }, + { + "epoch": 0.67, + "grad_norm": 1.0624582657746249, + "learning_rate": 5.222201899286349e-06, + "loss": 0.1697, + "step": 13152 + }, + { + "epoch": 0.67, + "grad_norm": 1.0023796482199938, + "learning_rate": 5.2207551357097395e-06, + "loss": 0.1599, + "step": 13153 + }, + { + "epoch": 0.67, + "grad_norm": 0.8018090044150327, + "learning_rate": 5.219308501770415e-06, + "loss": 0.1589, + "step": 13154 + }, + { + "epoch": 0.67, + "grad_norm": 1.6515551581906176, + "learning_rate": 5.217861997507618e-06, + "loss": 0.192, + "step": 13155 + }, + { + "epoch": 0.67, + "grad_norm": 0.8821744422221828, + "learning_rate": 5.2164156229605835e-06, + "loss": 0.1587, + "step": 13156 + }, + { + "epoch": 0.67, + "grad_norm": 26.084781713598552, + "learning_rate": 5.214969378168544e-06, + "loss": 0.1689, + "step": 13157 + }, + { + "epoch": 0.67, + "grad_norm": 1.2620184943781705, + "learning_rate": 5.213523263170731e-06, + "loss": 0.1788, + "step": 13158 + }, + { + "epoch": 0.67, + "grad_norm": 0.9441269351658541, + "learning_rate": 5.2120772780063735e-06, + "loss": 0.1803, + "step": 13159 + }, + { + "epoch": 0.67, + "grad_norm": 0.8263735589892514, + "learning_rate": 5.210631422714686e-06, + "loss": 0.177, + "step": 13160 + }, + { + "epoch": 0.67, + "grad_norm": 0.8343292258524222, + "learning_rate": 5.209185697334895e-06, + "loss": 0.1935, + "step": 13161 + }, + { + "epoch": 0.67, + "grad_norm": 0.7870435131244253, + "learning_rate": 5.207740101906215e-06, + "loss": 0.1649, + "step": 13162 + }, + { + "epoch": 0.67, + "grad_norm": 1.8376019067077287, + "learning_rate": 5.20629463646785e-06, + "loss": 0.1656, + "step": 13163 + }, + { + "epoch": 0.67, + "grad_norm": 1.1280017623926835, + "learning_rate": 5.2048493010590125e-06, + "loss": 0.1756, + "step": 13164 + }, + { + "epoch": 0.67, + "grad_norm": 1.1939937173125184, + "learning_rate": 5.20340409571891e-06, + "loss": 0.1779, + "step": 13165 + }, + { + "epoch": 0.67, + "grad_norm": 1.0052027725381725, + "learning_rate": 5.201959020486746e-06, + "loss": 0.1924, + "step": 13166 + }, + { + "epoch": 0.67, + "grad_norm": 1.968058426905328, + "learning_rate": 5.20051407540171e-06, + "loss": 0.1891, + "step": 13167 + }, + { + "epoch": 0.67, + "grad_norm": 0.9770179576061162, + "learning_rate": 5.199069260503006e-06, + "loss": 0.1449, + "step": 13168 + }, + { + "epoch": 0.67, + "grad_norm": 1.2173618399010426, + "learning_rate": 5.197624575829815e-06, + "loss": 0.2029, + "step": 13169 + }, + { + "epoch": 0.67, + "grad_norm": 3.1799476798417223, + "learning_rate": 5.196180021421332e-06, + "loss": 0.1988, + "step": 13170 + }, + { + "epoch": 0.67, + "grad_norm": 1.5495054811517865, + "learning_rate": 5.194735597316733e-06, + "loss": 0.1916, + "step": 13171 + }, + { + "epoch": 0.67, + "grad_norm": 0.9055829113764653, + "learning_rate": 5.193291303555208e-06, + "loss": 0.1645, + "step": 13172 + }, + { + "epoch": 0.67, + "grad_norm": 3.360427470770462, + "learning_rate": 5.191847140175923e-06, + "loss": 0.1538, + "step": 13173 + }, + { + "epoch": 0.67, + "grad_norm": 1.2417065582611897, + "learning_rate": 5.190403107218056e-06, + "loss": 0.1882, + "step": 13174 + }, + { + "epoch": 0.67, + "grad_norm": 1.0556889798216127, + "learning_rate": 5.188959204720776e-06, + "loss": 0.1933, + "step": 13175 + }, + { + "epoch": 0.67, + "grad_norm": 2.736411415481473, + "learning_rate": 5.1875154327232534e-06, + "loss": 0.1805, + "step": 13176 + }, + { + "epoch": 0.67, + "grad_norm": 0.9560392244403962, + "learning_rate": 5.186071791264642e-06, + "loss": 0.1685, + "step": 13177 + }, + { + "epoch": 0.67, + "grad_norm": 0.854930259475486, + "learning_rate": 5.1846282803841095e-06, + "loss": 0.1833, + "step": 13178 + }, + { + "epoch": 0.67, + "grad_norm": 0.8505934212998812, + "learning_rate": 5.183184900120807e-06, + "loss": 0.1485, + "step": 13179 + }, + { + "epoch": 0.67, + "grad_norm": 1.1737667148898814, + "learning_rate": 5.181741650513883e-06, + "loss": 0.1629, + "step": 13180 + }, + { + "epoch": 0.67, + "grad_norm": 1.2484661380408568, + "learning_rate": 5.180298531602491e-06, + "loss": 0.1718, + "step": 13181 + }, + { + "epoch": 0.67, + "grad_norm": 1.2998613299876511, + "learning_rate": 5.178855543425771e-06, + "loss": 0.1617, + "step": 13182 + }, + { + "epoch": 0.67, + "grad_norm": 0.9322637885141669, + "learning_rate": 5.177412686022866e-06, + "loss": 0.185, + "step": 13183 + }, + { + "epoch": 0.67, + "grad_norm": 1.215022711148766, + "learning_rate": 5.1759699594329135e-06, + "loss": 0.1681, + "step": 13184 + }, + { + "epoch": 0.67, + "grad_norm": 1.4889001656465048, + "learning_rate": 5.1745273636950545e-06, + "loss": 0.1709, + "step": 13185 + }, + { + "epoch": 0.67, + "grad_norm": 1.0293914136113915, + "learning_rate": 5.173084898848408e-06, + "loss": 0.1693, + "step": 13186 + }, + { + "epoch": 0.67, + "grad_norm": 1.6068083591065103, + "learning_rate": 5.17164256493211e-06, + "loss": 0.1796, + "step": 13187 + }, + { + "epoch": 0.67, + "grad_norm": 1.0840075598306749, + "learning_rate": 5.170200361985277e-06, + "loss": 0.1684, + "step": 13188 + }, + { + "epoch": 0.67, + "grad_norm": 1.0189090314842295, + "learning_rate": 5.168758290047035e-06, + "loss": 0.1501, + "step": 13189 + }, + { + "epoch": 0.67, + "grad_norm": 0.9682687528749686, + "learning_rate": 5.167316349156495e-06, + "loss": 0.1988, + "step": 13190 + }, + { + "epoch": 0.67, + "grad_norm": 1.0825631032236098, + "learning_rate": 5.16587453935277e-06, + "loss": 0.1704, + "step": 13191 + }, + { + "epoch": 0.67, + "grad_norm": 1.2869878171587137, + "learning_rate": 5.164432860674972e-06, + "loss": 0.1723, + "step": 13192 + }, + { + "epoch": 0.67, + "grad_norm": 1.1248074085681445, + "learning_rate": 5.162991313162209e-06, + "loss": 0.1684, + "step": 13193 + }, + { + "epoch": 0.67, + "grad_norm": 1.7456815987690477, + "learning_rate": 5.161549896853577e-06, + "loss": 0.2443, + "step": 13194 + }, + { + "epoch": 0.67, + "grad_norm": 0.8367021529866165, + "learning_rate": 5.16010861178818e-06, + "loss": 0.1567, + "step": 13195 + }, + { + "epoch": 0.67, + "grad_norm": 1.1886731283547356, + "learning_rate": 5.158667458005111e-06, + "loss": 0.1787, + "step": 13196 + }, + { + "epoch": 0.67, + "grad_norm": 0.9412907591588243, + "learning_rate": 5.157226435543456e-06, + "loss": 0.1637, + "step": 13197 + }, + { + "epoch": 0.67, + "grad_norm": 1.008260727572345, + "learning_rate": 5.155785544442313e-06, + "loss": 0.1919, + "step": 13198 + }, + { + "epoch": 0.67, + "grad_norm": 1.4114423893472183, + "learning_rate": 5.154344784740757e-06, + "loss": 0.1833, + "step": 13199 + }, + { + "epoch": 0.67, + "grad_norm": 0.8491369164972039, + "learning_rate": 5.15290415647787e-06, + "loss": 0.1701, + "step": 13200 + }, + { + "epoch": 0.67, + "grad_norm": 0.9168263593528301, + "learning_rate": 5.1514636596927325e-06, + "loss": 0.1723, + "step": 13201 + }, + { + "epoch": 0.67, + "grad_norm": 0.7817434712222572, + "learning_rate": 5.150023294424422e-06, + "loss": 0.175, + "step": 13202 + }, + { + "epoch": 0.67, + "grad_norm": 0.8371772526988961, + "learning_rate": 5.148583060711999e-06, + "loss": 0.1654, + "step": 13203 + }, + { + "epoch": 0.67, + "grad_norm": 0.8143341031199058, + "learning_rate": 5.147142958594538e-06, + "loss": 0.1649, + "step": 13204 + }, + { + "epoch": 0.67, + "grad_norm": 1.6114298088143855, + "learning_rate": 5.145702988111095e-06, + "loss": 0.1901, + "step": 13205 + }, + { + "epoch": 0.67, + "grad_norm": 1.0781789312399832, + "learning_rate": 5.144263149300737e-06, + "loss": 0.1656, + "step": 13206 + }, + { + "epoch": 0.67, + "grad_norm": 0.7097433857989437, + "learning_rate": 5.142823442202511e-06, + "loss": 0.1549, + "step": 13207 + }, + { + "epoch": 0.67, + "grad_norm": 1.7835889847646613, + "learning_rate": 5.141383866855476e-06, + "loss": 0.191, + "step": 13208 + }, + { + "epoch": 0.67, + "grad_norm": 1.6006004708707962, + "learning_rate": 5.139944423298675e-06, + "loss": 0.1878, + "step": 13209 + }, + { + "epoch": 0.67, + "grad_norm": 2.797442171502045, + "learning_rate": 5.138505111571157e-06, + "loss": 0.1586, + "step": 13210 + }, + { + "epoch": 0.67, + "grad_norm": 0.8320194440956922, + "learning_rate": 5.137065931711962e-06, + "loss": 0.1753, + "step": 13211 + }, + { + "epoch": 0.67, + "grad_norm": 0.9070023242298215, + "learning_rate": 5.135626883760132e-06, + "loss": 0.1673, + "step": 13212 + }, + { + "epoch": 0.67, + "grad_norm": 0.9764642294741012, + "learning_rate": 5.134187967754694e-06, + "loss": 0.1603, + "step": 13213 + }, + { + "epoch": 0.67, + "grad_norm": 0.9232328856993673, + "learning_rate": 5.132749183734684e-06, + "loss": 0.1797, + "step": 13214 + }, + { + "epoch": 0.67, + "grad_norm": 1.220088273132286, + "learning_rate": 5.131310531739129e-06, + "loss": 0.1743, + "step": 13215 + }, + { + "epoch": 0.67, + "grad_norm": 1.1545015554406468, + "learning_rate": 5.129872011807046e-06, + "loss": 0.1894, + "step": 13216 + }, + { + "epoch": 0.67, + "grad_norm": 1.115588263462691, + "learning_rate": 5.128433623977461e-06, + "loss": 0.1794, + "step": 13217 + }, + { + "epoch": 0.67, + "grad_norm": 1.0539783738675352, + "learning_rate": 5.126995368289389e-06, + "loss": 0.1848, + "step": 13218 + }, + { + "epoch": 0.67, + "grad_norm": 1.5524460056178087, + "learning_rate": 5.125557244781847e-06, + "loss": 0.1635, + "step": 13219 + }, + { + "epoch": 0.67, + "grad_norm": 1.1596802547778315, + "learning_rate": 5.1241192534938355e-06, + "loss": 0.1671, + "step": 13220 + }, + { + "epoch": 0.67, + "grad_norm": 1.0529195047149937, + "learning_rate": 5.122681394464368e-06, + "loss": 0.1995, + "step": 13221 + }, + { + "epoch": 0.67, + "grad_norm": 1.0346863465392042, + "learning_rate": 5.12124366773244e-06, + "loss": 0.1613, + "step": 13222 + }, + { + "epoch": 0.67, + "grad_norm": 1.0604730200598846, + "learning_rate": 5.119806073337057e-06, + "loss": 0.1567, + "step": 13223 + }, + { + "epoch": 0.67, + "grad_norm": 0.8402646199803202, + "learning_rate": 5.118368611317205e-06, + "loss": 0.1591, + "step": 13224 + }, + { + "epoch": 0.67, + "grad_norm": 0.9687228752658418, + "learning_rate": 5.116931281711886e-06, + "loss": 0.1733, + "step": 13225 + }, + { + "epoch": 0.67, + "grad_norm": 1.3458085182463966, + "learning_rate": 5.115494084560076e-06, + "loss": 0.1732, + "step": 13226 + }, + { + "epoch": 0.67, + "grad_norm": 0.8374535462370374, + "learning_rate": 5.114057019900764e-06, + "loss": 0.1602, + "step": 13227 + }, + { + "epoch": 0.67, + "grad_norm": 0.9107738081284421, + "learning_rate": 5.112620087772933e-06, + "loss": 0.1593, + "step": 13228 + }, + { + "epoch": 0.67, + "grad_norm": 0.8671955054266972, + "learning_rate": 5.111183288215562e-06, + "loss": 0.1573, + "step": 13229 + }, + { + "epoch": 0.67, + "grad_norm": 1.3376215180747362, + "learning_rate": 5.1097466212676175e-06, + "loss": 0.1686, + "step": 13230 + }, + { + "epoch": 0.67, + "grad_norm": 0.8096623233751569, + "learning_rate": 5.108310086968075e-06, + "loss": 0.1765, + "step": 13231 + }, + { + "epoch": 0.67, + "grad_norm": 0.9195070385921977, + "learning_rate": 5.106873685355897e-06, + "loss": 0.1723, + "step": 13232 + }, + { + "epoch": 0.67, + "grad_norm": 0.8939499616027785, + "learning_rate": 5.105437416470043e-06, + "loss": 0.1765, + "step": 13233 + }, + { + "epoch": 0.67, + "grad_norm": 0.8221497410960499, + "learning_rate": 5.10400128034948e-06, + "loss": 0.1832, + "step": 13234 + }, + { + "epoch": 0.67, + "grad_norm": 0.9561186940936613, + "learning_rate": 5.102565277033155e-06, + "loss": 0.1513, + "step": 13235 + }, + { + "epoch": 0.67, + "grad_norm": 0.872333575342079, + "learning_rate": 5.101129406560023e-06, + "loss": 0.1904, + "step": 13236 + }, + { + "epoch": 0.67, + "grad_norm": 1.6989686795106012, + "learning_rate": 5.099693668969033e-06, + "loss": 0.1791, + "step": 13237 + }, + { + "epoch": 0.67, + "grad_norm": 10.59026638608574, + "learning_rate": 5.098258064299132e-06, + "loss": 0.1643, + "step": 13238 + }, + { + "epoch": 0.67, + "grad_norm": 1.1675096098809408, + "learning_rate": 5.096822592589254e-06, + "loss": 0.1732, + "step": 13239 + }, + { + "epoch": 0.67, + "grad_norm": 1.0224568286169364, + "learning_rate": 5.095387253878346e-06, + "loss": 0.2192, + "step": 13240 + }, + { + "epoch": 0.67, + "grad_norm": 1.4791810466091335, + "learning_rate": 5.09395204820533e-06, + "loss": 0.1682, + "step": 13241 + }, + { + "epoch": 0.67, + "grad_norm": 1.503518493377402, + "learning_rate": 5.092516975609146e-06, + "loss": 0.1833, + "step": 13242 + }, + { + "epoch": 0.67, + "grad_norm": 0.9971546165789382, + "learning_rate": 5.091082036128712e-06, + "loss": 0.173, + "step": 13243 + }, + { + "epoch": 0.67, + "grad_norm": 1.0692436202332236, + "learning_rate": 5.0896472298029555e-06, + "loss": 0.1666, + "step": 13244 + }, + { + "epoch": 0.67, + "grad_norm": 1.4054718829745738, + "learning_rate": 5.088212556670795e-06, + "loss": 0.1733, + "step": 13245 + }, + { + "epoch": 0.67, + "grad_norm": 0.8824887512289968, + "learning_rate": 5.08677801677115e-06, + "loss": 0.1785, + "step": 13246 + }, + { + "epoch": 0.67, + "grad_norm": 1.1406611285731336, + "learning_rate": 5.085343610142926e-06, + "loss": 0.1765, + "step": 13247 + }, + { + "epoch": 0.67, + "grad_norm": 1.2010164866264619, + "learning_rate": 5.083909336825037e-06, + "loss": 0.1716, + "step": 13248 + }, + { + "epoch": 0.67, + "grad_norm": 1.1463388525562261, + "learning_rate": 5.0824751968563845e-06, + "loss": 0.1839, + "step": 13249 + }, + { + "epoch": 0.67, + "grad_norm": 1.8175246259217428, + "learning_rate": 5.0810411902758675e-06, + "loss": 0.1701, + "step": 13250 + }, + { + "epoch": 0.67, + "grad_norm": 1.3857489048773455, + "learning_rate": 5.0796073171223884e-06, + "loss": 0.1901, + "step": 13251 + }, + { + "epoch": 0.67, + "grad_norm": 1.0770489342555571, + "learning_rate": 5.078173577434836e-06, + "loss": 0.156, + "step": 13252 + }, + { + "epoch": 0.67, + "grad_norm": 1.0569178307565479, + "learning_rate": 5.076739971252103e-06, + "loss": 0.1667, + "step": 13253 + }, + { + "epoch": 0.67, + "grad_norm": 0.8588401381650115, + "learning_rate": 5.0753064986130765e-06, + "loss": 0.1783, + "step": 13254 + }, + { + "epoch": 0.67, + "grad_norm": 1.0696363407064873, + "learning_rate": 5.073873159556643e-06, + "loss": 0.1749, + "step": 13255 + }, + { + "epoch": 0.67, + "grad_norm": 2.0143686351956207, + "learning_rate": 5.072439954121675e-06, + "loss": 0.1777, + "step": 13256 + }, + { + "epoch": 0.67, + "grad_norm": 1.5402276037649723, + "learning_rate": 5.0710068823470535e-06, + "loss": 0.1656, + "step": 13257 + }, + { + "epoch": 0.67, + "grad_norm": 1.0663757615170488, + "learning_rate": 5.069573944271646e-06, + "loss": 0.1785, + "step": 13258 + }, + { + "epoch": 0.67, + "grad_norm": 1.0825248239599943, + "learning_rate": 5.068141139934328e-06, + "loss": 0.1927, + "step": 13259 + }, + { + "epoch": 0.67, + "grad_norm": 1.0449836392755059, + "learning_rate": 5.066708469373958e-06, + "loss": 0.17, + "step": 13260 + }, + { + "epoch": 0.67, + "grad_norm": 1.2647662251334955, + "learning_rate": 5.065275932629401e-06, + "loss": 0.1636, + "step": 13261 + }, + { + "epoch": 0.67, + "grad_norm": 0.8529323655944855, + "learning_rate": 5.063843529739509e-06, + "loss": 0.1719, + "step": 13262 + }, + { + "epoch": 0.67, + "grad_norm": 2.026661229104386, + "learning_rate": 5.062411260743141e-06, + "loss": 0.1884, + "step": 13263 + }, + { + "epoch": 0.67, + "grad_norm": 1.2294290210006409, + "learning_rate": 5.060979125679147e-06, + "loss": 0.1672, + "step": 13264 + }, + { + "epoch": 0.67, + "grad_norm": 2.0393714847360274, + "learning_rate": 5.0595471245863745e-06, + "loss": 0.1797, + "step": 13265 + }, + { + "epoch": 0.67, + "grad_norm": 1.209605688386021, + "learning_rate": 5.058115257503667e-06, + "loss": 0.1614, + "step": 13266 + }, + { + "epoch": 0.67, + "grad_norm": 1.0857832204663802, + "learning_rate": 5.056683524469859e-06, + "loss": 0.1927, + "step": 13267 + }, + { + "epoch": 0.67, + "grad_norm": 0.9911732545186153, + "learning_rate": 5.055251925523792e-06, + "loss": 0.1655, + "step": 13268 + }, + { + "epoch": 0.67, + "grad_norm": 1.624387785149277, + "learning_rate": 5.0538204607042925e-06, + "loss": 0.1607, + "step": 13269 + }, + { + "epoch": 0.67, + "grad_norm": 0.9864496852717262, + "learning_rate": 5.052389130050193e-06, + "loss": 0.1615, + "step": 13270 + }, + { + "epoch": 0.67, + "grad_norm": 1.2432216270091936, + "learning_rate": 5.050957933600317e-06, + "loss": 0.1989, + "step": 13271 + }, + { + "epoch": 0.67, + "grad_norm": 1.0533200560620144, + "learning_rate": 5.049526871393491e-06, + "loss": 0.171, + "step": 13272 + }, + { + "epoch": 0.67, + "grad_norm": 0.7796060605558551, + "learning_rate": 5.048095943468524e-06, + "loss": 0.1742, + "step": 13273 + }, + { + "epoch": 0.68, + "grad_norm": 0.9142403208282665, + "learning_rate": 5.046665149864238e-06, + "loss": 0.1552, + "step": 13274 + }, + { + "epoch": 0.68, + "grad_norm": 0.9049380952582469, + "learning_rate": 5.045234490619435e-06, + "loss": 0.1755, + "step": 13275 + }, + { + "epoch": 0.68, + "grad_norm": 1.2962081391225797, + "learning_rate": 5.043803965772932e-06, + "loss": 0.1896, + "step": 13276 + }, + { + "epoch": 0.68, + "grad_norm": 1.101749651971222, + "learning_rate": 5.042373575363522e-06, + "loss": 0.1798, + "step": 13277 + }, + { + "epoch": 0.68, + "grad_norm": 0.951802944183445, + "learning_rate": 5.040943319430012e-06, + "loss": 0.1939, + "step": 13278 + }, + { + "epoch": 0.68, + "grad_norm": 1.0555114435186326, + "learning_rate": 5.03951319801119e-06, + "loss": 0.1626, + "step": 13279 + }, + { + "epoch": 0.68, + "grad_norm": 2.350767178690501, + "learning_rate": 5.038083211145854e-06, + "loss": 0.1766, + "step": 13280 + }, + { + "epoch": 0.68, + "grad_norm": 1.1201796798396992, + "learning_rate": 5.03665335887279e-06, + "loss": 0.2076, + "step": 13281 + }, + { + "epoch": 0.68, + "grad_norm": 0.9327089524446213, + "learning_rate": 5.035223641230789e-06, + "loss": 0.189, + "step": 13282 + }, + { + "epoch": 0.68, + "grad_norm": 0.8261115940786845, + "learning_rate": 5.033794058258623e-06, + "loss": 0.1659, + "step": 13283 + }, + { + "epoch": 0.68, + "grad_norm": 1.4459582919012035, + "learning_rate": 5.0323646099950775e-06, + "loss": 0.1576, + "step": 13284 + }, + { + "epoch": 0.68, + "grad_norm": 0.8754760385383721, + "learning_rate": 5.030935296478922e-06, + "loss": 0.1734, + "step": 13285 + }, + { + "epoch": 0.68, + "grad_norm": 1.0686975505660197, + "learning_rate": 5.029506117748924e-06, + "loss": 0.1728, + "step": 13286 + }, + { + "epoch": 0.68, + "grad_norm": 1.282629243662813, + "learning_rate": 5.0280770738438535e-06, + "loss": 0.1656, + "step": 13287 + }, + { + "epoch": 0.68, + "grad_norm": 1.013316881874263, + "learning_rate": 5.026648164802472e-06, + "loss": 0.1848, + "step": 13288 + }, + { + "epoch": 0.68, + "grad_norm": 1.0771000819802043, + "learning_rate": 5.025219390663545e-06, + "loss": 0.1599, + "step": 13289 + }, + { + "epoch": 0.68, + "grad_norm": 1.3450360397275072, + "learning_rate": 5.023790751465818e-06, + "loss": 0.1638, + "step": 13290 + }, + { + "epoch": 0.68, + "grad_norm": 1.1537028984901865, + "learning_rate": 5.022362247248052e-06, + "loss": 0.2023, + "step": 13291 + }, + { + "epoch": 0.68, + "grad_norm": 0.9600143888983569, + "learning_rate": 5.020933878048988e-06, + "loss": 0.1778, + "step": 13292 + }, + { + "epoch": 0.68, + "grad_norm": 1.347995113401063, + "learning_rate": 5.0195056439073775e-06, + "loss": 0.2065, + "step": 13293 + }, + { + "epoch": 0.68, + "grad_norm": 0.8692116195281121, + "learning_rate": 5.018077544861954e-06, + "loss": 0.1683, + "step": 13294 + }, + { + "epoch": 0.68, + "grad_norm": 1.4525317319667932, + "learning_rate": 5.016649580951462e-06, + "loss": 0.1714, + "step": 13295 + }, + { + "epoch": 0.68, + "grad_norm": 1.0646484388308648, + "learning_rate": 5.015221752214627e-06, + "loss": 0.1597, + "step": 13296 + }, + { + "epoch": 0.68, + "grad_norm": 1.0205582628493528, + "learning_rate": 5.013794058690185e-06, + "loss": 0.1797, + "step": 13297 + }, + { + "epoch": 0.68, + "grad_norm": 0.8514951899742292, + "learning_rate": 5.01236650041686e-06, + "loss": 0.1743, + "step": 13298 + }, + { + "epoch": 0.68, + "grad_norm": 0.8689347769419848, + "learning_rate": 5.010939077433378e-06, + "loss": 0.1852, + "step": 13299 + }, + { + "epoch": 0.68, + "grad_norm": 0.9292572474186874, + "learning_rate": 5.009511789778454e-06, + "loss": 0.1802, + "step": 13300 + }, + { + "epoch": 0.68, + "grad_norm": 3.692359441421138, + "learning_rate": 5.008084637490807e-06, + "loss": 0.1784, + "step": 13301 + }, + { + "epoch": 0.68, + "grad_norm": 1.2508345137841275, + "learning_rate": 5.006657620609147e-06, + "loss": 0.1911, + "step": 13302 + }, + { + "epoch": 0.68, + "grad_norm": 0.9286787172036365, + "learning_rate": 5.005230739172175e-06, + "loss": 0.1892, + "step": 13303 + }, + { + "epoch": 0.68, + "grad_norm": 0.867410754178937, + "learning_rate": 5.003803993218608e-06, + "loss": 0.1789, + "step": 13304 + }, + { + "epoch": 0.68, + "grad_norm": 1.1372677523807884, + "learning_rate": 5.002377382787135e-06, + "loss": 0.1997, + "step": 13305 + }, + { + "epoch": 0.68, + "grad_norm": 0.8645106706756592, + "learning_rate": 5.000950907916457e-06, + "loss": 0.1848, + "step": 13306 + }, + { + "epoch": 0.68, + "grad_norm": 0.9028957991781709, + "learning_rate": 4.999524568645268e-06, + "loss": 0.1791, + "step": 13307 + }, + { + "epoch": 0.68, + "grad_norm": 1.653303769921554, + "learning_rate": 4.998098365012263e-06, + "loss": 0.1944, + "step": 13308 + }, + { + "epoch": 0.68, + "grad_norm": 3.581598586863471, + "learning_rate": 4.9966722970561165e-06, + "loss": 0.1645, + "step": 13309 + }, + { + "epoch": 0.68, + "grad_norm": 1.0273155328412358, + "learning_rate": 4.995246364815522e-06, + "loss": 0.1739, + "step": 13310 + }, + { + "epoch": 0.68, + "grad_norm": 1.233151772861096, + "learning_rate": 4.993820568329147e-06, + "loss": 0.1814, + "step": 13311 + }, + { + "epoch": 0.68, + "grad_norm": 0.8748578904892358, + "learning_rate": 4.992394907635677e-06, + "loss": 0.1754, + "step": 13312 + }, + { + "epoch": 0.68, + "grad_norm": 1.3279718652025159, + "learning_rate": 4.990969382773773e-06, + "loss": 0.1784, + "step": 13313 + }, + { + "epoch": 0.68, + "grad_norm": 0.8481820795080676, + "learning_rate": 4.989543993782109e-06, + "loss": 0.1647, + "step": 13314 + }, + { + "epoch": 0.68, + "grad_norm": 0.8523082068024254, + "learning_rate": 4.9881187406993455e-06, + "loss": 0.1654, + "step": 13315 + }, + { + "epoch": 0.68, + "grad_norm": 1.0876009925692094, + "learning_rate": 4.98669362356415e-06, + "loss": 0.1692, + "step": 13316 + }, + { + "epoch": 0.68, + "grad_norm": 1.2189864873069765, + "learning_rate": 4.985268642415167e-06, + "loss": 0.2011, + "step": 13317 + }, + { + "epoch": 0.68, + "grad_norm": 1.0037139617465891, + "learning_rate": 4.98384379729106e-06, + "loss": 0.1682, + "step": 13318 + }, + { + "epoch": 0.68, + "grad_norm": 1.1469562922992695, + "learning_rate": 4.982419088230473e-06, + "loss": 0.1668, + "step": 13319 + }, + { + "epoch": 0.68, + "grad_norm": 0.8763162550492711, + "learning_rate": 4.98099451527205e-06, + "loss": 0.188, + "step": 13320 + }, + { + "epoch": 0.68, + "grad_norm": 0.9031762114085561, + "learning_rate": 4.9795700784544355e-06, + "loss": 0.1561, + "step": 13321 + }, + { + "epoch": 0.68, + "grad_norm": 1.4303154523574806, + "learning_rate": 4.978145777816264e-06, + "loss": 0.1728, + "step": 13322 + }, + { + "epoch": 0.68, + "grad_norm": 1.063977785990633, + "learning_rate": 4.9767216133961705e-06, + "loss": 0.1552, + "step": 13323 + }, + { + "epoch": 0.68, + "grad_norm": 1.1334620235258557, + "learning_rate": 4.975297585232788e-06, + "loss": 0.1845, + "step": 13324 + }, + { + "epoch": 0.68, + "grad_norm": 1.0121310102918408, + "learning_rate": 4.973873693364746e-06, + "loss": 0.1838, + "step": 13325 + }, + { + "epoch": 0.68, + "grad_norm": 0.8874720693087358, + "learning_rate": 4.972449937830659e-06, + "loss": 0.1392, + "step": 13326 + }, + { + "epoch": 0.68, + "grad_norm": 0.8405872199516397, + "learning_rate": 4.971026318669156e-06, + "loss": 0.2046, + "step": 13327 + }, + { + "epoch": 0.68, + "grad_norm": 0.9140677436739643, + "learning_rate": 4.9696028359188444e-06, + "loss": 0.1817, + "step": 13328 + }, + { + "epoch": 0.68, + "grad_norm": 1.122027630302311, + "learning_rate": 4.968179489618345e-06, + "loss": 0.1768, + "step": 13329 + }, + { + "epoch": 0.68, + "grad_norm": 0.9179887064369858, + "learning_rate": 4.966756279806255e-06, + "loss": 0.1599, + "step": 13330 + }, + { + "epoch": 0.68, + "grad_norm": 1.2229010455954328, + "learning_rate": 4.9653332065211905e-06, + "loss": 0.1692, + "step": 13331 + }, + { + "epoch": 0.68, + "grad_norm": 0.9100285797496191, + "learning_rate": 4.963910269801743e-06, + "loss": 0.1708, + "step": 13332 + }, + { + "epoch": 0.68, + "grad_norm": 1.108220029993447, + "learning_rate": 4.962487469686513e-06, + "loss": 0.1887, + "step": 13333 + }, + { + "epoch": 0.68, + "grad_norm": 1.1734441516261298, + "learning_rate": 4.961064806214096e-06, + "loss": 0.1804, + "step": 13334 + }, + { + "epoch": 0.68, + "grad_norm": 1.071162594836847, + "learning_rate": 4.959642279423085e-06, + "loss": 0.1738, + "step": 13335 + }, + { + "epoch": 0.68, + "grad_norm": 1.1985132657468434, + "learning_rate": 4.958219889352061e-06, + "loss": 0.1652, + "step": 13336 + }, + { + "epoch": 0.68, + "grad_norm": 0.8772492189650118, + "learning_rate": 4.956797636039603e-06, + "loss": 0.1721, + "step": 13337 + }, + { + "epoch": 0.68, + "grad_norm": 1.2597169048117127, + "learning_rate": 4.955375519524299e-06, + "loss": 0.2005, + "step": 13338 + }, + { + "epoch": 0.68, + "grad_norm": 2.4108297618878187, + "learning_rate": 4.953953539844715e-06, + "loss": 0.1738, + "step": 13339 + }, + { + "epoch": 0.68, + "grad_norm": 0.8353302923538877, + "learning_rate": 4.952531697039424e-06, + "loss": 0.1796, + "step": 13340 + }, + { + "epoch": 0.68, + "grad_norm": 3.45598031294388, + "learning_rate": 4.951109991146999e-06, + "loss": 0.1664, + "step": 13341 + }, + { + "epoch": 0.68, + "grad_norm": 3.890865755028747, + "learning_rate": 4.949688422206003e-06, + "loss": 0.1665, + "step": 13342 + }, + { + "epoch": 0.68, + "grad_norm": 1.035408609462023, + "learning_rate": 4.9482669902549896e-06, + "loss": 0.1791, + "step": 13343 + }, + { + "epoch": 0.68, + "grad_norm": 1.0615178010266604, + "learning_rate": 4.946845695332524e-06, + "loss": 0.1934, + "step": 13344 + }, + { + "epoch": 0.68, + "grad_norm": 1.1510058838639408, + "learning_rate": 4.945424537477149e-06, + "loss": 0.1732, + "step": 13345 + }, + { + "epoch": 0.68, + "grad_norm": 0.8626483346119986, + "learning_rate": 4.944003516727424e-06, + "loss": 0.164, + "step": 13346 + }, + { + "epoch": 0.68, + "grad_norm": 0.9788758357224554, + "learning_rate": 4.942582633121885e-06, + "loss": 0.1781, + "step": 13347 + }, + { + "epoch": 0.68, + "grad_norm": 1.2100884087094943, + "learning_rate": 4.941161886699082e-06, + "loss": 0.1607, + "step": 13348 + }, + { + "epoch": 0.68, + "grad_norm": 0.7699738187325713, + "learning_rate": 4.939741277497545e-06, + "loss": 0.1842, + "step": 13349 + }, + { + "epoch": 0.68, + "grad_norm": 0.88949621414364, + "learning_rate": 4.938320805555811e-06, + "loss": 0.1579, + "step": 13350 + }, + { + "epoch": 0.68, + "grad_norm": 0.9083718699578254, + "learning_rate": 4.9369004709124115e-06, + "loss": 0.1797, + "step": 13351 + }, + { + "epoch": 0.68, + "grad_norm": 3.122080049815317, + "learning_rate": 4.935480273605876e-06, + "loss": 0.1666, + "step": 13352 + }, + { + "epoch": 0.68, + "grad_norm": 1.1730433331504413, + "learning_rate": 4.93406021367472e-06, + "loss": 0.1693, + "step": 13353 + }, + { + "epoch": 0.68, + "grad_norm": 1.1792075254193641, + "learning_rate": 4.932640291157471e-06, + "loss": 0.1599, + "step": 13354 + }, + { + "epoch": 0.68, + "grad_norm": 0.9867221227698497, + "learning_rate": 4.931220506092641e-06, + "loss": 0.1657, + "step": 13355 + }, + { + "epoch": 0.68, + "grad_norm": 0.9272440666065497, + "learning_rate": 4.929800858518736e-06, + "loss": 0.1678, + "step": 13356 + }, + { + "epoch": 0.68, + "grad_norm": 1.3593501320246062, + "learning_rate": 4.928381348474274e-06, + "loss": 0.1864, + "step": 13357 + }, + { + "epoch": 0.68, + "grad_norm": 1.2641556486628795, + "learning_rate": 4.926961975997749e-06, + "loss": 0.2181, + "step": 13358 + }, + { + "epoch": 0.68, + "grad_norm": 1.0468224571302156, + "learning_rate": 4.925542741127669e-06, + "loss": 0.177, + "step": 13359 + }, + { + "epoch": 0.68, + "grad_norm": 1.0279548360084887, + "learning_rate": 4.9241236439025275e-06, + "loss": 0.1573, + "step": 13360 + }, + { + "epoch": 0.68, + "grad_norm": 1.4210161107545294, + "learning_rate": 4.9227046843608224e-06, + "loss": 0.1559, + "step": 13361 + }, + { + "epoch": 0.68, + "grad_norm": 1.0368899818961608, + "learning_rate": 4.921285862541037e-06, + "loss": 0.1776, + "step": 13362 + }, + { + "epoch": 0.68, + "grad_norm": 0.8609128185278291, + "learning_rate": 4.919867178481662e-06, + "loss": 0.1789, + "step": 13363 + }, + { + "epoch": 0.68, + "grad_norm": 1.0819398376435012, + "learning_rate": 4.9184486322211734e-06, + "loss": 0.1582, + "step": 13364 + }, + { + "epoch": 0.68, + "grad_norm": 2.186110252214916, + "learning_rate": 4.917030223798057e-06, + "loss": 0.1507, + "step": 13365 + }, + { + "epoch": 0.68, + "grad_norm": 0.8508139881204698, + "learning_rate": 4.915611953250778e-06, + "loss": 0.179, + "step": 13366 + }, + { + "epoch": 0.68, + "grad_norm": 2.8510715693991586, + "learning_rate": 4.914193820617813e-06, + "loss": 0.1841, + "step": 13367 + }, + { + "epoch": 0.68, + "grad_norm": 0.8557222228620209, + "learning_rate": 4.912775825937627e-06, + "loss": 0.181, + "step": 13368 + }, + { + "epoch": 0.68, + "grad_norm": 1.0329111875129768, + "learning_rate": 4.911357969248688e-06, + "loss": 0.1757, + "step": 13369 + }, + { + "epoch": 0.68, + "grad_norm": 1.0033361006799728, + "learning_rate": 4.909940250589448e-06, + "loss": 0.174, + "step": 13370 + }, + { + "epoch": 0.68, + "grad_norm": 0.9562282908872947, + "learning_rate": 4.90852266999837e-06, + "loss": 0.1636, + "step": 13371 + }, + { + "epoch": 0.68, + "grad_norm": 1.6260398737326067, + "learning_rate": 4.907105227513902e-06, + "loss": 0.1776, + "step": 13372 + }, + { + "epoch": 0.68, + "grad_norm": 0.9791958386788515, + "learning_rate": 4.905687923174488e-06, + "loss": 0.149, + "step": 13373 + }, + { + "epoch": 0.68, + "grad_norm": 0.8983703155733312, + "learning_rate": 4.904270757018581e-06, + "loss": 0.1563, + "step": 13374 + }, + { + "epoch": 0.68, + "grad_norm": 0.8223754890980843, + "learning_rate": 4.902853729084615e-06, + "loss": 0.1608, + "step": 13375 + }, + { + "epoch": 0.68, + "grad_norm": 0.8900805982829235, + "learning_rate": 4.9014368394110275e-06, + "loss": 0.1645, + "step": 13376 + }, + { + "epoch": 0.68, + "grad_norm": 1.17004263604402, + "learning_rate": 4.900020088036254e-06, + "loss": 0.157, + "step": 13377 + }, + { + "epoch": 0.68, + "grad_norm": 1.2806837428129272, + "learning_rate": 4.898603474998729e-06, + "loss": 0.1875, + "step": 13378 + }, + { + "epoch": 0.68, + "grad_norm": 0.894743260473036, + "learning_rate": 4.897187000336867e-06, + "loss": 0.1517, + "step": 13379 + }, + { + "epoch": 0.68, + "grad_norm": 1.0252274864395192, + "learning_rate": 4.895770664089101e-06, + "loss": 0.199, + "step": 13380 + }, + { + "epoch": 0.68, + "grad_norm": 1.1158333518389476, + "learning_rate": 4.89435446629384e-06, + "loss": 0.1824, + "step": 13381 + }, + { + "epoch": 0.68, + "grad_norm": 0.9805177074634117, + "learning_rate": 4.892938406989507e-06, + "loss": 0.1725, + "step": 13382 + }, + { + "epoch": 0.68, + "grad_norm": 0.998938905422329, + "learning_rate": 4.891522486214503e-06, + "loss": 0.1641, + "step": 13383 + }, + { + "epoch": 0.68, + "grad_norm": 0.9151046573689942, + "learning_rate": 4.89010670400724e-06, + "loss": 0.1815, + "step": 13384 + }, + { + "epoch": 0.68, + "grad_norm": 1.1280548607462288, + "learning_rate": 4.888691060406122e-06, + "loss": 0.1789, + "step": 13385 + }, + { + "epoch": 0.68, + "grad_norm": 1.0898656353463203, + "learning_rate": 4.887275555449552e-06, + "loss": 0.1374, + "step": 13386 + }, + { + "epoch": 0.68, + "grad_norm": 1.0457794371285571, + "learning_rate": 4.885860189175917e-06, + "loss": 0.1746, + "step": 13387 + }, + { + "epoch": 0.68, + "grad_norm": 0.8664565229805467, + "learning_rate": 4.884444961623616e-06, + "loss": 0.1848, + "step": 13388 + }, + { + "epoch": 0.68, + "grad_norm": 1.4847731662977808, + "learning_rate": 4.8830298728310355e-06, + "loss": 0.2005, + "step": 13389 + }, + { + "epoch": 0.68, + "grad_norm": 0.9869762959474485, + "learning_rate": 4.881614922836555e-06, + "loss": 0.1632, + "step": 13390 + }, + { + "epoch": 0.68, + "grad_norm": 1.1011081086417145, + "learning_rate": 4.880200111678563e-06, + "loss": 0.1541, + "step": 13391 + }, + { + "epoch": 0.68, + "grad_norm": 0.8733096551389045, + "learning_rate": 4.878785439395427e-06, + "loss": 0.1746, + "step": 13392 + }, + { + "epoch": 0.68, + "grad_norm": 1.7062110329950042, + "learning_rate": 4.8773709060255256e-06, + "loss": 0.1809, + "step": 13393 + }, + { + "epoch": 0.68, + "grad_norm": 1.4984101977385287, + "learning_rate": 4.8759565116072285e-06, + "loss": 0.196, + "step": 13394 + }, + { + "epoch": 0.68, + "grad_norm": 0.9800320483035345, + "learning_rate": 4.874542256178903e-06, + "loss": 0.1675, + "step": 13395 + }, + { + "epoch": 0.68, + "grad_norm": 1.0454689807754207, + "learning_rate": 4.873128139778906e-06, + "loss": 0.1897, + "step": 13396 + }, + { + "epoch": 0.68, + "grad_norm": 0.9859535430018859, + "learning_rate": 4.8717141624456e-06, + "loss": 0.1584, + "step": 13397 + }, + { + "epoch": 0.68, + "grad_norm": 1.0988300467051104, + "learning_rate": 4.870300324217334e-06, + "loss": 0.1787, + "step": 13398 + }, + { + "epoch": 0.68, + "grad_norm": 0.9376020847074387, + "learning_rate": 4.868886625132465e-06, + "loss": 0.1755, + "step": 13399 + }, + { + "epoch": 0.68, + "grad_norm": 0.7033458031810654, + "learning_rate": 4.867473065229332e-06, + "loss": 0.135, + "step": 13400 + }, + { + "epoch": 0.68, + "grad_norm": 1.063844911772989, + "learning_rate": 4.866059644546287e-06, + "loss": 0.1879, + "step": 13401 + }, + { + "epoch": 0.68, + "grad_norm": 1.5097869097695407, + "learning_rate": 4.864646363121659e-06, + "loss": 0.182, + "step": 13402 + }, + { + "epoch": 0.68, + "grad_norm": 1.369818989617672, + "learning_rate": 4.863233220993789e-06, + "loss": 0.1657, + "step": 13403 + }, + { + "epoch": 0.68, + "grad_norm": 0.9667709687107531, + "learning_rate": 4.861820218201009e-06, + "loss": 0.1761, + "step": 13404 + }, + { + "epoch": 0.68, + "grad_norm": 1.1342468133599284, + "learning_rate": 4.860407354781647e-06, + "loss": 0.1535, + "step": 13405 + }, + { + "epoch": 0.68, + "grad_norm": 1.1973300465476358, + "learning_rate": 4.858994630774028e-06, + "loss": 0.1678, + "step": 13406 + }, + { + "epoch": 0.68, + "grad_norm": 0.8670715244476761, + "learning_rate": 4.857582046216465e-06, + "loss": 0.1865, + "step": 13407 + }, + { + "epoch": 0.68, + "grad_norm": 0.9690513997511496, + "learning_rate": 4.856169601147285e-06, + "loss": 0.1735, + "step": 13408 + }, + { + "epoch": 0.68, + "grad_norm": 1.0865856176368371, + "learning_rate": 4.8547572956047894e-06, + "loss": 0.1875, + "step": 13409 + }, + { + "epoch": 0.68, + "grad_norm": 1.0067725107937646, + "learning_rate": 4.8533451296272934e-06, + "loss": 0.1698, + "step": 13410 + }, + { + "epoch": 0.68, + "grad_norm": 0.8611937224662363, + "learning_rate": 4.8519331032531015e-06, + "loss": 0.1621, + "step": 13411 + }, + { + "epoch": 0.68, + "grad_norm": 1.2179372902706707, + "learning_rate": 4.85052121652052e-06, + "loss": 0.1737, + "step": 13412 + }, + { + "epoch": 0.68, + "grad_norm": 1.0646786227946774, + "learning_rate": 4.849109469467835e-06, + "loss": 0.1744, + "step": 13413 + }, + { + "epoch": 0.68, + "grad_norm": 1.3153447789011714, + "learning_rate": 4.847697862133351e-06, + "loss": 0.1599, + "step": 13414 + }, + { + "epoch": 0.68, + "grad_norm": 1.139802471031512, + "learning_rate": 4.846286394555352e-06, + "loss": 0.1461, + "step": 13415 + }, + { + "epoch": 0.68, + "grad_norm": 0.8781096489001784, + "learning_rate": 4.844875066772126e-06, + "loss": 0.1754, + "step": 13416 + }, + { + "epoch": 0.68, + "grad_norm": 0.9085196484189163, + "learning_rate": 4.843463878821955e-06, + "loss": 0.1776, + "step": 13417 + }, + { + "epoch": 0.68, + "grad_norm": 0.9693730021548259, + "learning_rate": 4.842052830743118e-06, + "loss": 0.1638, + "step": 13418 + }, + { + "epoch": 0.68, + "grad_norm": 0.911581147326447, + "learning_rate": 4.840641922573888e-06, + "loss": 0.1603, + "step": 13419 + }, + { + "epoch": 0.68, + "grad_norm": 2.1832359251033444, + "learning_rate": 4.839231154352535e-06, + "loss": 0.181, + "step": 13420 + }, + { + "epoch": 0.68, + "grad_norm": 0.9944004541687458, + "learning_rate": 4.837820526117329e-06, + "loss": 0.1786, + "step": 13421 + }, + { + "epoch": 0.68, + "grad_norm": 0.9643622262243208, + "learning_rate": 4.836410037906537e-06, + "loss": 0.1863, + "step": 13422 + }, + { + "epoch": 0.68, + "grad_norm": 1.798376813456311, + "learning_rate": 4.834999689758412e-06, + "loss": 0.1683, + "step": 13423 + }, + { + "epoch": 0.68, + "grad_norm": 1.0119099256446815, + "learning_rate": 4.833589481711214e-06, + "loss": 0.1666, + "step": 13424 + }, + { + "epoch": 0.68, + "grad_norm": 1.1561921433167999, + "learning_rate": 4.832179413803193e-06, + "loss": 0.1566, + "step": 13425 + }, + { + "epoch": 0.68, + "grad_norm": 1.135528996676528, + "learning_rate": 4.830769486072594e-06, + "loss": 0.191, + "step": 13426 + }, + { + "epoch": 0.68, + "grad_norm": 1.0686848952484496, + "learning_rate": 4.829359698557669e-06, + "loss": 0.201, + "step": 13427 + }, + { + "epoch": 0.68, + "grad_norm": 0.9368366479450847, + "learning_rate": 4.827950051296651e-06, + "loss": 0.1854, + "step": 13428 + }, + { + "epoch": 0.68, + "grad_norm": 1.055241925769985, + "learning_rate": 4.826540544327778e-06, + "loss": 0.1735, + "step": 13429 + }, + { + "epoch": 0.68, + "grad_norm": 1.4929546877284332, + "learning_rate": 4.825131177689286e-06, + "loss": 0.1832, + "step": 13430 + }, + { + "epoch": 0.68, + "grad_norm": 0.9835157565703031, + "learning_rate": 4.8237219514194064e-06, + "loss": 0.1614, + "step": 13431 + }, + { + "epoch": 0.68, + "grad_norm": 0.8935105225693067, + "learning_rate": 4.8223128655563574e-06, + "loss": 0.1619, + "step": 13432 + }, + { + "epoch": 0.68, + "grad_norm": 0.9941601481300318, + "learning_rate": 4.820903920138369e-06, + "loss": 0.1813, + "step": 13433 + }, + { + "epoch": 0.68, + "grad_norm": 1.2681225658498505, + "learning_rate": 4.819495115203651e-06, + "loss": 0.1727, + "step": 13434 + }, + { + "epoch": 0.68, + "grad_norm": 1.0083145685072834, + "learning_rate": 4.818086450790423e-06, + "loss": 0.1838, + "step": 13435 + }, + { + "epoch": 0.68, + "grad_norm": 1.002803369604112, + "learning_rate": 4.816677926936889e-06, + "loss": 0.1649, + "step": 13436 + }, + { + "epoch": 0.68, + "grad_norm": 0.8970907695821947, + "learning_rate": 4.815269543681259e-06, + "loss": 0.1644, + "step": 13437 + }, + { + "epoch": 0.68, + "grad_norm": 1.7536868504864365, + "learning_rate": 4.813861301061737e-06, + "loss": 0.168, + "step": 13438 + }, + { + "epoch": 0.68, + "grad_norm": 1.287016327629495, + "learning_rate": 4.812453199116522e-06, + "loss": 0.1568, + "step": 13439 + }, + { + "epoch": 0.68, + "grad_norm": 2.365945925124035, + "learning_rate": 4.811045237883803e-06, + "loss": 0.1613, + "step": 13440 + }, + { + "epoch": 0.68, + "grad_norm": 0.8182433301573657, + "learning_rate": 4.80963741740178e-06, + "loss": 0.1671, + "step": 13441 + }, + { + "epoch": 0.68, + "grad_norm": 1.2762692421010002, + "learning_rate": 4.808229737708635e-06, + "loss": 0.183, + "step": 13442 + }, + { + "epoch": 0.68, + "grad_norm": 1.1055784334758063, + "learning_rate": 4.806822198842548e-06, + "loss": 0.1614, + "step": 13443 + }, + { + "epoch": 0.68, + "grad_norm": 1.6299468631717564, + "learning_rate": 4.805414800841706e-06, + "loss": 0.182, + "step": 13444 + }, + { + "epoch": 0.68, + "grad_norm": 0.9921407678152764, + "learning_rate": 4.804007543744277e-06, + "loss": 0.1731, + "step": 13445 + }, + { + "epoch": 0.68, + "grad_norm": 1.226900393560294, + "learning_rate": 4.802600427588437e-06, + "loss": 0.1875, + "step": 13446 + }, + { + "epoch": 0.68, + "grad_norm": 0.8771639659433222, + "learning_rate": 4.801193452412353e-06, + "loss": 0.1828, + "step": 13447 + }, + { + "epoch": 0.68, + "grad_norm": 1.2076103968998635, + "learning_rate": 4.799786618254194e-06, + "loss": 0.1824, + "step": 13448 + }, + { + "epoch": 0.68, + "grad_norm": 1.165591143836002, + "learning_rate": 4.798379925152113e-06, + "loss": 0.1688, + "step": 13449 + }, + { + "epoch": 0.68, + "grad_norm": 0.9737739548462891, + "learning_rate": 4.796973373144276e-06, + "loss": 0.1849, + "step": 13450 + }, + { + "epoch": 0.68, + "grad_norm": 0.9713615342015935, + "learning_rate": 4.795566962268824e-06, + "loss": 0.1916, + "step": 13451 + }, + { + "epoch": 0.68, + "grad_norm": 1.033946714466998, + "learning_rate": 4.794160692563917e-06, + "loss": 0.2029, + "step": 13452 + }, + { + "epoch": 0.68, + "grad_norm": 1.2749100365224257, + "learning_rate": 4.792754564067691e-06, + "loss": 0.1633, + "step": 13453 + }, + { + "epoch": 0.68, + "grad_norm": 0.806561978231977, + "learning_rate": 4.791348576818296e-06, + "loss": 0.162, + "step": 13454 + }, + { + "epoch": 0.68, + "grad_norm": 1.6382894362878446, + "learning_rate": 4.78994273085386e-06, + "loss": 0.1802, + "step": 13455 + }, + { + "epoch": 0.68, + "grad_norm": 0.9880680606393522, + "learning_rate": 4.788537026212523e-06, + "loss": 0.1718, + "step": 13456 + }, + { + "epoch": 0.68, + "grad_norm": 1.3179714810536076, + "learning_rate": 4.7871314629324125e-06, + "loss": 0.1637, + "step": 13457 + }, + { + "epoch": 0.68, + "grad_norm": 0.8938350389586178, + "learning_rate": 4.78572604105166e-06, + "loss": 0.1625, + "step": 13458 + }, + { + "epoch": 0.68, + "grad_norm": 1.2134710981285515, + "learning_rate": 4.784320760608384e-06, + "loss": 0.1706, + "step": 13459 + }, + { + "epoch": 0.68, + "grad_norm": 1.4017759179084095, + "learning_rate": 4.782915621640697e-06, + "loss": 0.1752, + "step": 13460 + }, + { + "epoch": 0.68, + "grad_norm": 1.1175978308846175, + "learning_rate": 4.781510624186723e-06, + "loss": 0.1664, + "step": 13461 + }, + { + "epoch": 0.68, + "grad_norm": 1.0035509318285942, + "learning_rate": 4.780105768284563e-06, + "loss": 0.1715, + "step": 13462 + }, + { + "epoch": 0.68, + "grad_norm": 0.7894408453332361, + "learning_rate": 4.778701053972329e-06, + "loss": 0.1549, + "step": 13463 + }, + { + "epoch": 0.68, + "grad_norm": 6.307745294733831, + "learning_rate": 4.777296481288125e-06, + "loss": 0.1649, + "step": 13464 + }, + { + "epoch": 0.68, + "grad_norm": 1.050140125564269, + "learning_rate": 4.775892050270051e-06, + "loss": 0.1903, + "step": 13465 + }, + { + "epoch": 0.68, + "grad_norm": 0.8613949065020686, + "learning_rate": 4.774487760956198e-06, + "loss": 0.169, + "step": 13466 + }, + { + "epoch": 0.68, + "grad_norm": 0.7835237230319492, + "learning_rate": 4.773083613384663e-06, + "loss": 0.1403, + "step": 13467 + }, + { + "epoch": 0.68, + "grad_norm": 1.3627385906428595, + "learning_rate": 4.771679607593526e-06, + "loss": 0.1839, + "step": 13468 + }, + { + "epoch": 0.68, + "grad_norm": 1.0347857730772498, + "learning_rate": 4.770275743620879e-06, + "loss": 0.183, + "step": 13469 + }, + { + "epoch": 0.68, + "grad_norm": 0.9220173253856068, + "learning_rate": 4.768872021504795e-06, + "loss": 0.1678, + "step": 13470 + }, + { + "epoch": 0.69, + "grad_norm": 1.468227027681512, + "learning_rate": 4.767468441283355e-06, + "loss": 0.1754, + "step": 13471 + }, + { + "epoch": 0.69, + "grad_norm": 0.953328021389925, + "learning_rate": 4.766065002994626e-06, + "loss": 0.2112, + "step": 13472 + }, + { + "epoch": 0.69, + "grad_norm": 0.8236174244301655, + "learning_rate": 4.764661706676679e-06, + "loss": 0.1659, + "step": 13473 + }, + { + "epoch": 0.69, + "grad_norm": 1.07470258531206, + "learning_rate": 4.763258552367579e-06, + "loss": 0.159, + "step": 13474 + }, + { + "epoch": 0.69, + "grad_norm": 1.4178247476696861, + "learning_rate": 4.761855540105391e-06, + "loss": 0.1845, + "step": 13475 + }, + { + "epoch": 0.69, + "grad_norm": 1.0258835616788802, + "learning_rate": 4.760452669928167e-06, + "loss": 0.1675, + "step": 13476 + }, + { + "epoch": 0.69, + "grad_norm": 0.8569243751405146, + "learning_rate": 4.759049941873957e-06, + "loss": 0.1865, + "step": 13477 + }, + { + "epoch": 0.69, + "grad_norm": 0.9533173851533268, + "learning_rate": 4.757647355980816e-06, + "loss": 0.1824, + "step": 13478 + }, + { + "epoch": 0.69, + "grad_norm": 0.775607340301293, + "learning_rate": 4.756244912286782e-06, + "loss": 0.1543, + "step": 13479 + }, + { + "epoch": 0.69, + "grad_norm": 0.8975477515315292, + "learning_rate": 4.754842610829908e-06, + "loss": 0.1811, + "step": 13480 + }, + { + "epoch": 0.69, + "grad_norm": 0.8611261096224713, + "learning_rate": 4.753440451648218e-06, + "loss": 0.1522, + "step": 13481 + }, + { + "epoch": 0.69, + "grad_norm": 1.2630671689125583, + "learning_rate": 4.752038434779752e-06, + "loss": 0.1542, + "step": 13482 + }, + { + "epoch": 0.69, + "grad_norm": 1.3134315148743594, + "learning_rate": 4.750636560262542e-06, + "loss": 0.1737, + "step": 13483 + }, + { + "epoch": 0.69, + "grad_norm": 1.6569673148435342, + "learning_rate": 4.749234828134614e-06, + "loss": 0.1738, + "step": 13484 + }, + { + "epoch": 0.69, + "grad_norm": 0.843456888349568, + "learning_rate": 4.7478332384339834e-06, + "loss": 0.1788, + "step": 13485 + }, + { + "epoch": 0.69, + "grad_norm": 1.1360067662500801, + "learning_rate": 4.746431791198678e-06, + "loss": 0.1787, + "step": 13486 + }, + { + "epoch": 0.69, + "grad_norm": 0.9441431557652228, + "learning_rate": 4.745030486466702e-06, + "loss": 0.1614, + "step": 13487 + }, + { + "epoch": 0.69, + "grad_norm": 1.217989906596319, + "learning_rate": 4.743629324276076e-06, + "loss": 0.1607, + "step": 13488 + }, + { + "epoch": 0.69, + "grad_norm": 1.2598036556797265, + "learning_rate": 4.742228304664797e-06, + "loss": 0.1863, + "step": 13489 + }, + { + "epoch": 0.69, + "grad_norm": 0.9377746170865727, + "learning_rate": 4.740827427670871e-06, + "loss": 0.1719, + "step": 13490 + }, + { + "epoch": 0.69, + "grad_norm": 0.7620754681797829, + "learning_rate": 4.7394266933322995e-06, + "loss": 0.155, + "step": 13491 + }, + { + "epoch": 0.69, + "grad_norm": 1.446752341984604, + "learning_rate": 4.73802610168708e-06, + "loss": 0.1637, + "step": 13492 + }, + { + "epoch": 0.69, + "grad_norm": 1.3227834294404783, + "learning_rate": 4.736625652773195e-06, + "loss": 0.1758, + "step": 13493 + }, + { + "epoch": 0.69, + "grad_norm": 1.2196491773894043, + "learning_rate": 4.735225346628641e-06, + "loss": 0.1779, + "step": 13494 + }, + { + "epoch": 0.69, + "grad_norm": 0.972599455443251, + "learning_rate": 4.733825183291396e-06, + "loss": 0.1582, + "step": 13495 + }, + { + "epoch": 0.69, + "grad_norm": 2.1493289407215332, + "learning_rate": 4.7324251627994375e-06, + "loss": 0.1861, + "step": 13496 + }, + { + "epoch": 0.69, + "grad_norm": 1.213066587740853, + "learning_rate": 4.731025285190748e-06, + "loss": 0.1705, + "step": 13497 + }, + { + "epoch": 0.69, + "grad_norm": 0.9963341122388324, + "learning_rate": 4.729625550503291e-06, + "loss": 0.1576, + "step": 13498 + }, + { + "epoch": 0.69, + "grad_norm": 0.928854673356536, + "learning_rate": 4.728225958775038e-06, + "loss": 0.1631, + "step": 13499 + }, + { + "epoch": 0.69, + "grad_norm": 1.1516730033725135, + "learning_rate": 4.726826510043953e-06, + "loss": 0.1566, + "step": 13500 + }, + { + "epoch": 0.69, + "grad_norm": 1.0721723247090889, + "learning_rate": 4.725427204348002e-06, + "loss": 0.1706, + "step": 13501 + }, + { + "epoch": 0.69, + "grad_norm": 1.0402834885592727, + "learning_rate": 4.724028041725132e-06, + "loss": 0.1372, + "step": 13502 + }, + { + "epoch": 0.69, + "grad_norm": 1.2585969099620888, + "learning_rate": 4.722629022213303e-06, + "loss": 0.1784, + "step": 13503 + }, + { + "epoch": 0.69, + "grad_norm": 1.211687513437, + "learning_rate": 4.721230145850456e-06, + "loss": 0.1699, + "step": 13504 + }, + { + "epoch": 0.69, + "grad_norm": 0.8774157171124775, + "learning_rate": 4.7198314126745424e-06, + "loss": 0.1476, + "step": 13505 + }, + { + "epoch": 0.69, + "grad_norm": 1.1104455447501893, + "learning_rate": 4.718432822723498e-06, + "loss": 0.1737, + "step": 13506 + }, + { + "epoch": 0.69, + "grad_norm": 0.8279150288922937, + "learning_rate": 4.7170343760352595e-06, + "loss": 0.1626, + "step": 13507 + }, + { + "epoch": 0.69, + "grad_norm": 1.3927560254857045, + "learning_rate": 4.715636072647763e-06, + "loss": 0.1732, + "step": 13508 + }, + { + "epoch": 0.69, + "grad_norm": 0.9894871996707575, + "learning_rate": 4.714237912598941e-06, + "loss": 0.1672, + "step": 13509 + }, + { + "epoch": 0.69, + "grad_norm": 1.001968051076667, + "learning_rate": 4.7128398959267095e-06, + "loss": 0.1654, + "step": 13510 + }, + { + "epoch": 0.69, + "grad_norm": 1.2122031088329057, + "learning_rate": 4.711442022668998e-06, + "loss": 0.1674, + "step": 13511 + }, + { + "epoch": 0.69, + "grad_norm": 0.7976776552459861, + "learning_rate": 4.710044292863721e-06, + "loss": 0.1469, + "step": 13512 + }, + { + "epoch": 0.69, + "grad_norm": 0.9277944585899132, + "learning_rate": 4.7086467065487875e-06, + "loss": 0.1499, + "step": 13513 + }, + { + "epoch": 0.69, + "grad_norm": 1.2169821076074399, + "learning_rate": 4.707249263762115e-06, + "loss": 0.1595, + "step": 13514 + }, + { + "epoch": 0.69, + "grad_norm": 1.6496269445057359, + "learning_rate": 4.7058519645416004e-06, + "loss": 0.1722, + "step": 13515 + }, + { + "epoch": 0.69, + "grad_norm": 1.0799766772357269, + "learning_rate": 4.7044548089251505e-06, + "loss": 0.161, + "step": 13516 + }, + { + "epoch": 0.69, + "grad_norm": 1.7183745714562393, + "learning_rate": 4.703057796950663e-06, + "loss": 0.1632, + "step": 13517 + }, + { + "epoch": 0.69, + "grad_norm": 1.2890048452136667, + "learning_rate": 4.701660928656036e-06, + "loss": 0.1841, + "step": 13518 + }, + { + "epoch": 0.69, + "grad_norm": 1.299031005709721, + "learning_rate": 4.7002642040791526e-06, + "loss": 0.1627, + "step": 13519 + }, + { + "epoch": 0.69, + "grad_norm": 1.0199202670304646, + "learning_rate": 4.698867623257905e-06, + "loss": 0.1509, + "step": 13520 + }, + { + "epoch": 0.69, + "grad_norm": 1.0061548078404354, + "learning_rate": 4.697471186230168e-06, + "loss": 0.1723, + "step": 13521 + }, + { + "epoch": 0.69, + "grad_norm": 1.6402715115473485, + "learning_rate": 4.696074893033828e-06, + "loss": 0.1612, + "step": 13522 + }, + { + "epoch": 0.69, + "grad_norm": 0.9338752918297215, + "learning_rate": 4.694678743706754e-06, + "loss": 0.1874, + "step": 13523 + }, + { + "epoch": 0.69, + "grad_norm": 1.3107696335607468, + "learning_rate": 4.693282738286822e-06, + "loss": 0.1601, + "step": 13524 + }, + { + "epoch": 0.69, + "grad_norm": 1.0287906491985137, + "learning_rate": 4.6918868768118906e-06, + "loss": 0.1608, + "step": 13525 + }, + { + "epoch": 0.69, + "grad_norm": 1.3655937539532998, + "learning_rate": 4.690491159319829e-06, + "loss": 0.1741, + "step": 13526 + }, + { + "epoch": 0.69, + "grad_norm": 1.0816495827850183, + "learning_rate": 4.689095585848494e-06, + "loss": 0.1555, + "step": 13527 + }, + { + "epoch": 0.69, + "grad_norm": 1.2177066010221815, + "learning_rate": 4.687700156435745e-06, + "loss": 0.1835, + "step": 13528 + }, + { + "epoch": 0.69, + "grad_norm": 0.8894544378116847, + "learning_rate": 4.686304871119429e-06, + "loss": 0.1577, + "step": 13529 + }, + { + "epoch": 0.69, + "grad_norm": 1.4856797398142272, + "learning_rate": 4.68490972993739e-06, + "loss": 0.1922, + "step": 13530 + }, + { + "epoch": 0.69, + "grad_norm": 0.842186615856397, + "learning_rate": 4.683514732927479e-06, + "loss": 0.1726, + "step": 13531 + }, + { + "epoch": 0.69, + "grad_norm": 1.0661252290878187, + "learning_rate": 4.682119880127526e-06, + "loss": 0.181, + "step": 13532 + }, + { + "epoch": 0.69, + "grad_norm": 1.1548888139770868, + "learning_rate": 4.680725171575373e-06, + "loss": 0.1751, + "step": 13533 + }, + { + "epoch": 0.69, + "grad_norm": 0.9473059849013943, + "learning_rate": 4.679330607308849e-06, + "loss": 0.195, + "step": 13534 + }, + { + "epoch": 0.69, + "grad_norm": 1.4304758217482247, + "learning_rate": 4.677936187365787e-06, + "loss": 0.1815, + "step": 13535 + }, + { + "epoch": 0.69, + "grad_norm": 0.9854967453610628, + "learning_rate": 4.676541911784004e-06, + "loss": 0.1773, + "step": 13536 + }, + { + "epoch": 0.69, + "grad_norm": 1.2275356983406849, + "learning_rate": 4.675147780601324e-06, + "loss": 0.1621, + "step": 13537 + }, + { + "epoch": 0.69, + "grad_norm": 0.8949481564151959, + "learning_rate": 4.673753793855559e-06, + "loss": 0.173, + "step": 13538 + }, + { + "epoch": 0.69, + "grad_norm": 1.0624308576100148, + "learning_rate": 4.672359951584526e-06, + "loss": 0.1774, + "step": 13539 + }, + { + "epoch": 0.69, + "grad_norm": 0.9060232614534275, + "learning_rate": 4.670966253826027e-06, + "loss": 0.1626, + "step": 13540 + }, + { + "epoch": 0.69, + "grad_norm": 0.8764413985940274, + "learning_rate": 4.669572700617872e-06, + "loss": 0.1696, + "step": 13541 + }, + { + "epoch": 0.69, + "grad_norm": 1.1743336547666958, + "learning_rate": 4.6681792919978565e-06, + "loss": 0.1732, + "step": 13542 + }, + { + "epoch": 0.69, + "grad_norm": 0.9143877752093164, + "learning_rate": 4.666786028003778e-06, + "loss": 0.1569, + "step": 13543 + }, + { + "epoch": 0.69, + "grad_norm": 1.0888796487708368, + "learning_rate": 4.66539290867343e-06, + "loss": 0.157, + "step": 13544 + }, + { + "epoch": 0.69, + "grad_norm": 1.25089617021715, + "learning_rate": 4.6639999340446045e-06, + "loss": 0.1631, + "step": 13545 + }, + { + "epoch": 0.69, + "grad_norm": 1.4660317954025943, + "learning_rate": 4.662607104155081e-06, + "loss": 0.1864, + "step": 13546 + }, + { + "epoch": 0.69, + "grad_norm": 0.7761331672489337, + "learning_rate": 4.661214419042639e-06, + "loss": 0.1929, + "step": 13547 + }, + { + "epoch": 0.69, + "grad_norm": 1.0568586982203227, + "learning_rate": 4.65982187874506e-06, + "loss": 0.1751, + "step": 13548 + }, + { + "epoch": 0.69, + "grad_norm": 0.8537179892331328, + "learning_rate": 4.658429483300111e-06, + "loss": 0.1648, + "step": 13549 + }, + { + "epoch": 0.69, + "grad_norm": 1.3181985905550566, + "learning_rate": 4.6570372327455686e-06, + "loss": 0.1788, + "step": 13550 + }, + { + "epoch": 0.69, + "grad_norm": 1.8328073860100116, + "learning_rate": 4.6556451271191875e-06, + "loss": 0.1745, + "step": 13551 + }, + { + "epoch": 0.69, + "grad_norm": 1.2966089598592383, + "learning_rate": 4.6542531664587355e-06, + "loss": 0.1703, + "step": 13552 + }, + { + "epoch": 0.69, + "grad_norm": 1.1042830729817357, + "learning_rate": 4.652861350801967e-06, + "loss": 0.1756, + "step": 13553 + }, + { + "epoch": 0.69, + "grad_norm": 0.8945608077369172, + "learning_rate": 4.651469680186641e-06, + "loss": 0.1402, + "step": 13554 + }, + { + "epoch": 0.69, + "grad_norm": 1.672402165651304, + "learning_rate": 4.650078154650498e-06, + "loss": 0.1837, + "step": 13555 + }, + { + "epoch": 0.69, + "grad_norm": 1.071037699984741, + "learning_rate": 4.648686774231291e-06, + "loss": 0.1725, + "step": 13556 + }, + { + "epoch": 0.69, + "grad_norm": 0.8953953526421431, + "learning_rate": 4.647295538966754e-06, + "loss": 0.1629, + "step": 13557 + }, + { + "epoch": 0.69, + "grad_norm": 1.0473247996790758, + "learning_rate": 4.645904448894632e-06, + "loss": 0.1598, + "step": 13558 + }, + { + "epoch": 0.69, + "grad_norm": 0.9166293227738351, + "learning_rate": 4.644513504052649e-06, + "loss": 0.1726, + "step": 13559 + }, + { + "epoch": 0.69, + "grad_norm": 0.8631086924575336, + "learning_rate": 4.643122704478541e-06, + "loss": 0.1433, + "step": 13560 + }, + { + "epoch": 0.69, + "grad_norm": 1.6481431810406317, + "learning_rate": 4.641732050210032e-06, + "loss": 0.1535, + "step": 13561 + }, + { + "epoch": 0.69, + "grad_norm": 2.250059379509954, + "learning_rate": 4.640341541284847e-06, + "loss": 0.1824, + "step": 13562 + }, + { + "epoch": 0.69, + "grad_norm": 0.8370577808863652, + "learning_rate": 4.638951177740697e-06, + "loss": 0.1811, + "step": 13563 + }, + { + "epoch": 0.69, + "grad_norm": 1.0688233014826511, + "learning_rate": 4.637560959615302e-06, + "loss": 0.1542, + "step": 13564 + }, + { + "epoch": 0.69, + "grad_norm": 1.165288677744048, + "learning_rate": 4.636170886946371e-06, + "loss": 0.1886, + "step": 13565 + }, + { + "epoch": 0.69, + "grad_norm": 0.9040930500201682, + "learning_rate": 4.634780959771602e-06, + "loss": 0.1591, + "step": 13566 + }, + { + "epoch": 0.69, + "grad_norm": 1.0498273932508568, + "learning_rate": 4.633391178128707e-06, + "loss": 0.1724, + "step": 13567 + }, + { + "epoch": 0.69, + "grad_norm": 1.5616589553427944, + "learning_rate": 4.632001542055375e-06, + "loss": 0.1837, + "step": 13568 + }, + { + "epoch": 0.69, + "grad_norm": 1.1889390923160186, + "learning_rate": 4.630612051589305e-06, + "loss": 0.1698, + "step": 13569 + }, + { + "epoch": 0.69, + "grad_norm": 1.1318211001505287, + "learning_rate": 4.629222706768186e-06, + "loss": 0.1799, + "step": 13570 + }, + { + "epoch": 0.69, + "grad_norm": 1.3790972285522614, + "learning_rate": 4.6278335076297085e-06, + "loss": 0.1619, + "step": 13571 + }, + { + "epoch": 0.69, + "grad_norm": 1.7173679176378776, + "learning_rate": 4.626444454211547e-06, + "loss": 0.1688, + "step": 13572 + }, + { + "epoch": 0.69, + "grad_norm": 1.1035216925111442, + "learning_rate": 4.6250555465513866e-06, + "loss": 0.1538, + "step": 13573 + }, + { + "epoch": 0.69, + "grad_norm": 0.9786599408298793, + "learning_rate": 4.623666784686895e-06, + "loss": 0.1866, + "step": 13574 + }, + { + "epoch": 0.69, + "grad_norm": 1.421386305458509, + "learning_rate": 4.6222781686557485e-06, + "loss": 0.1817, + "step": 13575 + }, + { + "epoch": 0.69, + "grad_norm": 1.4270252922303626, + "learning_rate": 4.620889698495606e-06, + "loss": 0.1885, + "step": 13576 + }, + { + "epoch": 0.69, + "grad_norm": 1.2922180581849296, + "learning_rate": 4.619501374244138e-06, + "loss": 0.1724, + "step": 13577 + }, + { + "epoch": 0.69, + "grad_norm": 0.8167681017098267, + "learning_rate": 4.618113195938997e-06, + "loss": 0.1858, + "step": 13578 + }, + { + "epoch": 0.69, + "grad_norm": 1.004602648431441, + "learning_rate": 4.616725163617838e-06, + "loss": 0.1783, + "step": 13579 + }, + { + "epoch": 0.69, + "grad_norm": 0.984136134542311, + "learning_rate": 4.615337277318313e-06, + "loss": 0.1699, + "step": 13580 + }, + { + "epoch": 0.69, + "grad_norm": 0.8063277723505651, + "learning_rate": 4.613949537078074e-06, + "loss": 0.1626, + "step": 13581 + }, + { + "epoch": 0.69, + "grad_norm": 1.6604415687659848, + "learning_rate": 4.612561942934757e-06, + "loss": 0.1848, + "step": 13582 + }, + { + "epoch": 0.69, + "grad_norm": 1.60353535338271, + "learning_rate": 4.611174494925998e-06, + "loss": 0.1684, + "step": 13583 + }, + { + "epoch": 0.69, + "grad_norm": 1.5032736790328078, + "learning_rate": 4.609787193089438e-06, + "loss": 0.1871, + "step": 13584 + }, + { + "epoch": 0.69, + "grad_norm": 2.9766809056291623, + "learning_rate": 4.608400037462702e-06, + "loss": 0.1659, + "step": 13585 + }, + { + "epoch": 0.69, + "grad_norm": 0.8504763258567766, + "learning_rate": 4.607013028083419e-06, + "loss": 0.1633, + "step": 13586 + }, + { + "epoch": 0.69, + "grad_norm": 0.9512887618479131, + "learning_rate": 4.605626164989212e-06, + "loss": 0.1586, + "step": 13587 + }, + { + "epoch": 0.69, + "grad_norm": 0.9440239043274677, + "learning_rate": 4.604239448217704e-06, + "loss": 0.1931, + "step": 13588 + }, + { + "epoch": 0.69, + "grad_norm": 0.94407399820329, + "learning_rate": 4.602852877806502e-06, + "loss": 0.1728, + "step": 13589 + }, + { + "epoch": 0.69, + "grad_norm": 0.8781022672939047, + "learning_rate": 4.601466453793224e-06, + "loss": 0.16, + "step": 13590 + }, + { + "epoch": 0.69, + "grad_norm": 1.02587555658374, + "learning_rate": 4.60008017621547e-06, + "loss": 0.1712, + "step": 13591 + }, + { + "epoch": 0.69, + "grad_norm": 0.8299062732482788, + "learning_rate": 4.598694045110851e-06, + "loss": 0.1637, + "step": 13592 + }, + { + "epoch": 0.69, + "grad_norm": 1.6775253264414962, + "learning_rate": 4.597308060516956e-06, + "loss": 0.178, + "step": 13593 + }, + { + "epoch": 0.69, + "grad_norm": 1.0173767614248344, + "learning_rate": 4.595922222471388e-06, + "loss": 0.1702, + "step": 13594 + }, + { + "epoch": 0.69, + "grad_norm": 1.15134970328035, + "learning_rate": 4.5945365310117325e-06, + "loss": 0.2012, + "step": 13595 + }, + { + "epoch": 0.69, + "grad_norm": 1.2736440643117173, + "learning_rate": 4.593150986175578e-06, + "loss": 0.1611, + "step": 13596 + }, + { + "epoch": 0.69, + "grad_norm": 0.9160802750515318, + "learning_rate": 4.59176558800051e-06, + "loss": 0.1854, + "step": 13597 + }, + { + "epoch": 0.69, + "grad_norm": 1.0618565771701247, + "learning_rate": 4.590380336524108e-06, + "loss": 0.1863, + "step": 13598 + }, + { + "epoch": 0.69, + "grad_norm": 1.4095872357508041, + "learning_rate": 4.588995231783946e-06, + "loss": 0.1574, + "step": 13599 + }, + { + "epoch": 0.69, + "grad_norm": 1.4633830074142442, + "learning_rate": 4.5876102738175895e-06, + "loss": 0.1711, + "step": 13600 + }, + { + "epoch": 0.69, + "grad_norm": 1.1496702197396802, + "learning_rate": 4.586225462662615e-06, + "loss": 0.1694, + "step": 13601 + }, + { + "epoch": 0.69, + "grad_norm": 1.2995727514266986, + "learning_rate": 4.584840798356574e-06, + "loss": 0.1774, + "step": 13602 + }, + { + "epoch": 0.69, + "grad_norm": 1.2066523787933985, + "learning_rate": 4.583456280937035e-06, + "loss": 0.178, + "step": 13603 + }, + { + "epoch": 0.69, + "grad_norm": 1.0492721475579148, + "learning_rate": 4.582071910441549e-06, + "loss": 0.159, + "step": 13604 + }, + { + "epoch": 0.69, + "grad_norm": 1.9537120737237095, + "learning_rate": 4.580687686907673e-06, + "loss": 0.1438, + "step": 13605 + }, + { + "epoch": 0.69, + "grad_norm": 2.1519038592765245, + "learning_rate": 4.579303610372945e-06, + "loss": 0.165, + "step": 13606 + }, + { + "epoch": 0.69, + "grad_norm": 0.9498047286022816, + "learning_rate": 4.577919680874917e-06, + "loss": 0.1709, + "step": 13607 + }, + { + "epoch": 0.69, + "grad_norm": 1.250374359466655, + "learning_rate": 4.57653589845112e-06, + "loss": 0.1443, + "step": 13608 + }, + { + "epoch": 0.69, + "grad_norm": 0.918021914540636, + "learning_rate": 4.575152263139096e-06, + "loss": 0.1468, + "step": 13609 + }, + { + "epoch": 0.69, + "grad_norm": 1.416579009119526, + "learning_rate": 4.573768774976371e-06, + "loss": 0.1792, + "step": 13610 + }, + { + "epoch": 0.69, + "grad_norm": 1.2171797058734255, + "learning_rate": 4.572385434000477e-06, + "loss": 0.1675, + "step": 13611 + }, + { + "epoch": 0.69, + "grad_norm": 1.3487422467380303, + "learning_rate": 4.5710022402489316e-06, + "loss": 0.1774, + "step": 13612 + }, + { + "epoch": 0.69, + "grad_norm": 0.8390122329378098, + "learning_rate": 4.569619193759257e-06, + "loss": 0.1526, + "step": 13613 + }, + { + "epoch": 0.69, + "grad_norm": 1.0920505118260386, + "learning_rate": 4.5682362945689684e-06, + "loss": 0.1821, + "step": 13614 + }, + { + "epoch": 0.69, + "grad_norm": 1.2520630500194392, + "learning_rate": 4.5668535427155816e-06, + "loss": 0.1818, + "step": 13615 + }, + { + "epoch": 0.69, + "grad_norm": 0.9270273436084522, + "learning_rate": 4.565470938236598e-06, + "loss": 0.1543, + "step": 13616 + }, + { + "epoch": 0.69, + "grad_norm": 1.0954459642480405, + "learning_rate": 4.5640884811695185e-06, + "loss": 0.1644, + "step": 13617 + }, + { + "epoch": 0.69, + "grad_norm": 0.939343554571579, + "learning_rate": 4.5627061715518504e-06, + "loss": 0.1832, + "step": 13618 + }, + { + "epoch": 0.69, + "grad_norm": 0.9068057167860933, + "learning_rate": 4.561324009421081e-06, + "loss": 0.1756, + "step": 13619 + }, + { + "epoch": 0.69, + "grad_norm": 1.21653027952336, + "learning_rate": 4.5599419948147075e-06, + "loss": 0.1592, + "step": 13620 + }, + { + "epoch": 0.69, + "grad_norm": 0.9564302335647423, + "learning_rate": 4.558560127770212e-06, + "loss": 0.1713, + "step": 13621 + }, + { + "epoch": 0.69, + "grad_norm": 0.8784787604845598, + "learning_rate": 4.55717840832508e-06, + "loss": 0.155, + "step": 13622 + }, + { + "epoch": 0.69, + "grad_norm": 0.8785266321659584, + "learning_rate": 4.5557968365167905e-06, + "loss": 0.1812, + "step": 13623 + }, + { + "epoch": 0.69, + "grad_norm": 1.9026291345633957, + "learning_rate": 4.5544154123828246e-06, + "loss": 0.1774, + "step": 13624 + }, + { + "epoch": 0.69, + "grad_norm": 1.1734358448640543, + "learning_rate": 4.553034135960643e-06, + "loss": 0.1724, + "step": 13625 + }, + { + "epoch": 0.69, + "grad_norm": 1.3709598780484173, + "learning_rate": 4.551653007287722e-06, + "loss": 0.155, + "step": 13626 + }, + { + "epoch": 0.69, + "grad_norm": 1.0501572052064885, + "learning_rate": 4.550272026401518e-06, + "loss": 0.1617, + "step": 13627 + }, + { + "epoch": 0.69, + "grad_norm": 1.106664806576892, + "learning_rate": 4.548891193339496e-06, + "loss": 0.1843, + "step": 13628 + }, + { + "epoch": 0.69, + "grad_norm": 1.4397585434339168, + "learning_rate": 4.5475105081391045e-06, + "loss": 0.1819, + "step": 13629 + }, + { + "epoch": 0.69, + "grad_norm": 6.77510083667638, + "learning_rate": 4.546129970837799e-06, + "loss": 0.1791, + "step": 13630 + }, + { + "epoch": 0.69, + "grad_norm": 0.8667437504260953, + "learning_rate": 4.544749581473026e-06, + "loss": 0.1788, + "step": 13631 + }, + { + "epoch": 0.69, + "grad_norm": 1.0402234284927807, + "learning_rate": 4.543369340082232e-06, + "loss": 0.1657, + "step": 13632 + }, + { + "epoch": 0.69, + "grad_norm": 1.2141343623878411, + "learning_rate": 4.54198924670285e-06, + "loss": 0.1783, + "step": 13633 + }, + { + "epoch": 0.69, + "grad_norm": 0.9137213467530735, + "learning_rate": 4.540609301372321e-06, + "loss": 0.1905, + "step": 13634 + }, + { + "epoch": 0.69, + "grad_norm": 1.0482645074998238, + "learning_rate": 4.539229504128073e-06, + "loss": 0.1715, + "step": 13635 + }, + { + "epoch": 0.69, + "grad_norm": 1.3001918628637983, + "learning_rate": 4.53784985500753e-06, + "loss": 0.1828, + "step": 13636 + }, + { + "epoch": 0.69, + "grad_norm": 1.4116670746126074, + "learning_rate": 4.536470354048121e-06, + "loss": 0.1869, + "step": 13637 + }, + { + "epoch": 0.69, + "grad_norm": 0.8045872138382979, + "learning_rate": 4.535091001287259e-06, + "loss": 0.1484, + "step": 13638 + }, + { + "epoch": 0.69, + "grad_norm": 1.0206620077990305, + "learning_rate": 4.533711796762362e-06, + "loss": 0.1993, + "step": 13639 + }, + { + "epoch": 0.69, + "grad_norm": 1.4379884999398949, + "learning_rate": 4.532332740510842e-06, + "loss": 0.1645, + "step": 13640 + }, + { + "epoch": 0.69, + "grad_norm": 1.6563946729632992, + "learning_rate": 4.530953832570109e-06, + "loss": 0.1724, + "step": 13641 + }, + { + "epoch": 0.69, + "grad_norm": 1.1491454430323254, + "learning_rate": 4.5295750729775565e-06, + "loss": 0.1714, + "step": 13642 + }, + { + "epoch": 0.69, + "grad_norm": 0.805474854958942, + "learning_rate": 4.528196461770596e-06, + "loss": 0.1534, + "step": 13643 + }, + { + "epoch": 0.69, + "grad_norm": 0.9306763153096663, + "learning_rate": 4.526817998986609e-06, + "loss": 0.1653, + "step": 13644 + }, + { + "epoch": 0.69, + "grad_norm": 1.0826607768088683, + "learning_rate": 4.5254396846629975e-06, + "loss": 0.1562, + "step": 13645 + }, + { + "epoch": 0.69, + "grad_norm": 1.0587769923842847, + "learning_rate": 4.5240615188371404e-06, + "loss": 0.1681, + "step": 13646 + }, + { + "epoch": 0.69, + "grad_norm": 1.0287482429144468, + "learning_rate": 4.522683501546428e-06, + "loss": 0.1685, + "step": 13647 + }, + { + "epoch": 0.69, + "grad_norm": 0.9475189022165657, + "learning_rate": 4.52130563282823e-06, + "loss": 0.1533, + "step": 13648 + }, + { + "epoch": 0.69, + "grad_norm": 0.9559323826822024, + "learning_rate": 4.519927912719927e-06, + "loss": 0.2016, + "step": 13649 + }, + { + "epoch": 0.69, + "grad_norm": 0.9539906601319694, + "learning_rate": 4.518550341258888e-06, + "loss": 0.2025, + "step": 13650 + }, + { + "epoch": 0.69, + "grad_norm": 0.7717629165163703, + "learning_rate": 4.517172918482485e-06, + "loss": 0.1606, + "step": 13651 + }, + { + "epoch": 0.69, + "grad_norm": 1.5207736020799518, + "learning_rate": 4.515795644428076e-06, + "loss": 0.191, + "step": 13652 + }, + { + "epoch": 0.69, + "grad_norm": 2.051008527005148, + "learning_rate": 4.514418519133017e-06, + "loss": 0.1737, + "step": 13653 + }, + { + "epoch": 0.69, + "grad_norm": 1.3983662117819151, + "learning_rate": 4.513041542634668e-06, + "loss": 0.1852, + "step": 13654 + }, + { + "epoch": 0.69, + "grad_norm": 1.1120072938260044, + "learning_rate": 4.511664714970374e-06, + "loss": 0.2028, + "step": 13655 + }, + { + "epoch": 0.69, + "grad_norm": 1.0610598459717966, + "learning_rate": 4.510288036177485e-06, + "loss": 0.1831, + "step": 13656 + }, + { + "epoch": 0.69, + "grad_norm": 1.2201168795069803, + "learning_rate": 4.508911506293343e-06, + "loss": 0.1497, + "step": 13657 + }, + { + "epoch": 0.69, + "grad_norm": 1.019804582026071, + "learning_rate": 4.50753512535529e-06, + "loss": 0.1689, + "step": 13658 + }, + { + "epoch": 0.69, + "grad_norm": 1.5116796780430295, + "learning_rate": 4.5061588934006525e-06, + "loss": 0.169, + "step": 13659 + }, + { + "epoch": 0.69, + "grad_norm": 0.9856392022738341, + "learning_rate": 4.50478281046677e-06, + "loss": 0.152, + "step": 13660 + }, + { + "epoch": 0.69, + "grad_norm": 1.6570900891806821, + "learning_rate": 4.50340687659096e-06, + "loss": 0.1617, + "step": 13661 + }, + { + "epoch": 0.69, + "grad_norm": 1.283983702559894, + "learning_rate": 4.502031091810553e-06, + "loss": 0.1772, + "step": 13662 + }, + { + "epoch": 0.69, + "grad_norm": 0.8647829813983922, + "learning_rate": 4.500655456162859e-06, + "loss": 0.1605, + "step": 13663 + }, + { + "epoch": 0.69, + "grad_norm": 0.8856314954674944, + "learning_rate": 4.4992799696852e-06, + "loss": 0.1602, + "step": 13664 + }, + { + "epoch": 0.69, + "grad_norm": 0.8324634660473998, + "learning_rate": 4.497904632414879e-06, + "loss": 0.1672, + "step": 13665 + }, + { + "epoch": 0.69, + "grad_norm": 1.2142499210913824, + "learning_rate": 4.496529444389206e-06, + "loss": 0.1883, + "step": 13666 + }, + { + "epoch": 0.69, + "grad_norm": 1.0431450995646872, + "learning_rate": 4.495154405645482e-06, + "loss": 0.1901, + "step": 13667 + }, + { + "epoch": 0.7, + "grad_norm": 1.0691190749628912, + "learning_rate": 4.493779516221009e-06, + "loss": 0.1844, + "step": 13668 + }, + { + "epoch": 0.7, + "grad_norm": 1.7628766475524442, + "learning_rate": 4.492404776153078e-06, + "loss": 0.1729, + "step": 13669 + }, + { + "epoch": 0.7, + "grad_norm": 0.9033434819234581, + "learning_rate": 4.491030185478976e-06, + "loss": 0.1811, + "step": 13670 + }, + { + "epoch": 0.7, + "grad_norm": 1.296385993849624, + "learning_rate": 4.489655744235994e-06, + "loss": 0.1705, + "step": 13671 + }, + { + "epoch": 0.7, + "grad_norm": 1.2469644716777546, + "learning_rate": 4.488281452461407e-06, + "loss": 0.1694, + "step": 13672 + }, + { + "epoch": 0.7, + "grad_norm": 0.8311193064293749, + "learning_rate": 4.4869073101925024e-06, + "loss": 0.1622, + "step": 13673 + }, + { + "epoch": 0.7, + "grad_norm": 1.8013546615744547, + "learning_rate": 4.4855333174665425e-06, + "loss": 0.1701, + "step": 13674 + }, + { + "epoch": 0.7, + "grad_norm": 3.628093865865405, + "learning_rate": 4.484159474320804e-06, + "loss": 0.1751, + "step": 13675 + }, + { + "epoch": 0.7, + "grad_norm": 1.3232699202577978, + "learning_rate": 4.482785780792551e-06, + "loss": 0.169, + "step": 13676 + }, + { + "epoch": 0.7, + "grad_norm": 1.5385823443408615, + "learning_rate": 4.481412236919049e-06, + "loss": 0.1629, + "step": 13677 + }, + { + "epoch": 0.7, + "grad_norm": 0.9031756800976332, + "learning_rate": 4.480038842737548e-06, + "loss": 0.1479, + "step": 13678 + }, + { + "epoch": 0.7, + "grad_norm": 1.257354476225968, + "learning_rate": 4.47866559828531e-06, + "loss": 0.1869, + "step": 13679 + }, + { + "epoch": 0.7, + "grad_norm": 0.9714070942255482, + "learning_rate": 4.477292503599574e-06, + "loss": 0.1822, + "step": 13680 + }, + { + "epoch": 0.7, + "grad_norm": 1.1593533989119276, + "learning_rate": 4.475919558717596e-06, + "loss": 0.1904, + "step": 13681 + }, + { + "epoch": 0.7, + "grad_norm": 1.5104136020876684, + "learning_rate": 4.474546763676607e-06, + "loss": 0.1784, + "step": 13682 + }, + { + "epoch": 0.7, + "grad_norm": 1.2655799392585738, + "learning_rate": 4.47317411851385e-06, + "loss": 0.1744, + "step": 13683 + }, + { + "epoch": 0.7, + "grad_norm": 1.2385869323659042, + "learning_rate": 4.471801623266558e-06, + "loss": 0.1445, + "step": 13684 + }, + { + "epoch": 0.7, + "grad_norm": 0.7638791201707295, + "learning_rate": 4.470429277971961e-06, + "loss": 0.1627, + "step": 13685 + }, + { + "epoch": 0.7, + "grad_norm": 1.0250060836714399, + "learning_rate": 4.469057082667283e-06, + "loss": 0.196, + "step": 13686 + }, + { + "epoch": 0.7, + "grad_norm": 2.077268257744968, + "learning_rate": 4.46768503738974e-06, + "loss": 0.158, + "step": 13687 + }, + { + "epoch": 0.7, + "grad_norm": 0.9390479740377202, + "learning_rate": 4.466313142176557e-06, + "loss": 0.17, + "step": 13688 + }, + { + "epoch": 0.7, + "grad_norm": 0.799024685884593, + "learning_rate": 4.464941397064938e-06, + "loss": 0.1583, + "step": 13689 + }, + { + "epoch": 0.7, + "grad_norm": 1.49710107221133, + "learning_rate": 4.4635698020921016e-06, + "loss": 0.183, + "step": 13690 + }, + { + "epoch": 0.7, + "grad_norm": 2.5396832424734, + "learning_rate": 4.462198357295242e-06, + "loss": 0.1738, + "step": 13691 + }, + { + "epoch": 0.7, + "grad_norm": 0.8603738965306661, + "learning_rate": 4.460827062711564e-06, + "loss": 0.1618, + "step": 13692 + }, + { + "epoch": 0.7, + "grad_norm": 0.8633705184933675, + "learning_rate": 4.459455918378266e-06, + "loss": 0.1624, + "step": 13693 + }, + { + "epoch": 0.7, + "grad_norm": 1.315956016828563, + "learning_rate": 4.458084924332543e-06, + "loss": 0.1703, + "step": 13694 + }, + { + "epoch": 0.7, + "grad_norm": 0.860348062193069, + "learning_rate": 4.456714080611575e-06, + "loss": 0.1672, + "step": 13695 + }, + { + "epoch": 0.7, + "grad_norm": 0.9190060904210984, + "learning_rate": 4.455343387252555e-06, + "loss": 0.1822, + "step": 13696 + }, + { + "epoch": 0.7, + "grad_norm": 0.7940459239761528, + "learning_rate": 4.453972844292654e-06, + "loss": 0.1737, + "step": 13697 + }, + { + "epoch": 0.7, + "grad_norm": 1.097721016626529, + "learning_rate": 4.452602451769058e-06, + "loss": 0.1628, + "step": 13698 + }, + { + "epoch": 0.7, + "grad_norm": 1.0759386647310396, + "learning_rate": 4.4512322097189295e-06, + "loss": 0.167, + "step": 13699 + }, + { + "epoch": 0.7, + "grad_norm": 1.3701935230414497, + "learning_rate": 4.449862118179444e-06, + "loss": 0.1584, + "step": 13700 + }, + { + "epoch": 0.7, + "grad_norm": 0.9541634410686334, + "learning_rate": 4.44849217718776e-06, + "loss": 0.1795, + "step": 13701 + }, + { + "epoch": 0.7, + "grad_norm": 0.9153389222149987, + "learning_rate": 4.447122386781038e-06, + "loss": 0.1872, + "step": 13702 + }, + { + "epoch": 0.7, + "grad_norm": 0.9417906727918101, + "learning_rate": 4.445752746996438e-06, + "loss": 0.1725, + "step": 13703 + }, + { + "epoch": 0.7, + "grad_norm": 2.1049319774043718, + "learning_rate": 4.4443832578711055e-06, + "loss": 0.1989, + "step": 13704 + }, + { + "epoch": 0.7, + "grad_norm": 0.9056218269926907, + "learning_rate": 4.443013919442194e-06, + "loss": 0.1826, + "step": 13705 + }, + { + "epoch": 0.7, + "grad_norm": 0.984890726452985, + "learning_rate": 4.4416447317468405e-06, + "loss": 0.1716, + "step": 13706 + }, + { + "epoch": 0.7, + "grad_norm": 1.0207518181035065, + "learning_rate": 4.440275694822192e-06, + "loss": 0.166, + "step": 13707 + }, + { + "epoch": 0.7, + "grad_norm": 1.0303227419266743, + "learning_rate": 4.438906808705374e-06, + "loss": 0.1637, + "step": 13708 + }, + { + "epoch": 0.7, + "grad_norm": 1.0069749895417077, + "learning_rate": 4.437538073433524e-06, + "loss": 0.1834, + "step": 13709 + }, + { + "epoch": 0.7, + "grad_norm": 1.11647684150671, + "learning_rate": 4.436169489043768e-06, + "loss": 0.2011, + "step": 13710 + }, + { + "epoch": 0.7, + "grad_norm": 1.034624038537078, + "learning_rate": 4.434801055573232e-06, + "loss": 0.1871, + "step": 13711 + }, + { + "epoch": 0.7, + "grad_norm": 1.5391380793012959, + "learning_rate": 4.433432773059028e-06, + "loss": 0.155, + "step": 13712 + }, + { + "epoch": 0.7, + "grad_norm": 1.0803455955807053, + "learning_rate": 4.432064641538279e-06, + "loss": 0.1773, + "step": 13713 + }, + { + "epoch": 0.7, + "grad_norm": 0.9100790739475726, + "learning_rate": 4.430696661048086e-06, + "loss": 0.1683, + "step": 13714 + }, + { + "epoch": 0.7, + "grad_norm": 1.0718263998332211, + "learning_rate": 4.429328831625565e-06, + "loss": 0.1694, + "step": 13715 + }, + { + "epoch": 0.7, + "grad_norm": 1.1892483023347649, + "learning_rate": 4.427961153307811e-06, + "loss": 0.1673, + "step": 13716 + }, + { + "epoch": 0.7, + "grad_norm": 1.207942887979543, + "learning_rate": 4.426593626131928e-06, + "loss": 0.1533, + "step": 13717 + }, + { + "epoch": 0.7, + "grad_norm": 0.888967603029378, + "learning_rate": 4.425226250135005e-06, + "loss": 0.1736, + "step": 13718 + }, + { + "epoch": 0.7, + "grad_norm": 1.6935255519516728, + "learning_rate": 4.4238590253541335e-06, + "loss": 0.183, + "step": 13719 + }, + { + "epoch": 0.7, + "grad_norm": 3.4630156900562032, + "learning_rate": 4.422491951826402e-06, + "loss": 0.1638, + "step": 13720 + }, + { + "epoch": 0.7, + "grad_norm": 1.2770553329985392, + "learning_rate": 4.421125029588895e-06, + "loss": 0.166, + "step": 13721 + }, + { + "epoch": 0.7, + "grad_norm": 1.5433447524383987, + "learning_rate": 4.419758258678687e-06, + "loss": 0.173, + "step": 13722 + }, + { + "epoch": 0.7, + "grad_norm": 1.0737467851992872, + "learning_rate": 4.418391639132847e-06, + "loss": 0.1707, + "step": 13723 + }, + { + "epoch": 0.7, + "grad_norm": 0.9946446709389992, + "learning_rate": 4.4170251709884526e-06, + "loss": 0.1785, + "step": 13724 + }, + { + "epoch": 0.7, + "grad_norm": 1.4318930340410951, + "learning_rate": 4.4156588542825625e-06, + "loss": 0.1586, + "step": 13725 + }, + { + "epoch": 0.7, + "grad_norm": 3.510405383262583, + "learning_rate": 4.414292689052241e-06, + "loss": 0.1779, + "step": 13726 + }, + { + "epoch": 0.7, + "grad_norm": 0.9444011738801847, + "learning_rate": 4.412926675334546e-06, + "loss": 0.1636, + "step": 13727 + }, + { + "epoch": 0.7, + "grad_norm": 1.2076138811368429, + "learning_rate": 4.411560813166535e-06, + "loss": 0.1708, + "step": 13728 + }, + { + "epoch": 0.7, + "grad_norm": 2.370268298456576, + "learning_rate": 4.410195102585247e-06, + "loss": 0.1799, + "step": 13729 + }, + { + "epoch": 0.7, + "grad_norm": 0.9673169171945579, + "learning_rate": 4.408829543627737e-06, + "loss": 0.1746, + "step": 13730 + }, + { + "epoch": 0.7, + "grad_norm": 0.8898649802675859, + "learning_rate": 4.407464136331039e-06, + "loss": 0.169, + "step": 13731 + }, + { + "epoch": 0.7, + "grad_norm": 1.8779739505312871, + "learning_rate": 4.406098880732195e-06, + "loss": 0.1844, + "step": 13732 + }, + { + "epoch": 0.7, + "grad_norm": 1.0552422789470997, + "learning_rate": 4.404733776868231e-06, + "loss": 0.1786, + "step": 13733 + }, + { + "epoch": 0.7, + "grad_norm": 1.3731759447314973, + "learning_rate": 4.403368824776183e-06, + "loss": 0.1718, + "step": 13734 + }, + { + "epoch": 0.7, + "grad_norm": 2.8969742110147156, + "learning_rate": 4.402004024493069e-06, + "loss": 0.1735, + "step": 13735 + }, + { + "epoch": 0.7, + "grad_norm": 1.8454532379142259, + "learning_rate": 4.4006393760559105e-06, + "loss": 0.1647, + "step": 13736 + }, + { + "epoch": 0.7, + "grad_norm": 1.719264383388367, + "learning_rate": 4.399274879501726e-06, + "loss": 0.1765, + "step": 13737 + }, + { + "epoch": 0.7, + "grad_norm": 1.1110548054408023, + "learning_rate": 4.39791053486753e-06, + "loss": 0.1862, + "step": 13738 + }, + { + "epoch": 0.7, + "grad_norm": 2.8295730102929992, + "learning_rate": 4.396546342190327e-06, + "loss": 0.1518, + "step": 13739 + }, + { + "epoch": 0.7, + "grad_norm": 0.904612012615748, + "learning_rate": 4.3951823015071186e-06, + "loss": 0.1683, + "step": 13740 + }, + { + "epoch": 0.7, + "grad_norm": 1.0040002316631793, + "learning_rate": 4.393818412854909e-06, + "loss": 0.1811, + "step": 13741 + }, + { + "epoch": 0.7, + "grad_norm": 1.2752164591376416, + "learning_rate": 4.392454676270687e-06, + "loss": 0.1604, + "step": 13742 + }, + { + "epoch": 0.7, + "grad_norm": 1.0096690577838259, + "learning_rate": 4.391091091791455e-06, + "loss": 0.1636, + "step": 13743 + }, + { + "epoch": 0.7, + "grad_norm": 1.3945369984716742, + "learning_rate": 4.389727659454189e-06, + "loss": 0.1741, + "step": 13744 + }, + { + "epoch": 0.7, + "grad_norm": 0.8191888837385805, + "learning_rate": 4.388364379295878e-06, + "loss": 0.1619, + "step": 13745 + }, + { + "epoch": 0.7, + "grad_norm": 1.4411158575624718, + "learning_rate": 4.387001251353499e-06, + "loss": 0.1612, + "step": 13746 + }, + { + "epoch": 0.7, + "grad_norm": 1.6639282345204542, + "learning_rate": 4.3856382756640315e-06, + "loss": 0.1821, + "step": 13747 + }, + { + "epoch": 0.7, + "grad_norm": 0.9922552014129387, + "learning_rate": 4.3842754522644394e-06, + "loss": 0.1863, + "step": 13748 + }, + { + "epoch": 0.7, + "grad_norm": 1.1021385598823878, + "learning_rate": 4.382912781191697e-06, + "loss": 0.1895, + "step": 13749 + }, + { + "epoch": 0.7, + "grad_norm": 1.0179098567527967, + "learning_rate": 4.381550262482759e-06, + "loss": 0.1586, + "step": 13750 + }, + { + "epoch": 0.7, + "grad_norm": 0.999589172516189, + "learning_rate": 4.380187896174591e-06, + "loss": 0.191, + "step": 13751 + }, + { + "epoch": 0.7, + "grad_norm": 1.0142183508137548, + "learning_rate": 4.37882568230414e-06, + "loss": 0.1714, + "step": 13752 + }, + { + "epoch": 0.7, + "grad_norm": 1.7396820691494044, + "learning_rate": 4.37746362090836e-06, + "loss": 0.1879, + "step": 13753 + }, + { + "epoch": 0.7, + "grad_norm": 1.7150075562181797, + "learning_rate": 4.376101712024197e-06, + "loss": 0.1967, + "step": 13754 + }, + { + "epoch": 0.7, + "grad_norm": 0.9169604172984819, + "learning_rate": 4.374739955688595e-06, + "loss": 0.1517, + "step": 13755 + }, + { + "epoch": 0.7, + "grad_norm": 1.5748245219247363, + "learning_rate": 4.373378351938491e-06, + "loss": 0.1449, + "step": 13756 + }, + { + "epoch": 0.7, + "grad_norm": 0.9426798859246521, + "learning_rate": 4.372016900810813e-06, + "loss": 0.1728, + "step": 13757 + }, + { + "epoch": 0.7, + "grad_norm": 0.9292767642651203, + "learning_rate": 4.370655602342497e-06, + "loss": 0.1665, + "step": 13758 + }, + { + "epoch": 0.7, + "grad_norm": 0.9681826868018988, + "learning_rate": 4.369294456570463e-06, + "loss": 0.1758, + "step": 13759 + }, + { + "epoch": 0.7, + "grad_norm": 1.1272505686047567, + "learning_rate": 4.3679334635316395e-06, + "loss": 0.183, + "step": 13760 + }, + { + "epoch": 0.7, + "grad_norm": 1.3020722492673074, + "learning_rate": 4.366572623262934e-06, + "loss": 0.1595, + "step": 13761 + }, + { + "epoch": 0.7, + "grad_norm": 2.7390029556224693, + "learning_rate": 4.365211935801264e-06, + "loss": 0.1916, + "step": 13762 + }, + { + "epoch": 0.7, + "grad_norm": 1.075142392697466, + "learning_rate": 4.363851401183539e-06, + "loss": 0.1826, + "step": 13763 + }, + { + "epoch": 0.7, + "grad_norm": 1.937147420005713, + "learning_rate": 4.3624910194466675e-06, + "loss": 0.1843, + "step": 13764 + }, + { + "epoch": 0.7, + "grad_norm": 10.87810639949517, + "learning_rate": 4.361130790627541e-06, + "loss": 0.1532, + "step": 13765 + }, + { + "epoch": 0.7, + "grad_norm": 1.308672685903403, + "learning_rate": 4.3597707147630645e-06, + "loss": 0.1758, + "step": 13766 + }, + { + "epoch": 0.7, + "grad_norm": 1.514058633514923, + "learning_rate": 4.358410791890122e-06, + "loss": 0.1718, + "step": 13767 + }, + { + "epoch": 0.7, + "grad_norm": 1.2272785034178764, + "learning_rate": 4.357051022045608e-06, + "loss": 0.171, + "step": 13768 + }, + { + "epoch": 0.7, + "grad_norm": 0.8642054250805404, + "learning_rate": 4.355691405266401e-06, + "loss": 0.1507, + "step": 13769 + }, + { + "epoch": 0.7, + "grad_norm": 1.0457190185214238, + "learning_rate": 4.354331941589387e-06, + "loss": 0.1679, + "step": 13770 + }, + { + "epoch": 0.7, + "grad_norm": 1.2799911203581829, + "learning_rate": 4.352972631051435e-06, + "loss": 0.1879, + "step": 13771 + }, + { + "epoch": 0.7, + "grad_norm": 1.6635745340448762, + "learning_rate": 4.351613473689419e-06, + "loss": 0.1617, + "step": 13772 + }, + { + "epoch": 0.7, + "grad_norm": 0.8447529424528541, + "learning_rate": 4.350254469540209e-06, + "loss": 0.1517, + "step": 13773 + }, + { + "epoch": 0.7, + "grad_norm": 1.5466512179947478, + "learning_rate": 4.348895618640663e-06, + "loss": 0.1748, + "step": 13774 + }, + { + "epoch": 0.7, + "grad_norm": 0.8568594087610589, + "learning_rate": 4.347536921027646e-06, + "loss": 0.1654, + "step": 13775 + }, + { + "epoch": 0.7, + "grad_norm": 1.1145052751272784, + "learning_rate": 4.346178376738006e-06, + "loss": 0.193, + "step": 13776 + }, + { + "epoch": 0.7, + "grad_norm": 0.8522218692305981, + "learning_rate": 4.344819985808601e-06, + "loss": 0.1649, + "step": 13777 + }, + { + "epoch": 0.7, + "grad_norm": 1.0989297052512519, + "learning_rate": 4.343461748276267e-06, + "loss": 0.1849, + "step": 13778 + }, + { + "epoch": 0.7, + "grad_norm": 1.2810290352563842, + "learning_rate": 4.342103664177856e-06, + "loss": 0.172, + "step": 13779 + }, + { + "epoch": 0.7, + "grad_norm": 2.72999235370836, + "learning_rate": 4.3407457335502e-06, + "loss": 0.1643, + "step": 13780 + }, + { + "epoch": 0.7, + "grad_norm": 0.9451376817683268, + "learning_rate": 4.339387956430141e-06, + "loss": 0.1646, + "step": 13781 + }, + { + "epoch": 0.7, + "grad_norm": 1.321991051912222, + "learning_rate": 4.3380303328545e-06, + "loss": 0.1706, + "step": 13782 + }, + { + "epoch": 0.7, + "grad_norm": 1.1961186962090051, + "learning_rate": 4.336672862860107e-06, + "loss": 0.185, + "step": 13783 + }, + { + "epoch": 0.7, + "grad_norm": 0.9379183664300133, + "learning_rate": 4.335315546483781e-06, + "loss": 0.1697, + "step": 13784 + }, + { + "epoch": 0.7, + "grad_norm": 0.9396690803575051, + "learning_rate": 4.333958383762345e-06, + "loss": 0.1493, + "step": 13785 + }, + { + "epoch": 0.7, + "grad_norm": 1.143837166113807, + "learning_rate": 4.332601374732602e-06, + "loss": 0.1744, + "step": 13786 + }, + { + "epoch": 0.7, + "grad_norm": 0.9022516815447942, + "learning_rate": 4.331244519431371e-06, + "loss": 0.1565, + "step": 13787 + }, + { + "epoch": 0.7, + "grad_norm": 1.034186295116794, + "learning_rate": 4.329887817895451e-06, + "loss": 0.1756, + "step": 13788 + }, + { + "epoch": 0.7, + "grad_norm": 0.9630885441659184, + "learning_rate": 4.328531270161642e-06, + "loss": 0.1712, + "step": 13789 + }, + { + "epoch": 0.7, + "grad_norm": 1.0721431552611012, + "learning_rate": 4.327174876266743e-06, + "loss": 0.1594, + "step": 13790 + }, + { + "epoch": 0.7, + "grad_norm": 1.225581281392638, + "learning_rate": 4.325818636247549e-06, + "loss": 0.1556, + "step": 13791 + }, + { + "epoch": 0.7, + "grad_norm": 1.0627692078353024, + "learning_rate": 4.324462550140847e-06, + "loss": 0.1749, + "step": 13792 + }, + { + "epoch": 0.7, + "grad_norm": 0.9279764998021093, + "learning_rate": 4.323106617983414e-06, + "loss": 0.1599, + "step": 13793 + }, + { + "epoch": 0.7, + "grad_norm": 1.1209255773185283, + "learning_rate": 4.321750839812038e-06, + "loss": 0.1773, + "step": 13794 + }, + { + "epoch": 0.7, + "grad_norm": 1.2147773049589803, + "learning_rate": 4.320395215663488e-06, + "loss": 0.1608, + "step": 13795 + }, + { + "epoch": 0.7, + "grad_norm": 1.355886040723802, + "learning_rate": 4.319039745574543e-06, + "loss": 0.1587, + "step": 13796 + }, + { + "epoch": 0.7, + "grad_norm": 0.889548101640906, + "learning_rate": 4.317684429581961e-06, + "loss": 0.1651, + "step": 13797 + }, + { + "epoch": 0.7, + "grad_norm": 0.9541565186730563, + "learning_rate": 4.316329267722509e-06, + "loss": 0.1481, + "step": 13798 + }, + { + "epoch": 0.7, + "grad_norm": 0.9623375218408832, + "learning_rate": 4.314974260032948e-06, + "loss": 0.1656, + "step": 13799 + }, + { + "epoch": 0.7, + "grad_norm": 0.8715958990471303, + "learning_rate": 4.313619406550034e-06, + "loss": 0.167, + "step": 13800 + }, + { + "epoch": 0.7, + "grad_norm": 0.9084593575369279, + "learning_rate": 4.3122647073105114e-06, + "loss": 0.182, + "step": 13801 + }, + { + "epoch": 0.7, + "grad_norm": 1.3108811802318745, + "learning_rate": 4.310910162351134e-06, + "loss": 0.1538, + "step": 13802 + }, + { + "epoch": 0.7, + "grad_norm": 1.3121975048987127, + "learning_rate": 4.309555771708637e-06, + "loss": 0.1871, + "step": 13803 + }, + { + "epoch": 0.7, + "grad_norm": 1.0499623681533017, + "learning_rate": 4.308201535419762e-06, + "loss": 0.1691, + "step": 13804 + }, + { + "epoch": 0.7, + "grad_norm": 1.0081547694953512, + "learning_rate": 4.306847453521241e-06, + "loss": 0.1726, + "step": 13805 + }, + { + "epoch": 0.7, + "grad_norm": 1.5058682236514365, + "learning_rate": 4.305493526049803e-06, + "loss": 0.1697, + "step": 13806 + }, + { + "epoch": 0.7, + "grad_norm": 0.9462720026788233, + "learning_rate": 4.304139753042174e-06, + "loss": 0.186, + "step": 13807 + }, + { + "epoch": 0.7, + "grad_norm": 0.8945622051574326, + "learning_rate": 4.3027861345350805e-06, + "loss": 0.1733, + "step": 13808 + }, + { + "epoch": 0.7, + "grad_norm": 0.9428319169479271, + "learning_rate": 4.301432670565235e-06, + "loss": 0.1587, + "step": 13809 + }, + { + "epoch": 0.7, + "grad_norm": 0.8628655152606859, + "learning_rate": 4.300079361169347e-06, + "loss": 0.1919, + "step": 13810 + }, + { + "epoch": 0.7, + "grad_norm": 0.9129736896773106, + "learning_rate": 4.2987262063841316e-06, + "loss": 0.1778, + "step": 13811 + }, + { + "epoch": 0.7, + "grad_norm": 1.0113974153294598, + "learning_rate": 4.297373206246286e-06, + "loss": 0.1683, + "step": 13812 + }, + { + "epoch": 0.7, + "grad_norm": 1.0648951024839919, + "learning_rate": 4.296020360792518e-06, + "loss": 0.1514, + "step": 13813 + }, + { + "epoch": 0.7, + "grad_norm": 2.073923935323422, + "learning_rate": 4.2946676700595155e-06, + "loss": 0.16, + "step": 13814 + }, + { + "epoch": 0.7, + "grad_norm": 1.3478803045424044, + "learning_rate": 4.293315134083975e-06, + "loss": 0.195, + "step": 13815 + }, + { + "epoch": 0.7, + "grad_norm": 1.5146714760895041, + "learning_rate": 4.291962752902584e-06, + "loss": 0.1863, + "step": 13816 + }, + { + "epoch": 0.7, + "grad_norm": 6.2258691029557385, + "learning_rate": 4.2906105265520295e-06, + "loss": 0.1881, + "step": 13817 + }, + { + "epoch": 0.7, + "grad_norm": 1.4088017704480014, + "learning_rate": 4.289258455068983e-06, + "loss": 0.1969, + "step": 13818 + }, + { + "epoch": 0.7, + "grad_norm": 0.9642670080882169, + "learning_rate": 4.287906538490128e-06, + "loss": 0.1634, + "step": 13819 + }, + { + "epoch": 0.7, + "grad_norm": 2.0240549553314455, + "learning_rate": 4.286554776852125e-06, + "loss": 0.1666, + "step": 13820 + }, + { + "epoch": 0.7, + "grad_norm": 1.1796958409744949, + "learning_rate": 4.285203170191652e-06, + "loss": 0.1694, + "step": 13821 + }, + { + "epoch": 0.7, + "grad_norm": 0.9443459190506037, + "learning_rate": 4.283851718545362e-06, + "loss": 0.1679, + "step": 13822 + }, + { + "epoch": 0.7, + "grad_norm": 0.9646991551786035, + "learning_rate": 4.282500421949917e-06, + "loss": 0.1596, + "step": 13823 + }, + { + "epoch": 0.7, + "grad_norm": 1.2808704572171308, + "learning_rate": 4.2811492804419695e-06, + "loss": 0.1662, + "step": 13824 + }, + { + "epoch": 0.7, + "grad_norm": 1.347312322023975, + "learning_rate": 4.279798294058176e-06, + "loss": 0.1772, + "step": 13825 + }, + { + "epoch": 0.7, + "grad_norm": 1.784693189124559, + "learning_rate": 4.278447462835176e-06, + "loss": 0.186, + "step": 13826 + }, + { + "epoch": 0.7, + "grad_norm": 1.4600284370723755, + "learning_rate": 4.277096786809608e-06, + "loss": 0.1915, + "step": 13827 + }, + { + "epoch": 0.7, + "grad_norm": 1.2121081339526243, + "learning_rate": 4.275746266018117e-06, + "loss": 0.1782, + "step": 13828 + }, + { + "epoch": 0.7, + "grad_norm": 1.0950423022369236, + "learning_rate": 4.274395900497328e-06, + "loss": 0.1945, + "step": 13829 + }, + { + "epoch": 0.7, + "grad_norm": 1.1221579992549777, + "learning_rate": 4.273045690283878e-06, + "loss": 0.1806, + "step": 13830 + }, + { + "epoch": 0.7, + "grad_norm": 1.0292814625435251, + "learning_rate": 4.2716956354143826e-06, + "loss": 0.1599, + "step": 13831 + }, + { + "epoch": 0.7, + "grad_norm": 1.0910953828917187, + "learning_rate": 4.2703457359254665e-06, + "loss": 0.1721, + "step": 13832 + }, + { + "epoch": 0.7, + "grad_norm": 0.9996097948773663, + "learning_rate": 4.268995991853746e-06, + "loss": 0.1676, + "step": 13833 + }, + { + "epoch": 0.7, + "grad_norm": 1.1543389514703013, + "learning_rate": 4.267646403235836e-06, + "loss": 0.1964, + "step": 13834 + }, + { + "epoch": 0.7, + "grad_norm": 1.5130531821660502, + "learning_rate": 4.266296970108339e-06, + "loss": 0.1613, + "step": 13835 + }, + { + "epoch": 0.7, + "grad_norm": 0.8845460390820609, + "learning_rate": 4.264947692507863e-06, + "loss": 0.1708, + "step": 13836 + }, + { + "epoch": 0.7, + "grad_norm": 0.9841852132957352, + "learning_rate": 4.263598570471003e-06, + "loss": 0.1861, + "step": 13837 + }, + { + "epoch": 0.7, + "grad_norm": 0.9041726518279823, + "learning_rate": 4.262249604034356e-06, + "loss": 0.1699, + "step": 13838 + }, + { + "epoch": 0.7, + "grad_norm": 1.0345479895144427, + "learning_rate": 4.260900793234511e-06, + "loss": 0.1663, + "step": 13839 + }, + { + "epoch": 0.7, + "grad_norm": 0.8941284391965324, + "learning_rate": 4.259552138108061e-06, + "loss": 0.1577, + "step": 13840 + }, + { + "epoch": 0.7, + "grad_norm": 0.9192186532010812, + "learning_rate": 4.258203638691578e-06, + "loss": 0.1976, + "step": 13841 + }, + { + "epoch": 0.7, + "grad_norm": 0.9715839954624031, + "learning_rate": 4.256855295021646e-06, + "loss": 0.1523, + "step": 13842 + }, + { + "epoch": 0.7, + "grad_norm": 0.9026243535838973, + "learning_rate": 4.255507107134842e-06, + "loss": 0.1917, + "step": 13843 + }, + { + "epoch": 0.7, + "grad_norm": 0.8797153394563227, + "learning_rate": 4.2541590750677285e-06, + "loss": 0.1531, + "step": 13844 + }, + { + "epoch": 0.7, + "grad_norm": 0.9451326989379703, + "learning_rate": 4.252811198856878e-06, + "loss": 0.1708, + "step": 13845 + }, + { + "epoch": 0.7, + "grad_norm": 1.0833224058149244, + "learning_rate": 4.251463478538846e-06, + "loss": 0.2052, + "step": 13846 + }, + { + "epoch": 0.7, + "grad_norm": 1.3103829188590623, + "learning_rate": 4.250115914150194e-06, + "loss": 0.1823, + "step": 13847 + }, + { + "epoch": 0.7, + "grad_norm": 0.94943048980084, + "learning_rate": 4.2487685057274695e-06, + "loss": 0.1938, + "step": 13848 + }, + { + "epoch": 0.7, + "grad_norm": 1.559673614053583, + "learning_rate": 4.247421253307225e-06, + "loss": 0.1897, + "step": 13849 + }, + { + "epoch": 0.7, + "grad_norm": 0.9774268887965165, + "learning_rate": 4.246074156926002e-06, + "loss": 0.1596, + "step": 13850 + }, + { + "epoch": 0.7, + "grad_norm": 1.4407846971193052, + "learning_rate": 4.244727216620348e-06, + "loss": 0.1621, + "step": 13851 + }, + { + "epoch": 0.7, + "grad_norm": 1.0361937161045285, + "learning_rate": 4.2433804324267895e-06, + "loss": 0.1804, + "step": 13852 + }, + { + "epoch": 0.7, + "grad_norm": 1.0412298692371762, + "learning_rate": 4.242033804381864e-06, + "loss": 0.1917, + "step": 13853 + }, + { + "epoch": 0.7, + "grad_norm": 0.9114379786274113, + "learning_rate": 4.240687332522094e-06, + "loss": 0.1873, + "step": 13854 + }, + { + "epoch": 0.7, + "grad_norm": 1.6741798793588893, + "learning_rate": 4.239341016884008e-06, + "loss": 0.1743, + "step": 13855 + }, + { + "epoch": 0.7, + "grad_norm": 0.9870271064861393, + "learning_rate": 4.237994857504121e-06, + "loss": 0.1615, + "step": 13856 + }, + { + "epoch": 0.7, + "grad_norm": 1.4861495315901574, + "learning_rate": 4.236648854418951e-06, + "loss": 0.1603, + "step": 13857 + }, + { + "epoch": 0.7, + "grad_norm": 1.035190561600465, + "learning_rate": 4.2353030076650025e-06, + "loss": 0.1593, + "step": 13858 + }, + { + "epoch": 0.7, + "grad_norm": 1.1910756738313253, + "learning_rate": 4.233957317278786e-06, + "loss": 0.16, + "step": 13859 + }, + { + "epoch": 0.7, + "grad_norm": 1.5194077790731053, + "learning_rate": 4.232611783296804e-06, + "loss": 0.1933, + "step": 13860 + }, + { + "epoch": 0.7, + "grad_norm": 0.9646802011211995, + "learning_rate": 4.2312664057555556e-06, + "loss": 0.1781, + "step": 13861 + }, + { + "epoch": 0.7, + "grad_norm": 1.4234201229578838, + "learning_rate": 4.229921184691531e-06, + "loss": 0.1787, + "step": 13862 + }, + { + "epoch": 0.7, + "grad_norm": 1.4329485564112754, + "learning_rate": 4.228576120141218e-06, + "loss": 0.1733, + "step": 13863 + }, + { + "epoch": 0.71, + "grad_norm": 0.7625806077048197, + "learning_rate": 4.2272312121411065e-06, + "loss": 0.1707, + "step": 13864 + }, + { + "epoch": 0.71, + "grad_norm": 1.0778457587652996, + "learning_rate": 4.225886460727671e-06, + "loss": 0.1932, + "step": 13865 + }, + { + "epoch": 0.71, + "grad_norm": 1.398154670499273, + "learning_rate": 4.224541865937395e-06, + "loss": 0.158, + "step": 13866 + }, + { + "epoch": 0.71, + "grad_norm": 1.156600814601921, + "learning_rate": 4.2231974278067436e-06, + "loss": 0.1709, + "step": 13867 + }, + { + "epoch": 0.71, + "grad_norm": 1.1124745144642676, + "learning_rate": 4.221853146372188e-06, + "loss": 0.195, + "step": 13868 + }, + { + "epoch": 0.71, + "grad_norm": 0.9143329997002765, + "learning_rate": 4.220509021670193e-06, + "loss": 0.1533, + "step": 13869 + }, + { + "epoch": 0.71, + "grad_norm": 1.2854654668501229, + "learning_rate": 4.21916505373722e-06, + "loss": 0.1698, + "step": 13870 + }, + { + "epoch": 0.71, + "grad_norm": 1.7644388099664177, + "learning_rate": 4.2178212426097175e-06, + "loss": 0.1726, + "step": 13871 + }, + { + "epoch": 0.71, + "grad_norm": 2.4484549246924208, + "learning_rate": 4.216477588324144e-06, + "loss": 0.1723, + "step": 13872 + }, + { + "epoch": 0.71, + "grad_norm": 1.8021444948614584, + "learning_rate": 4.215134090916939e-06, + "loss": 0.1833, + "step": 13873 + }, + { + "epoch": 0.71, + "grad_norm": 0.9335939453621243, + "learning_rate": 4.213790750424553e-06, + "loss": 0.1612, + "step": 13874 + }, + { + "epoch": 0.71, + "grad_norm": 1.0977518688232104, + "learning_rate": 4.212447566883415e-06, + "loss": 0.1641, + "step": 13875 + }, + { + "epoch": 0.71, + "grad_norm": 1.220894291615603, + "learning_rate": 4.211104540329964e-06, + "loss": 0.1739, + "step": 13876 + }, + { + "epoch": 0.71, + "grad_norm": 0.9058485613346287, + "learning_rate": 4.209761670800631e-06, + "loss": 0.1364, + "step": 13877 + }, + { + "epoch": 0.71, + "grad_norm": 3.94948120515998, + "learning_rate": 4.208418958331841e-06, + "loss": 0.1656, + "step": 13878 + }, + { + "epoch": 0.71, + "grad_norm": 1.0817260499443833, + "learning_rate": 4.207076402960015e-06, + "loss": 0.1685, + "step": 13879 + }, + { + "epoch": 0.71, + "grad_norm": 1.4737839935785986, + "learning_rate": 4.205734004721565e-06, + "loss": 0.1674, + "step": 13880 + }, + { + "epoch": 0.71, + "grad_norm": 1.2886994072038556, + "learning_rate": 4.204391763652911e-06, + "loss": 0.1618, + "step": 13881 + }, + { + "epoch": 0.71, + "grad_norm": 1.0956350599041518, + "learning_rate": 4.2030496797904526e-06, + "loss": 0.1589, + "step": 13882 + }, + { + "epoch": 0.71, + "grad_norm": 1.5778971792081893, + "learning_rate": 4.2017077531706056e-06, + "loss": 0.1754, + "step": 13883 + }, + { + "epoch": 0.71, + "grad_norm": 1.280024551622798, + "learning_rate": 4.200365983829757e-06, + "loss": 0.1679, + "step": 13884 + }, + { + "epoch": 0.71, + "grad_norm": 1.0989316304641512, + "learning_rate": 4.19902437180431e-06, + "loss": 0.179, + "step": 13885 + }, + { + "epoch": 0.71, + "grad_norm": 2.0262960765847735, + "learning_rate": 4.197682917130654e-06, + "loss": 0.1701, + "step": 13886 + }, + { + "epoch": 0.71, + "grad_norm": 1.3649534886612573, + "learning_rate": 4.196341619845182e-06, + "loss": 0.1682, + "step": 13887 + }, + { + "epoch": 0.71, + "grad_norm": 0.9768879481607011, + "learning_rate": 4.195000479984264e-06, + "loss": 0.1555, + "step": 13888 + }, + { + "epoch": 0.71, + "grad_norm": 1.2674514431589723, + "learning_rate": 4.193659497584293e-06, + "loss": 0.1718, + "step": 13889 + }, + { + "epoch": 0.71, + "grad_norm": 0.9301726568541404, + "learning_rate": 4.192318672681631e-06, + "loss": 0.1667, + "step": 13890 + }, + { + "epoch": 0.71, + "grad_norm": 1.6447104967367812, + "learning_rate": 4.190978005312657e-06, + "loss": 0.1883, + "step": 13891 + }, + { + "epoch": 0.71, + "grad_norm": 1.0557492480294897, + "learning_rate": 4.189637495513729e-06, + "loss": 0.1608, + "step": 13892 + }, + { + "epoch": 0.71, + "grad_norm": 1.1176874894671438, + "learning_rate": 4.188297143321215e-06, + "loss": 0.1801, + "step": 13893 + }, + { + "epoch": 0.71, + "grad_norm": 0.9404386916814441, + "learning_rate": 4.186956948771467e-06, + "loss": 0.1617, + "step": 13894 + }, + { + "epoch": 0.71, + "grad_norm": 1.1853230444085585, + "learning_rate": 4.1856169119008384e-06, + "loss": 0.19, + "step": 13895 + }, + { + "epoch": 0.71, + "grad_norm": 1.2015223083588877, + "learning_rate": 4.184277032745685e-06, + "loss": 0.1627, + "step": 13896 + }, + { + "epoch": 0.71, + "grad_norm": 0.8969995228287397, + "learning_rate": 4.18293731134234e-06, + "loss": 0.1409, + "step": 13897 + }, + { + "epoch": 0.71, + "grad_norm": 2.7125517882768655, + "learning_rate": 4.181597747727154e-06, + "loss": 0.1916, + "step": 13898 + }, + { + "epoch": 0.71, + "grad_norm": 1.090728912294368, + "learning_rate": 4.180258341936454e-06, + "loss": 0.1636, + "step": 13899 + }, + { + "epoch": 0.71, + "grad_norm": 1.5086909915665916, + "learning_rate": 4.178919094006578e-06, + "loss": 0.1764, + "step": 13900 + }, + { + "epoch": 0.71, + "grad_norm": 1.0281672216497417, + "learning_rate": 4.1775800039738465e-06, + "loss": 0.1694, + "step": 13901 + }, + { + "epoch": 0.71, + "grad_norm": 0.9611875987842856, + "learning_rate": 4.176241071874587e-06, + "loss": 0.1667, + "step": 13902 + }, + { + "epoch": 0.71, + "grad_norm": 1.1189889394442891, + "learning_rate": 4.174902297745118e-06, + "loss": 0.1845, + "step": 13903 + }, + { + "epoch": 0.71, + "grad_norm": 1.0347532611945578, + "learning_rate": 4.173563681621756e-06, + "loss": 0.1723, + "step": 13904 + }, + { + "epoch": 0.71, + "grad_norm": 1.4840919450107504, + "learning_rate": 4.1722252235408045e-06, + "loss": 0.1786, + "step": 13905 + }, + { + "epoch": 0.71, + "grad_norm": 1.7848343757219844, + "learning_rate": 4.170886923538576e-06, + "loss": 0.1658, + "step": 13906 + }, + { + "epoch": 0.71, + "grad_norm": 1.1563399247561783, + "learning_rate": 4.169548781651367e-06, + "loss": 0.1772, + "step": 13907 + }, + { + "epoch": 0.71, + "grad_norm": 1.231024214373564, + "learning_rate": 4.168210797915479e-06, + "loss": 0.1483, + "step": 13908 + }, + { + "epoch": 0.71, + "grad_norm": 1.2235509447303758, + "learning_rate": 4.1668729723671994e-06, + "loss": 0.163, + "step": 13909 + }, + { + "epoch": 0.71, + "grad_norm": 0.9748722624842394, + "learning_rate": 4.165535305042822e-06, + "loss": 0.1544, + "step": 13910 + }, + { + "epoch": 0.71, + "grad_norm": 1.1483576747353885, + "learning_rate": 4.164197795978628e-06, + "loss": 0.1721, + "step": 13911 + }, + { + "epoch": 0.71, + "grad_norm": 1.011189049498867, + "learning_rate": 4.162860445210897e-06, + "loss": 0.1866, + "step": 13912 + }, + { + "epoch": 0.71, + "grad_norm": 1.3078625368117252, + "learning_rate": 4.16152325277591e-06, + "loss": 0.1789, + "step": 13913 + }, + { + "epoch": 0.71, + "grad_norm": 0.9965484103475903, + "learning_rate": 4.16018621870993e-06, + "loss": 0.2024, + "step": 13914 + }, + { + "epoch": 0.71, + "grad_norm": 1.021616309600682, + "learning_rate": 4.158849343049233e-06, + "loss": 0.1891, + "step": 13915 + }, + { + "epoch": 0.71, + "grad_norm": 1.1905640579670889, + "learning_rate": 4.157512625830074e-06, + "loss": 0.1679, + "step": 13916 + }, + { + "epoch": 0.71, + "grad_norm": 0.9737795035987967, + "learning_rate": 4.156176067088717e-06, + "loss": 0.1768, + "step": 13917 + }, + { + "epoch": 0.71, + "grad_norm": 0.9511517663436382, + "learning_rate": 4.154839666861413e-06, + "loss": 0.1708, + "step": 13918 + }, + { + "epoch": 0.71, + "grad_norm": 1.1916910017235933, + "learning_rate": 4.153503425184415e-06, + "loss": 0.1807, + "step": 13919 + }, + { + "epoch": 0.71, + "grad_norm": 1.2415657986375965, + "learning_rate": 4.152167342093965e-06, + "loss": 0.1531, + "step": 13920 + }, + { + "epoch": 0.71, + "grad_norm": 1.4514494878822788, + "learning_rate": 4.150831417626304e-06, + "loss": 0.1882, + "step": 13921 + }, + { + "epoch": 0.71, + "grad_norm": 0.9120073512123998, + "learning_rate": 4.149495651817673e-06, + "loss": 0.1664, + "step": 13922 + }, + { + "epoch": 0.71, + "grad_norm": 0.9726956622707574, + "learning_rate": 4.148160044704306e-06, + "loss": 0.1741, + "step": 13923 + }, + { + "epoch": 0.71, + "grad_norm": 0.9938501765351079, + "learning_rate": 4.1468245963224245e-06, + "loss": 0.1629, + "step": 13924 + }, + { + "epoch": 0.71, + "grad_norm": 1.102826239226895, + "learning_rate": 4.1454893067082605e-06, + "loss": 0.1704, + "step": 13925 + }, + { + "epoch": 0.71, + "grad_norm": 0.8498416122497809, + "learning_rate": 4.1441541758980256e-06, + "loss": 0.1589, + "step": 13926 + }, + { + "epoch": 0.71, + "grad_norm": 1.5947741621857945, + "learning_rate": 4.142819203927942e-06, + "loss": 0.2137, + "step": 13927 + }, + { + "epoch": 0.71, + "grad_norm": 0.994381854280893, + "learning_rate": 4.141484390834216e-06, + "loss": 0.1654, + "step": 13928 + }, + { + "epoch": 0.71, + "grad_norm": 0.9255592604899666, + "learning_rate": 4.140149736653056e-06, + "loss": 0.1751, + "step": 13929 + }, + { + "epoch": 0.71, + "grad_norm": 1.5580640598570554, + "learning_rate": 4.138815241420666e-06, + "loss": 0.1859, + "step": 13930 + }, + { + "epoch": 0.71, + "grad_norm": 0.8700202663936708, + "learning_rate": 4.137480905173248e-06, + "loss": 0.1566, + "step": 13931 + }, + { + "epoch": 0.71, + "grad_norm": 1.5028060839405217, + "learning_rate": 4.13614672794699e-06, + "loss": 0.1481, + "step": 13932 + }, + { + "epoch": 0.71, + "grad_norm": 1.2787088758138214, + "learning_rate": 4.13481270977808e-06, + "loss": 0.1806, + "step": 13933 + }, + { + "epoch": 0.71, + "grad_norm": 1.0328407095066583, + "learning_rate": 4.133478850702711e-06, + "loss": 0.1802, + "step": 13934 + }, + { + "epoch": 0.71, + "grad_norm": 0.9213280279429286, + "learning_rate": 4.1321451507570555e-06, + "loss": 0.1738, + "step": 13935 + }, + { + "epoch": 0.71, + "grad_norm": 1.13141968709934, + "learning_rate": 4.130811609977297e-06, + "loss": 0.1698, + "step": 13936 + }, + { + "epoch": 0.71, + "grad_norm": 0.9087622452263413, + "learning_rate": 4.1294782283996024e-06, + "loss": 0.189, + "step": 13937 + }, + { + "epoch": 0.71, + "grad_norm": 0.7743718454839085, + "learning_rate": 4.128145006060141e-06, + "loss": 0.1608, + "step": 13938 + }, + { + "epoch": 0.71, + "grad_norm": 1.008167328082802, + "learning_rate": 4.12681194299508e-06, + "loss": 0.1724, + "step": 13939 + }, + { + "epoch": 0.71, + "grad_norm": 1.3185350228359154, + "learning_rate": 4.12547903924058e-06, + "loss": 0.1587, + "step": 13940 + }, + { + "epoch": 0.71, + "grad_norm": 1.2549184563659543, + "learning_rate": 4.124146294832788e-06, + "loss": 0.1831, + "step": 13941 + }, + { + "epoch": 0.71, + "grad_norm": 1.252213411815056, + "learning_rate": 4.122813709807864e-06, + "loss": 0.177, + "step": 13942 + }, + { + "epoch": 0.71, + "grad_norm": 1.0130254442875402, + "learning_rate": 4.121481284201946e-06, + "loss": 0.1875, + "step": 13943 + }, + { + "epoch": 0.71, + "grad_norm": 1.0870914766560396, + "learning_rate": 4.120149018051184e-06, + "loss": 0.1803, + "step": 13944 + }, + { + "epoch": 0.71, + "grad_norm": 1.0470809970557806, + "learning_rate": 4.118816911391709e-06, + "loss": 0.1668, + "step": 13945 + }, + { + "epoch": 0.71, + "grad_norm": 0.943477282672622, + "learning_rate": 4.117484964259657e-06, + "loss": 0.1635, + "step": 13946 + }, + { + "epoch": 0.71, + "grad_norm": 1.101166936528451, + "learning_rate": 4.116153176691158e-06, + "loss": 0.1645, + "step": 13947 + }, + { + "epoch": 0.71, + "grad_norm": 3.306797221498104, + "learning_rate": 4.1148215487223385e-06, + "loss": 0.1637, + "step": 13948 + }, + { + "epoch": 0.71, + "grad_norm": 0.9415598236086736, + "learning_rate": 4.1134900803893185e-06, + "loss": 0.1946, + "step": 13949 + }, + { + "epoch": 0.71, + "grad_norm": 4.200965620690179, + "learning_rate": 4.1121587717282085e-06, + "loss": 0.1801, + "step": 13950 + }, + { + "epoch": 0.71, + "grad_norm": 1.141433023565065, + "learning_rate": 4.110827622775128e-06, + "loss": 0.1821, + "step": 13951 + }, + { + "epoch": 0.71, + "grad_norm": 1.305752632035842, + "learning_rate": 4.1094966335661765e-06, + "loss": 0.1645, + "step": 13952 + }, + { + "epoch": 0.71, + "grad_norm": 1.1922977922503717, + "learning_rate": 4.108165804137466e-06, + "loss": 0.1694, + "step": 13953 + }, + { + "epoch": 0.71, + "grad_norm": 2.0802621937995167, + "learning_rate": 4.106835134525087e-06, + "loss": 0.1686, + "step": 13954 + }, + { + "epoch": 0.71, + "grad_norm": 1.0049350043646883, + "learning_rate": 4.105504624765137e-06, + "loss": 0.176, + "step": 13955 + }, + { + "epoch": 0.71, + "grad_norm": 1.762105872660763, + "learning_rate": 4.104174274893709e-06, + "loss": 0.168, + "step": 13956 + }, + { + "epoch": 0.71, + "grad_norm": 1.30451208977044, + "learning_rate": 4.102844084946889e-06, + "loss": 0.1608, + "step": 13957 + }, + { + "epoch": 0.71, + "grad_norm": 1.106594812369558, + "learning_rate": 4.101514054960752e-06, + "loss": 0.1842, + "step": 13958 + }, + { + "epoch": 0.71, + "grad_norm": 1.0997050675526354, + "learning_rate": 4.1001841849713845e-06, + "loss": 0.1939, + "step": 13959 + }, + { + "epoch": 0.71, + "grad_norm": 1.0702076106090137, + "learning_rate": 4.098854475014849e-06, + "loss": 0.2071, + "step": 13960 + }, + { + "epoch": 0.71, + "grad_norm": 0.9761023790628669, + "learning_rate": 4.097524925127224e-06, + "loss": 0.1787, + "step": 13961 + }, + { + "epoch": 0.71, + "grad_norm": 1.0409487600060539, + "learning_rate": 4.096195535344565e-06, + "loss": 0.1671, + "step": 13962 + }, + { + "epoch": 0.71, + "grad_norm": 1.1941185796143974, + "learning_rate": 4.0948663057029395e-06, + "loss": 0.1842, + "step": 13963 + }, + { + "epoch": 0.71, + "grad_norm": 1.0081634661008763, + "learning_rate": 4.093537236238394e-06, + "loss": 0.1734, + "step": 13964 + }, + { + "epoch": 0.71, + "grad_norm": 2.1712157900084783, + "learning_rate": 4.092208326986986e-06, + "loss": 0.1735, + "step": 13965 + }, + { + "epoch": 0.71, + "grad_norm": 1.493318392656283, + "learning_rate": 4.090879577984763e-06, + "loss": 0.1534, + "step": 13966 + }, + { + "epoch": 0.71, + "grad_norm": 0.76818294481959, + "learning_rate": 4.089550989267763e-06, + "loss": 0.1598, + "step": 13967 + }, + { + "epoch": 0.71, + "grad_norm": 2.3472354685096652, + "learning_rate": 4.0882225608720295e-06, + "loss": 0.1624, + "step": 13968 + }, + { + "epoch": 0.71, + "grad_norm": 0.9148780436687676, + "learning_rate": 4.086894292833589e-06, + "loss": 0.1638, + "step": 13969 + }, + { + "epoch": 0.71, + "grad_norm": 1.7247586560020707, + "learning_rate": 4.085566185188478e-06, + "loss": 0.1865, + "step": 13970 + }, + { + "epoch": 0.71, + "grad_norm": 0.8492698486093321, + "learning_rate": 4.084238237972715e-06, + "loss": 0.169, + "step": 13971 + }, + { + "epoch": 0.71, + "grad_norm": 1.1954560908592897, + "learning_rate": 4.082910451222325e-06, + "loss": 0.1825, + "step": 13972 + }, + { + "epoch": 0.71, + "grad_norm": 1.0239596161957574, + "learning_rate": 4.0815828249733226e-06, + "loss": 0.1693, + "step": 13973 + }, + { + "epoch": 0.71, + "grad_norm": 0.9171027277912834, + "learning_rate": 4.080255359261723e-06, + "loss": 0.1612, + "step": 13974 + }, + { + "epoch": 0.71, + "grad_norm": 0.9608743201810395, + "learning_rate": 4.078928054123529e-06, + "loss": 0.1791, + "step": 13975 + }, + { + "epoch": 0.71, + "grad_norm": 0.9961691965383497, + "learning_rate": 4.077600909594748e-06, + "loss": 0.1706, + "step": 13976 + }, + { + "epoch": 0.71, + "grad_norm": 1.336294130271254, + "learning_rate": 4.0762739257113734e-06, + "loss": 0.1851, + "step": 13977 + }, + { + "epoch": 0.71, + "grad_norm": 1.0139466768340508, + "learning_rate": 4.074947102509408e-06, + "loss": 0.1655, + "step": 13978 + }, + { + "epoch": 0.71, + "grad_norm": 1.2893579384169815, + "learning_rate": 4.073620440024832e-06, + "loss": 0.1941, + "step": 13979 + }, + { + "epoch": 0.71, + "grad_norm": 0.9832599128423222, + "learning_rate": 4.072293938293641e-06, + "loss": 0.1756, + "step": 13980 + }, + { + "epoch": 0.71, + "grad_norm": 1.0238164537959138, + "learning_rate": 4.070967597351808e-06, + "loss": 0.1858, + "step": 13981 + }, + { + "epoch": 0.71, + "grad_norm": 1.0183251618475806, + "learning_rate": 4.069641417235314e-06, + "loss": 0.1804, + "step": 13982 + }, + { + "epoch": 0.71, + "grad_norm": 0.9810496013205331, + "learning_rate": 4.068315397980135e-06, + "loss": 0.1718, + "step": 13983 + }, + { + "epoch": 0.71, + "grad_norm": 1.082129763356115, + "learning_rate": 4.066989539622232e-06, + "loss": 0.1605, + "step": 13984 + }, + { + "epoch": 0.71, + "grad_norm": 0.7518739008925989, + "learning_rate": 4.065663842197576e-06, + "loss": 0.1535, + "step": 13985 + }, + { + "epoch": 0.71, + "grad_norm": 1.1024784812536153, + "learning_rate": 4.06433830574212e-06, + "loss": 0.1693, + "step": 13986 + }, + { + "epoch": 0.71, + "grad_norm": 1.4415271687351996, + "learning_rate": 4.0630129302918266e-06, + "loss": 0.145, + "step": 13987 + }, + { + "epoch": 0.71, + "grad_norm": 1.139274381955693, + "learning_rate": 4.06168771588264e-06, + "loss": 0.1895, + "step": 13988 + }, + { + "epoch": 0.71, + "grad_norm": 1.269776868117072, + "learning_rate": 4.0603626625505125e-06, + "loss": 0.1632, + "step": 13989 + }, + { + "epoch": 0.71, + "grad_norm": 0.9230653132887967, + "learning_rate": 4.059037770331379e-06, + "loss": 0.168, + "step": 13990 + }, + { + "epoch": 0.71, + "grad_norm": 1.0253563738907223, + "learning_rate": 4.057713039261182e-06, + "loss": 0.1611, + "step": 13991 + }, + { + "epoch": 0.71, + "grad_norm": 1.298699424162711, + "learning_rate": 4.056388469375853e-06, + "loss": 0.1808, + "step": 13992 + }, + { + "epoch": 0.71, + "grad_norm": 1.1550292532198296, + "learning_rate": 4.055064060711328e-06, + "loss": 0.1677, + "step": 13993 + }, + { + "epoch": 0.71, + "grad_norm": 1.1411457383366315, + "learning_rate": 4.0537398133035225e-06, + "loss": 0.1628, + "step": 13994 + }, + { + "epoch": 0.71, + "grad_norm": 0.9678233487865195, + "learning_rate": 4.0524157271883635e-06, + "loss": 0.1843, + "step": 13995 + }, + { + "epoch": 0.71, + "grad_norm": 1.0057253640193635, + "learning_rate": 4.05109180240176e-06, + "loss": 0.1664, + "step": 13996 + }, + { + "epoch": 0.71, + "grad_norm": 1.8006213175726598, + "learning_rate": 4.049768038979631e-06, + "loss": 0.1781, + "step": 13997 + }, + { + "epoch": 0.71, + "grad_norm": 1.2773225084954483, + "learning_rate": 4.0484444369578764e-06, + "loss": 0.1591, + "step": 13998 + }, + { + "epoch": 0.71, + "grad_norm": 1.18123877918416, + "learning_rate": 4.047120996372403e-06, + "loss": 0.1706, + "step": 13999 + }, + { + "epoch": 0.71, + "grad_norm": 1.6529702566124738, + "learning_rate": 4.045797717259109e-06, + "loss": 0.1905, + "step": 14000 + }, + { + "epoch": 0.71, + "grad_norm": 1.3302918737464733, + "learning_rate": 4.044474599653891e-06, + "loss": 0.1764, + "step": 14001 + }, + { + "epoch": 0.71, + "grad_norm": 1.9162525803200723, + "learning_rate": 4.0431516435926365e-06, + "loss": 0.1754, + "step": 14002 + }, + { + "epoch": 0.71, + "grad_norm": 1.316548792491148, + "learning_rate": 4.0418288491112255e-06, + "loss": 0.1725, + "step": 14003 + }, + { + "epoch": 0.71, + "grad_norm": 0.9532510067195563, + "learning_rate": 4.0405062162455474e-06, + "loss": 0.184, + "step": 14004 + }, + { + "epoch": 0.71, + "grad_norm": 1.2904459935716295, + "learning_rate": 4.0391837450314715e-06, + "loss": 0.1592, + "step": 14005 + }, + { + "epoch": 0.71, + "grad_norm": 0.831784096015568, + "learning_rate": 4.037861435504877e-06, + "loss": 0.1922, + "step": 14006 + }, + { + "epoch": 0.71, + "grad_norm": 0.9590948888956858, + "learning_rate": 4.0365392877016255e-06, + "loss": 0.1639, + "step": 14007 + }, + { + "epoch": 0.71, + "grad_norm": 0.7950257845294156, + "learning_rate": 4.03521730165758e-06, + "loss": 0.1488, + "step": 14008 + }, + { + "epoch": 0.71, + "grad_norm": 0.8696092538942773, + "learning_rate": 4.033895477408604e-06, + "loss": 0.1593, + "step": 14009 + }, + { + "epoch": 0.71, + "grad_norm": 1.1425549882601498, + "learning_rate": 4.032573814990553e-06, + "loss": 0.1876, + "step": 14010 + }, + { + "epoch": 0.71, + "grad_norm": 0.9434310996016758, + "learning_rate": 4.03125231443927e-06, + "loss": 0.1984, + "step": 14011 + }, + { + "epoch": 0.71, + "grad_norm": 1.0730244990456292, + "learning_rate": 4.02993097579061e-06, + "loss": 0.181, + "step": 14012 + }, + { + "epoch": 0.71, + "grad_norm": 1.322629645251612, + "learning_rate": 4.028609799080405e-06, + "loss": 0.1739, + "step": 14013 + }, + { + "epoch": 0.71, + "grad_norm": 1.5905595740156597, + "learning_rate": 4.0272887843445005e-06, + "loss": 0.1468, + "step": 14014 + }, + { + "epoch": 0.71, + "grad_norm": 1.4081861558909625, + "learning_rate": 4.025967931618722e-06, + "loss": 0.1725, + "step": 14015 + }, + { + "epoch": 0.71, + "grad_norm": 0.9231296337146756, + "learning_rate": 4.024647240938904e-06, + "loss": 0.1671, + "step": 14016 + }, + { + "epoch": 0.71, + "grad_norm": 1.0159252032635526, + "learning_rate": 4.0233267123408626e-06, + "loss": 0.1688, + "step": 14017 + }, + { + "epoch": 0.71, + "grad_norm": 0.9687980046675957, + "learning_rate": 4.022006345860422e-06, + "loss": 0.1691, + "step": 14018 + }, + { + "epoch": 0.71, + "grad_norm": 1.656629955728825, + "learning_rate": 4.020686141533401e-06, + "loss": 0.1753, + "step": 14019 + }, + { + "epoch": 0.71, + "grad_norm": 1.0112391428165952, + "learning_rate": 4.019366099395602e-06, + "loss": 0.1519, + "step": 14020 + }, + { + "epoch": 0.71, + "grad_norm": 0.8022414046323324, + "learning_rate": 4.01804621948284e-06, + "loss": 0.163, + "step": 14021 + }, + { + "epoch": 0.71, + "grad_norm": 0.8621081733595416, + "learning_rate": 4.0167265018309075e-06, + "loss": 0.162, + "step": 14022 + }, + { + "epoch": 0.71, + "grad_norm": 0.9482586410500872, + "learning_rate": 4.01540694647561e-06, + "loss": 0.1671, + "step": 14023 + }, + { + "epoch": 0.71, + "grad_norm": 0.8599979611209102, + "learning_rate": 4.014087553452734e-06, + "loss": 0.1569, + "step": 14024 + }, + { + "epoch": 0.71, + "grad_norm": 1.224541117890549, + "learning_rate": 4.012768322798072e-06, + "loss": 0.1801, + "step": 14025 + }, + { + "epoch": 0.71, + "grad_norm": 1.1503432737597172, + "learning_rate": 4.011449254547406e-06, + "loss": 0.1693, + "step": 14026 + }, + { + "epoch": 0.71, + "grad_norm": 0.8289679623687345, + "learning_rate": 4.010130348736522e-06, + "loss": 0.1473, + "step": 14027 + }, + { + "epoch": 0.71, + "grad_norm": 0.9223129355750407, + "learning_rate": 4.008811605401186e-06, + "loss": 0.1674, + "step": 14028 + }, + { + "epoch": 0.71, + "grad_norm": 1.1534141512590803, + "learning_rate": 4.007493024577177e-06, + "loss": 0.1977, + "step": 14029 + }, + { + "epoch": 0.71, + "grad_norm": 0.8860937852779077, + "learning_rate": 4.006174606300255e-06, + "loss": 0.1612, + "step": 14030 + }, + { + "epoch": 0.71, + "grad_norm": 0.8518361547364689, + "learning_rate": 4.00485635060619e-06, + "loss": 0.1679, + "step": 14031 + }, + { + "epoch": 0.71, + "grad_norm": 0.9889257574197703, + "learning_rate": 4.0035382575307306e-06, + "loss": 0.1586, + "step": 14032 + }, + { + "epoch": 0.71, + "grad_norm": 1.1061626270404616, + "learning_rate": 4.0022203271096375e-06, + "loss": 0.1824, + "step": 14033 + }, + { + "epoch": 0.71, + "grad_norm": 0.9897553297519481, + "learning_rate": 4.000902559378654e-06, + "loss": 0.1967, + "step": 14034 + }, + { + "epoch": 0.71, + "grad_norm": 0.8693727300851246, + "learning_rate": 3.999584954373528e-06, + "loss": 0.1628, + "step": 14035 + }, + { + "epoch": 0.71, + "grad_norm": 1.0355612812061994, + "learning_rate": 3.998267512130001e-06, + "loss": 0.1976, + "step": 14036 + }, + { + "epoch": 0.71, + "grad_norm": 0.8755614439478262, + "learning_rate": 3.996950232683804e-06, + "loss": 0.1737, + "step": 14037 + }, + { + "epoch": 0.71, + "grad_norm": 1.8701663683866392, + "learning_rate": 3.995633116070675e-06, + "loss": 0.1635, + "step": 14038 + }, + { + "epoch": 0.71, + "grad_norm": 1.364662616775321, + "learning_rate": 3.994316162326333e-06, + "loss": 0.1468, + "step": 14039 + }, + { + "epoch": 0.71, + "grad_norm": 1.0885834864885342, + "learning_rate": 3.992999371486508e-06, + "loss": 0.1716, + "step": 14040 + }, + { + "epoch": 0.71, + "grad_norm": 1.1753816593544923, + "learning_rate": 3.99168274358691e-06, + "loss": 0.1937, + "step": 14041 + }, + { + "epoch": 0.71, + "grad_norm": 1.0768423153042457, + "learning_rate": 3.990366278663258e-06, + "loss": 0.1579, + "step": 14042 + }, + { + "epoch": 0.71, + "grad_norm": 0.9510173102034292, + "learning_rate": 3.989049976751259e-06, + "loss": 0.1694, + "step": 14043 + }, + { + "epoch": 0.71, + "grad_norm": 1.0030856247148077, + "learning_rate": 3.987733837886622e-06, + "loss": 0.1723, + "step": 14044 + }, + { + "epoch": 0.71, + "grad_norm": 0.9019917277961534, + "learning_rate": 3.986417862105043e-06, + "loss": 0.174, + "step": 14045 + }, + { + "epoch": 0.71, + "grad_norm": 0.9420535261557448, + "learning_rate": 3.985102049442221e-06, + "loss": 0.1612, + "step": 14046 + }, + { + "epoch": 0.71, + "grad_norm": 0.9970341470186542, + "learning_rate": 3.983786399933842e-06, + "loss": 0.1801, + "step": 14047 + }, + { + "epoch": 0.71, + "grad_norm": 1.1198941603246906, + "learning_rate": 3.982470913615602e-06, + "loss": 0.1944, + "step": 14048 + }, + { + "epoch": 0.71, + "grad_norm": 1.031101344226233, + "learning_rate": 3.981155590523173e-06, + "loss": 0.1442, + "step": 14049 + }, + { + "epoch": 0.71, + "grad_norm": 2.4822485364807862, + "learning_rate": 3.979840430692242e-06, + "loss": 0.2225, + "step": 14050 + }, + { + "epoch": 0.71, + "grad_norm": 0.9894481492357582, + "learning_rate": 3.978525434158476e-06, + "loss": 0.1638, + "step": 14051 + }, + { + "epoch": 0.71, + "grad_norm": 1.0964076308612705, + "learning_rate": 3.977210600957548e-06, + "loss": 0.1754, + "step": 14052 + }, + { + "epoch": 0.71, + "grad_norm": 1.6665709589666882, + "learning_rate": 3.975895931125126e-06, + "loss": 0.1697, + "step": 14053 + }, + { + "epoch": 0.71, + "grad_norm": 2.000045807372957, + "learning_rate": 3.9745814246968654e-06, + "loss": 0.1786, + "step": 14054 + }, + { + "epoch": 0.71, + "grad_norm": 0.8953795174978475, + "learning_rate": 3.973267081708425e-06, + "loss": 0.1689, + "step": 14055 + }, + { + "epoch": 0.71, + "grad_norm": 3.3286514942564485, + "learning_rate": 3.971952902195453e-06, + "loss": 0.1663, + "step": 14056 + }, + { + "epoch": 0.71, + "grad_norm": 1.1203493039554666, + "learning_rate": 3.970638886193603e-06, + "loss": 0.1955, + "step": 14057 + }, + { + "epoch": 0.71, + "grad_norm": 0.8888136001531051, + "learning_rate": 3.969325033738509e-06, + "loss": 0.1964, + "step": 14058 + }, + { + "epoch": 0.71, + "grad_norm": 0.795777988010517, + "learning_rate": 3.968011344865819e-06, + "loss": 0.1531, + "step": 14059 + }, + { + "epoch": 0.71, + "grad_norm": 0.7770520518118822, + "learning_rate": 3.9666978196111575e-06, + "loss": 0.1632, + "step": 14060 + }, + { + "epoch": 0.72, + "grad_norm": 0.8263785189625037, + "learning_rate": 3.965384458010157e-06, + "loss": 0.1834, + "step": 14061 + }, + { + "epoch": 0.72, + "grad_norm": 0.854320039391229, + "learning_rate": 3.964071260098446e-06, + "loss": 0.1508, + "step": 14062 + }, + { + "epoch": 0.72, + "grad_norm": 1.3807512839982112, + "learning_rate": 3.962758225911646e-06, + "loss": 0.1575, + "step": 14063 + }, + { + "epoch": 0.72, + "grad_norm": 1.1555728871664732, + "learning_rate": 3.961445355485366e-06, + "loss": 0.1758, + "step": 14064 + }, + { + "epoch": 0.72, + "grad_norm": 0.8123345828753771, + "learning_rate": 3.960132648855226e-06, + "loss": 0.1758, + "step": 14065 + }, + { + "epoch": 0.72, + "grad_norm": 0.9460660456671849, + "learning_rate": 3.958820106056826e-06, + "loss": 0.156, + "step": 14066 + }, + { + "epoch": 0.72, + "grad_norm": 1.0004887583666053, + "learning_rate": 3.957507727125775e-06, + "loss": 0.1608, + "step": 14067 + }, + { + "epoch": 0.72, + "grad_norm": 1.3035912785558115, + "learning_rate": 3.956195512097664e-06, + "loss": 0.1651, + "step": 14068 + }, + { + "epoch": 0.72, + "grad_norm": 1.0046179848207974, + "learning_rate": 3.954883461008091e-06, + "loss": 0.1829, + "step": 14069 + }, + { + "epoch": 0.72, + "grad_norm": 0.990047289247749, + "learning_rate": 3.953571573892646e-06, + "loss": 0.1758, + "step": 14070 + }, + { + "epoch": 0.72, + "grad_norm": 0.8181189145002394, + "learning_rate": 3.9522598507869166e-06, + "loss": 0.1539, + "step": 14071 + }, + { + "epoch": 0.72, + "grad_norm": 0.8046299088957247, + "learning_rate": 3.95094829172648e-06, + "loss": 0.1725, + "step": 14072 + }, + { + "epoch": 0.72, + "grad_norm": 1.5116421607576094, + "learning_rate": 3.949636896746911e-06, + "loss": 0.1514, + "step": 14073 + }, + { + "epoch": 0.72, + "grad_norm": 1.1942340373605995, + "learning_rate": 3.948325665883785e-06, + "loss": 0.1796, + "step": 14074 + }, + { + "epoch": 0.72, + "grad_norm": 1.0544986056562093, + "learning_rate": 3.947014599172664e-06, + "loss": 0.1786, + "step": 14075 + }, + { + "epoch": 0.72, + "grad_norm": 1.336394823926345, + "learning_rate": 3.945703696649117e-06, + "loss": 0.1519, + "step": 14076 + }, + { + "epoch": 0.72, + "grad_norm": 0.944020747222741, + "learning_rate": 3.944392958348696e-06, + "loss": 0.1733, + "step": 14077 + }, + { + "epoch": 0.72, + "grad_norm": 1.8344168292689214, + "learning_rate": 3.943082384306958e-06, + "loss": 0.1558, + "step": 14078 + }, + { + "epoch": 0.72, + "grad_norm": 0.9462982215603182, + "learning_rate": 3.941771974559453e-06, + "loss": 0.1849, + "step": 14079 + }, + { + "epoch": 0.72, + "grad_norm": 1.1478756672396018, + "learning_rate": 3.940461729141728e-06, + "loss": 0.1716, + "step": 14080 + }, + { + "epoch": 0.72, + "grad_norm": 1.291699795537521, + "learning_rate": 3.939151648089317e-06, + "loss": 0.138, + "step": 14081 + }, + { + "epoch": 0.72, + "grad_norm": 1.4757223942660185, + "learning_rate": 3.937841731437765e-06, + "loss": 0.176, + "step": 14082 + }, + { + "epoch": 0.72, + "grad_norm": 0.9795969693563553, + "learning_rate": 3.936531979222593e-06, + "loss": 0.1532, + "step": 14083 + }, + { + "epoch": 0.72, + "grad_norm": 1.7695703125447169, + "learning_rate": 3.935222391479339e-06, + "loss": 0.1525, + "step": 14084 + }, + { + "epoch": 0.72, + "grad_norm": 0.8807274886889117, + "learning_rate": 3.933912968243515e-06, + "loss": 0.1697, + "step": 14085 + }, + { + "epoch": 0.72, + "grad_norm": 1.17677908764498, + "learning_rate": 3.9326037095506486e-06, + "loss": 0.1619, + "step": 14086 + }, + { + "epoch": 0.72, + "grad_norm": 0.8695497729024724, + "learning_rate": 3.931294615436245e-06, + "loss": 0.1745, + "step": 14087 + }, + { + "epoch": 0.72, + "grad_norm": 1.015175994237581, + "learning_rate": 3.929985685935819e-06, + "loss": 0.1685, + "step": 14088 + }, + { + "epoch": 0.72, + "grad_norm": 1.6135670353009806, + "learning_rate": 3.928676921084877e-06, + "loss": 0.177, + "step": 14089 + }, + { + "epoch": 0.72, + "grad_norm": 1.0624513862242058, + "learning_rate": 3.9273683209189115e-06, + "loss": 0.1712, + "step": 14090 + }, + { + "epoch": 0.72, + "grad_norm": 0.9178673325700115, + "learning_rate": 3.926059885473429e-06, + "loss": 0.1617, + "step": 14091 + }, + { + "epoch": 0.72, + "grad_norm": 1.0849630238704142, + "learning_rate": 3.9247516147839105e-06, + "loss": 0.1735, + "step": 14092 + }, + { + "epoch": 0.72, + "grad_norm": 0.9923777691551956, + "learning_rate": 3.923443508885851e-06, + "loss": 0.1583, + "step": 14093 + }, + { + "epoch": 0.72, + "grad_norm": 1.2803548963051437, + "learning_rate": 3.922135567814726e-06, + "loss": 0.1708, + "step": 14094 + }, + { + "epoch": 0.72, + "grad_norm": 0.9130041610458673, + "learning_rate": 3.920827791606018e-06, + "loss": 0.1616, + "step": 14095 + }, + { + "epoch": 0.72, + "grad_norm": 1.0466901162138669, + "learning_rate": 3.919520180295199e-06, + "loss": 0.1615, + "step": 14096 + }, + { + "epoch": 0.72, + "grad_norm": 1.0599024632729042, + "learning_rate": 3.918212733917742e-06, + "loss": 0.1568, + "step": 14097 + }, + { + "epoch": 0.72, + "grad_norm": 1.1164968866332436, + "learning_rate": 3.9169054525091045e-06, + "loss": 0.1734, + "step": 14098 + }, + { + "epoch": 0.72, + "grad_norm": 0.9097229462116942, + "learning_rate": 3.915598336104754e-06, + "loss": 0.1647, + "step": 14099 + }, + { + "epoch": 0.72, + "grad_norm": 0.9607652166214575, + "learning_rate": 3.914291384740139e-06, + "loss": 0.1484, + "step": 14100 + }, + { + "epoch": 0.72, + "grad_norm": 1.1161152426737857, + "learning_rate": 3.912984598450716e-06, + "loss": 0.1711, + "step": 14101 + }, + { + "epoch": 0.72, + "grad_norm": 1.0838039029190736, + "learning_rate": 3.9116779772719274e-06, + "loss": 0.1828, + "step": 14102 + }, + { + "epoch": 0.72, + "grad_norm": 1.396753142997566, + "learning_rate": 3.91037152123922e-06, + "loss": 0.1637, + "step": 14103 + }, + { + "epoch": 0.72, + "grad_norm": 1.299828907754022, + "learning_rate": 3.9090652303880265e-06, + "loss": 0.1842, + "step": 14104 + }, + { + "epoch": 0.72, + "grad_norm": 1.5028157315567607, + "learning_rate": 3.907759104753782e-06, + "loss": 0.1568, + "step": 14105 + }, + { + "epoch": 0.72, + "grad_norm": 1.1985317318332738, + "learning_rate": 3.90645314437192e-06, + "loss": 0.1913, + "step": 14106 + }, + { + "epoch": 0.72, + "grad_norm": 1.251572368953231, + "learning_rate": 3.905147349277857e-06, + "loss": 0.1689, + "step": 14107 + }, + { + "epoch": 0.72, + "grad_norm": 1.3102618168442373, + "learning_rate": 3.9038417195070196e-06, + "loss": 0.1952, + "step": 14108 + }, + { + "epoch": 0.72, + "grad_norm": 1.0520810466721868, + "learning_rate": 3.902536255094816e-06, + "loss": 0.1655, + "step": 14109 + }, + { + "epoch": 0.72, + "grad_norm": 0.7655259367920346, + "learning_rate": 3.901230956076665e-06, + "loss": 0.165, + "step": 14110 + }, + { + "epoch": 0.72, + "grad_norm": 1.2413659929656236, + "learning_rate": 3.899925822487965e-06, + "loss": 0.1652, + "step": 14111 + }, + { + "epoch": 0.72, + "grad_norm": 0.9601156709495291, + "learning_rate": 3.898620854364126e-06, + "loss": 0.182, + "step": 14112 + }, + { + "epoch": 0.72, + "grad_norm": 1.0182150466816644, + "learning_rate": 3.897316051740536e-06, + "loss": 0.1802, + "step": 14113 + }, + { + "epoch": 0.72, + "grad_norm": 0.9465330640666286, + "learning_rate": 3.896011414652593e-06, + "loss": 0.1558, + "step": 14114 + }, + { + "epoch": 0.72, + "grad_norm": 0.8495037191629528, + "learning_rate": 3.894706943135686e-06, + "loss": 0.1619, + "step": 14115 + }, + { + "epoch": 0.72, + "grad_norm": 3.749177420998462, + "learning_rate": 3.893402637225201e-06, + "loss": 0.1738, + "step": 14116 + }, + { + "epoch": 0.72, + "grad_norm": 1.0153253780913176, + "learning_rate": 3.892098496956511e-06, + "loss": 0.1617, + "step": 14117 + }, + { + "epoch": 0.72, + "grad_norm": 1.2844861536349312, + "learning_rate": 3.890794522364998e-06, + "loss": 0.1844, + "step": 14118 + }, + { + "epoch": 0.72, + "grad_norm": 1.1477401726044283, + "learning_rate": 3.8894907134860236e-06, + "loss": 0.1761, + "step": 14119 + }, + { + "epoch": 0.72, + "grad_norm": 0.9576356068272893, + "learning_rate": 3.888187070354964e-06, + "loss": 0.1847, + "step": 14120 + }, + { + "epoch": 0.72, + "grad_norm": 0.8866710011707144, + "learning_rate": 3.886883593007171e-06, + "loss": 0.1629, + "step": 14121 + }, + { + "epoch": 0.72, + "grad_norm": 0.853715620160077, + "learning_rate": 3.885580281478007e-06, + "loss": 0.1636, + "step": 14122 + }, + { + "epoch": 0.72, + "grad_norm": 1.0522413642603576, + "learning_rate": 3.8842771358028254e-06, + "loss": 0.1828, + "step": 14123 + }, + { + "epoch": 0.72, + "grad_norm": 0.7705310953095824, + "learning_rate": 3.882974156016968e-06, + "loss": 0.146, + "step": 14124 + }, + { + "epoch": 0.72, + "grad_norm": 0.9935591406214743, + "learning_rate": 3.881671342155786e-06, + "loss": 0.1927, + "step": 14125 + }, + { + "epoch": 0.72, + "grad_norm": 1.0289400849388903, + "learning_rate": 3.880368694254612e-06, + "loss": 0.1944, + "step": 14126 + }, + { + "epoch": 0.72, + "grad_norm": 1.075369092240746, + "learning_rate": 3.879066212348786e-06, + "loss": 0.1874, + "step": 14127 + }, + { + "epoch": 0.72, + "grad_norm": 1.1412571748621356, + "learning_rate": 3.877763896473629e-06, + "loss": 0.1736, + "step": 14128 + }, + { + "epoch": 0.72, + "grad_norm": 0.8994203625034884, + "learning_rate": 3.876461746664478e-06, + "loss": 0.1513, + "step": 14129 + }, + { + "epoch": 0.72, + "grad_norm": 0.8771349317185486, + "learning_rate": 3.875159762956644e-06, + "loss": 0.178, + "step": 14130 + }, + { + "epoch": 0.72, + "grad_norm": 1.0027658517118203, + "learning_rate": 3.873857945385447e-06, + "loss": 0.1679, + "step": 14131 + }, + { + "epoch": 0.72, + "grad_norm": 0.87315016577878, + "learning_rate": 3.8725562939862e-06, + "loss": 0.1711, + "step": 14132 + }, + { + "epoch": 0.72, + "grad_norm": 0.8900586829521311, + "learning_rate": 3.871254808794213e-06, + "loss": 0.1691, + "step": 14133 + }, + { + "epoch": 0.72, + "grad_norm": 0.8966203204273094, + "learning_rate": 3.869953489844781e-06, + "loss": 0.1407, + "step": 14134 + }, + { + "epoch": 0.72, + "grad_norm": 0.998730627617709, + "learning_rate": 3.868652337173211e-06, + "loss": 0.1883, + "step": 14135 + }, + { + "epoch": 0.72, + "grad_norm": 1.7719003043359163, + "learning_rate": 3.8673513508147885e-06, + "loss": 0.161, + "step": 14136 + }, + { + "epoch": 0.72, + "grad_norm": 0.8156765100824103, + "learning_rate": 3.866050530804811e-06, + "loss": 0.141, + "step": 14137 + }, + { + "epoch": 0.72, + "grad_norm": 0.8042549971576102, + "learning_rate": 3.864749877178556e-06, + "loss": 0.1637, + "step": 14138 + }, + { + "epoch": 0.72, + "grad_norm": 0.8790603683540212, + "learning_rate": 3.86344938997131e-06, + "loss": 0.1742, + "step": 14139 + }, + { + "epoch": 0.72, + "grad_norm": 1.54209637857875, + "learning_rate": 3.862149069218343e-06, + "loss": 0.176, + "step": 14140 + }, + { + "epoch": 0.72, + "grad_norm": 1.0170286994576827, + "learning_rate": 3.8608489149549286e-06, + "loss": 0.1591, + "step": 14141 + }, + { + "epoch": 0.72, + "grad_norm": 0.9255760660296493, + "learning_rate": 3.8595489272163375e-06, + "loss": 0.181, + "step": 14142 + }, + { + "epoch": 0.72, + "grad_norm": 0.9384850020253076, + "learning_rate": 3.858249106037826e-06, + "loss": 0.1547, + "step": 14143 + }, + { + "epoch": 0.72, + "grad_norm": 1.1219354082933488, + "learning_rate": 3.856949451454658e-06, + "loss": 0.1847, + "step": 14144 + }, + { + "epoch": 0.72, + "grad_norm": 1.3725720945269748, + "learning_rate": 3.855649963502078e-06, + "loss": 0.2026, + "step": 14145 + }, + { + "epoch": 0.72, + "grad_norm": 2.826362384690696, + "learning_rate": 3.854350642215344e-06, + "loss": 0.2124, + "step": 14146 + }, + { + "epoch": 0.72, + "grad_norm": 1.0420657943593228, + "learning_rate": 3.853051487629693e-06, + "loss": 0.1555, + "step": 14147 + }, + { + "epoch": 0.72, + "grad_norm": 1.1153510092050685, + "learning_rate": 3.851752499780368e-06, + "loss": 0.1651, + "step": 14148 + }, + { + "epoch": 0.72, + "grad_norm": 0.9459742492520735, + "learning_rate": 3.8504536787026025e-06, + "loss": 0.1678, + "step": 14149 + }, + { + "epoch": 0.72, + "grad_norm": 1.0397145526569862, + "learning_rate": 3.8491550244316326e-06, + "loss": 0.171, + "step": 14150 + }, + { + "epoch": 0.72, + "grad_norm": 0.9655573592566931, + "learning_rate": 3.847856537002677e-06, + "loss": 0.1736, + "step": 14151 + }, + { + "epoch": 0.72, + "grad_norm": 0.9220548799507059, + "learning_rate": 3.846558216450962e-06, + "loss": 0.1524, + "step": 14152 + }, + { + "epoch": 0.72, + "grad_norm": 1.0456814200250961, + "learning_rate": 3.845260062811701e-06, + "loss": 0.1764, + "step": 14153 + }, + { + "epoch": 0.72, + "grad_norm": 0.9412885631888973, + "learning_rate": 3.843962076120111e-06, + "loss": 0.1722, + "step": 14154 + }, + { + "epoch": 0.72, + "grad_norm": 0.8769782231890678, + "learning_rate": 3.842664256411393e-06, + "loss": 0.1711, + "step": 14155 + }, + { + "epoch": 0.72, + "grad_norm": 0.9941968030192654, + "learning_rate": 3.841366603720761e-06, + "loss": 0.1581, + "step": 14156 + }, + { + "epoch": 0.72, + "grad_norm": 0.81692305819075, + "learning_rate": 3.840069118083403e-06, + "loss": 0.1729, + "step": 14157 + }, + { + "epoch": 0.72, + "grad_norm": 1.5179566558436612, + "learning_rate": 3.838771799534518e-06, + "loss": 0.1832, + "step": 14158 + }, + { + "epoch": 0.72, + "grad_norm": 0.9901563215065169, + "learning_rate": 3.837474648109298e-06, + "loss": 0.1637, + "step": 14159 + }, + { + "epoch": 0.72, + "grad_norm": 1.6951338937784872, + "learning_rate": 3.836177663842925e-06, + "loss": 0.1659, + "step": 14160 + }, + { + "epoch": 0.72, + "grad_norm": 0.9560269248745517, + "learning_rate": 3.834880846770584e-06, + "loss": 0.1683, + "step": 14161 + }, + { + "epoch": 0.72, + "grad_norm": 1.1981393142829408, + "learning_rate": 3.833584196927443e-06, + "loss": 0.1925, + "step": 14162 + }, + { + "epoch": 0.72, + "grad_norm": 0.737439399910469, + "learning_rate": 3.8322877143486835e-06, + "loss": 0.1732, + "step": 14163 + }, + { + "epoch": 0.72, + "grad_norm": 0.9970214572536786, + "learning_rate": 3.830991399069466e-06, + "loss": 0.1662, + "step": 14164 + }, + { + "epoch": 0.72, + "grad_norm": 0.9038070805127413, + "learning_rate": 3.829695251124953e-06, + "loss": 0.1659, + "step": 14165 + }, + { + "epoch": 0.72, + "grad_norm": 0.8343905841105865, + "learning_rate": 3.828399270550306e-06, + "loss": 0.1765, + "step": 14166 + }, + { + "epoch": 0.72, + "grad_norm": 0.7514551033421455, + "learning_rate": 3.827103457380681e-06, + "loss": 0.1569, + "step": 14167 + }, + { + "epoch": 0.72, + "grad_norm": 1.1434878349475175, + "learning_rate": 3.82580781165122e-06, + "loss": 0.1675, + "step": 14168 + }, + { + "epoch": 0.72, + "grad_norm": 1.0057817293164968, + "learning_rate": 3.824512333397073e-06, + "loss": 0.185, + "step": 14169 + }, + { + "epoch": 0.72, + "grad_norm": 1.2729546710261415, + "learning_rate": 3.823217022653376e-06, + "loss": 0.1975, + "step": 14170 + }, + { + "epoch": 0.72, + "grad_norm": 0.81564027380783, + "learning_rate": 3.821921879455268e-06, + "loss": 0.1631, + "step": 14171 + }, + { + "epoch": 0.72, + "grad_norm": 0.8976442491961399, + "learning_rate": 3.820626903837875e-06, + "loss": 0.1569, + "step": 14172 + }, + { + "epoch": 0.72, + "grad_norm": 0.9607703608651237, + "learning_rate": 3.81933209583633e-06, + "loss": 0.1783, + "step": 14173 + }, + { + "epoch": 0.72, + "grad_norm": 0.98830563137594, + "learning_rate": 3.818037455485748e-06, + "loss": 0.195, + "step": 14174 + }, + { + "epoch": 0.72, + "grad_norm": 0.967548832430743, + "learning_rate": 3.816742982821249e-06, + "loss": 0.1873, + "step": 14175 + }, + { + "epoch": 0.72, + "grad_norm": 0.9725562983600868, + "learning_rate": 3.815448677877949e-06, + "loss": 0.1637, + "step": 14176 + }, + { + "epoch": 0.72, + "grad_norm": 0.9534466925619756, + "learning_rate": 3.8141545406909486e-06, + "loss": 0.2168, + "step": 14177 + }, + { + "epoch": 0.72, + "grad_norm": 0.9022686390988373, + "learning_rate": 3.8128605712953606e-06, + "loss": 0.1771, + "step": 14178 + }, + { + "epoch": 0.72, + "grad_norm": 0.7895997669461096, + "learning_rate": 3.811566769726275e-06, + "loss": 0.1788, + "step": 14179 + }, + { + "epoch": 0.72, + "grad_norm": 1.026042586429769, + "learning_rate": 3.810273136018793e-06, + "loss": 0.1561, + "step": 14180 + }, + { + "epoch": 0.72, + "grad_norm": 0.7893571675039319, + "learning_rate": 3.8089796702079996e-06, + "loss": 0.1781, + "step": 14181 + }, + { + "epoch": 0.72, + "grad_norm": 1.6919757359686876, + "learning_rate": 3.8076863723289847e-06, + "loss": 0.1631, + "step": 14182 + }, + { + "epoch": 0.72, + "grad_norm": 0.862175568391283, + "learning_rate": 3.8063932424168236e-06, + "loss": 0.1673, + "step": 14183 + }, + { + "epoch": 0.72, + "grad_norm": 0.8485517752972715, + "learning_rate": 3.8051002805065964e-06, + "loss": 0.1934, + "step": 14184 + }, + { + "epoch": 0.72, + "grad_norm": 1.9233190973713077, + "learning_rate": 3.803807486633373e-06, + "loss": 0.1653, + "step": 14185 + }, + { + "epoch": 0.72, + "grad_norm": 1.1319349294113454, + "learning_rate": 3.802514860832225e-06, + "loss": 0.1909, + "step": 14186 + }, + { + "epoch": 0.72, + "grad_norm": 0.8653577480393039, + "learning_rate": 3.8012224031382084e-06, + "loss": 0.1779, + "step": 14187 + }, + { + "epoch": 0.72, + "grad_norm": 1.1929076075384082, + "learning_rate": 3.7999301135863875e-06, + "loss": 0.1731, + "step": 14188 + }, + { + "epoch": 0.72, + "grad_norm": 0.9413376467983017, + "learning_rate": 3.7986379922118087e-06, + "loss": 0.1551, + "step": 14189 + }, + { + "epoch": 0.72, + "grad_norm": 0.9052220902235042, + "learning_rate": 3.797346039049529e-06, + "loss": 0.1828, + "step": 14190 + }, + { + "epoch": 0.72, + "grad_norm": 0.9246863882687407, + "learning_rate": 3.7960542541345836e-06, + "loss": 0.1973, + "step": 14191 + }, + { + "epoch": 0.72, + "grad_norm": 0.8486898931706738, + "learning_rate": 3.7947626375020173e-06, + "loss": 0.1641, + "step": 14192 + }, + { + "epoch": 0.72, + "grad_norm": 1.2499576012040685, + "learning_rate": 3.793471189186869e-06, + "loss": 0.1742, + "step": 14193 + }, + { + "epoch": 0.72, + "grad_norm": 0.8500478491305098, + "learning_rate": 3.792179909224162e-06, + "loss": 0.1686, + "step": 14194 + }, + { + "epoch": 0.72, + "grad_norm": 0.9980383111334427, + "learning_rate": 3.7908887976489284e-06, + "loss": 0.2013, + "step": 14195 + }, + { + "epoch": 0.72, + "grad_norm": 1.557717958905487, + "learning_rate": 3.789597854496183e-06, + "loss": 0.1621, + "step": 14196 + }, + { + "epoch": 0.72, + "grad_norm": 1.172579101340141, + "learning_rate": 3.7883070798009503e-06, + "loss": 0.1838, + "step": 14197 + }, + { + "epoch": 0.72, + "grad_norm": 1.5335676877341493, + "learning_rate": 3.7870164735982363e-06, + "loss": 0.1741, + "step": 14198 + }, + { + "epoch": 0.72, + "grad_norm": 1.236872824666607, + "learning_rate": 3.7857260359230543e-06, + "loss": 0.1668, + "step": 14199 + }, + { + "epoch": 0.72, + "grad_norm": 1.0325130618195033, + "learning_rate": 3.7844357668104005e-06, + "loss": 0.1567, + "step": 14200 + }, + { + "epoch": 0.72, + "grad_norm": 1.1342609581997496, + "learning_rate": 3.7831456662952783e-06, + "loss": 0.1772, + "step": 14201 + }, + { + "epoch": 0.72, + "grad_norm": 1.4083831164497156, + "learning_rate": 3.7818557344126807e-06, + "loss": 0.1642, + "step": 14202 + }, + { + "epoch": 0.72, + "grad_norm": 0.8579470681672271, + "learning_rate": 3.7805659711976007e-06, + "loss": 0.1697, + "step": 14203 + }, + { + "epoch": 0.72, + "grad_norm": 1.0276154408565137, + "learning_rate": 3.779276376685017e-06, + "loss": 0.1614, + "step": 14204 + }, + { + "epoch": 0.72, + "grad_norm": 1.1292194703813447, + "learning_rate": 3.7779869509099166e-06, + "loss": 0.1716, + "step": 14205 + }, + { + "epoch": 0.72, + "grad_norm": 0.9358439759067537, + "learning_rate": 3.7766976939072673e-06, + "loss": 0.173, + "step": 14206 + }, + { + "epoch": 0.72, + "grad_norm": 0.7796355039284816, + "learning_rate": 3.7754086057120486e-06, + "loss": 0.1449, + "step": 14207 + }, + { + "epoch": 0.72, + "grad_norm": 1.056734691141821, + "learning_rate": 3.77411968635922e-06, + "loss": 0.1628, + "step": 14208 + }, + { + "epoch": 0.72, + "grad_norm": 0.9221052651830727, + "learning_rate": 3.772830935883749e-06, + "loss": 0.1563, + "step": 14209 + }, + { + "epoch": 0.72, + "grad_norm": 1.0631227692867489, + "learning_rate": 3.7715423543205875e-06, + "loss": 0.1683, + "step": 14210 + }, + { + "epoch": 0.72, + "grad_norm": 0.7410763969252387, + "learning_rate": 3.7702539417046923e-06, + "loss": 0.1603, + "step": 14211 + }, + { + "epoch": 0.72, + "grad_norm": 0.9452639562381826, + "learning_rate": 3.7689656980710132e-06, + "loss": 0.189, + "step": 14212 + }, + { + "epoch": 0.72, + "grad_norm": 1.6908879201876998, + "learning_rate": 3.7676776234544876e-06, + "loss": 0.1638, + "step": 14213 + }, + { + "epoch": 0.72, + "grad_norm": 1.0159422655316006, + "learning_rate": 3.7663897178900634e-06, + "loss": 0.1612, + "step": 14214 + }, + { + "epoch": 0.72, + "grad_norm": 0.870214529701722, + "learning_rate": 3.7651019814126656e-06, + "loss": 0.1632, + "step": 14215 + }, + { + "epoch": 0.72, + "grad_norm": 1.3077317836042965, + "learning_rate": 3.763814414057233e-06, + "loss": 0.1788, + "step": 14216 + }, + { + "epoch": 0.72, + "grad_norm": 2.0002530570640173, + "learning_rate": 3.7625270158586824e-06, + "loss": 0.1536, + "step": 14217 + }, + { + "epoch": 0.72, + "grad_norm": 1.3738436004166406, + "learning_rate": 3.761239786851939e-06, + "loss": 0.1561, + "step": 14218 + }, + { + "epoch": 0.72, + "grad_norm": 1.354949019454525, + "learning_rate": 3.7599527270719183e-06, + "loss": 0.1437, + "step": 14219 + }, + { + "epoch": 0.72, + "grad_norm": 1.5472545692675839, + "learning_rate": 3.7586658365535367e-06, + "loss": 0.1568, + "step": 14220 + }, + { + "epoch": 0.72, + "grad_norm": 0.9656835326636025, + "learning_rate": 3.757379115331693e-06, + "loss": 0.1802, + "step": 14221 + }, + { + "epoch": 0.72, + "grad_norm": 0.9776349541649698, + "learning_rate": 3.756092563441297e-06, + "loss": 0.1656, + "step": 14222 + }, + { + "epoch": 0.72, + "grad_norm": 0.8852204987916643, + "learning_rate": 3.754806180917239e-06, + "loss": 0.1921, + "step": 14223 + }, + { + "epoch": 0.72, + "grad_norm": 0.9167565711720689, + "learning_rate": 3.75351996779442e-06, + "loss": 0.1775, + "step": 14224 + }, + { + "epoch": 0.72, + "grad_norm": 0.8952438830379588, + "learning_rate": 3.752233924107721e-06, + "loss": 0.16, + "step": 14225 + }, + { + "epoch": 0.72, + "grad_norm": 1.2955265466290538, + "learning_rate": 3.7509480498920325e-06, + "loss": 0.1617, + "step": 14226 + }, + { + "epoch": 0.72, + "grad_norm": 0.9378117311047315, + "learning_rate": 3.749662345182229e-06, + "loss": 0.1589, + "step": 14227 + }, + { + "epoch": 0.72, + "grad_norm": 1.1886155582753781, + "learning_rate": 3.7483768100131857e-06, + "loss": 0.1915, + "step": 14228 + }, + { + "epoch": 0.72, + "grad_norm": 1.0826107082122094, + "learning_rate": 3.7470914444197793e-06, + "loss": 0.1709, + "step": 14229 + }, + { + "epoch": 0.72, + "grad_norm": 1.0393811372304311, + "learning_rate": 3.745806248436866e-06, + "loss": 0.1554, + "step": 14230 + }, + { + "epoch": 0.72, + "grad_norm": 1.236921987649103, + "learning_rate": 3.7445212220993167e-06, + "loss": 0.1815, + "step": 14231 + }, + { + "epoch": 0.72, + "grad_norm": 1.2104292988068672, + "learning_rate": 3.743236365441978e-06, + "loss": 0.2004, + "step": 14232 + }, + { + "epoch": 0.72, + "grad_norm": 1.6439370614882838, + "learning_rate": 3.7419516784997102e-06, + "loss": 0.1716, + "step": 14233 + }, + { + "epoch": 0.72, + "grad_norm": 0.8458498080848184, + "learning_rate": 3.740667161307352e-06, + "loss": 0.1576, + "step": 14234 + }, + { + "epoch": 0.72, + "grad_norm": 1.1550376497930346, + "learning_rate": 3.7393828138997543e-06, + "loss": 0.1706, + "step": 14235 + }, + { + "epoch": 0.72, + "grad_norm": 1.3056922913399747, + "learning_rate": 3.7380986363117488e-06, + "loss": 0.1802, + "step": 14236 + }, + { + "epoch": 0.72, + "grad_norm": 1.0611836357952285, + "learning_rate": 3.7368146285781716e-06, + "loss": 0.1553, + "step": 14237 + }, + { + "epoch": 0.72, + "grad_norm": 1.5617880165016602, + "learning_rate": 3.73553079073385e-06, + "loss": 0.173, + "step": 14238 + }, + { + "epoch": 0.72, + "grad_norm": 1.1187420114648672, + "learning_rate": 3.7342471228136148e-06, + "loss": 0.1655, + "step": 14239 + }, + { + "epoch": 0.72, + "grad_norm": 0.8656618888307698, + "learning_rate": 3.732963624852275e-06, + "loss": 0.1645, + "step": 14240 + }, + { + "epoch": 0.72, + "grad_norm": 1.0527586977075543, + "learning_rate": 3.7316802968846555e-06, + "loss": 0.1905, + "step": 14241 + }, + { + "epoch": 0.72, + "grad_norm": 0.9653891403786222, + "learning_rate": 3.7303971389455584e-06, + "loss": 0.1785, + "step": 14242 + }, + { + "epoch": 0.72, + "grad_norm": 0.8007572110573549, + "learning_rate": 3.7291141510697957e-06, + "loss": 0.1631, + "step": 14243 + }, + { + "epoch": 0.72, + "grad_norm": 1.0979067825574946, + "learning_rate": 3.7278313332921634e-06, + "loss": 0.168, + "step": 14244 + }, + { + "epoch": 0.72, + "grad_norm": 1.0376817166504873, + "learning_rate": 3.72654868564746e-06, + "loss": 0.2146, + "step": 14245 + }, + { + "epoch": 0.72, + "grad_norm": 0.870826236426534, + "learning_rate": 3.7252662081704806e-06, + "loss": 0.1576, + "step": 14246 + }, + { + "epoch": 0.72, + "grad_norm": 1.1024186957206004, + "learning_rate": 3.7239839008960066e-06, + "loss": 0.1845, + "step": 14247 + }, + { + "epoch": 0.72, + "grad_norm": 0.9461416873285, + "learning_rate": 3.722701763858828e-06, + "loss": 0.1605, + "step": 14248 + }, + { + "epoch": 0.72, + "grad_norm": 0.8203311355747611, + "learning_rate": 3.721419797093715e-06, + "loss": 0.1727, + "step": 14249 + }, + { + "epoch": 0.72, + "grad_norm": 0.9325230397827498, + "learning_rate": 3.720138000635447e-06, + "loss": 0.1751, + "step": 14250 + }, + { + "epoch": 0.72, + "grad_norm": 1.115454803046617, + "learning_rate": 3.718856374518788e-06, + "loss": 0.1702, + "step": 14251 + }, + { + "epoch": 0.72, + "grad_norm": 0.8502188295421723, + "learning_rate": 3.717574918778507e-06, + "loss": 0.1666, + "step": 14252 + }, + { + "epoch": 0.72, + "grad_norm": 1.1228808366873253, + "learning_rate": 3.7162936334493594e-06, + "loss": 0.1701, + "step": 14253 + }, + { + "epoch": 0.72, + "grad_norm": 0.8689147468875956, + "learning_rate": 3.7150125185661e-06, + "loss": 0.1704, + "step": 14254 + }, + { + "epoch": 0.72, + "grad_norm": 0.9488898299242475, + "learning_rate": 3.7137315741634825e-06, + "loss": 0.1882, + "step": 14255 + }, + { + "epoch": 0.72, + "grad_norm": 1.0817092141613454, + "learning_rate": 3.7124508002762537e-06, + "loss": 0.1599, + "step": 14256 + }, + { + "epoch": 0.72, + "grad_norm": 1.166908832486912, + "learning_rate": 3.711170196939149e-06, + "loss": 0.1724, + "step": 14257 + }, + { + "epoch": 0.73, + "grad_norm": 0.8814828284152649, + "learning_rate": 3.7098897641869113e-06, + "loss": 0.1781, + "step": 14258 + }, + { + "epoch": 0.73, + "grad_norm": 0.9455121000726662, + "learning_rate": 3.7086095020542655e-06, + "loss": 0.1781, + "step": 14259 + }, + { + "epoch": 0.73, + "grad_norm": 1.1232332362340367, + "learning_rate": 3.7073294105759462e-06, + "loss": 0.1884, + "step": 14260 + }, + { + "epoch": 0.73, + "grad_norm": 0.8402922663453951, + "learning_rate": 3.706049489786667e-06, + "loss": 0.1541, + "step": 14261 + }, + { + "epoch": 0.73, + "grad_norm": 0.9859218401322686, + "learning_rate": 3.704769739721156e-06, + "loss": 0.1641, + "step": 14262 + }, + { + "epoch": 0.73, + "grad_norm": 1.110810403466431, + "learning_rate": 3.703490160414117e-06, + "loss": 0.187, + "step": 14263 + }, + { + "epoch": 0.73, + "grad_norm": 1.0070633935527822, + "learning_rate": 3.7022107519002635e-06, + "loss": 0.1602, + "step": 14264 + }, + { + "epoch": 0.73, + "grad_norm": 2.5144713975719286, + "learning_rate": 3.7009315142143033e-06, + "loss": 0.1777, + "step": 14265 + }, + { + "epoch": 0.73, + "grad_norm": 1.3375853269180695, + "learning_rate": 3.6996524473909268e-06, + "loss": 0.1619, + "step": 14266 + }, + { + "epoch": 0.73, + "grad_norm": 1.0919991003667495, + "learning_rate": 3.6983735514648376e-06, + "loss": 0.174, + "step": 14267 + }, + { + "epoch": 0.73, + "grad_norm": 1.1546100421700132, + "learning_rate": 3.697094826470717e-06, + "loss": 0.176, + "step": 14268 + }, + { + "epoch": 0.73, + "grad_norm": 0.8533756977482386, + "learning_rate": 3.6958162724432612e-06, + "loss": 0.1706, + "step": 14269 + }, + { + "epoch": 0.73, + "grad_norm": 0.8114168158613203, + "learning_rate": 3.6945378894171392e-06, + "loss": 0.174, + "step": 14270 + }, + { + "epoch": 0.73, + "grad_norm": 1.139833257653544, + "learning_rate": 3.6932596774270346e-06, + "loss": 0.1579, + "step": 14271 + }, + { + "epoch": 0.73, + "grad_norm": 1.011143112623799, + "learning_rate": 3.6919816365076165e-06, + "loss": 0.1912, + "step": 14272 + }, + { + "epoch": 0.73, + "grad_norm": 1.599161063664042, + "learning_rate": 3.6907037666935565e-06, + "loss": 0.18, + "step": 14273 + }, + { + "epoch": 0.73, + "grad_norm": 2.111474925295134, + "learning_rate": 3.6894260680195105e-06, + "loss": 0.1687, + "step": 14274 + }, + { + "epoch": 0.73, + "grad_norm": 2.071490665473469, + "learning_rate": 3.688148540520141e-06, + "loss": 0.1563, + "step": 14275 + }, + { + "epoch": 0.73, + "grad_norm": 0.9001205884583297, + "learning_rate": 3.6868711842300964e-06, + "loss": 0.1532, + "step": 14276 + }, + { + "epoch": 0.73, + "grad_norm": 1.2590964346673659, + "learning_rate": 3.6855939991840305e-06, + "loss": 0.1857, + "step": 14277 + }, + { + "epoch": 0.73, + "grad_norm": 1.4732035688485705, + "learning_rate": 3.6843169854165807e-06, + "loss": 0.176, + "step": 14278 + }, + { + "epoch": 0.73, + "grad_norm": 1.284662221148433, + "learning_rate": 3.683040142962393e-06, + "loss": 0.1924, + "step": 14279 + }, + { + "epoch": 0.73, + "grad_norm": 1.2777057044689084, + "learning_rate": 3.6817634718560947e-06, + "loss": 0.1679, + "step": 14280 + }, + { + "epoch": 0.73, + "grad_norm": 0.9109142054246077, + "learning_rate": 3.6804869721323187e-06, + "loss": 0.1671, + "step": 14281 + }, + { + "epoch": 0.73, + "grad_norm": 1.0592463944026813, + "learning_rate": 3.6792106438256937e-06, + "loss": 0.1739, + "step": 14282 + }, + { + "epoch": 0.73, + "grad_norm": 0.810647659156529, + "learning_rate": 3.6779344869708344e-06, + "loss": 0.1507, + "step": 14283 + }, + { + "epoch": 0.73, + "grad_norm": 0.9415308598837328, + "learning_rate": 3.6766585016023624e-06, + "loss": 0.1671, + "step": 14284 + }, + { + "epoch": 0.73, + "grad_norm": 1.1358383503657274, + "learning_rate": 3.6753826877548817e-06, + "loss": 0.1595, + "step": 14285 + }, + { + "epoch": 0.73, + "grad_norm": 0.9346530761399574, + "learning_rate": 3.674107045463007e-06, + "loss": 0.191, + "step": 14286 + }, + { + "epoch": 0.73, + "grad_norm": 1.466131046517711, + "learning_rate": 3.672831574761332e-06, + "loss": 0.1527, + "step": 14287 + }, + { + "epoch": 0.73, + "grad_norm": 1.2540878334454677, + "learning_rate": 3.671556275684458e-06, + "loss": 0.1754, + "step": 14288 + }, + { + "epoch": 0.73, + "grad_norm": 1.2123331434905684, + "learning_rate": 3.6702811482669776e-06, + "loss": 0.1581, + "step": 14289 + }, + { + "epoch": 0.73, + "grad_norm": 0.8083395757328803, + "learning_rate": 3.6690061925434817e-06, + "loss": 0.1594, + "step": 14290 + }, + { + "epoch": 0.73, + "grad_norm": 0.9007273446819845, + "learning_rate": 3.667731408548547e-06, + "loss": 0.1777, + "step": 14291 + }, + { + "epoch": 0.73, + "grad_norm": 1.1764483153180103, + "learning_rate": 3.6664567963167598e-06, + "loss": 0.1855, + "step": 14292 + }, + { + "epoch": 0.73, + "grad_norm": 1.0257463654360448, + "learning_rate": 3.6651823558826847e-06, + "loss": 0.1775, + "step": 14293 + }, + { + "epoch": 0.73, + "grad_norm": 1.708378143859542, + "learning_rate": 3.6639080872809007e-06, + "loss": 0.1595, + "step": 14294 + }, + { + "epoch": 0.73, + "grad_norm": 1.063857693334117, + "learning_rate": 3.662633990545964e-06, + "loss": 0.1604, + "step": 14295 + }, + { + "epoch": 0.73, + "grad_norm": 1.015383341678732, + "learning_rate": 3.6613600657124416e-06, + "loss": 0.1715, + "step": 14296 + }, + { + "epoch": 0.73, + "grad_norm": 1.2290575016512464, + "learning_rate": 3.6600863128148823e-06, + "loss": 0.1729, + "step": 14297 + }, + { + "epoch": 0.73, + "grad_norm": 0.937283228394862, + "learning_rate": 3.6588127318878398e-06, + "loss": 0.1839, + "step": 14298 + }, + { + "epoch": 0.73, + "grad_norm": 1.0972501980810836, + "learning_rate": 3.657539322965863e-06, + "loss": 0.151, + "step": 14299 + }, + { + "epoch": 0.73, + "grad_norm": 1.2100849301088383, + "learning_rate": 3.6562660860834866e-06, + "loss": 0.1771, + "step": 14300 + }, + { + "epoch": 0.73, + "grad_norm": 0.9269535849349061, + "learning_rate": 3.654993021275255e-06, + "loss": 0.1659, + "step": 14301 + }, + { + "epoch": 0.73, + "grad_norm": 0.989396120662106, + "learning_rate": 3.6537201285756927e-06, + "loss": 0.1663, + "step": 14302 + }, + { + "epoch": 0.73, + "grad_norm": 0.9574458380861439, + "learning_rate": 3.652447408019334e-06, + "loss": 0.1689, + "step": 14303 + }, + { + "epoch": 0.73, + "grad_norm": 1.7094279189074484, + "learning_rate": 3.651174859640694e-06, + "loss": 0.1878, + "step": 14304 + }, + { + "epoch": 0.73, + "grad_norm": 1.1206309986972018, + "learning_rate": 3.6499024834742967e-06, + "loss": 0.1781, + "step": 14305 + }, + { + "epoch": 0.73, + "grad_norm": 1.9028885348627, + "learning_rate": 3.6486302795546515e-06, + "loss": 0.1713, + "step": 14306 + }, + { + "epoch": 0.73, + "grad_norm": 0.9185930338847872, + "learning_rate": 3.6473582479162684e-06, + "loss": 0.1493, + "step": 14307 + }, + { + "epoch": 0.73, + "grad_norm": 0.8589657424822319, + "learning_rate": 3.6460863885936514e-06, + "loss": 0.1684, + "step": 14308 + }, + { + "epoch": 0.73, + "grad_norm": 0.9463367078690121, + "learning_rate": 3.644814701621303e-06, + "loss": 0.1705, + "step": 14309 + }, + { + "epoch": 0.73, + "grad_norm": 0.9896721370691746, + "learning_rate": 3.6435431870337123e-06, + "loss": 0.1743, + "step": 14310 + }, + { + "epoch": 0.73, + "grad_norm": 1.1614616682775758, + "learning_rate": 3.642271844865375e-06, + "loss": 0.1781, + "step": 14311 + }, + { + "epoch": 0.73, + "grad_norm": 0.9824090286288907, + "learning_rate": 3.641000675150769e-06, + "loss": 0.1646, + "step": 14312 + }, + { + "epoch": 0.73, + "grad_norm": 0.7977130453754009, + "learning_rate": 3.639729677924382e-06, + "loss": 0.1507, + "step": 14313 + }, + { + "epoch": 0.73, + "grad_norm": 0.8735573137115935, + "learning_rate": 3.638458853220683e-06, + "loss": 0.1643, + "step": 14314 + }, + { + "epoch": 0.73, + "grad_norm": 0.9780952797888448, + "learning_rate": 3.637188201074149e-06, + "loss": 0.1755, + "step": 14315 + }, + { + "epoch": 0.73, + "grad_norm": 0.9590069219781341, + "learning_rate": 3.635917721519245e-06, + "loss": 0.1515, + "step": 14316 + }, + { + "epoch": 0.73, + "grad_norm": 0.7895245555191475, + "learning_rate": 3.634647414590431e-06, + "loss": 0.1827, + "step": 14317 + }, + { + "epoch": 0.73, + "grad_norm": 0.844476378491929, + "learning_rate": 3.6333772803221677e-06, + "loss": 0.1677, + "step": 14318 + }, + { + "epoch": 0.73, + "grad_norm": 1.05269931311854, + "learning_rate": 3.632107318748903e-06, + "loss": 0.1904, + "step": 14319 + }, + { + "epoch": 0.73, + "grad_norm": 1.636618686759367, + "learning_rate": 3.630837529905089e-06, + "loss": 0.1683, + "step": 14320 + }, + { + "epoch": 0.73, + "grad_norm": 0.8617969918168314, + "learning_rate": 3.6295679138251637e-06, + "loss": 0.1592, + "step": 14321 + }, + { + "epoch": 0.73, + "grad_norm": 0.9486332624925874, + "learning_rate": 3.628298470543572e-06, + "loss": 0.1892, + "step": 14322 + }, + { + "epoch": 0.73, + "grad_norm": 1.4453375339022392, + "learning_rate": 3.6270292000947417e-06, + "loss": 0.1761, + "step": 14323 + }, + { + "epoch": 0.73, + "grad_norm": 1.146975579000541, + "learning_rate": 3.625760102513103e-06, + "loss": 0.1698, + "step": 14324 + }, + { + "epoch": 0.73, + "grad_norm": 1.6093006977976625, + "learning_rate": 3.6244911778330826e-06, + "loss": 0.1641, + "step": 14325 + }, + { + "epoch": 0.73, + "grad_norm": 1.1152540194062124, + "learning_rate": 3.6232224260891012e-06, + "loss": 0.1529, + "step": 14326 + }, + { + "epoch": 0.73, + "grad_norm": 0.8843950514675752, + "learning_rate": 3.621953847315569e-06, + "loss": 0.1527, + "step": 14327 + }, + { + "epoch": 0.73, + "grad_norm": 1.096107420351601, + "learning_rate": 3.620685441546903e-06, + "loss": 0.1987, + "step": 14328 + }, + { + "epoch": 0.73, + "grad_norm": 1.1006170106896094, + "learning_rate": 3.6194172088175005e-06, + "loss": 0.1886, + "step": 14329 + }, + { + "epoch": 0.73, + "grad_norm": 1.026145337870011, + "learning_rate": 3.6181491491617706e-06, + "loss": 0.1496, + "step": 14330 + }, + { + "epoch": 0.73, + "grad_norm": 1.029899205795693, + "learning_rate": 3.6168812626141e-06, + "loss": 0.1987, + "step": 14331 + }, + { + "epoch": 0.73, + "grad_norm": 1.0966232569516159, + "learning_rate": 3.6156135492088915e-06, + "loss": 0.1475, + "step": 14332 + }, + { + "epoch": 0.73, + "grad_norm": 1.1319156136893418, + "learning_rate": 3.6143460089805214e-06, + "loss": 0.1609, + "step": 14333 + }, + { + "epoch": 0.73, + "grad_norm": 1.3430977218507882, + "learning_rate": 3.613078641963377e-06, + "loss": 0.1658, + "step": 14334 + }, + { + "epoch": 0.73, + "grad_norm": 0.8104722022322258, + "learning_rate": 3.611811448191839e-06, + "loss": 0.1578, + "step": 14335 + }, + { + "epoch": 0.73, + "grad_norm": 1.3388835404316106, + "learning_rate": 3.610544427700272e-06, + "loss": 0.1761, + "step": 14336 + }, + { + "epoch": 0.73, + "grad_norm": 0.8960803836041868, + "learning_rate": 3.6092775805230516e-06, + "loss": 0.1594, + "step": 14337 + }, + { + "epoch": 0.73, + "grad_norm": 0.8458899384765411, + "learning_rate": 3.6080109066945357e-06, + "loss": 0.1749, + "step": 14338 + }, + { + "epoch": 0.73, + "grad_norm": 0.8441196825783069, + "learning_rate": 3.6067444062490875e-06, + "loss": 0.18, + "step": 14339 + }, + { + "epoch": 0.73, + "grad_norm": 0.7732150398842218, + "learning_rate": 3.6054780792210542e-06, + "loss": 0.1627, + "step": 14340 + }, + { + "epoch": 0.73, + "grad_norm": 1.1086405766696839, + "learning_rate": 3.6042119256447904e-06, + "loss": 0.1675, + "step": 14341 + }, + { + "epoch": 0.73, + "grad_norm": 0.8720343885082179, + "learning_rate": 3.602945945554639e-06, + "loss": 0.1747, + "step": 14342 + }, + { + "epoch": 0.73, + "grad_norm": 0.8623553602417626, + "learning_rate": 3.6016801389849434e-06, + "loss": 0.1829, + "step": 14343 + }, + { + "epoch": 0.73, + "grad_norm": 1.2966082489742936, + "learning_rate": 3.6004145059700313e-06, + "loss": 0.1676, + "step": 14344 + }, + { + "epoch": 0.73, + "grad_norm": 1.157181184083306, + "learning_rate": 3.5991490465442413e-06, + "loss": 0.1957, + "step": 14345 + }, + { + "epoch": 0.73, + "grad_norm": 0.8556719271462281, + "learning_rate": 3.5978837607418914e-06, + "loss": 0.1431, + "step": 14346 + }, + { + "epoch": 0.73, + "grad_norm": 0.9223430518682103, + "learning_rate": 3.5966186485973097e-06, + "loss": 0.1711, + "step": 14347 + }, + { + "epoch": 0.73, + "grad_norm": 1.5919130161900463, + "learning_rate": 3.5953537101448053e-06, + "loss": 0.1598, + "step": 14348 + }, + { + "epoch": 0.73, + "grad_norm": 2.5950698902412257, + "learning_rate": 3.5940889454186965e-06, + "loss": 0.1786, + "step": 14349 + }, + { + "epoch": 0.73, + "grad_norm": 1.05794885305146, + "learning_rate": 3.5928243544532835e-06, + "loss": 0.1447, + "step": 14350 + }, + { + "epoch": 0.73, + "grad_norm": 0.9346209259921906, + "learning_rate": 3.5915599372828725e-06, + "loss": 0.1607, + "step": 14351 + }, + { + "epoch": 0.73, + "grad_norm": 1.0338066235146515, + "learning_rate": 3.590295693941763e-06, + "loss": 0.1827, + "step": 14352 + }, + { + "epoch": 0.73, + "grad_norm": 1.0959471715990214, + "learning_rate": 3.5890316244642408e-06, + "loss": 0.1626, + "step": 14353 + }, + { + "epoch": 0.73, + "grad_norm": 2.165534101280814, + "learning_rate": 3.5877677288846023e-06, + "loss": 0.158, + "step": 14354 + }, + { + "epoch": 0.73, + "grad_norm": 0.947660632419828, + "learning_rate": 3.5865040072371228e-06, + "loss": 0.1776, + "step": 14355 + }, + { + "epoch": 0.73, + "grad_norm": 0.8461107495185389, + "learning_rate": 3.5852404595560876e-06, + "loss": 0.1931, + "step": 14356 + }, + { + "epoch": 0.73, + "grad_norm": 2.1027775274177167, + "learning_rate": 3.5839770858757627e-06, + "loss": 0.1729, + "step": 14357 + }, + { + "epoch": 0.73, + "grad_norm": 0.8846669049565868, + "learning_rate": 3.5827138862304266e-06, + "loss": 0.1689, + "step": 14358 + }, + { + "epoch": 0.73, + "grad_norm": 1.2012854026999105, + "learning_rate": 3.581450860654335e-06, + "loss": 0.1838, + "step": 14359 + }, + { + "epoch": 0.73, + "grad_norm": 1.24401535072047, + "learning_rate": 3.580188009181751e-06, + "loss": 0.1878, + "step": 14360 + }, + { + "epoch": 0.73, + "grad_norm": 1.4983760604381224, + "learning_rate": 3.57892533184693e-06, + "loss": 0.1877, + "step": 14361 + }, + { + "epoch": 0.73, + "grad_norm": 1.0436754053513289, + "learning_rate": 3.577662828684125e-06, + "loss": 0.1759, + "step": 14362 + }, + { + "epoch": 0.73, + "grad_norm": 1.258726118552222, + "learning_rate": 3.576400499727576e-06, + "loss": 0.19, + "step": 14363 + }, + { + "epoch": 0.73, + "grad_norm": 0.9488113239079438, + "learning_rate": 3.5751383450115298e-06, + "loss": 0.1769, + "step": 14364 + }, + { + "epoch": 0.73, + "grad_norm": 0.7623195980440788, + "learning_rate": 3.5738763645702145e-06, + "loss": 0.1558, + "step": 14365 + }, + { + "epoch": 0.73, + "grad_norm": 0.855507636654625, + "learning_rate": 3.572614558437869e-06, + "loss": 0.1645, + "step": 14366 + }, + { + "epoch": 0.73, + "grad_norm": 1.3798623658098415, + "learning_rate": 3.5713529266487145e-06, + "loss": 0.1922, + "step": 14367 + }, + { + "epoch": 0.73, + "grad_norm": 0.9842413518768752, + "learning_rate": 3.5700914692369738e-06, + "loss": 0.1842, + "step": 14368 + }, + { + "epoch": 0.73, + "grad_norm": 1.7150008532521748, + "learning_rate": 3.568830186236869e-06, + "loss": 0.1782, + "step": 14369 + }, + { + "epoch": 0.73, + "grad_norm": 1.4899325092697677, + "learning_rate": 3.5675690776826055e-06, + "loss": 0.187, + "step": 14370 + }, + { + "epoch": 0.73, + "grad_norm": 1.0130933708959058, + "learning_rate": 3.5663081436083967e-06, + "loss": 0.1641, + "step": 14371 + }, + { + "epoch": 0.73, + "grad_norm": 0.913539667921112, + "learning_rate": 3.5650473840484402e-06, + "loss": 0.1573, + "step": 14372 + }, + { + "epoch": 0.73, + "grad_norm": 1.0861222144561535, + "learning_rate": 3.56378679903694e-06, + "loss": 0.1931, + "step": 14373 + }, + { + "epoch": 0.73, + "grad_norm": 1.0768559467476906, + "learning_rate": 3.562526388608083e-06, + "loss": 0.1692, + "step": 14374 + }, + { + "epoch": 0.73, + "grad_norm": 1.1798169488505947, + "learning_rate": 3.5612661527960646e-06, + "loss": 0.165, + "step": 14375 + }, + { + "epoch": 0.73, + "grad_norm": 0.9430744626611828, + "learning_rate": 3.560006091635062e-06, + "loss": 0.1657, + "step": 14376 + }, + { + "epoch": 0.73, + "grad_norm": 1.0014914667902954, + "learning_rate": 3.558746205159258e-06, + "loss": 0.1796, + "step": 14377 + }, + { + "epoch": 0.73, + "grad_norm": 0.9003099615219436, + "learning_rate": 3.5574864934028275e-06, + "loss": 0.1637, + "step": 14378 + }, + { + "epoch": 0.73, + "grad_norm": 1.1470960761940354, + "learning_rate": 3.556226956399943e-06, + "loss": 0.1825, + "step": 14379 + }, + { + "epoch": 0.73, + "grad_norm": 1.7413078096043346, + "learning_rate": 3.554967594184762e-06, + "loss": 0.1853, + "step": 14380 + }, + { + "epoch": 0.73, + "grad_norm": 0.7265083010133702, + "learning_rate": 3.553708406791453e-06, + "loss": 0.1709, + "step": 14381 + }, + { + "epoch": 0.73, + "grad_norm": 1.1558964925446717, + "learning_rate": 3.552449394254165e-06, + "loss": 0.1857, + "step": 14382 + }, + { + "epoch": 0.73, + "grad_norm": 1.263887901976451, + "learning_rate": 3.5511905566070537e-06, + "loss": 0.16, + "step": 14383 + }, + { + "epoch": 0.73, + "grad_norm": 0.9311776746938797, + "learning_rate": 3.549931893884259e-06, + "loss": 0.1575, + "step": 14384 + }, + { + "epoch": 0.73, + "grad_norm": 0.8371777695890396, + "learning_rate": 3.5486734061199266e-06, + "loss": 0.1647, + "step": 14385 + }, + { + "epoch": 0.73, + "grad_norm": 0.9259804564620615, + "learning_rate": 3.5474150933481955e-06, + "loss": 0.1845, + "step": 14386 + }, + { + "epoch": 0.73, + "grad_norm": 1.4184095276211304, + "learning_rate": 3.5461569556031915e-06, + "loss": 0.1696, + "step": 14387 + }, + { + "epoch": 0.73, + "grad_norm": 0.7718611178275522, + "learning_rate": 3.544898992919048e-06, + "loss": 0.1823, + "step": 14388 + }, + { + "epoch": 0.73, + "grad_norm": 0.9339948773745854, + "learning_rate": 3.543641205329881e-06, + "loss": 0.165, + "step": 14389 + }, + { + "epoch": 0.73, + "grad_norm": 0.7963601667531222, + "learning_rate": 3.5423835928698126e-06, + "loss": 0.152, + "step": 14390 + }, + { + "epoch": 0.73, + "grad_norm": 0.9641537768577647, + "learning_rate": 3.5411261555729513e-06, + "loss": 0.1606, + "step": 14391 + }, + { + "epoch": 0.73, + "grad_norm": 1.0070060877029965, + "learning_rate": 3.5398688934734125e-06, + "loss": 0.175, + "step": 14392 + }, + { + "epoch": 0.73, + "grad_norm": 1.0048912561155683, + "learning_rate": 3.53861180660529e-06, + "loss": 0.1551, + "step": 14393 + }, + { + "epoch": 0.73, + "grad_norm": 1.0342423921380977, + "learning_rate": 3.5373548950026882e-06, + "loss": 0.1561, + "step": 14394 + }, + { + "epoch": 0.73, + "grad_norm": 1.1324394346700175, + "learning_rate": 3.536098158699699e-06, + "loss": 0.1861, + "step": 14395 + }, + { + "epoch": 0.73, + "grad_norm": 0.8373933752901664, + "learning_rate": 3.5348415977304165e-06, + "loss": 0.148, + "step": 14396 + }, + { + "epoch": 0.73, + "grad_norm": 1.4163575087438194, + "learning_rate": 3.5335852121289172e-06, + "loss": 0.1624, + "step": 14397 + }, + { + "epoch": 0.73, + "grad_norm": 1.0223524984740415, + "learning_rate": 3.5323290019292867e-06, + "loss": 0.1646, + "step": 14398 + }, + { + "epoch": 0.73, + "grad_norm": 1.0621961867309728, + "learning_rate": 3.531072967165595e-06, + "loss": 0.1641, + "step": 14399 + }, + { + "epoch": 0.73, + "grad_norm": 1.0916386681798416, + "learning_rate": 3.529817107871918e-06, + "loss": 0.2014, + "step": 14400 + }, + { + "epoch": 0.73, + "grad_norm": 1.125081982430693, + "learning_rate": 3.5285614240823128e-06, + "loss": 0.1665, + "step": 14401 + }, + { + "epoch": 0.73, + "grad_norm": 1.0002037186679038, + "learning_rate": 3.5273059158308487e-06, + "loss": 0.1696, + "step": 14402 + }, + { + "epoch": 0.73, + "grad_norm": 1.114430281565667, + "learning_rate": 3.5260505831515736e-06, + "loss": 0.1829, + "step": 14403 + }, + { + "epoch": 0.73, + "grad_norm": 0.9266988557304616, + "learning_rate": 3.5247954260785422e-06, + "loss": 0.15, + "step": 14404 + }, + { + "epoch": 0.73, + "grad_norm": 0.9081102708912858, + "learning_rate": 3.523540444645804e-06, + "loss": 0.169, + "step": 14405 + }, + { + "epoch": 0.73, + "grad_norm": 0.8531204158047896, + "learning_rate": 3.522285638887394e-06, + "loss": 0.1861, + "step": 14406 + }, + { + "epoch": 0.73, + "grad_norm": 0.8542913650553401, + "learning_rate": 3.5210310088373544e-06, + "loss": 0.1799, + "step": 14407 + }, + { + "epoch": 0.73, + "grad_norm": 1.2123191910112574, + "learning_rate": 3.5197765545297124e-06, + "loss": 0.1628, + "step": 14408 + }, + { + "epoch": 0.73, + "grad_norm": 1.0883468660358517, + "learning_rate": 3.5185222759984993e-06, + "loss": 0.1844, + "step": 14409 + }, + { + "epoch": 0.73, + "grad_norm": 0.9928291607993566, + "learning_rate": 3.5172681732777335e-06, + "loss": 0.1583, + "step": 14410 + }, + { + "epoch": 0.73, + "grad_norm": 0.8216944156970188, + "learning_rate": 3.5160142464014336e-06, + "loss": 0.155, + "step": 14411 + }, + { + "epoch": 0.73, + "grad_norm": 1.0854388217429343, + "learning_rate": 3.514760495403614e-06, + "loss": 0.1634, + "step": 14412 + }, + { + "epoch": 0.73, + "grad_norm": 0.8440810900533375, + "learning_rate": 3.5135069203182858e-06, + "loss": 0.1536, + "step": 14413 + }, + { + "epoch": 0.73, + "grad_norm": 0.8591150792858231, + "learning_rate": 3.512253521179445e-06, + "loss": 0.1564, + "step": 14414 + }, + { + "epoch": 0.73, + "grad_norm": 2.701393035668696, + "learning_rate": 3.511000298021098e-06, + "loss": 0.1776, + "step": 14415 + }, + { + "epoch": 0.73, + "grad_norm": 1.0878208543064773, + "learning_rate": 3.5097472508772302e-06, + "loss": 0.1722, + "step": 14416 + }, + { + "epoch": 0.73, + "grad_norm": 0.9959521739940412, + "learning_rate": 3.508494379781838e-06, + "loss": 0.1483, + "step": 14417 + }, + { + "epoch": 0.73, + "grad_norm": 1.2945797851190817, + "learning_rate": 3.5072416847688993e-06, + "loss": 0.1735, + "step": 14418 + }, + { + "epoch": 0.73, + "grad_norm": 0.8609757094301027, + "learning_rate": 3.505989165872401e-06, + "loss": 0.1569, + "step": 14419 + }, + { + "epoch": 0.73, + "grad_norm": 0.9284580215713893, + "learning_rate": 3.504736823126309e-06, + "loss": 0.1708, + "step": 14420 + }, + { + "epoch": 0.73, + "grad_norm": 0.8923394895343267, + "learning_rate": 3.5034846565645973e-06, + "loss": 0.1718, + "step": 14421 + }, + { + "epoch": 0.73, + "grad_norm": 1.4204732071601351, + "learning_rate": 3.5022326662212347e-06, + "loss": 0.1712, + "step": 14422 + }, + { + "epoch": 0.73, + "grad_norm": 1.4474184116828497, + "learning_rate": 3.5009808521301746e-06, + "loss": 0.1592, + "step": 14423 + }, + { + "epoch": 0.73, + "grad_norm": 0.8142217738799143, + "learning_rate": 3.49972921432538e-06, + "loss": 0.1545, + "step": 14424 + }, + { + "epoch": 0.73, + "grad_norm": 0.9384311497554553, + "learning_rate": 3.4984777528407944e-06, + "loss": 0.1645, + "step": 14425 + }, + { + "epoch": 0.73, + "grad_norm": 0.947750884547131, + "learning_rate": 3.4972264677103694e-06, + "loss": 0.1805, + "step": 14426 + }, + { + "epoch": 0.73, + "grad_norm": 0.8440441374202798, + "learning_rate": 3.495975358968041e-06, + "loss": 0.1713, + "step": 14427 + }, + { + "epoch": 0.73, + "grad_norm": 1.4807376614383012, + "learning_rate": 3.4947244266477507e-06, + "loss": 0.1677, + "step": 14428 + }, + { + "epoch": 0.73, + "grad_norm": 0.9859634373605151, + "learning_rate": 3.493473670783426e-06, + "loss": 0.1568, + "step": 14429 + }, + { + "epoch": 0.73, + "grad_norm": 0.8734431242446054, + "learning_rate": 3.492223091408994e-06, + "loss": 0.1804, + "step": 14430 + }, + { + "epoch": 0.73, + "grad_norm": 1.2104883056005962, + "learning_rate": 3.4909726885583782e-06, + "loss": 0.1562, + "step": 14431 + }, + { + "epoch": 0.73, + "grad_norm": 1.1914531731251647, + "learning_rate": 3.4897224622655e-06, + "loss": 0.1766, + "step": 14432 + }, + { + "epoch": 0.73, + "grad_norm": 1.219156978564906, + "learning_rate": 3.4884724125642646e-06, + "loss": 0.1753, + "step": 14433 + }, + { + "epoch": 0.73, + "grad_norm": 0.880056487254269, + "learning_rate": 3.487222539488586e-06, + "loss": 0.1731, + "step": 14434 + }, + { + "epoch": 0.73, + "grad_norm": 0.9074694159953327, + "learning_rate": 3.4859728430723595e-06, + "loss": 0.161, + "step": 14435 + }, + { + "epoch": 0.73, + "grad_norm": 0.8519788084340726, + "learning_rate": 3.4847233233494916e-06, + "loss": 0.1958, + "step": 14436 + }, + { + "epoch": 0.73, + "grad_norm": 1.034003312618856, + "learning_rate": 3.4834739803538686e-06, + "loss": 0.1773, + "step": 14437 + }, + { + "epoch": 0.73, + "grad_norm": 0.9177306598389084, + "learning_rate": 3.4822248141193816e-06, + "loss": 0.1585, + "step": 14438 + }, + { + "epoch": 0.73, + "grad_norm": 0.8999315788165344, + "learning_rate": 3.4809758246799173e-06, + "loss": 0.166, + "step": 14439 + }, + { + "epoch": 0.73, + "grad_norm": 0.800926136705932, + "learning_rate": 3.479727012069349e-06, + "loss": 0.1541, + "step": 14440 + }, + { + "epoch": 0.73, + "grad_norm": 1.5135594002739012, + "learning_rate": 3.478478376321558e-06, + "loss": 0.1798, + "step": 14441 + }, + { + "epoch": 0.73, + "grad_norm": 1.6799241455933929, + "learning_rate": 3.4772299174704048e-06, + "loss": 0.1728, + "step": 14442 + }, + { + "epoch": 0.73, + "grad_norm": 0.9344063548247316, + "learning_rate": 3.475981635549763e-06, + "loss": 0.1697, + "step": 14443 + }, + { + "epoch": 0.73, + "grad_norm": 0.9087585500193363, + "learning_rate": 3.4747335305934836e-06, + "loss": 0.1703, + "step": 14444 + }, + { + "epoch": 0.73, + "grad_norm": 0.874986045657428, + "learning_rate": 3.47348560263543e-06, + "loss": 0.1694, + "step": 14445 + }, + { + "epoch": 0.73, + "grad_norm": 0.8998223181947868, + "learning_rate": 3.4722378517094436e-06, + "loss": 0.1631, + "step": 14446 + }, + { + "epoch": 0.73, + "grad_norm": 0.950896245133745, + "learning_rate": 3.4709902778493742e-06, + "loss": 0.185, + "step": 14447 + }, + { + "epoch": 0.73, + "grad_norm": 1.5040341211529928, + "learning_rate": 3.4697428810890634e-06, + "loss": 0.1536, + "step": 14448 + }, + { + "epoch": 0.73, + "grad_norm": 1.430844340350886, + "learning_rate": 3.4684956614623476e-06, + "loss": 0.1675, + "step": 14449 + }, + { + "epoch": 0.73, + "grad_norm": 1.3461237917573672, + "learning_rate": 3.4672486190030543e-06, + "loss": 0.1732, + "step": 14450 + }, + { + "epoch": 0.73, + "grad_norm": 0.9382187760505567, + "learning_rate": 3.466001753745013e-06, + "loss": 0.1739, + "step": 14451 + }, + { + "epoch": 0.73, + "grad_norm": 1.571179375982628, + "learning_rate": 3.4647550657220407e-06, + "loss": 0.1646, + "step": 14452 + }, + { + "epoch": 0.73, + "grad_norm": 1.1049373626773231, + "learning_rate": 3.463508554967959e-06, + "loss": 0.1782, + "step": 14453 + }, + { + "epoch": 0.74, + "grad_norm": 1.371196296814422, + "learning_rate": 3.462262221516575e-06, + "loss": 0.1718, + "step": 14454 + }, + { + "epoch": 0.74, + "grad_norm": 1.4900025372462213, + "learning_rate": 3.4610160654016987e-06, + "loss": 0.1915, + "step": 14455 + }, + { + "epoch": 0.74, + "grad_norm": 1.7389687154100937, + "learning_rate": 3.4597700866571294e-06, + "loss": 0.1759, + "step": 14456 + }, + { + "epoch": 0.74, + "grad_norm": 0.9413972328791125, + "learning_rate": 3.4585242853166657e-06, + "loss": 0.1771, + "step": 14457 + }, + { + "epoch": 0.74, + "grad_norm": 0.9685053769263471, + "learning_rate": 3.457278661414103e-06, + "loss": 0.1851, + "step": 14458 + }, + { + "epoch": 0.74, + "grad_norm": 1.1409849072794902, + "learning_rate": 3.456033214983222e-06, + "loss": 0.1857, + "step": 14459 + }, + { + "epoch": 0.74, + "grad_norm": 1.1307581819553465, + "learning_rate": 3.454787946057814e-06, + "loss": 0.1539, + "step": 14460 + }, + { + "epoch": 0.74, + "grad_norm": 1.105159104219784, + "learning_rate": 3.45354285467165e-06, + "loss": 0.1663, + "step": 14461 + }, + { + "epoch": 0.74, + "grad_norm": 1.0820023214794368, + "learning_rate": 3.452297940858508e-06, + "loss": 0.1862, + "step": 14462 + }, + { + "epoch": 0.74, + "grad_norm": 1.7615077258540803, + "learning_rate": 3.4510532046521505e-06, + "loss": 0.1792, + "step": 14463 + }, + { + "epoch": 0.74, + "grad_norm": 1.0554686371860205, + "learning_rate": 3.4498086460863455e-06, + "loss": 0.1387, + "step": 14464 + }, + { + "epoch": 0.74, + "grad_norm": 0.9781012241290394, + "learning_rate": 3.4485642651948516e-06, + "loss": 0.1652, + "step": 14465 + }, + { + "epoch": 0.74, + "grad_norm": 0.85615413732514, + "learning_rate": 3.4473200620114245e-06, + "loss": 0.1726, + "step": 14466 + }, + { + "epoch": 0.74, + "grad_norm": 1.0215857868340161, + "learning_rate": 3.4460760365698078e-06, + "loss": 0.1717, + "step": 14467 + }, + { + "epoch": 0.74, + "grad_norm": 1.1543118477931984, + "learning_rate": 3.444832188903752e-06, + "loss": 0.1546, + "step": 14468 + }, + { + "epoch": 0.74, + "grad_norm": 0.7340101051610519, + "learning_rate": 3.4435885190469886e-06, + "loss": 0.1799, + "step": 14469 + }, + { + "epoch": 0.74, + "grad_norm": 0.8684807031919314, + "learning_rate": 3.4423450270332626e-06, + "loss": 0.1616, + "step": 14470 + }, + { + "epoch": 0.74, + "grad_norm": 1.0967064609498811, + "learning_rate": 3.4411017128962932e-06, + "loss": 0.1609, + "step": 14471 + }, + { + "epoch": 0.74, + "grad_norm": 0.902430777464027, + "learning_rate": 3.4398585766698146e-06, + "loss": 0.1776, + "step": 14472 + }, + { + "epoch": 0.74, + "grad_norm": 0.9113227208746828, + "learning_rate": 3.4386156183875384e-06, + "loss": 0.1683, + "step": 14473 + }, + { + "epoch": 0.74, + "grad_norm": 0.8535988903463739, + "learning_rate": 3.437372838083184e-06, + "loss": 0.1612, + "step": 14474 + }, + { + "epoch": 0.74, + "grad_norm": 1.1787012390630862, + "learning_rate": 3.4361302357904657e-06, + "loss": 0.1583, + "step": 14475 + }, + { + "epoch": 0.74, + "grad_norm": 1.3044864076379246, + "learning_rate": 3.4348878115430827e-06, + "loss": 0.1637, + "step": 14476 + }, + { + "epoch": 0.74, + "grad_norm": 1.1927482457591698, + "learning_rate": 3.4336455653747414e-06, + "loss": 0.1669, + "step": 14477 + }, + { + "epoch": 0.74, + "grad_norm": 0.8094448998728605, + "learning_rate": 3.432403497319132e-06, + "loss": 0.1597, + "step": 14478 + }, + { + "epoch": 0.74, + "grad_norm": 1.0774702938327079, + "learning_rate": 3.4311616074099517e-06, + "loss": 0.1621, + "step": 14479 + }, + { + "epoch": 0.74, + "grad_norm": 0.9542580107319009, + "learning_rate": 3.429919895680881e-06, + "loss": 0.1632, + "step": 14480 + }, + { + "epoch": 0.74, + "grad_norm": 0.9611707086909844, + "learning_rate": 3.428678362165607e-06, + "loss": 0.1443, + "step": 14481 + }, + { + "epoch": 0.74, + "grad_norm": 1.116401520493127, + "learning_rate": 3.4274370068978013e-06, + "loss": 0.1587, + "step": 14482 + }, + { + "epoch": 0.74, + "grad_norm": 2.304449259136624, + "learning_rate": 3.4261958299111363e-06, + "loss": 0.1763, + "step": 14483 + }, + { + "epoch": 0.74, + "grad_norm": 1.1314091816500065, + "learning_rate": 3.424954831239282e-06, + "loss": 0.1945, + "step": 14484 + }, + { + "epoch": 0.74, + "grad_norm": 1.0671220035516777, + "learning_rate": 3.4237140109159015e-06, + "loss": 0.2072, + "step": 14485 + }, + { + "epoch": 0.74, + "grad_norm": 1.1149443924818006, + "learning_rate": 3.422473368974648e-06, + "loss": 0.1663, + "step": 14486 + }, + { + "epoch": 0.74, + "grad_norm": 0.7938422789239769, + "learning_rate": 3.4212329054491775e-06, + "loss": 0.1876, + "step": 14487 + }, + { + "epoch": 0.74, + "grad_norm": 0.8671460207650743, + "learning_rate": 3.419992620373134e-06, + "loss": 0.1636, + "step": 14488 + }, + { + "epoch": 0.74, + "grad_norm": 1.2237197498575294, + "learning_rate": 3.418752513780166e-06, + "loss": 0.2031, + "step": 14489 + }, + { + "epoch": 0.74, + "grad_norm": 0.7849908273815244, + "learning_rate": 3.4175125857039027e-06, + "loss": 0.1407, + "step": 14490 + }, + { + "epoch": 0.74, + "grad_norm": 1.1118225123332441, + "learning_rate": 3.416272836177984e-06, + "loss": 0.1758, + "step": 14491 + }, + { + "epoch": 0.74, + "grad_norm": 1.4168430278920894, + "learning_rate": 3.4150332652360386e-06, + "loss": 0.169, + "step": 14492 + }, + { + "epoch": 0.74, + "grad_norm": 0.8217197474909118, + "learning_rate": 3.413793872911685e-06, + "loss": 0.1589, + "step": 14493 + }, + { + "epoch": 0.74, + "grad_norm": 1.0719618045102857, + "learning_rate": 3.4125546592385483e-06, + "loss": 0.1777, + "step": 14494 + }, + { + "epoch": 0.74, + "grad_norm": 1.0332384870154756, + "learning_rate": 3.4113156242502345e-06, + "loss": 0.1622, + "step": 14495 + }, + { + "epoch": 0.74, + "grad_norm": 1.4227475903179927, + "learning_rate": 3.4100767679803605e-06, + "loss": 0.1686, + "step": 14496 + }, + { + "epoch": 0.74, + "grad_norm": 1.2006207316704358, + "learning_rate": 3.4088380904625217e-06, + "loss": 0.1633, + "step": 14497 + }, + { + "epoch": 0.74, + "grad_norm": 1.0980208062719765, + "learning_rate": 3.4075995917303263e-06, + "loss": 0.175, + "step": 14498 + }, + { + "epoch": 0.74, + "grad_norm": 1.3096078988390378, + "learning_rate": 3.4063612718173613e-06, + "loss": 0.1759, + "step": 14499 + }, + { + "epoch": 0.74, + "grad_norm": 1.1581398109088064, + "learning_rate": 3.4051231307572187e-06, + "loss": 0.1498, + "step": 14500 + }, + { + "epoch": 0.74, + "grad_norm": 0.8573471344318443, + "learning_rate": 3.403885168583484e-06, + "loss": 0.1721, + "step": 14501 + }, + { + "epoch": 0.74, + "grad_norm": 0.8507531056569723, + "learning_rate": 3.4026473853297394e-06, + "loss": 0.15, + "step": 14502 + }, + { + "epoch": 0.74, + "grad_norm": 1.1802770854199611, + "learning_rate": 3.4014097810295542e-06, + "loss": 0.1805, + "step": 14503 + }, + { + "epoch": 0.74, + "grad_norm": 1.322535646862338, + "learning_rate": 3.4001723557165046e-06, + "loss": 0.1681, + "step": 14504 + }, + { + "epoch": 0.74, + "grad_norm": 0.8631282651600433, + "learning_rate": 3.3989351094241496e-06, + "loss": 0.1683, + "step": 14505 + }, + { + "epoch": 0.74, + "grad_norm": 5.157037757388056, + "learning_rate": 3.3976980421860563e-06, + "loss": 0.1839, + "step": 14506 + }, + { + "epoch": 0.74, + "grad_norm": 1.4696535664183503, + "learning_rate": 3.396461154035772e-06, + "loss": 0.1608, + "step": 14507 + }, + { + "epoch": 0.74, + "grad_norm": 0.9476656230693181, + "learning_rate": 3.3952244450068527e-06, + "loss": 0.1616, + "step": 14508 + }, + { + "epoch": 0.74, + "grad_norm": 1.1449168374763314, + "learning_rate": 3.393987915132846e-06, + "loss": 0.1748, + "step": 14509 + }, + { + "epoch": 0.74, + "grad_norm": 0.9197974944526394, + "learning_rate": 3.3927515644472876e-06, + "loss": 0.1768, + "step": 14510 + }, + { + "epoch": 0.74, + "grad_norm": 0.86307723423347, + "learning_rate": 3.3915153929837186e-06, + "loss": 0.174, + "step": 14511 + }, + { + "epoch": 0.74, + "grad_norm": 0.9776784552407588, + "learning_rate": 3.3902794007756655e-06, + "loss": 0.1774, + "step": 14512 + }, + { + "epoch": 0.74, + "grad_norm": 1.0365769962900846, + "learning_rate": 3.38904358785666e-06, + "loss": 0.1589, + "step": 14513 + }, + { + "epoch": 0.74, + "grad_norm": 0.917243783214414, + "learning_rate": 3.3878079542602172e-06, + "loss": 0.1795, + "step": 14514 + }, + { + "epoch": 0.74, + "grad_norm": 0.9496799831594078, + "learning_rate": 3.38657250001986e-06, + "loss": 0.1648, + "step": 14515 + }, + { + "epoch": 0.74, + "grad_norm": 4.645315205313346, + "learning_rate": 3.3853372251690943e-06, + "loss": 0.1571, + "step": 14516 + }, + { + "epoch": 0.74, + "grad_norm": 1.0114640248745363, + "learning_rate": 3.38410212974143e-06, + "loss": 0.1705, + "step": 14517 + }, + { + "epoch": 0.74, + "grad_norm": 1.1624848313084417, + "learning_rate": 3.382867213770369e-06, + "loss": 0.1802, + "step": 14518 + }, + { + "epoch": 0.74, + "grad_norm": 0.9069656129089444, + "learning_rate": 3.3816324772894116e-06, + "loss": 0.1599, + "step": 14519 + }, + { + "epoch": 0.74, + "grad_norm": 1.1188970379680807, + "learning_rate": 3.380397920332045e-06, + "loss": 0.1778, + "step": 14520 + }, + { + "epoch": 0.74, + "grad_norm": 0.8696674808427313, + "learning_rate": 3.3791635429317602e-06, + "loss": 0.1655, + "step": 14521 + }, + { + "epoch": 0.74, + "grad_norm": 1.2652170450369873, + "learning_rate": 3.377929345122036e-06, + "loss": 0.152, + "step": 14522 + }, + { + "epoch": 0.74, + "grad_norm": 0.8047940293282426, + "learning_rate": 3.3766953269363555e-06, + "loss": 0.1647, + "step": 14523 + }, + { + "epoch": 0.74, + "grad_norm": 1.0025267726557285, + "learning_rate": 3.375461488408185e-06, + "loss": 0.1743, + "step": 14524 + }, + { + "epoch": 0.74, + "grad_norm": 0.9846755541578239, + "learning_rate": 3.3742278295709996e-06, + "loss": 0.1684, + "step": 14525 + }, + { + "epoch": 0.74, + "grad_norm": 1.4390956632534884, + "learning_rate": 3.372994350458254e-06, + "loss": 0.1666, + "step": 14526 + }, + { + "epoch": 0.74, + "grad_norm": 1.097163868094141, + "learning_rate": 3.3717610511034116e-06, + "loss": 0.1888, + "step": 14527 + }, + { + "epoch": 0.74, + "grad_norm": 1.4382002789137311, + "learning_rate": 3.370527931539929e-06, + "loss": 0.156, + "step": 14528 + }, + { + "epoch": 0.74, + "grad_norm": 0.8815583376865935, + "learning_rate": 3.3692949918012464e-06, + "loss": 0.1724, + "step": 14529 + }, + { + "epoch": 0.74, + "grad_norm": 3.213918545715467, + "learning_rate": 3.3680622319208158e-06, + "loss": 0.1833, + "step": 14530 + }, + { + "epoch": 0.74, + "grad_norm": 1.6967342650065715, + "learning_rate": 3.3668296519320676e-06, + "loss": 0.1699, + "step": 14531 + }, + { + "epoch": 0.74, + "grad_norm": 1.0196662112296344, + "learning_rate": 3.3655972518684433e-06, + "loss": 0.1757, + "step": 14532 + }, + { + "epoch": 0.74, + "grad_norm": 1.9265938286137334, + "learning_rate": 3.3643650317633645e-06, + "loss": 0.1703, + "step": 14533 + }, + { + "epoch": 0.74, + "grad_norm": 1.0321780931122735, + "learning_rate": 3.36313299165026e-06, + "loss": 0.1564, + "step": 14534 + }, + { + "epoch": 0.74, + "grad_norm": 1.3583160837462, + "learning_rate": 3.361901131562547e-06, + "loss": 0.1786, + "step": 14535 + }, + { + "epoch": 0.74, + "grad_norm": 1.0028582835706312, + "learning_rate": 3.3606694515336457e-06, + "loss": 0.1837, + "step": 14536 + }, + { + "epoch": 0.74, + "grad_norm": 1.2658375346336457, + "learning_rate": 3.3594379515969555e-06, + "loss": 0.1654, + "step": 14537 + }, + { + "epoch": 0.74, + "grad_norm": 0.9410506696252773, + "learning_rate": 3.3582066317858898e-06, + "loss": 0.1474, + "step": 14538 + }, + { + "epoch": 0.74, + "grad_norm": 1.256438474519618, + "learning_rate": 3.3569754921338416e-06, + "loss": 0.1898, + "step": 14539 + }, + { + "epoch": 0.74, + "grad_norm": 1.0700580104442847, + "learning_rate": 3.355744532674211e-06, + "loss": 0.1682, + "step": 14540 + }, + { + "epoch": 0.74, + "grad_norm": 0.9123242811741862, + "learning_rate": 3.3545137534403814e-06, + "loss": 0.1786, + "step": 14541 + }, + { + "epoch": 0.74, + "grad_norm": 0.9746035844674514, + "learning_rate": 3.3532831544657464e-06, + "loss": 0.1773, + "step": 14542 + }, + { + "epoch": 0.74, + "grad_norm": 1.647138127338118, + "learning_rate": 3.3520527357836764e-06, + "loss": 0.1675, + "step": 14543 + }, + { + "epoch": 0.74, + "grad_norm": 0.9351064187096528, + "learning_rate": 3.3508224974275517e-06, + "loss": 0.1534, + "step": 14544 + }, + { + "epoch": 0.74, + "grad_norm": 1.317826242356967, + "learning_rate": 3.3495924394307466e-06, + "loss": 0.1794, + "step": 14545 + }, + { + "epoch": 0.74, + "grad_norm": 0.8461449815181312, + "learning_rate": 3.348362561826618e-06, + "loss": 0.1895, + "step": 14546 + }, + { + "epoch": 0.74, + "grad_norm": 0.9737778528397362, + "learning_rate": 3.3471328646485345e-06, + "loss": 0.1576, + "step": 14547 + }, + { + "epoch": 0.74, + "grad_norm": 0.890239126549413, + "learning_rate": 3.3459033479298444e-06, + "loss": 0.1635, + "step": 14548 + }, + { + "epoch": 0.74, + "grad_norm": 1.1067314555766055, + "learning_rate": 3.3446740117039045e-06, + "loss": 0.1694, + "step": 14549 + }, + { + "epoch": 0.74, + "grad_norm": 0.7909680510638762, + "learning_rate": 3.3434448560040544e-06, + "loss": 0.1624, + "step": 14550 + }, + { + "epoch": 0.74, + "grad_norm": 1.8687447740217409, + "learning_rate": 3.342215880863643e-06, + "loss": 0.1763, + "step": 14551 + }, + { + "epoch": 0.74, + "grad_norm": 0.7685163826835121, + "learning_rate": 3.3409870863159977e-06, + "loss": 0.1599, + "step": 14552 + }, + { + "epoch": 0.74, + "grad_norm": 2.4860196312297997, + "learning_rate": 3.3397584723944542e-06, + "loss": 0.1447, + "step": 14553 + }, + { + "epoch": 0.74, + "grad_norm": 1.0828676969745032, + "learning_rate": 3.3385300391323384e-06, + "loss": 0.1508, + "step": 14554 + }, + { + "epoch": 0.74, + "grad_norm": 0.9196225158622006, + "learning_rate": 3.3373017865629742e-06, + "loss": 0.1647, + "step": 14555 + }, + { + "epoch": 0.74, + "grad_norm": 1.1366321195647922, + "learning_rate": 3.336073714719673e-06, + "loss": 0.1741, + "step": 14556 + }, + { + "epoch": 0.74, + "grad_norm": 0.9609351328524763, + "learning_rate": 3.3348458236357517e-06, + "loss": 0.1374, + "step": 14557 + }, + { + "epoch": 0.74, + "grad_norm": 1.5918165884053737, + "learning_rate": 3.33361811334451e-06, + "loss": 0.1787, + "step": 14558 + }, + { + "epoch": 0.74, + "grad_norm": 1.6087642687901853, + "learning_rate": 3.332390583879257e-06, + "loss": 0.1831, + "step": 14559 + }, + { + "epoch": 0.74, + "grad_norm": 0.954159681884898, + "learning_rate": 3.3311632352732826e-06, + "loss": 0.1633, + "step": 14560 + }, + { + "epoch": 0.74, + "grad_norm": 1.2349214257106662, + "learning_rate": 3.3299360675598826e-06, + "loss": 0.1772, + "step": 14561 + }, + { + "epoch": 0.74, + "grad_norm": 1.3786250169409255, + "learning_rate": 3.3287090807723466e-06, + "loss": 0.1524, + "step": 14562 + }, + { + "epoch": 0.74, + "grad_norm": 1.9314633192128212, + "learning_rate": 3.3274822749439506e-06, + "loss": 0.1697, + "step": 14563 + }, + { + "epoch": 0.74, + "grad_norm": 1.3280040060596827, + "learning_rate": 3.3262556501079777e-06, + "loss": 0.1545, + "step": 14564 + }, + { + "epoch": 0.74, + "grad_norm": 0.7767872809005165, + "learning_rate": 3.325029206297694e-06, + "loss": 0.1468, + "step": 14565 + }, + { + "epoch": 0.74, + "grad_norm": 1.264531729338008, + "learning_rate": 3.3238029435463727e-06, + "loss": 0.1636, + "step": 14566 + }, + { + "epoch": 0.74, + "grad_norm": 1.2727697010218713, + "learning_rate": 3.3225768618872712e-06, + "loss": 0.1635, + "step": 14567 + }, + { + "epoch": 0.74, + "grad_norm": 1.0529185194920503, + "learning_rate": 3.321350961353652e-06, + "loss": 0.1579, + "step": 14568 + }, + { + "epoch": 0.74, + "grad_norm": 1.050110099196728, + "learning_rate": 3.320125241978762e-06, + "loss": 0.1659, + "step": 14569 + }, + { + "epoch": 0.74, + "grad_norm": 1.080844483099251, + "learning_rate": 3.3188997037958535e-06, + "loss": 0.1584, + "step": 14570 + }, + { + "epoch": 0.74, + "grad_norm": 1.0033168561510555, + "learning_rate": 3.3176743468381665e-06, + "loss": 0.1654, + "step": 14571 + }, + { + "epoch": 0.74, + "grad_norm": 1.4874031465916626, + "learning_rate": 3.3164491711389434e-06, + "loss": 0.1753, + "step": 14572 + }, + { + "epoch": 0.74, + "grad_norm": 1.5101430635489643, + "learning_rate": 3.3152241767314117e-06, + "loss": 0.1632, + "step": 14573 + }, + { + "epoch": 0.74, + "grad_norm": 0.7342798075701323, + "learning_rate": 3.3139993636488042e-06, + "loss": 0.1567, + "step": 14574 + }, + { + "epoch": 0.74, + "grad_norm": 1.040576258566937, + "learning_rate": 3.3127747319243385e-06, + "loss": 0.1759, + "step": 14575 + }, + { + "epoch": 0.74, + "grad_norm": 1.0946480073485154, + "learning_rate": 3.31155028159124e-06, + "loss": 0.1689, + "step": 14576 + }, + { + "epoch": 0.74, + "grad_norm": 1.0050106610475777, + "learning_rate": 3.3103260126827143e-06, + "loss": 0.185, + "step": 14577 + }, + { + "epoch": 0.74, + "grad_norm": 1.086747451695257, + "learning_rate": 3.3091019252319755e-06, + "loss": 0.1706, + "step": 14578 + }, + { + "epoch": 0.74, + "grad_norm": 1.041621202794767, + "learning_rate": 3.3078780192722225e-06, + "loss": 0.1531, + "step": 14579 + }, + { + "epoch": 0.74, + "grad_norm": 1.0667084003881273, + "learning_rate": 3.3066542948366564e-06, + "loss": 0.1624, + "step": 14580 + }, + { + "epoch": 0.74, + "grad_norm": 1.0626094125015544, + "learning_rate": 3.3054307519584737e-06, + "loss": 0.1639, + "step": 14581 + }, + { + "epoch": 0.74, + "grad_norm": 0.9810072837276894, + "learning_rate": 3.304207390670856e-06, + "loss": 0.1596, + "step": 14582 + }, + { + "epoch": 0.74, + "grad_norm": 1.2386279164336178, + "learning_rate": 3.302984211006995e-06, + "loss": 0.175, + "step": 14583 + }, + { + "epoch": 0.74, + "grad_norm": 1.2024887775229773, + "learning_rate": 3.301761213000062e-06, + "loss": 0.1549, + "step": 14584 + }, + { + "epoch": 0.74, + "grad_norm": 1.162501635769063, + "learning_rate": 3.3005383966832383e-06, + "loss": 0.1699, + "step": 14585 + }, + { + "epoch": 0.74, + "grad_norm": 1.0021653115090114, + "learning_rate": 3.2993157620896844e-06, + "loss": 0.1644, + "step": 14586 + }, + { + "epoch": 0.74, + "grad_norm": 0.9026003490311508, + "learning_rate": 3.2980933092525704e-06, + "loss": 0.1762, + "step": 14587 + }, + { + "epoch": 0.74, + "grad_norm": 1.0016934769153616, + "learning_rate": 3.296871038205053e-06, + "loss": 0.1795, + "step": 14588 + }, + { + "epoch": 0.74, + "grad_norm": 1.1099327248410022, + "learning_rate": 3.2956489489802902e-06, + "loss": 0.176, + "step": 14589 + }, + { + "epoch": 0.74, + "grad_norm": 0.9360943590709028, + "learning_rate": 3.2944270416114256e-06, + "loss": 0.1513, + "step": 14590 + }, + { + "epoch": 0.74, + "grad_norm": 1.0377900767546822, + "learning_rate": 3.29320531613161e-06, + "loss": 0.1851, + "step": 14591 + }, + { + "epoch": 0.74, + "grad_norm": 1.1138509631808082, + "learning_rate": 3.2919837725739745e-06, + "loss": 0.1656, + "step": 14592 + }, + { + "epoch": 0.74, + "grad_norm": 0.8037614164539251, + "learning_rate": 3.290762410971663e-06, + "loss": 0.1843, + "step": 14593 + }, + { + "epoch": 0.74, + "grad_norm": 1.1417182624558486, + "learning_rate": 3.289541231357796e-06, + "loss": 0.1875, + "step": 14594 + }, + { + "epoch": 0.74, + "grad_norm": 1.1424511213698094, + "learning_rate": 3.288320233765504e-06, + "loss": 0.1674, + "step": 14595 + }, + { + "epoch": 0.74, + "grad_norm": 0.8267506888422055, + "learning_rate": 3.2870994182279036e-06, + "loss": 0.1669, + "step": 14596 + }, + { + "epoch": 0.74, + "grad_norm": 2.109347917510044, + "learning_rate": 3.2858787847781093e-06, + "loss": 0.1737, + "step": 14597 + }, + { + "epoch": 0.74, + "grad_norm": 2.4723720577426307, + "learning_rate": 3.2846583334492354e-06, + "loss": 0.1934, + "step": 14598 + }, + { + "epoch": 0.74, + "grad_norm": 1.4229642799860578, + "learning_rate": 3.2834380642743813e-06, + "loss": 0.183, + "step": 14599 + }, + { + "epoch": 0.74, + "grad_norm": 0.8882285331677954, + "learning_rate": 3.282217977286651e-06, + "loss": 0.1626, + "step": 14600 + }, + { + "epoch": 0.74, + "grad_norm": 1.040414192870245, + "learning_rate": 3.280998072519135e-06, + "loss": 0.1632, + "step": 14601 + }, + { + "epoch": 0.74, + "grad_norm": 1.5818449754137551, + "learning_rate": 3.2797783500049297e-06, + "loss": 0.1594, + "step": 14602 + }, + { + "epoch": 0.74, + "grad_norm": 1.0808796494097017, + "learning_rate": 3.2785588097771115e-06, + "loss": 0.1792, + "step": 14603 + }, + { + "epoch": 0.74, + "grad_norm": 1.8474299748024428, + "learning_rate": 3.277339451868766e-06, + "loss": 0.1518, + "step": 14604 + }, + { + "epoch": 0.74, + "grad_norm": 1.9284468258094691, + "learning_rate": 3.2761202763129674e-06, + "loss": 0.1717, + "step": 14605 + }, + { + "epoch": 0.74, + "grad_norm": 1.0873153366497863, + "learning_rate": 3.2749012831427897e-06, + "loss": 0.189, + "step": 14606 + }, + { + "epoch": 0.74, + "grad_norm": 1.30278093395129, + "learning_rate": 3.2736824723912907e-06, + "loss": 0.1631, + "step": 14607 + }, + { + "epoch": 0.74, + "grad_norm": 1.0590908505037824, + "learning_rate": 3.272463844091538e-06, + "loss": 0.1829, + "step": 14608 + }, + { + "epoch": 0.74, + "grad_norm": 0.9810362215460501, + "learning_rate": 3.2712453982765802e-06, + "loss": 0.1513, + "step": 14609 + }, + { + "epoch": 0.74, + "grad_norm": 1.8938383648760326, + "learning_rate": 3.270027134979474e-06, + "loss": 0.1808, + "step": 14610 + }, + { + "epoch": 0.74, + "grad_norm": 1.5913105125941047, + "learning_rate": 3.2688090542332597e-06, + "loss": 0.1599, + "step": 14611 + }, + { + "epoch": 0.74, + "grad_norm": 0.8290832642919869, + "learning_rate": 3.2675911560709826e-06, + "loss": 0.1719, + "step": 14612 + }, + { + "epoch": 0.74, + "grad_norm": 1.0200858042748173, + "learning_rate": 3.266373440525672e-06, + "loss": 0.1687, + "step": 14613 + }, + { + "epoch": 0.74, + "grad_norm": 0.9597410293776677, + "learning_rate": 3.265155907630363e-06, + "loss": 0.1932, + "step": 14614 + }, + { + "epoch": 0.74, + "grad_norm": 0.9905982909574356, + "learning_rate": 3.2639385574180825e-06, + "loss": 0.1726, + "step": 14615 + }, + { + "epoch": 0.74, + "grad_norm": 1.0516067557843038, + "learning_rate": 3.262721389921847e-06, + "loss": 0.1368, + "step": 14616 + }, + { + "epoch": 0.74, + "grad_norm": 1.488569177243947, + "learning_rate": 3.261504405174677e-06, + "loss": 0.1905, + "step": 14617 + }, + { + "epoch": 0.74, + "grad_norm": 1.0496740397502935, + "learning_rate": 3.2602876032095767e-06, + "loss": 0.2094, + "step": 14618 + }, + { + "epoch": 0.74, + "grad_norm": 1.2266278443699472, + "learning_rate": 3.2590709840595604e-06, + "loss": 0.1694, + "step": 14619 + }, + { + "epoch": 0.74, + "grad_norm": 3.8910979222441915, + "learning_rate": 3.2578545477576195e-06, + "loss": 0.1935, + "step": 14620 + }, + { + "epoch": 0.74, + "grad_norm": 1.1269281838780711, + "learning_rate": 3.256638294336759e-06, + "loss": 0.1769, + "step": 14621 + }, + { + "epoch": 0.74, + "grad_norm": 1.469746087556023, + "learning_rate": 3.2554222238299627e-06, + "loss": 0.1743, + "step": 14622 + }, + { + "epoch": 0.74, + "grad_norm": 1.5128575182766357, + "learning_rate": 3.2542063362702194e-06, + "loss": 0.1768, + "step": 14623 + }, + { + "epoch": 0.74, + "grad_norm": 0.9485174659917036, + "learning_rate": 3.25299063169051e-06, + "loss": 0.184, + "step": 14624 + }, + { + "epoch": 0.74, + "grad_norm": 1.819632029538112, + "learning_rate": 3.251775110123814e-06, + "loss": 0.1994, + "step": 14625 + }, + { + "epoch": 0.74, + "grad_norm": 1.1297795644354593, + "learning_rate": 3.250559771603097e-06, + "loss": 0.1556, + "step": 14626 + }, + { + "epoch": 0.74, + "grad_norm": 1.5246558499353544, + "learning_rate": 3.2493446161613297e-06, + "loss": 0.1624, + "step": 14627 + }, + { + "epoch": 0.74, + "grad_norm": 1.1439968605434216, + "learning_rate": 3.248129643831467e-06, + "loss": 0.1548, + "step": 14628 + }, + { + "epoch": 0.74, + "grad_norm": 0.9945889104991872, + "learning_rate": 3.2469148546464734e-06, + "loss": 0.1439, + "step": 14629 + }, + { + "epoch": 0.74, + "grad_norm": 1.3193732383010004, + "learning_rate": 3.245700248639292e-06, + "loss": 0.1741, + "step": 14630 + }, + { + "epoch": 0.74, + "grad_norm": 0.8404263760961086, + "learning_rate": 3.2444858258428733e-06, + "loss": 0.1655, + "step": 14631 + }, + { + "epoch": 0.74, + "grad_norm": 0.9678957974759184, + "learning_rate": 3.243271586290161e-06, + "loss": 0.1731, + "step": 14632 + }, + { + "epoch": 0.74, + "grad_norm": 1.043645278047243, + "learning_rate": 3.2420575300140855e-06, + "loss": 0.1608, + "step": 14633 + }, + { + "epoch": 0.74, + "grad_norm": 1.1111114593266385, + "learning_rate": 3.2408436570475844e-06, + "loss": 0.1684, + "step": 14634 + }, + { + "epoch": 0.74, + "grad_norm": 1.237932636905411, + "learning_rate": 3.2396299674235777e-06, + "loss": 0.1937, + "step": 14635 + }, + { + "epoch": 0.74, + "grad_norm": 0.9429836293705041, + "learning_rate": 3.2384164611749924e-06, + "loss": 0.176, + "step": 14636 + }, + { + "epoch": 0.74, + "grad_norm": 1.7866774192846304, + "learning_rate": 3.23720313833474e-06, + "loss": 0.1658, + "step": 14637 + }, + { + "epoch": 0.74, + "grad_norm": 1.1895483659518495, + "learning_rate": 3.235989998935738e-06, + "loss": 0.1748, + "step": 14638 + }, + { + "epoch": 0.74, + "grad_norm": 1.0754191928378574, + "learning_rate": 3.234777043010886e-06, + "loss": 0.175, + "step": 14639 + }, + { + "epoch": 0.74, + "grad_norm": 1.2658732859500361, + "learning_rate": 3.2335642705930883e-06, + "loss": 0.1662, + "step": 14640 + }, + { + "epoch": 0.74, + "grad_norm": 1.0116684018540003, + "learning_rate": 3.2323516817152424e-06, + "loss": 0.1697, + "step": 14641 + }, + { + "epoch": 0.74, + "grad_norm": 1.2918276885913262, + "learning_rate": 3.2311392764102422e-06, + "loss": 0.1726, + "step": 14642 + }, + { + "epoch": 0.74, + "grad_norm": 0.8766387636913678, + "learning_rate": 3.2299270547109684e-06, + "loss": 0.1728, + "step": 14643 + }, + { + "epoch": 0.74, + "grad_norm": 1.032418759945567, + "learning_rate": 3.228715016650308e-06, + "loss": 0.1813, + "step": 14644 + }, + { + "epoch": 0.74, + "grad_norm": 0.966008159927298, + "learning_rate": 3.227503162261133e-06, + "loss": 0.157, + "step": 14645 + }, + { + "epoch": 0.74, + "grad_norm": 1.2640327901478297, + "learning_rate": 3.22629149157632e-06, + "loss": 0.1604, + "step": 14646 + }, + { + "epoch": 0.74, + "grad_norm": 1.5646018269615116, + "learning_rate": 3.2250800046287303e-06, + "loss": 0.1951, + "step": 14647 + }, + { + "epoch": 0.74, + "grad_norm": 1.1532003300998375, + "learning_rate": 3.2238687014512306e-06, + "loss": 0.1482, + "step": 14648 + }, + { + "epoch": 0.74, + "grad_norm": 1.0329973344003411, + "learning_rate": 3.2226575820766727e-06, + "loss": 0.1594, + "step": 14649 + }, + { + "epoch": 0.74, + "grad_norm": 0.8869975421624843, + "learning_rate": 3.22144664653791e-06, + "loss": 0.1643, + "step": 14650 + }, + { + "epoch": 0.75, + "grad_norm": 2.0117567487631582, + "learning_rate": 3.220235894867794e-06, + "loss": 0.1664, + "step": 14651 + }, + { + "epoch": 0.75, + "grad_norm": 0.8690142384583647, + "learning_rate": 3.219025327099158e-06, + "loss": 0.1813, + "step": 14652 + }, + { + "epoch": 0.75, + "grad_norm": 1.1503442008963496, + "learning_rate": 3.2178149432648465e-06, + "loss": 0.1901, + "step": 14653 + }, + { + "epoch": 0.75, + "grad_norm": 1.072701341953258, + "learning_rate": 3.2166047433976843e-06, + "loss": 0.1462, + "step": 14654 + }, + { + "epoch": 0.75, + "grad_norm": 0.8609032986106758, + "learning_rate": 3.215394727530504e-06, + "loss": 0.1581, + "step": 14655 + }, + { + "epoch": 0.75, + "grad_norm": 0.8603901676502527, + "learning_rate": 3.214184895696123e-06, + "loss": 0.176, + "step": 14656 + }, + { + "epoch": 0.75, + "grad_norm": 0.8059552025787837, + "learning_rate": 3.21297524792736e-06, + "loss": 0.162, + "step": 14657 + }, + { + "epoch": 0.75, + "grad_norm": 0.9996015945561221, + "learning_rate": 3.211765784257026e-06, + "loss": 0.1677, + "step": 14658 + }, + { + "epoch": 0.75, + "grad_norm": 1.2233505941974974, + "learning_rate": 3.210556504717932e-06, + "loss": 0.1809, + "step": 14659 + }, + { + "epoch": 0.75, + "grad_norm": 1.0029193160617993, + "learning_rate": 3.2093474093428733e-06, + "loss": 0.1721, + "step": 14660 + }, + { + "epoch": 0.75, + "grad_norm": 1.1138558257840987, + "learning_rate": 3.2081384981646534e-06, + "loss": 0.1602, + "step": 14661 + }, + { + "epoch": 0.75, + "grad_norm": 0.8493704927249281, + "learning_rate": 3.2069297712160563e-06, + "loss": 0.1696, + "step": 14662 + }, + { + "epoch": 0.75, + "grad_norm": 0.9300782059342856, + "learning_rate": 3.2057212285298767e-06, + "loss": 0.1746, + "step": 14663 + }, + { + "epoch": 0.75, + "grad_norm": 1.6489511279917344, + "learning_rate": 3.2045128701388883e-06, + "loss": 0.1831, + "step": 14664 + }, + { + "epoch": 0.75, + "grad_norm": 0.6715377433329404, + "learning_rate": 3.2033046960758763e-06, + "loss": 0.148, + "step": 14665 + }, + { + "epoch": 0.75, + "grad_norm": 1.0328666976068182, + "learning_rate": 3.202096706373604e-06, + "loss": 0.1697, + "step": 14666 + }, + { + "epoch": 0.75, + "grad_norm": 1.553987915627782, + "learning_rate": 3.2008889010648438e-06, + "loss": 0.1515, + "step": 14667 + }, + { + "epoch": 0.75, + "grad_norm": 1.7387038180708394, + "learning_rate": 3.1996812801823585e-06, + "loss": 0.1754, + "step": 14668 + }, + { + "epoch": 0.75, + "grad_norm": 1.0705040882814145, + "learning_rate": 3.1984738437588992e-06, + "loss": 0.1509, + "step": 14669 + }, + { + "epoch": 0.75, + "grad_norm": 0.6639713788036837, + "learning_rate": 3.197266591827225e-06, + "loss": 0.1562, + "step": 14670 + }, + { + "epoch": 0.75, + "grad_norm": 1.0838213880179155, + "learning_rate": 3.1960595244200745e-06, + "loss": 0.1564, + "step": 14671 + }, + { + "epoch": 0.75, + "grad_norm": 1.1254132382331623, + "learning_rate": 3.1948526415701973e-06, + "loss": 0.1587, + "step": 14672 + }, + { + "epoch": 0.75, + "grad_norm": 1.1114188821514774, + "learning_rate": 3.1936459433103238e-06, + "loss": 0.1504, + "step": 14673 + }, + { + "epoch": 0.75, + "grad_norm": 0.8106975038011655, + "learning_rate": 3.192439429673192e-06, + "loss": 0.1733, + "step": 14674 + }, + { + "epoch": 0.75, + "grad_norm": 1.287735468603681, + "learning_rate": 3.191233100691521e-06, + "loss": 0.1759, + "step": 14675 + }, + { + "epoch": 0.75, + "grad_norm": 0.9826916736715666, + "learning_rate": 3.1900269563980368e-06, + "loss": 0.1738, + "step": 14676 + }, + { + "epoch": 0.75, + "grad_norm": 0.9326441042038448, + "learning_rate": 3.1888209968254567e-06, + "loss": 0.1624, + "step": 14677 + }, + { + "epoch": 0.75, + "grad_norm": 1.230860934384176, + "learning_rate": 3.1876152220064948e-06, + "loss": 0.1465, + "step": 14678 + }, + { + "epoch": 0.75, + "grad_norm": 1.252938672283592, + "learning_rate": 3.186409631973851e-06, + "loss": 0.1899, + "step": 14679 + }, + { + "epoch": 0.75, + "grad_norm": 0.9513008216253451, + "learning_rate": 3.1852042267602344e-06, + "loss": 0.169, + "step": 14680 + }, + { + "epoch": 0.75, + "grad_norm": 1.3737216365826161, + "learning_rate": 3.183999006398335e-06, + "loss": 0.1629, + "step": 14681 + }, + { + "epoch": 0.75, + "grad_norm": 1.0258943551734003, + "learning_rate": 3.1827939709208512e-06, + "loss": 0.1769, + "step": 14682 + }, + { + "epoch": 0.75, + "grad_norm": 1.1094208877179543, + "learning_rate": 3.181589120360462e-06, + "loss": 0.1736, + "step": 14683 + }, + { + "epoch": 0.75, + "grad_norm": 0.8428413399334154, + "learning_rate": 3.180384454749853e-06, + "loss": 0.1562, + "step": 14684 + }, + { + "epoch": 0.75, + "grad_norm": 1.0872615303852258, + "learning_rate": 3.1791799741217046e-06, + "loss": 0.1958, + "step": 14685 + }, + { + "epoch": 0.75, + "grad_norm": 1.4317740619625547, + "learning_rate": 3.17797567850868e-06, + "loss": 0.1848, + "step": 14686 + }, + { + "epoch": 0.75, + "grad_norm": 1.2333218938900938, + "learning_rate": 3.176771567943455e-06, + "loss": 0.1678, + "step": 14687 + }, + { + "epoch": 0.75, + "grad_norm": 2.200898823296642, + "learning_rate": 3.1755676424586835e-06, + "loss": 0.1623, + "step": 14688 + }, + { + "epoch": 0.75, + "grad_norm": 1.0968842840505326, + "learning_rate": 3.1743639020870277e-06, + "loss": 0.164, + "step": 14689 + }, + { + "epoch": 0.75, + "grad_norm": 1.1702189181308977, + "learning_rate": 3.173160346861134e-06, + "loss": 0.1793, + "step": 14690 + }, + { + "epoch": 0.75, + "grad_norm": 1.7023827975981258, + "learning_rate": 3.1719569768136536e-06, + "loss": 0.1598, + "step": 14691 + }, + { + "epoch": 0.75, + "grad_norm": 1.1005884174638634, + "learning_rate": 3.1707537919772236e-06, + "loss": 0.1612, + "step": 14692 + }, + { + "epoch": 0.75, + "grad_norm": 0.8893579964151115, + "learning_rate": 3.1695507923844813e-06, + "loss": 0.1508, + "step": 14693 + }, + { + "epoch": 0.75, + "grad_norm": 1.226655951675619, + "learning_rate": 3.1683479780680616e-06, + "loss": 0.1635, + "step": 14694 + }, + { + "epoch": 0.75, + "grad_norm": 0.947454141417125, + "learning_rate": 3.1671453490605906e-06, + "loss": 0.1847, + "step": 14695 + }, + { + "epoch": 0.75, + "grad_norm": 1.0113457917496576, + "learning_rate": 3.1659429053946853e-06, + "loss": 0.1553, + "step": 14696 + }, + { + "epoch": 0.75, + "grad_norm": 1.088003262151018, + "learning_rate": 3.1647406471029684e-06, + "loss": 0.1665, + "step": 14697 + }, + { + "epoch": 0.75, + "grad_norm": 1.1241668611924707, + "learning_rate": 3.1635385742180435e-06, + "loss": 0.1697, + "step": 14698 + }, + { + "epoch": 0.75, + "grad_norm": 0.8074782348500469, + "learning_rate": 3.1623366867725238e-06, + "loss": 0.1667, + "step": 14699 + }, + { + "epoch": 0.75, + "grad_norm": 0.9719281670469927, + "learning_rate": 3.1611349847990083e-06, + "loss": 0.1702, + "step": 14700 + }, + { + "epoch": 0.75, + "grad_norm": 1.115648697125075, + "learning_rate": 3.15993346833009e-06, + "loss": 0.1717, + "step": 14701 + }, + { + "epoch": 0.75, + "grad_norm": 1.1447856245133403, + "learning_rate": 3.1587321373983616e-06, + "loss": 0.1681, + "step": 14702 + }, + { + "epoch": 0.75, + "grad_norm": 1.1970897169384256, + "learning_rate": 3.1575309920364106e-06, + "loss": 0.1498, + "step": 14703 + }, + { + "epoch": 0.75, + "grad_norm": 1.4211130522484572, + "learning_rate": 3.156330032276821e-06, + "loss": 0.1797, + "step": 14704 + }, + { + "epoch": 0.75, + "grad_norm": 1.123241466425086, + "learning_rate": 3.1551292581521632e-06, + "loss": 0.1764, + "step": 14705 + }, + { + "epoch": 0.75, + "grad_norm": 1.1792557421404262, + "learning_rate": 3.1539286696950135e-06, + "loss": 0.1903, + "step": 14706 + }, + { + "epoch": 0.75, + "grad_norm": 1.0185275875438167, + "learning_rate": 3.152728266937932e-06, + "loss": 0.1541, + "step": 14707 + }, + { + "epoch": 0.75, + "grad_norm": 1.1295736663003781, + "learning_rate": 3.151528049913487e-06, + "loss": 0.183, + "step": 14708 + }, + { + "epoch": 0.75, + "grad_norm": 1.0700529459342265, + "learning_rate": 3.150328018654226e-06, + "loss": 0.1689, + "step": 14709 + }, + { + "epoch": 0.75, + "grad_norm": 1.331060876225935, + "learning_rate": 3.149128173192706e-06, + "loss": 0.1866, + "step": 14710 + }, + { + "epoch": 0.75, + "grad_norm": 1.011138547228524, + "learning_rate": 3.14792851356147e-06, + "loss": 0.1602, + "step": 14711 + }, + { + "epoch": 0.75, + "grad_norm": 0.8101396967808473, + "learning_rate": 3.1467290397930637e-06, + "loss": 0.1807, + "step": 14712 + }, + { + "epoch": 0.75, + "grad_norm": 0.8430158304766958, + "learning_rate": 3.1455297519200157e-06, + "loss": 0.1553, + "step": 14713 + }, + { + "epoch": 0.75, + "grad_norm": 1.0986957333952236, + "learning_rate": 3.144330649974864e-06, + "loss": 0.172, + "step": 14714 + }, + { + "epoch": 0.75, + "grad_norm": 0.9840712424089405, + "learning_rate": 3.1431317339901267e-06, + "loss": 0.1609, + "step": 14715 + }, + { + "epoch": 0.75, + "grad_norm": 1.1991225020841343, + "learning_rate": 3.1419330039983333e-06, + "loss": 0.1853, + "step": 14716 + }, + { + "epoch": 0.75, + "grad_norm": 1.1482875122879654, + "learning_rate": 3.14073446003199e-06, + "loss": 0.1649, + "step": 14717 + }, + { + "epoch": 0.75, + "grad_norm": 1.1271257346502312, + "learning_rate": 3.1395361021236148e-06, + "loss": 0.1825, + "step": 14718 + }, + { + "epoch": 0.75, + "grad_norm": 1.0659969335638813, + "learning_rate": 3.1383379303057084e-06, + "loss": 0.1443, + "step": 14719 + }, + { + "epoch": 0.75, + "grad_norm": 1.0191410707488946, + "learning_rate": 3.137139944610772e-06, + "loss": 0.1733, + "step": 14720 + }, + { + "epoch": 0.75, + "grad_norm": 0.76103460182858, + "learning_rate": 3.1359421450713056e-06, + "loss": 0.1548, + "step": 14721 + }, + { + "epoch": 0.75, + "grad_norm": 0.8902470007480779, + "learning_rate": 3.1347445317197935e-06, + "loss": 0.1549, + "step": 14722 + }, + { + "epoch": 0.75, + "grad_norm": 1.2089112463254226, + "learning_rate": 3.1335471045887255e-06, + "loss": 0.175, + "step": 14723 + }, + { + "epoch": 0.75, + "grad_norm": 0.973603976260908, + "learning_rate": 3.1323498637105787e-06, + "loss": 0.1607, + "step": 14724 + }, + { + "epoch": 0.75, + "grad_norm": 0.9537143962114913, + "learning_rate": 3.1311528091178324e-06, + "loss": 0.149, + "step": 14725 + }, + { + "epoch": 0.75, + "grad_norm": 0.9696065408943665, + "learning_rate": 3.12995594084295e-06, + "loss": 0.1678, + "step": 14726 + }, + { + "epoch": 0.75, + "grad_norm": 0.9559168981759395, + "learning_rate": 3.1287592589184025e-06, + "loss": 0.139, + "step": 14727 + }, + { + "epoch": 0.75, + "grad_norm": 0.9479267688834492, + "learning_rate": 3.127562763376647e-06, + "loss": 0.194, + "step": 14728 + }, + { + "epoch": 0.75, + "grad_norm": 0.9450990399734578, + "learning_rate": 3.1263664542501427e-06, + "loss": 0.2034, + "step": 14729 + }, + { + "epoch": 0.75, + "grad_norm": 0.861102551017935, + "learning_rate": 3.1251703315713333e-06, + "loss": 0.1589, + "step": 14730 + }, + { + "epoch": 0.75, + "grad_norm": 1.0664891017098699, + "learning_rate": 3.123974395372671e-06, + "loss": 0.1531, + "step": 14731 + }, + { + "epoch": 0.75, + "grad_norm": 1.0335841585823917, + "learning_rate": 3.1227786456865883e-06, + "loss": 0.1716, + "step": 14732 + }, + { + "epoch": 0.75, + "grad_norm": 2.00491646111731, + "learning_rate": 3.121583082545526e-06, + "loss": 0.1683, + "step": 14733 + }, + { + "epoch": 0.75, + "grad_norm": 1.206580417310946, + "learning_rate": 3.1203877059819077e-06, + "loss": 0.1858, + "step": 14734 + }, + { + "epoch": 0.75, + "grad_norm": 1.1377213446369543, + "learning_rate": 3.1191925160281644e-06, + "loss": 0.1572, + "step": 14735 + }, + { + "epoch": 0.75, + "grad_norm": 1.539961066898607, + "learning_rate": 3.1179975127167105e-06, + "loss": 0.1629, + "step": 14736 + }, + { + "epoch": 0.75, + "grad_norm": 0.9933739792302864, + "learning_rate": 3.1168026960799624e-06, + "loss": 0.1745, + "step": 14737 + }, + { + "epoch": 0.75, + "grad_norm": 3.090709860866361, + "learning_rate": 3.115608066150333e-06, + "loss": 0.1733, + "step": 14738 + }, + { + "epoch": 0.75, + "grad_norm": 1.1758233601106887, + "learning_rate": 3.1144136229602205e-06, + "loss": 0.1798, + "step": 14739 + }, + { + "epoch": 0.75, + "grad_norm": 0.906908514722467, + "learning_rate": 3.1132193665420306e-06, + "loss": 0.1616, + "step": 14740 + }, + { + "epoch": 0.75, + "grad_norm": 1.0543997428468896, + "learning_rate": 3.112025296928152e-06, + "loss": 0.178, + "step": 14741 + }, + { + "epoch": 0.75, + "grad_norm": 1.1950618467976908, + "learning_rate": 3.110831414150978e-06, + "loss": 0.184, + "step": 14742 + }, + { + "epoch": 0.75, + "grad_norm": 1.2784627442075167, + "learning_rate": 3.1096377182428885e-06, + "loss": 0.1458, + "step": 14743 + }, + { + "epoch": 0.75, + "grad_norm": 1.032175497273516, + "learning_rate": 3.1084442092362675e-06, + "loss": 0.1568, + "step": 14744 + }, + { + "epoch": 0.75, + "grad_norm": 1.1464911325739189, + "learning_rate": 3.1072508871634843e-06, + "loss": 0.1475, + "step": 14745 + }, + { + "epoch": 0.75, + "grad_norm": 1.1112318755193693, + "learning_rate": 3.1060577520569103e-06, + "loss": 0.1764, + "step": 14746 + }, + { + "epoch": 0.75, + "grad_norm": 1.084391657633532, + "learning_rate": 3.104864803948908e-06, + "loss": 0.1764, + "step": 14747 + }, + { + "epoch": 0.75, + "grad_norm": 1.0758458176930714, + "learning_rate": 3.1036720428718413e-06, + "loss": 0.1891, + "step": 14748 + }, + { + "epoch": 0.75, + "grad_norm": 1.1294542286974087, + "learning_rate": 3.1024794688580562e-06, + "loss": 0.176, + "step": 14749 + }, + { + "epoch": 0.75, + "grad_norm": 0.9721057798346413, + "learning_rate": 3.1012870819399087e-06, + "loss": 0.1683, + "step": 14750 + }, + { + "epoch": 0.75, + "grad_norm": 1.0807184805474042, + "learning_rate": 3.1000948821497356e-06, + "loss": 0.1756, + "step": 14751 + }, + { + "epoch": 0.75, + "grad_norm": 0.8688973454793225, + "learning_rate": 3.098902869519882e-06, + "loss": 0.1598, + "step": 14752 + }, + { + "epoch": 0.75, + "grad_norm": 0.6859639098812923, + "learning_rate": 3.097711044082674e-06, + "loss": 0.1586, + "step": 14753 + }, + { + "epoch": 0.75, + "grad_norm": 0.9853578787351401, + "learning_rate": 3.096519405870444e-06, + "loss": 0.1573, + "step": 14754 + }, + { + "epoch": 0.75, + "grad_norm": 0.9852364539415878, + "learning_rate": 3.095327954915519e-06, + "loss": 0.1824, + "step": 14755 + }, + { + "epoch": 0.75, + "grad_norm": 0.9889064845105907, + "learning_rate": 3.0941366912502092e-06, + "loss": 0.1829, + "step": 14756 + }, + { + "epoch": 0.75, + "grad_norm": 1.2036898306677362, + "learning_rate": 3.092945614906835e-06, + "loss": 0.1591, + "step": 14757 + }, + { + "epoch": 0.75, + "grad_norm": 1.0620697656612537, + "learning_rate": 3.0917547259176973e-06, + "loss": 0.1856, + "step": 14758 + }, + { + "epoch": 0.75, + "grad_norm": 1.2864163935068007, + "learning_rate": 3.090564024315107e-06, + "loss": 0.1736, + "step": 14759 + }, + { + "epoch": 0.75, + "grad_norm": 1.189074879260085, + "learning_rate": 3.089373510131354e-06, + "loss": 0.1811, + "step": 14760 + }, + { + "epoch": 0.75, + "grad_norm": 1.345609459317833, + "learning_rate": 3.0881831833987387e-06, + "loss": 0.1497, + "step": 14761 + }, + { + "epoch": 0.75, + "grad_norm": 0.8882311019034608, + "learning_rate": 3.086993044149541e-06, + "loss": 0.1618, + "step": 14762 + }, + { + "epoch": 0.75, + "grad_norm": 1.0149783463440294, + "learning_rate": 3.0858030924160477e-06, + "loss": 0.1901, + "step": 14763 + }, + { + "epoch": 0.75, + "grad_norm": 0.8790084933384287, + "learning_rate": 3.084613328230536e-06, + "loss": 0.1813, + "step": 14764 + }, + { + "epoch": 0.75, + "grad_norm": 0.8785959266232123, + "learning_rate": 3.0834237516252817e-06, + "loss": 0.1542, + "step": 14765 + }, + { + "epoch": 0.75, + "grad_norm": 1.1549721382274187, + "learning_rate": 3.082234362632546e-06, + "loss": 0.1645, + "step": 14766 + }, + { + "epoch": 0.75, + "grad_norm": 0.9054447399940857, + "learning_rate": 3.081045161284596e-06, + "loss": 0.1561, + "step": 14767 + }, + { + "epoch": 0.75, + "grad_norm": 1.0804171216263059, + "learning_rate": 3.0798561476136845e-06, + "loss": 0.1687, + "step": 14768 + }, + { + "epoch": 0.75, + "grad_norm": 1.0504575861996284, + "learning_rate": 3.078667321652069e-06, + "loss": 0.1494, + "step": 14769 + }, + { + "epoch": 0.75, + "grad_norm": 1.6349417730106812, + "learning_rate": 3.0774786834319923e-06, + "loss": 0.1834, + "step": 14770 + }, + { + "epoch": 0.75, + "grad_norm": 1.4004228489340422, + "learning_rate": 3.076290232985696e-06, + "loss": 0.1997, + "step": 14771 + }, + { + "epoch": 0.75, + "grad_norm": 1.1630110956367583, + "learning_rate": 3.0751019703454164e-06, + "loss": 0.1626, + "step": 14772 + }, + { + "epoch": 0.75, + "grad_norm": 0.9623326057353061, + "learning_rate": 3.0739138955433878e-06, + "loss": 0.1676, + "step": 14773 + }, + { + "epoch": 0.75, + "grad_norm": 1.006629466980831, + "learning_rate": 3.072726008611839e-06, + "loss": 0.1617, + "step": 14774 + }, + { + "epoch": 0.75, + "grad_norm": 1.2838886926972721, + "learning_rate": 3.0715383095829853e-06, + "loss": 0.1548, + "step": 14775 + }, + { + "epoch": 0.75, + "grad_norm": 1.0454608588604146, + "learning_rate": 3.070350798489049e-06, + "loss": 0.1815, + "step": 14776 + }, + { + "epoch": 0.75, + "grad_norm": 0.821807555146335, + "learning_rate": 3.069163475362237e-06, + "loss": 0.1492, + "step": 14777 + }, + { + "epoch": 0.75, + "grad_norm": 0.8491864566705534, + "learning_rate": 3.0679763402347584e-06, + "loss": 0.1792, + "step": 14778 + }, + { + "epoch": 0.75, + "grad_norm": 1.3916599840610384, + "learning_rate": 3.0667893931388117e-06, + "loss": 0.1748, + "step": 14779 + }, + { + "epoch": 0.75, + "grad_norm": 1.0579779934680873, + "learning_rate": 3.0656026341065927e-06, + "loss": 0.141, + "step": 14780 + }, + { + "epoch": 0.75, + "grad_norm": 1.0207426330020186, + "learning_rate": 3.0644160631702945e-06, + "loss": 0.1737, + "step": 14781 + }, + { + "epoch": 0.75, + "grad_norm": 0.8869670421806797, + "learning_rate": 3.0632296803621064e-06, + "loss": 0.1637, + "step": 14782 + }, + { + "epoch": 0.75, + "grad_norm": 0.9987869626382742, + "learning_rate": 3.0620434857142e-06, + "loss": 0.1618, + "step": 14783 + }, + { + "epoch": 0.75, + "grad_norm": 0.9076548216511201, + "learning_rate": 3.0608574792587607e-06, + "loss": 0.168, + "step": 14784 + }, + { + "epoch": 0.75, + "grad_norm": 0.8639606682030742, + "learning_rate": 3.059671661027951e-06, + "loss": 0.1503, + "step": 14785 + }, + { + "epoch": 0.75, + "grad_norm": 0.902688952256046, + "learning_rate": 3.0584860310539423e-06, + "loss": 0.15, + "step": 14786 + }, + { + "epoch": 0.75, + "grad_norm": 0.830731620498615, + "learning_rate": 3.057300589368891e-06, + "loss": 0.1611, + "step": 14787 + }, + { + "epoch": 0.75, + "grad_norm": 1.3496131252289139, + "learning_rate": 3.0561153360049513e-06, + "loss": 0.1781, + "step": 14788 + }, + { + "epoch": 0.75, + "grad_norm": 1.1472205477231077, + "learning_rate": 3.0549302709942753e-06, + "loss": 0.1688, + "step": 14789 + }, + { + "epoch": 0.75, + "grad_norm": 1.2001411056709832, + "learning_rate": 3.0537453943690076e-06, + "loss": 0.1766, + "step": 14790 + }, + { + "epoch": 0.75, + "grad_norm": 0.8679364218872742, + "learning_rate": 3.0525607061612918e-06, + "loss": 0.1603, + "step": 14791 + }, + { + "epoch": 0.75, + "grad_norm": 0.8910635946636004, + "learning_rate": 3.051376206403256e-06, + "loss": 0.1595, + "step": 14792 + }, + { + "epoch": 0.75, + "grad_norm": 0.9196162615395724, + "learning_rate": 3.050191895127036e-06, + "loss": 0.1735, + "step": 14793 + }, + { + "epoch": 0.75, + "grad_norm": 0.8051345823549313, + "learning_rate": 3.0490077723647504e-06, + "loss": 0.1763, + "step": 14794 + }, + { + "epoch": 0.75, + "grad_norm": 1.781799246737779, + "learning_rate": 3.047823838148525e-06, + "loss": 0.1682, + "step": 14795 + }, + { + "epoch": 0.75, + "grad_norm": 1.2158586699703775, + "learning_rate": 3.0466400925104665e-06, + "loss": 0.1896, + "step": 14796 + }, + { + "epoch": 0.75, + "grad_norm": 1.6484925242012631, + "learning_rate": 3.0454565354826926e-06, + "loss": 0.1647, + "step": 14797 + }, + { + "epoch": 0.75, + "grad_norm": 1.1769066884903636, + "learning_rate": 3.044273167097299e-06, + "loss": 0.1789, + "step": 14798 + }, + { + "epoch": 0.75, + "grad_norm": 0.9788467179153428, + "learning_rate": 3.0430899873863897e-06, + "loss": 0.1666, + "step": 14799 + }, + { + "epoch": 0.75, + "grad_norm": 1.161712363023313, + "learning_rate": 3.041906996382056e-06, + "loss": 0.1538, + "step": 14800 + }, + { + "epoch": 0.75, + "grad_norm": 1.5549606547832509, + "learning_rate": 3.0407241941163923e-06, + "loss": 0.159, + "step": 14801 + }, + { + "epoch": 0.75, + "grad_norm": 1.505694443799723, + "learning_rate": 3.0395415806214735e-06, + "loss": 0.193, + "step": 14802 + }, + { + "epoch": 0.75, + "grad_norm": 0.8924218114947182, + "learning_rate": 3.0383591559293867e-06, + "loss": 0.1531, + "step": 14803 + }, + { + "epoch": 0.75, + "grad_norm": 0.9852185215373485, + "learning_rate": 3.0371769200721977e-06, + "loss": 0.1806, + "step": 14804 + }, + { + "epoch": 0.75, + "grad_norm": 3.2306235996053707, + "learning_rate": 3.035994873081981e-06, + "loss": 0.1845, + "step": 14805 + }, + { + "epoch": 0.75, + "grad_norm": 0.906085883851588, + "learning_rate": 3.0348130149907928e-06, + "loss": 0.1791, + "step": 14806 + }, + { + "epoch": 0.75, + "grad_norm": 1.6349173864537299, + "learning_rate": 3.0336313458306964e-06, + "loss": 0.1894, + "step": 14807 + }, + { + "epoch": 0.75, + "grad_norm": 0.8587567982498606, + "learning_rate": 3.0324498656337453e-06, + "loss": 0.1749, + "step": 14808 + }, + { + "epoch": 0.75, + "grad_norm": 0.9227183523843092, + "learning_rate": 3.0312685744319824e-06, + "loss": 0.162, + "step": 14809 + }, + { + "epoch": 0.75, + "grad_norm": 1.015572029239059, + "learning_rate": 3.030087472257456e-06, + "loss": 0.1698, + "step": 14810 + }, + { + "epoch": 0.75, + "grad_norm": 0.9089076200824454, + "learning_rate": 3.0289065591421974e-06, + "loss": 0.1732, + "step": 14811 + }, + { + "epoch": 0.75, + "grad_norm": 0.9013864412772916, + "learning_rate": 3.027725835118245e-06, + "loss": 0.1719, + "step": 14812 + }, + { + "epoch": 0.75, + "grad_norm": 0.9075117656813344, + "learning_rate": 3.0265453002176203e-06, + "loss": 0.189, + "step": 14813 + }, + { + "epoch": 0.75, + "grad_norm": 0.9636418655739848, + "learning_rate": 3.0253649544723517e-06, + "loss": 0.1516, + "step": 14814 + }, + { + "epoch": 0.75, + "grad_norm": 0.9890103632519298, + "learning_rate": 3.024184797914449e-06, + "loss": 0.1742, + "step": 14815 + }, + { + "epoch": 0.75, + "grad_norm": 0.9107581267039456, + "learning_rate": 3.0230048305759274e-06, + "loss": 0.1547, + "step": 14816 + }, + { + "epoch": 0.75, + "grad_norm": 1.351786077343159, + "learning_rate": 3.021825052488795e-06, + "loss": 0.1514, + "step": 14817 + }, + { + "epoch": 0.75, + "grad_norm": 1.066387189414217, + "learning_rate": 3.0206454636850546e-06, + "loss": 0.1591, + "step": 14818 + }, + { + "epoch": 0.75, + "grad_norm": 1.0678313553067573, + "learning_rate": 3.0194660641966965e-06, + "loss": 0.1619, + "step": 14819 + }, + { + "epoch": 0.75, + "grad_norm": 0.8015416936308438, + "learning_rate": 3.01828685405572e-06, + "loss": 0.1536, + "step": 14820 + }, + { + "epoch": 0.75, + "grad_norm": 1.1124837008765287, + "learning_rate": 3.0171078332941028e-06, + "loss": 0.1707, + "step": 14821 + }, + { + "epoch": 0.75, + "grad_norm": 0.9763316659811963, + "learning_rate": 3.015929001943834e-06, + "loss": 0.1748, + "step": 14822 + }, + { + "epoch": 0.75, + "grad_norm": 0.9133392214693982, + "learning_rate": 3.014750360036881e-06, + "loss": 0.138, + "step": 14823 + }, + { + "epoch": 0.75, + "grad_norm": 1.1592682615722274, + "learning_rate": 3.0135719076052195e-06, + "loss": 0.187, + "step": 14824 + }, + { + "epoch": 0.75, + "grad_norm": 1.6634091590839288, + "learning_rate": 3.012393644680819e-06, + "loss": 0.1678, + "step": 14825 + }, + { + "epoch": 0.75, + "grad_norm": 0.8682168822734787, + "learning_rate": 3.0112155712956305e-06, + "loss": 0.1535, + "step": 14826 + }, + { + "epoch": 0.75, + "grad_norm": 0.9829329473504432, + "learning_rate": 3.0100376874816183e-06, + "loss": 0.1736, + "step": 14827 + }, + { + "epoch": 0.75, + "grad_norm": 0.9083554632551712, + "learning_rate": 3.0088599932707263e-06, + "loss": 0.1657, + "step": 14828 + }, + { + "epoch": 0.75, + "grad_norm": 0.9103203674379348, + "learning_rate": 3.007682488694904e-06, + "loss": 0.1599, + "step": 14829 + }, + { + "epoch": 0.75, + "grad_norm": 0.8617723421597876, + "learning_rate": 3.0065051737860863e-06, + "loss": 0.1631, + "step": 14830 + }, + { + "epoch": 0.75, + "grad_norm": 1.5295785801478599, + "learning_rate": 3.0053280485762136e-06, + "loss": 0.168, + "step": 14831 + }, + { + "epoch": 0.75, + "grad_norm": 1.010684464650659, + "learning_rate": 3.00415111309721e-06, + "loss": 0.1756, + "step": 14832 + }, + { + "epoch": 0.75, + "grad_norm": 1.1799490977267986, + "learning_rate": 3.0029743673810032e-06, + "loss": 0.2012, + "step": 14833 + }, + { + "epoch": 0.75, + "grad_norm": 1.353078753744399, + "learning_rate": 3.0017978114595103e-06, + "loss": 0.1659, + "step": 14834 + }, + { + "epoch": 0.75, + "grad_norm": 3.3560744486295873, + "learning_rate": 3.000621445364651e-06, + "loss": 0.167, + "step": 14835 + }, + { + "epoch": 0.75, + "grad_norm": 1.3890085822404705, + "learning_rate": 2.999445269128327e-06, + "loss": 0.1639, + "step": 14836 + }, + { + "epoch": 0.75, + "grad_norm": 1.0976117458451753, + "learning_rate": 2.9982692827824487e-06, + "loss": 0.1728, + "step": 14837 + }, + { + "epoch": 0.75, + "grad_norm": 1.079496472102237, + "learning_rate": 2.9970934863589086e-06, + "loss": 0.1678, + "step": 14838 + }, + { + "epoch": 0.75, + "grad_norm": 0.842838928379633, + "learning_rate": 2.9959178798896062e-06, + "loss": 0.1422, + "step": 14839 + }, + { + "epoch": 0.75, + "grad_norm": 0.9306792323418289, + "learning_rate": 2.994742463406427e-06, + "loss": 0.1736, + "step": 14840 + }, + { + "epoch": 0.75, + "grad_norm": 0.8812207056004026, + "learning_rate": 2.9935672369412507e-06, + "loss": 0.1755, + "step": 14841 + }, + { + "epoch": 0.75, + "grad_norm": 1.0940408765288119, + "learning_rate": 2.9923922005259588e-06, + "loss": 0.1793, + "step": 14842 + }, + { + "epoch": 0.75, + "grad_norm": 0.8373831947473562, + "learning_rate": 2.991217354192425e-06, + "loss": 0.1765, + "step": 14843 + }, + { + "epoch": 0.75, + "grad_norm": 2.3787908438981957, + "learning_rate": 2.9900426979725183e-06, + "loss": 0.1743, + "step": 14844 + }, + { + "epoch": 0.75, + "grad_norm": 1.1502864105435349, + "learning_rate": 2.9888682318980975e-06, + "loss": 0.1636, + "step": 14845 + }, + { + "epoch": 0.75, + "grad_norm": 1.044664524211976, + "learning_rate": 2.9876939560010244e-06, + "loss": 0.1583, + "step": 14846 + }, + { + "epoch": 0.75, + "grad_norm": 0.9738137676351397, + "learning_rate": 2.986519870313146e-06, + "loss": 0.172, + "step": 14847 + }, + { + "epoch": 0.76, + "grad_norm": 1.8642222069852814, + "learning_rate": 2.985345974866315e-06, + "loss": 0.1423, + "step": 14848 + }, + { + "epoch": 0.76, + "grad_norm": 0.810328271150909, + "learning_rate": 2.984172269692367e-06, + "loss": 0.1637, + "step": 14849 + }, + { + "epoch": 0.76, + "grad_norm": 1.1340679026784688, + "learning_rate": 2.9829987548231432e-06, + "loss": 0.1611, + "step": 14850 + }, + { + "epoch": 0.76, + "grad_norm": 1.070376228632435, + "learning_rate": 2.9818254302904735e-06, + "loss": 0.1681, + "step": 14851 + }, + { + "epoch": 0.76, + "grad_norm": 1.0497269808800511, + "learning_rate": 2.9806522961261896e-06, + "loss": 0.1623, + "step": 14852 + }, + { + "epoch": 0.76, + "grad_norm": 1.4840784587493108, + "learning_rate": 2.979479352362106e-06, + "loss": 0.1548, + "step": 14853 + }, + { + "epoch": 0.76, + "grad_norm": 0.9688261626228124, + "learning_rate": 2.9783065990300432e-06, + "loss": 0.1758, + "step": 14854 + }, + { + "epoch": 0.76, + "grad_norm": 0.9197866418393205, + "learning_rate": 2.9771340361618075e-06, + "loss": 0.1787, + "step": 14855 + }, + { + "epoch": 0.76, + "grad_norm": 1.6718879704798661, + "learning_rate": 2.975961663789212e-06, + "loss": 0.1573, + "step": 14856 + }, + { + "epoch": 0.76, + "grad_norm": 0.9578131340707566, + "learning_rate": 2.9747894819440514e-06, + "loss": 0.172, + "step": 14857 + }, + { + "epoch": 0.76, + "grad_norm": 0.930379079874052, + "learning_rate": 2.9736174906581216e-06, + "loss": 0.1655, + "step": 14858 + }, + { + "epoch": 0.76, + "grad_norm": 0.7535419036472633, + "learning_rate": 2.9724456899632126e-06, + "loss": 0.1555, + "step": 14859 + }, + { + "epoch": 0.76, + "grad_norm": 1.2210499593610467, + "learning_rate": 2.971274079891112e-06, + "loss": 0.1867, + "step": 14860 + }, + { + "epoch": 0.76, + "grad_norm": 1.1231733960892714, + "learning_rate": 2.970102660473603e-06, + "loss": 0.1519, + "step": 14861 + }, + { + "epoch": 0.76, + "grad_norm": 1.111326106758271, + "learning_rate": 2.9689314317424513e-06, + "loss": 0.1765, + "step": 14862 + }, + { + "epoch": 0.76, + "grad_norm": 4.061444520286897, + "learning_rate": 2.9677603937294364e-06, + "loss": 0.1544, + "step": 14863 + }, + { + "epoch": 0.76, + "grad_norm": 1.0349430214778528, + "learning_rate": 2.966589546466314e-06, + "loss": 0.1518, + "step": 14864 + }, + { + "epoch": 0.76, + "grad_norm": 1.0634051726805134, + "learning_rate": 2.96541888998485e-06, + "loss": 0.1696, + "step": 14865 + }, + { + "epoch": 0.76, + "grad_norm": 1.1166888090548637, + "learning_rate": 2.964248424316795e-06, + "loss": 0.1612, + "step": 14866 + }, + { + "epoch": 0.76, + "grad_norm": 1.3298552807639807, + "learning_rate": 2.9630781494938997e-06, + "loss": 0.1833, + "step": 14867 + }, + { + "epoch": 0.76, + "grad_norm": 1.236808824422805, + "learning_rate": 2.961908065547905e-06, + "loss": 0.1529, + "step": 14868 + }, + { + "epoch": 0.76, + "grad_norm": 1.4039050352062317, + "learning_rate": 2.960738172510551e-06, + "loss": 0.1812, + "step": 14869 + }, + { + "epoch": 0.76, + "grad_norm": 1.1263685791612776, + "learning_rate": 2.959568470413572e-06, + "loss": 0.1675, + "step": 14870 + }, + { + "epoch": 0.76, + "grad_norm": 1.2335040738408105, + "learning_rate": 2.9583989592886985e-06, + "loss": 0.1848, + "step": 14871 + }, + { + "epoch": 0.76, + "grad_norm": 1.0305781290114908, + "learning_rate": 2.957229639167648e-06, + "loss": 0.1641, + "step": 14872 + }, + { + "epoch": 0.76, + "grad_norm": 0.8071061008183066, + "learning_rate": 2.956060510082145e-06, + "loss": 0.1619, + "step": 14873 + }, + { + "epoch": 0.76, + "grad_norm": 1.0075024836831434, + "learning_rate": 2.954891572063895e-06, + "loss": 0.1607, + "step": 14874 + }, + { + "epoch": 0.76, + "grad_norm": 1.1847496166015081, + "learning_rate": 2.9537228251446125e-06, + "loss": 0.1713, + "step": 14875 + }, + { + "epoch": 0.76, + "grad_norm": 1.177493790578264, + "learning_rate": 2.9525542693559926e-06, + "loss": 0.1785, + "step": 14876 + }, + { + "epoch": 0.76, + "grad_norm": 1.0309264603516655, + "learning_rate": 2.9513859047297366e-06, + "loss": 0.178, + "step": 14877 + }, + { + "epoch": 0.76, + "grad_norm": 1.493847712149435, + "learning_rate": 2.9502177312975387e-06, + "loss": 0.1551, + "step": 14878 + }, + { + "epoch": 0.76, + "grad_norm": 1.1904513588353383, + "learning_rate": 2.9490497490910806e-06, + "loss": 0.1753, + "step": 14879 + }, + { + "epoch": 0.76, + "grad_norm": 0.8749388948121452, + "learning_rate": 2.9478819581420493e-06, + "loss": 0.1617, + "step": 14880 + }, + { + "epoch": 0.76, + "grad_norm": 1.2930194430877306, + "learning_rate": 2.9467143584821145e-06, + "loss": 0.1655, + "step": 14881 + }, + { + "epoch": 0.76, + "grad_norm": 0.9587201065916398, + "learning_rate": 2.9455469501429557e-06, + "loss": 0.1638, + "step": 14882 + }, + { + "epoch": 0.76, + "grad_norm": 1.031977326446882, + "learning_rate": 2.9443797331562295e-06, + "loss": 0.1904, + "step": 14883 + }, + { + "epoch": 0.76, + "grad_norm": 1.1442108789360643, + "learning_rate": 2.9432127075536056e-06, + "loss": 0.1624, + "step": 14884 + }, + { + "epoch": 0.76, + "grad_norm": 1.5933491740422148, + "learning_rate": 2.942045873366731e-06, + "loss": 0.169, + "step": 14885 + }, + { + "epoch": 0.76, + "grad_norm": 1.243282759250556, + "learning_rate": 2.9408792306272625e-06, + "loss": 0.17, + "step": 14886 + }, + { + "epoch": 0.76, + "grad_norm": 0.8418987826724236, + "learning_rate": 2.9397127793668435e-06, + "loss": 0.1578, + "step": 14887 + }, + { + "epoch": 0.76, + "grad_norm": 0.9680157927451623, + "learning_rate": 2.938546519617116e-06, + "loss": 0.1646, + "step": 14888 + }, + { + "epoch": 0.76, + "grad_norm": 1.483610722538024, + "learning_rate": 2.93738045140971e-06, + "loss": 0.1769, + "step": 14889 + }, + { + "epoch": 0.76, + "grad_norm": 0.7884359484807628, + "learning_rate": 2.9362145747762626e-06, + "loss": 0.1601, + "step": 14890 + }, + { + "epoch": 0.76, + "grad_norm": 1.121494941591859, + "learning_rate": 2.9350488897483897e-06, + "loss": 0.1894, + "step": 14891 + }, + { + "epoch": 0.76, + "grad_norm": 1.2684480232810904, + "learning_rate": 2.9338833963577184e-06, + "loss": 0.1585, + "step": 14892 + }, + { + "epoch": 0.76, + "grad_norm": 1.3305637146146216, + "learning_rate": 2.932718094635858e-06, + "loss": 0.2072, + "step": 14893 + }, + { + "epoch": 0.76, + "grad_norm": 1.3770407432912448, + "learning_rate": 2.9315529846144162e-06, + "loss": 0.169, + "step": 14894 + }, + { + "epoch": 0.76, + "grad_norm": 0.9470699738751776, + "learning_rate": 2.9303880663249985e-06, + "loss": 0.1744, + "step": 14895 + }, + { + "epoch": 0.76, + "grad_norm": 1.0590649066073299, + "learning_rate": 2.9292233397992043e-06, + "loss": 0.1538, + "step": 14896 + }, + { + "epoch": 0.76, + "grad_norm": 1.0300673069287118, + "learning_rate": 2.9280588050686287e-06, + "loss": 0.1912, + "step": 14897 + }, + { + "epoch": 0.76, + "grad_norm": 1.0014920304678387, + "learning_rate": 2.9268944621648554e-06, + "loss": 0.1973, + "step": 14898 + }, + { + "epoch": 0.76, + "grad_norm": 1.6558249390820992, + "learning_rate": 2.925730311119471e-06, + "loss": 0.1602, + "step": 14899 + }, + { + "epoch": 0.76, + "grad_norm": 1.1327455208047599, + "learning_rate": 2.92456635196405e-06, + "loss": 0.1705, + "step": 14900 + }, + { + "epoch": 0.76, + "grad_norm": 1.0468497436593192, + "learning_rate": 2.9234025847301685e-06, + "loss": 0.1679, + "step": 14901 + }, + { + "epoch": 0.76, + "grad_norm": 0.8848502370869115, + "learning_rate": 2.922239009449388e-06, + "loss": 0.1557, + "step": 14902 + }, + { + "epoch": 0.76, + "grad_norm": 8.23931656859147, + "learning_rate": 2.9210756261532746e-06, + "loss": 0.1499, + "step": 14903 + }, + { + "epoch": 0.76, + "grad_norm": 1.5678878010715138, + "learning_rate": 2.919912434873385e-06, + "loss": 0.1665, + "step": 14904 + }, + { + "epoch": 0.76, + "grad_norm": 0.8378693122157262, + "learning_rate": 2.918749435641274e-06, + "loss": 0.1657, + "step": 14905 + }, + { + "epoch": 0.76, + "grad_norm": 0.856513967247774, + "learning_rate": 2.91758662848848e-06, + "loss": 0.1677, + "step": 14906 + }, + { + "epoch": 0.76, + "grad_norm": 1.5243921986143543, + "learning_rate": 2.9164240134465527e-06, + "loss": 0.1661, + "step": 14907 + }, + { + "epoch": 0.76, + "grad_norm": 1.8660583832399535, + "learning_rate": 2.9152615905470216e-06, + "loss": 0.1686, + "step": 14908 + }, + { + "epoch": 0.76, + "grad_norm": 1.1635873514860704, + "learning_rate": 2.9140993598214217e-06, + "loss": 0.1904, + "step": 14909 + }, + { + "epoch": 0.76, + "grad_norm": 2.3164720376059806, + "learning_rate": 2.912937321301278e-06, + "loss": 0.1663, + "step": 14910 + }, + { + "epoch": 0.76, + "grad_norm": 1.0820820698019624, + "learning_rate": 2.911775475018106e-06, + "loss": 0.1674, + "step": 14911 + }, + { + "epoch": 0.76, + "grad_norm": 0.9985268905374507, + "learning_rate": 2.910613821003425e-06, + "loss": 0.1577, + "step": 14912 + }, + { + "epoch": 0.76, + "grad_norm": 1.1918499534603288, + "learning_rate": 2.9094523592887446e-06, + "loss": 0.182, + "step": 14913 + }, + { + "epoch": 0.76, + "grad_norm": 1.2000467515295787, + "learning_rate": 2.9082910899055717e-06, + "loss": 0.1647, + "step": 14914 + }, + { + "epoch": 0.76, + "grad_norm": 1.425609422227406, + "learning_rate": 2.9071300128854007e-06, + "loss": 0.1677, + "step": 14915 + }, + { + "epoch": 0.76, + "grad_norm": 2.4507515531337645, + "learning_rate": 2.9059691282597325e-06, + "loss": 0.1733, + "step": 14916 + }, + { + "epoch": 0.76, + "grad_norm": 1.3031539416406344, + "learning_rate": 2.9048084360600494e-06, + "loss": 0.1608, + "step": 14917 + }, + { + "epoch": 0.76, + "grad_norm": 0.8005481370728588, + "learning_rate": 2.9036479363178405e-06, + "loss": 0.1748, + "step": 14918 + }, + { + "epoch": 0.76, + "grad_norm": 1.4637440704208484, + "learning_rate": 2.9024876290645787e-06, + "loss": 0.1587, + "step": 14919 + }, + { + "epoch": 0.76, + "grad_norm": 1.022621538622618, + "learning_rate": 2.9013275143317453e-06, + "loss": 0.1641, + "step": 14920 + }, + { + "epoch": 0.76, + "grad_norm": 1.5938994367208572, + "learning_rate": 2.9001675921507998e-06, + "loss": 0.1825, + "step": 14921 + }, + { + "epoch": 0.76, + "grad_norm": 1.5829965650281022, + "learning_rate": 2.8990078625532104e-06, + "loss": 0.147, + "step": 14922 + }, + { + "epoch": 0.76, + "grad_norm": 1.609257440006581, + "learning_rate": 2.8978483255704325e-06, + "loss": 0.1568, + "step": 14923 + }, + { + "epoch": 0.76, + "grad_norm": 1.4008987644802413, + "learning_rate": 2.8966889812339237e-06, + "loss": 0.1795, + "step": 14924 + }, + { + "epoch": 0.76, + "grad_norm": 1.3581868558224195, + "learning_rate": 2.8955298295751245e-06, + "loss": 0.1589, + "step": 14925 + }, + { + "epoch": 0.76, + "grad_norm": 0.9135404012632948, + "learning_rate": 2.8943708706254824e-06, + "loss": 0.1602, + "step": 14926 + }, + { + "epoch": 0.76, + "grad_norm": 1.0471223644901613, + "learning_rate": 2.893212104416432e-06, + "loss": 0.1847, + "step": 14927 + }, + { + "epoch": 0.76, + "grad_norm": 0.853204413537538, + "learning_rate": 2.8920535309794018e-06, + "loss": 0.1418, + "step": 14928 + }, + { + "epoch": 0.76, + "grad_norm": 1.1207432193111928, + "learning_rate": 2.8908951503458217e-06, + "loss": 0.1736, + "step": 14929 + }, + { + "epoch": 0.76, + "grad_norm": 1.4981305908868932, + "learning_rate": 2.8897369625471105e-06, + "loss": 0.1729, + "step": 14930 + }, + { + "epoch": 0.76, + "grad_norm": 1.1182424351020628, + "learning_rate": 2.8885789676146903e-06, + "loss": 0.1718, + "step": 14931 + }, + { + "epoch": 0.76, + "grad_norm": 1.354916786961543, + "learning_rate": 2.887421165579963e-06, + "loss": 0.1484, + "step": 14932 + }, + { + "epoch": 0.76, + "grad_norm": 1.5863329120631235, + "learning_rate": 2.8862635564743424e-06, + "loss": 0.1692, + "step": 14933 + }, + { + "epoch": 0.76, + "grad_norm": 0.9166221221801297, + "learning_rate": 2.8851061403292213e-06, + "loss": 0.1578, + "step": 14934 + }, + { + "epoch": 0.76, + "grad_norm": 2.035905778944065, + "learning_rate": 2.8839489171760015e-06, + "loss": 0.1755, + "step": 14935 + }, + { + "epoch": 0.76, + "grad_norm": 0.8688123045023516, + "learning_rate": 2.882791887046066e-06, + "loss": 0.1514, + "step": 14936 + }, + { + "epoch": 0.76, + "grad_norm": 0.9109323250386723, + "learning_rate": 2.8816350499708044e-06, + "loss": 0.1676, + "step": 14937 + }, + { + "epoch": 0.76, + "grad_norm": 1.116979008834176, + "learning_rate": 2.8804784059815914e-06, + "loss": 0.1597, + "step": 14938 + }, + { + "epoch": 0.76, + "grad_norm": 0.9080606464625794, + "learning_rate": 2.879321955109805e-06, + "loss": 0.1694, + "step": 14939 + }, + { + "epoch": 0.76, + "grad_norm": 1.4528067891010092, + "learning_rate": 2.878165697386812e-06, + "loss": 0.1516, + "step": 14940 + }, + { + "epoch": 0.76, + "grad_norm": 1.0721172933704726, + "learning_rate": 2.87700963284398e-06, + "loss": 0.1821, + "step": 14941 + }, + { + "epoch": 0.76, + "grad_norm": 1.0226958352661308, + "learning_rate": 2.87585376151266e-06, + "loss": 0.1745, + "step": 14942 + }, + { + "epoch": 0.76, + "grad_norm": 0.973993706553113, + "learning_rate": 2.8746980834242133e-06, + "loss": 0.1899, + "step": 14943 + }, + { + "epoch": 0.76, + "grad_norm": 1.0773078735796286, + "learning_rate": 2.8735425986099796e-06, + "loss": 0.1737, + "step": 14944 + }, + { + "epoch": 0.76, + "grad_norm": 0.9840464557781372, + "learning_rate": 2.87238730710131e-06, + "loss": 0.172, + "step": 14945 + }, + { + "epoch": 0.76, + "grad_norm": 0.9383012071885413, + "learning_rate": 2.871232208929533e-06, + "loss": 0.1812, + "step": 14946 + }, + { + "epoch": 0.76, + "grad_norm": 1.2462761175413661, + "learning_rate": 2.8700773041259844e-06, + "loss": 0.1845, + "step": 14947 + }, + { + "epoch": 0.76, + "grad_norm": 1.32041091829035, + "learning_rate": 2.8689225927219956e-06, + "loss": 0.1949, + "step": 14948 + }, + { + "epoch": 0.76, + "grad_norm": 1.1985684244279526, + "learning_rate": 2.8677680747488812e-06, + "loss": 0.1804, + "step": 14949 + }, + { + "epoch": 0.76, + "grad_norm": 1.1156791419115983, + "learning_rate": 2.8666137502379632e-06, + "loss": 0.199, + "step": 14950 + }, + { + "epoch": 0.76, + "grad_norm": 1.149339106701358, + "learning_rate": 2.8654596192205476e-06, + "loss": 0.1742, + "step": 14951 + }, + { + "epoch": 0.76, + "grad_norm": 0.9928395353613638, + "learning_rate": 2.8643056817279448e-06, + "loss": 0.1598, + "step": 14952 + }, + { + "epoch": 0.76, + "grad_norm": 1.0961845244112949, + "learning_rate": 2.863151937791452e-06, + "loss": 0.1732, + "step": 14953 + }, + { + "epoch": 0.76, + "grad_norm": 0.8936819931072454, + "learning_rate": 2.8619983874423672e-06, + "loss": 0.1709, + "step": 14954 + }, + { + "epoch": 0.76, + "grad_norm": 1.083003245432732, + "learning_rate": 2.8608450307119772e-06, + "loss": 0.1599, + "step": 14955 + }, + { + "epoch": 0.76, + "grad_norm": 1.0890360019663619, + "learning_rate": 2.8596918676315687e-06, + "loss": 0.1804, + "step": 14956 + }, + { + "epoch": 0.76, + "grad_norm": 0.921592315801085, + "learning_rate": 2.8585388982324226e-06, + "loss": 0.1718, + "step": 14957 + }, + { + "epoch": 0.76, + "grad_norm": 1.0191836834087407, + "learning_rate": 2.8573861225458143e-06, + "loss": 0.1588, + "step": 14958 + }, + { + "epoch": 0.76, + "grad_norm": 1.049360441030582, + "learning_rate": 2.8562335406030074e-06, + "loss": 0.1789, + "step": 14959 + }, + { + "epoch": 0.76, + "grad_norm": 1.021851023037416, + "learning_rate": 2.8550811524352727e-06, + "loss": 0.1371, + "step": 14960 + }, + { + "epoch": 0.76, + "grad_norm": 1.0470103475321244, + "learning_rate": 2.8539289580738627e-06, + "loss": 0.1542, + "step": 14961 + }, + { + "epoch": 0.76, + "grad_norm": 1.033767752087917, + "learning_rate": 2.8527769575500363e-06, + "loss": 0.1691, + "step": 14962 + }, + { + "epoch": 0.76, + "grad_norm": 1.7569499057857303, + "learning_rate": 2.851625150895039e-06, + "loss": 0.1773, + "step": 14963 + }, + { + "epoch": 0.76, + "grad_norm": 1.9601401947005221, + "learning_rate": 2.850473538140108e-06, + "loss": 0.1709, + "step": 14964 + }, + { + "epoch": 0.76, + "grad_norm": 0.9617429520069206, + "learning_rate": 2.8493221193164886e-06, + "loss": 0.1505, + "step": 14965 + }, + { + "epoch": 0.76, + "grad_norm": 1.6305919387900003, + "learning_rate": 2.84817089445541e-06, + "loss": 0.1857, + "step": 14966 + }, + { + "epoch": 0.76, + "grad_norm": 0.992601166343874, + "learning_rate": 2.847019863588102e-06, + "loss": 0.1686, + "step": 14967 + }, + { + "epoch": 0.76, + "grad_norm": 1.213736998481106, + "learning_rate": 2.845869026745781e-06, + "loss": 0.196, + "step": 14968 + }, + { + "epoch": 0.76, + "grad_norm": 0.9976238049114304, + "learning_rate": 2.8447183839596705e-06, + "loss": 0.1558, + "step": 14969 + }, + { + "epoch": 0.76, + "grad_norm": 0.8547754154366524, + "learning_rate": 2.8435679352609747e-06, + "loss": 0.1787, + "step": 14970 + }, + { + "epoch": 0.76, + "grad_norm": 2.2194540168258508, + "learning_rate": 2.8424176806809068e-06, + "loss": 0.1417, + "step": 14971 + }, + { + "epoch": 0.76, + "grad_norm": 1.14402890444547, + "learning_rate": 2.8412676202506596e-06, + "loss": 0.1685, + "step": 14972 + }, + { + "epoch": 0.76, + "grad_norm": 0.9557704528886504, + "learning_rate": 2.8401177540014323e-06, + "loss": 0.173, + "step": 14973 + }, + { + "epoch": 0.76, + "grad_norm": 0.920417194706065, + "learning_rate": 2.838968081964416e-06, + "loss": 0.1653, + "step": 14974 + }, + { + "epoch": 0.76, + "grad_norm": 1.1526702838319869, + "learning_rate": 2.8378186041707977e-06, + "loss": 0.1799, + "step": 14975 + }, + { + "epoch": 0.76, + "grad_norm": 1.0949852077610864, + "learning_rate": 2.8366693206517503e-06, + "loss": 0.1584, + "step": 14976 + }, + { + "epoch": 0.76, + "grad_norm": 1.2539237468301876, + "learning_rate": 2.835520231438457e-06, + "loss": 0.1617, + "step": 14977 + }, + { + "epoch": 0.76, + "grad_norm": 1.015585176023755, + "learning_rate": 2.834371336562077e-06, + "loss": 0.1508, + "step": 14978 + }, + { + "epoch": 0.76, + "grad_norm": 1.0288380937596868, + "learning_rate": 2.833222636053784e-06, + "loss": 0.2067, + "step": 14979 + }, + { + "epoch": 0.76, + "grad_norm": 1.4883629291388756, + "learning_rate": 2.8320741299447306e-06, + "loss": 0.1725, + "step": 14980 + }, + { + "epoch": 0.76, + "grad_norm": 0.8305302577491667, + "learning_rate": 2.8309258182660693e-06, + "loss": 0.1578, + "step": 14981 + }, + { + "epoch": 0.76, + "grad_norm": 1.9041171256480882, + "learning_rate": 2.829777701048949e-06, + "loss": 0.1483, + "step": 14982 + }, + { + "epoch": 0.76, + "grad_norm": 1.1448825644487437, + "learning_rate": 2.828629778324514e-06, + "loss": 0.1614, + "step": 14983 + }, + { + "epoch": 0.76, + "grad_norm": 1.3660210089224283, + "learning_rate": 2.827482050123905e-06, + "loss": 0.1702, + "step": 14984 + }, + { + "epoch": 0.76, + "grad_norm": 1.3638108245767226, + "learning_rate": 2.8263345164782473e-06, + "loss": 0.1741, + "step": 14985 + }, + { + "epoch": 0.76, + "grad_norm": 1.2411375513006744, + "learning_rate": 2.8251871774186736e-06, + "loss": 0.1566, + "step": 14986 + }, + { + "epoch": 0.76, + "grad_norm": 0.9342897147302336, + "learning_rate": 2.8240400329762994e-06, + "loss": 0.1805, + "step": 14987 + }, + { + "epoch": 0.76, + "grad_norm": 1.0304143141046265, + "learning_rate": 2.8228930831822486e-06, + "loss": 0.1634, + "step": 14988 + }, + { + "epoch": 0.76, + "grad_norm": 0.775467905274122, + "learning_rate": 2.821746328067625e-06, + "loss": 0.1691, + "step": 14989 + }, + { + "epoch": 0.76, + "grad_norm": 0.9910104080265708, + "learning_rate": 2.82059976766354e-06, + "loss": 0.1616, + "step": 14990 + }, + { + "epoch": 0.76, + "grad_norm": 1.6828123750215387, + "learning_rate": 2.81945340200109e-06, + "loss": 0.1768, + "step": 14991 + }, + { + "epoch": 0.76, + "grad_norm": 1.0531142469858463, + "learning_rate": 2.818307231111371e-06, + "loss": 0.197, + "step": 14992 + }, + { + "epoch": 0.76, + "grad_norm": 1.2279946254505785, + "learning_rate": 2.8171612550254746e-06, + "loss": 0.1581, + "step": 14993 + }, + { + "epoch": 0.76, + "grad_norm": 1.2522759008118522, + "learning_rate": 2.816015473774487e-06, + "loss": 0.1314, + "step": 14994 + }, + { + "epoch": 0.76, + "grad_norm": 10.995273571793144, + "learning_rate": 2.814869887389483e-06, + "loss": 0.172, + "step": 14995 + }, + { + "epoch": 0.76, + "grad_norm": 1.0064004224764271, + "learning_rate": 2.813724495901543e-06, + "loss": 0.1722, + "step": 14996 + }, + { + "epoch": 0.76, + "grad_norm": 1.0028392290053534, + "learning_rate": 2.812579299341731e-06, + "loss": 0.1667, + "step": 14997 + }, + { + "epoch": 0.76, + "grad_norm": 0.9677404251213896, + "learning_rate": 2.811434297741108e-06, + "loss": 0.1836, + "step": 14998 + }, + { + "epoch": 0.76, + "grad_norm": 0.9845281773459438, + "learning_rate": 2.8102894911307367e-06, + "loss": 0.1618, + "step": 14999 + }, + { + "epoch": 0.76, + "grad_norm": 0.8616214642095336, + "learning_rate": 2.809144879541669e-06, + "loss": 0.162, + "step": 15000 + }, + { + "epoch": 0.76, + "grad_norm": 1.6859524501238814, + "learning_rate": 2.808000463004954e-06, + "loss": 0.1528, + "step": 15001 + }, + { + "epoch": 0.76, + "grad_norm": 2.160218092080421, + "learning_rate": 2.8068562415516308e-06, + "loss": 0.1746, + "step": 15002 + }, + { + "epoch": 0.76, + "grad_norm": 1.1401600807234633, + "learning_rate": 2.8057122152127413e-06, + "loss": 0.1789, + "step": 15003 + }, + { + "epoch": 0.76, + "grad_norm": 1.032889345595397, + "learning_rate": 2.804568384019312e-06, + "loss": 0.1571, + "step": 15004 + }, + { + "epoch": 0.76, + "grad_norm": 0.8592553578925399, + "learning_rate": 2.8034247480023735e-06, + "loss": 0.1717, + "step": 15005 + }, + { + "epoch": 0.76, + "grad_norm": 1.1253400311781885, + "learning_rate": 2.8022813071929434e-06, + "loss": 0.1502, + "step": 15006 + }, + { + "epoch": 0.76, + "grad_norm": 0.9738577653061913, + "learning_rate": 2.8011380616220407e-06, + "loss": 0.1847, + "step": 15007 + }, + { + "epoch": 0.76, + "grad_norm": 1.3238903583814505, + "learning_rate": 2.7999950113206732e-06, + "loss": 0.1883, + "step": 15008 + }, + { + "epoch": 0.76, + "grad_norm": 0.8885027709396675, + "learning_rate": 2.798852156319847e-06, + "loss": 0.1562, + "step": 15009 + }, + { + "epoch": 0.76, + "grad_norm": 1.0721210227832059, + "learning_rate": 2.7977094966505624e-06, + "loss": 0.1961, + "step": 15010 + }, + { + "epoch": 0.76, + "grad_norm": 1.1721695304850344, + "learning_rate": 2.7965670323438178e-06, + "loss": 0.179, + "step": 15011 + }, + { + "epoch": 0.76, + "grad_norm": 1.3159953959620643, + "learning_rate": 2.7954247634305965e-06, + "loss": 0.1562, + "step": 15012 + }, + { + "epoch": 0.76, + "grad_norm": 0.9882891878751142, + "learning_rate": 2.7942826899418886e-06, + "loss": 0.1814, + "step": 15013 + }, + { + "epoch": 0.76, + "grad_norm": 1.071625820697916, + "learning_rate": 2.7931408119086668e-06, + "loss": 0.1998, + "step": 15014 + }, + { + "epoch": 0.76, + "grad_norm": 1.1167208938383686, + "learning_rate": 2.79199912936191e-06, + "loss": 0.1751, + "step": 15015 + }, + { + "epoch": 0.76, + "grad_norm": 1.0858374521812366, + "learning_rate": 2.790857642332584e-06, + "loss": 0.1723, + "step": 15016 + }, + { + "epoch": 0.76, + "grad_norm": 0.9124234370105371, + "learning_rate": 2.789716350851649e-06, + "loss": 0.1581, + "step": 15017 + }, + { + "epoch": 0.76, + "grad_norm": 1.3363800719743475, + "learning_rate": 2.7885752549500644e-06, + "loss": 0.1549, + "step": 15018 + }, + { + "epoch": 0.76, + "grad_norm": 1.0255205464470902, + "learning_rate": 2.7874343546587846e-06, + "loss": 0.1846, + "step": 15019 + }, + { + "epoch": 0.76, + "grad_norm": 0.9317828134518765, + "learning_rate": 2.7862936500087566e-06, + "loss": 0.164, + "step": 15020 + }, + { + "epoch": 0.76, + "grad_norm": 0.8632663766608207, + "learning_rate": 2.7851531410309194e-06, + "loss": 0.1741, + "step": 15021 + }, + { + "epoch": 0.76, + "grad_norm": 1.1484226086902631, + "learning_rate": 2.784012827756213e-06, + "loss": 0.1755, + "step": 15022 + }, + { + "epoch": 0.76, + "grad_norm": 0.8400458929925192, + "learning_rate": 2.7828727102155627e-06, + "loss": 0.1696, + "step": 15023 + }, + { + "epoch": 0.76, + "grad_norm": 0.9106478377375347, + "learning_rate": 2.7817327884399014e-06, + "loss": 0.1882, + "step": 15024 + }, + { + "epoch": 0.76, + "grad_norm": 1.3584652549796459, + "learning_rate": 2.7805930624601427e-06, + "loss": 0.147, + "step": 15025 + }, + { + "epoch": 0.76, + "grad_norm": 0.9858682454425882, + "learning_rate": 2.779453532307206e-06, + "loss": 0.1864, + "step": 15026 + }, + { + "epoch": 0.76, + "grad_norm": 1.9690793197636596, + "learning_rate": 2.778314198011999e-06, + "loss": 0.1641, + "step": 15027 + }, + { + "epoch": 0.76, + "grad_norm": 0.9040955497405057, + "learning_rate": 2.7771750596054305e-06, + "loss": 0.1608, + "step": 15028 + }, + { + "epoch": 0.76, + "grad_norm": 1.2710265204702358, + "learning_rate": 2.7760361171183934e-06, + "loss": 0.1689, + "step": 15029 + }, + { + "epoch": 0.76, + "grad_norm": 1.1442824752872833, + "learning_rate": 2.774897370581787e-06, + "loss": 0.1542, + "step": 15030 + }, + { + "epoch": 0.76, + "grad_norm": 1.0493120502774447, + "learning_rate": 2.7737588200264953e-06, + "loss": 0.1577, + "step": 15031 + }, + { + "epoch": 0.76, + "grad_norm": 1.0022591482167709, + "learning_rate": 2.7726204654834067e-06, + "loss": 0.1754, + "step": 15032 + }, + { + "epoch": 0.76, + "grad_norm": 2.844231737981002, + "learning_rate": 2.7714823069833964e-06, + "loss": 0.1736, + "step": 15033 + }, + { + "epoch": 0.76, + "grad_norm": 1.4792458478526416, + "learning_rate": 2.770344344557333e-06, + "loss": 0.1683, + "step": 15034 + }, + { + "epoch": 0.76, + "grad_norm": 1.2239895492597026, + "learning_rate": 2.7692065782360876e-06, + "loss": 0.1632, + "step": 15035 + }, + { + "epoch": 0.76, + "grad_norm": 0.991586953334937, + "learning_rate": 2.7680690080505234e-06, + "loss": 0.1479, + "step": 15036 + }, + { + "epoch": 0.76, + "grad_norm": 1.2104851538907433, + "learning_rate": 2.7669316340314977e-06, + "loss": 0.1594, + "step": 15037 + }, + { + "epoch": 0.76, + "grad_norm": 0.9131732170375813, + "learning_rate": 2.765794456209857e-06, + "loss": 0.1563, + "step": 15038 + }, + { + "epoch": 0.76, + "grad_norm": 2.228509649935169, + "learning_rate": 2.7646574746164533e-06, + "loss": 0.1604, + "step": 15039 + }, + { + "epoch": 0.76, + "grad_norm": 0.9544290582199312, + "learning_rate": 2.763520689282122e-06, + "loss": 0.1653, + "step": 15040 + }, + { + "epoch": 0.76, + "grad_norm": 0.9012892594587696, + "learning_rate": 2.762384100237703e-06, + "loss": 0.1657, + "step": 15041 + }, + { + "epoch": 0.76, + "grad_norm": 1.0570949666178724, + "learning_rate": 2.761247707514021e-06, + "loss": 0.1753, + "step": 15042 + }, + { + "epoch": 0.76, + "grad_norm": 1.0871063071718943, + "learning_rate": 2.7601115111419043e-06, + "loss": 0.1647, + "step": 15043 + }, + { + "epoch": 0.77, + "grad_norm": 0.8618285709761413, + "learning_rate": 2.758975511152171e-06, + "loss": 0.1569, + "step": 15044 + }, + { + "epoch": 0.77, + "grad_norm": 1.547419540473778, + "learning_rate": 2.7578397075756404e-06, + "loss": 0.1648, + "step": 15045 + }, + { + "epoch": 0.77, + "grad_norm": 1.0384641122566953, + "learning_rate": 2.756704100443113e-06, + "loss": 0.1722, + "step": 15046 + }, + { + "epoch": 0.77, + "grad_norm": 0.8121363605603122, + "learning_rate": 2.7555686897853983e-06, + "loss": 0.1751, + "step": 15047 + }, + { + "epoch": 0.77, + "grad_norm": 0.7748124221997982, + "learning_rate": 2.754433475633289e-06, + "loss": 0.1717, + "step": 15048 + }, + { + "epoch": 0.77, + "grad_norm": 1.2459613623991952, + "learning_rate": 2.753298458017585e-06, + "loss": 0.1689, + "step": 15049 + }, + { + "epoch": 0.77, + "grad_norm": 0.7775055960696998, + "learning_rate": 2.7521636369690687e-06, + "loss": 0.1628, + "step": 15050 + }, + { + "epoch": 0.77, + "grad_norm": 1.0486466173882525, + "learning_rate": 2.7510290125185203e-06, + "loss": 0.172, + "step": 15051 + }, + { + "epoch": 0.77, + "grad_norm": 1.0057226924940363, + "learning_rate": 2.7498945846967197e-06, + "loss": 0.1626, + "step": 15052 + }, + { + "epoch": 0.77, + "grad_norm": 0.7624894425040479, + "learning_rate": 2.7487603535344375e-06, + "loss": 0.1775, + "step": 15053 + }, + { + "epoch": 0.77, + "grad_norm": 1.1466543189425418, + "learning_rate": 2.747626319062444e-06, + "loss": 0.1886, + "step": 15054 + }, + { + "epoch": 0.77, + "grad_norm": 0.8899801858769728, + "learning_rate": 2.7464924813114926e-06, + "loss": 0.1682, + "step": 15055 + }, + { + "epoch": 0.77, + "grad_norm": 1.07558876667753, + "learning_rate": 2.7453588403123453e-06, + "loss": 0.1724, + "step": 15056 + }, + { + "epoch": 0.77, + "grad_norm": 1.0008619053830783, + "learning_rate": 2.7442253960957466e-06, + "loss": 0.1681, + "step": 15057 + }, + { + "epoch": 0.77, + "grad_norm": 1.0007558626762345, + "learning_rate": 2.743092148692447e-06, + "loss": 0.1731, + "step": 15058 + }, + { + "epoch": 0.77, + "grad_norm": 1.289803694977765, + "learning_rate": 2.741959098133179e-06, + "loss": 0.1628, + "step": 15059 + }, + { + "epoch": 0.77, + "grad_norm": 1.4436849600973654, + "learning_rate": 2.7408262444486844e-06, + "loss": 0.1721, + "step": 15060 + }, + { + "epoch": 0.77, + "grad_norm": 0.9500321666703038, + "learning_rate": 2.739693587669684e-06, + "loss": 0.1584, + "step": 15061 + }, + { + "epoch": 0.77, + "grad_norm": 0.902479170382834, + "learning_rate": 2.7385611278269054e-06, + "loss": 0.1546, + "step": 15062 + }, + { + "epoch": 0.77, + "grad_norm": 0.9355144421315881, + "learning_rate": 2.737428864951066e-06, + "loss": 0.1703, + "step": 15063 + }, + { + "epoch": 0.77, + "grad_norm": 0.8297513558457941, + "learning_rate": 2.736296799072883e-06, + "loss": 0.1633, + "step": 15064 + }, + { + "epoch": 0.77, + "grad_norm": 0.9871101636958638, + "learning_rate": 2.7351649302230553e-06, + "loss": 0.157, + "step": 15065 + }, + { + "epoch": 0.77, + "grad_norm": 1.167113858545876, + "learning_rate": 2.7340332584322927e-06, + "loss": 0.1538, + "step": 15066 + }, + { + "epoch": 0.77, + "grad_norm": 0.8959136261904793, + "learning_rate": 2.7329017837312875e-06, + "loss": 0.1492, + "step": 15067 + }, + { + "epoch": 0.77, + "grad_norm": 1.0774540888808917, + "learning_rate": 2.7317705061507306e-06, + "loss": 0.166, + "step": 15068 + }, + { + "epoch": 0.77, + "grad_norm": 1.1236019697314357, + "learning_rate": 2.7306394257213078e-06, + "loss": 0.1596, + "step": 15069 + }, + { + "epoch": 0.77, + "grad_norm": 1.474374951856886, + "learning_rate": 2.729508542473702e-06, + "loss": 0.1653, + "step": 15070 + }, + { + "epoch": 0.77, + "grad_norm": 1.7946612833079612, + "learning_rate": 2.72837785643859e-06, + "loss": 0.18, + "step": 15071 + }, + { + "epoch": 0.77, + "grad_norm": 1.8050299194968318, + "learning_rate": 2.727247367646637e-06, + "loss": 0.1701, + "step": 15072 + }, + { + "epoch": 0.77, + "grad_norm": 0.9506576876624673, + "learning_rate": 2.726117076128513e-06, + "loss": 0.1817, + "step": 15073 + }, + { + "epoch": 0.77, + "grad_norm": 1.2612952414807954, + "learning_rate": 2.72498698191487e-06, + "loss": 0.1674, + "step": 15074 + }, + { + "epoch": 0.77, + "grad_norm": 1.3330606571640877, + "learning_rate": 2.72385708503637e-06, + "loss": 0.1822, + "step": 15075 + }, + { + "epoch": 0.77, + "grad_norm": 1.2579774240785415, + "learning_rate": 2.7227273855236535e-06, + "loss": 0.1622, + "step": 15076 + }, + { + "epoch": 0.77, + "grad_norm": 0.8701657202529226, + "learning_rate": 2.721597883407372e-06, + "loss": 0.1991, + "step": 15077 + }, + { + "epoch": 0.77, + "grad_norm": 2.8020324879684333, + "learning_rate": 2.720468578718155e-06, + "loss": 0.1824, + "step": 15078 + }, + { + "epoch": 0.77, + "grad_norm": 1.0579987620792803, + "learning_rate": 2.7193394714866396e-06, + "loss": 0.1741, + "step": 15079 + }, + { + "epoch": 0.77, + "grad_norm": 1.2534880239050814, + "learning_rate": 2.7182105617434516e-06, + "loss": 0.1839, + "step": 15080 + }, + { + "epoch": 0.77, + "grad_norm": 1.070027748514126, + "learning_rate": 2.7170818495192163e-06, + "loss": 0.1591, + "step": 15081 + }, + { + "epoch": 0.77, + "grad_norm": 1.145745299916052, + "learning_rate": 2.7159533348445455e-06, + "loss": 0.1667, + "step": 15082 + }, + { + "epoch": 0.77, + "grad_norm": 1.0809253146050162, + "learning_rate": 2.7148250177500534e-06, + "loss": 0.1813, + "step": 15083 + }, + { + "epoch": 0.77, + "grad_norm": 1.0653540590236674, + "learning_rate": 2.7136968982663427e-06, + "loss": 0.1828, + "step": 15084 + }, + { + "epoch": 0.77, + "grad_norm": 1.5754998020627489, + "learning_rate": 2.7125689764240173e-06, + "loss": 0.1576, + "step": 15085 + }, + { + "epoch": 0.77, + "grad_norm": 1.740473150238454, + "learning_rate": 2.711441252253669e-06, + "loss": 0.1617, + "step": 15086 + }, + { + "epoch": 0.77, + "grad_norm": 1.5122081253911168, + "learning_rate": 2.7103137257858867e-06, + "loss": 0.1749, + "step": 15087 + }, + { + "epoch": 0.77, + "grad_norm": 2.131401733962826, + "learning_rate": 2.7091863970512564e-06, + "loss": 0.1797, + "step": 15088 + }, + { + "epoch": 0.77, + "grad_norm": 1.0242405027137653, + "learning_rate": 2.708059266080356e-06, + "loss": 0.1844, + "step": 15089 + }, + { + "epoch": 0.77, + "grad_norm": 0.8234663848030147, + "learning_rate": 2.7069323329037632e-06, + "loss": 0.1545, + "step": 15090 + }, + { + "epoch": 0.77, + "grad_norm": 1.363966608963242, + "learning_rate": 2.7058055975520405e-06, + "loss": 0.1817, + "step": 15091 + }, + { + "epoch": 0.77, + "grad_norm": 1.4655481719724168, + "learning_rate": 2.704679060055755e-06, + "loss": 0.1859, + "step": 15092 + }, + { + "epoch": 0.77, + "grad_norm": 1.3100547717657192, + "learning_rate": 2.703552720445459e-06, + "loss": 0.1922, + "step": 15093 + }, + { + "epoch": 0.77, + "grad_norm": 0.968730998319128, + "learning_rate": 2.702426578751711e-06, + "loss": 0.1686, + "step": 15094 + }, + { + "epoch": 0.77, + "grad_norm": 2.464987708079324, + "learning_rate": 2.701300635005052e-06, + "loss": 0.1654, + "step": 15095 + }, + { + "epoch": 0.77, + "grad_norm": 1.1518904234958036, + "learning_rate": 2.7001748892360247e-06, + "loss": 0.1543, + "step": 15096 + }, + { + "epoch": 0.77, + "grad_norm": 1.1980341783397113, + "learning_rate": 2.6990493414751652e-06, + "loss": 0.152, + "step": 15097 + }, + { + "epoch": 0.77, + "grad_norm": 1.0117771613701103, + "learning_rate": 2.697923991753009e-06, + "loss": 0.1548, + "step": 15098 + }, + { + "epoch": 0.77, + "grad_norm": 1.3844556881797818, + "learning_rate": 2.696798840100072e-06, + "loss": 0.1779, + "step": 15099 + }, + { + "epoch": 0.77, + "grad_norm": 1.2282852794525476, + "learning_rate": 2.6956738865468832e-06, + "loss": 0.1638, + "step": 15100 + }, + { + "epoch": 0.77, + "grad_norm": 0.9476410124228332, + "learning_rate": 2.6945491311239504e-06, + "loss": 0.1522, + "step": 15101 + }, + { + "epoch": 0.77, + "grad_norm": 0.8403668096547272, + "learning_rate": 2.693424573861787e-06, + "loss": 0.1584, + "step": 15102 + }, + { + "epoch": 0.77, + "grad_norm": 1.0560591291079178, + "learning_rate": 2.692300214790895e-06, + "loss": 0.1808, + "step": 15103 + }, + { + "epoch": 0.77, + "grad_norm": 0.9424701654290392, + "learning_rate": 2.6911760539417698e-06, + "loss": 0.1649, + "step": 15104 + }, + { + "epoch": 0.77, + "grad_norm": 1.54019391223232, + "learning_rate": 2.690052091344907e-06, + "loss": 0.1624, + "step": 15105 + }, + { + "epoch": 0.77, + "grad_norm": 1.1614143182985948, + "learning_rate": 2.688928327030793e-06, + "loss": 0.175, + "step": 15106 + }, + { + "epoch": 0.77, + "grad_norm": 1.184879417210331, + "learning_rate": 2.6878047610299152e-06, + "loss": 0.1578, + "step": 15107 + }, + { + "epoch": 0.77, + "grad_norm": 1.531118791624129, + "learning_rate": 2.686681393372743e-06, + "loss": 0.1774, + "step": 15108 + }, + { + "epoch": 0.77, + "grad_norm": 0.9591536192176581, + "learning_rate": 2.6855582240897536e-06, + "loss": 0.1653, + "step": 15109 + }, + { + "epoch": 0.77, + "grad_norm": 1.0411427116028142, + "learning_rate": 2.6844352532114084e-06, + "loss": 0.1697, + "step": 15110 + }, + { + "epoch": 0.77, + "grad_norm": 0.956752950405443, + "learning_rate": 2.6833124807681722e-06, + "loss": 0.1704, + "step": 15111 + }, + { + "epoch": 0.77, + "grad_norm": 1.111203752316407, + "learning_rate": 2.6821899067904956e-06, + "loss": 0.1614, + "step": 15112 + }, + { + "epoch": 0.77, + "grad_norm": 0.9663017267242348, + "learning_rate": 2.6810675313088343e-06, + "loss": 0.1689, + "step": 15113 + }, + { + "epoch": 0.77, + "grad_norm": 1.0200424958227927, + "learning_rate": 2.6799453543536256e-06, + "loss": 0.1824, + "step": 15114 + }, + { + "epoch": 0.77, + "grad_norm": 1.0230622487319652, + "learning_rate": 2.678823375955314e-06, + "loss": 0.1733, + "step": 15115 + }, + { + "epoch": 0.77, + "grad_norm": 0.9057815915974087, + "learning_rate": 2.677701596144331e-06, + "loss": 0.1597, + "step": 15116 + }, + { + "epoch": 0.77, + "grad_norm": 1.3771708626699875, + "learning_rate": 2.6765800149511088e-06, + "loss": 0.1747, + "step": 15117 + }, + { + "epoch": 0.77, + "grad_norm": 1.141733838833967, + "learning_rate": 2.6754586324060637e-06, + "loss": 0.1738, + "step": 15118 + }, + { + "epoch": 0.77, + "grad_norm": 1.0691457001073312, + "learning_rate": 2.6743374485396212e-06, + "loss": 0.1564, + "step": 15119 + }, + { + "epoch": 0.77, + "grad_norm": 1.082285544415921, + "learning_rate": 2.673216463382189e-06, + "loss": 0.1542, + "step": 15120 + }, + { + "epoch": 0.77, + "grad_norm": 2.0461481068809784, + "learning_rate": 2.67209567696417e-06, + "loss": 0.1884, + "step": 15121 + }, + { + "epoch": 0.77, + "grad_norm": 1.5995436617467997, + "learning_rate": 2.6709750893159705e-06, + "loss": 0.1822, + "step": 15122 + }, + { + "epoch": 0.77, + "grad_norm": 1.1283157762236709, + "learning_rate": 2.6698547004679853e-06, + "loss": 0.1556, + "step": 15123 + }, + { + "epoch": 0.77, + "grad_norm": 1.2046144453330978, + "learning_rate": 2.6687345104506092e-06, + "loss": 0.152, + "step": 15124 + }, + { + "epoch": 0.77, + "grad_norm": 1.1382689858156538, + "learning_rate": 2.6676145192942194e-06, + "loss": 0.1657, + "step": 15125 + }, + { + "epoch": 0.77, + "grad_norm": 0.9200882739660786, + "learning_rate": 2.666494727029203e-06, + "loss": 0.1579, + "step": 15126 + }, + { + "epoch": 0.77, + "grad_norm": 1.029441048435042, + "learning_rate": 2.6653751336859292e-06, + "loss": 0.1602, + "step": 15127 + }, + { + "epoch": 0.77, + "grad_norm": 0.9313604753550834, + "learning_rate": 2.6642557392947722e-06, + "loss": 0.1747, + "step": 15128 + }, + { + "epoch": 0.77, + "grad_norm": 1.6679795449211656, + "learning_rate": 2.663136543886089e-06, + "loss": 0.1757, + "step": 15129 + }, + { + "epoch": 0.77, + "grad_norm": 1.0950242043797351, + "learning_rate": 2.6620175474902444e-06, + "loss": 0.1506, + "step": 15130 + }, + { + "epoch": 0.77, + "grad_norm": 1.1122224424154379, + "learning_rate": 2.660898750137585e-06, + "loss": 0.1647, + "step": 15131 + }, + { + "epoch": 0.77, + "grad_norm": 1.0499158991950026, + "learning_rate": 2.659780151858462e-06, + "loss": 0.1696, + "step": 15132 + }, + { + "epoch": 0.77, + "grad_norm": 0.8062873238846927, + "learning_rate": 2.658661752683217e-06, + "loss": 0.1594, + "step": 15133 + }, + { + "epoch": 0.77, + "grad_norm": 1.2888923453349863, + "learning_rate": 2.657543552642189e-06, + "loss": 0.1772, + "step": 15134 + }, + { + "epoch": 0.77, + "grad_norm": 2.6795161267833896, + "learning_rate": 2.6564255517657044e-06, + "loss": 0.1526, + "step": 15135 + }, + { + "epoch": 0.77, + "grad_norm": 0.9699837684539419, + "learning_rate": 2.655307750084094e-06, + "loss": 0.1669, + "step": 15136 + }, + { + "epoch": 0.77, + "grad_norm": 0.8597368366303423, + "learning_rate": 2.6541901476276767e-06, + "loss": 0.1964, + "step": 15137 + }, + { + "epoch": 0.77, + "grad_norm": 1.2499292435659783, + "learning_rate": 2.653072744426762e-06, + "loss": 0.1752, + "step": 15138 + }, + { + "epoch": 0.77, + "grad_norm": 1.031377290048498, + "learning_rate": 2.6519555405116683e-06, + "loss": 0.1867, + "step": 15139 + }, + { + "epoch": 0.77, + "grad_norm": 0.8232779349067384, + "learning_rate": 2.650838535912692e-06, + "loss": 0.1527, + "step": 15140 + }, + { + "epoch": 0.77, + "grad_norm": 1.0424697846561075, + "learning_rate": 2.6497217306601365e-06, + "loss": 0.1901, + "step": 15141 + }, + { + "epoch": 0.77, + "grad_norm": 1.1272349519057292, + "learning_rate": 2.6486051247842935e-06, + "loss": 0.2006, + "step": 15142 + }, + { + "epoch": 0.77, + "grad_norm": 0.801356139954553, + "learning_rate": 2.647488718315454e-06, + "loss": 0.1599, + "step": 15143 + }, + { + "epoch": 0.77, + "grad_norm": 1.3660574893840298, + "learning_rate": 2.6463725112838968e-06, + "loss": 0.1848, + "step": 15144 + }, + { + "epoch": 0.77, + "grad_norm": 0.8537934209396962, + "learning_rate": 2.645256503719902e-06, + "loss": 0.1759, + "step": 15145 + }, + { + "epoch": 0.77, + "grad_norm": 1.1754110127965345, + "learning_rate": 2.6441406956537376e-06, + "loss": 0.1752, + "step": 15146 + }, + { + "epoch": 0.77, + "grad_norm": 0.7957917440483896, + "learning_rate": 2.643025087115676e-06, + "loss": 0.1595, + "step": 15147 + }, + { + "epoch": 0.77, + "grad_norm": 1.0393249746050452, + "learning_rate": 2.6419096781359698e-06, + "loss": 0.193, + "step": 15148 + }, + { + "epoch": 0.77, + "grad_norm": 1.287822483747699, + "learning_rate": 2.6407944687448804e-06, + "loss": 0.1492, + "step": 15149 + }, + { + "epoch": 0.77, + "grad_norm": 0.7834088365545043, + "learning_rate": 2.639679458972657e-06, + "loss": 0.1561, + "step": 15150 + }, + { + "epoch": 0.77, + "grad_norm": 5.253358728554841, + "learning_rate": 2.6385646488495466e-06, + "loss": 0.1622, + "step": 15151 + }, + { + "epoch": 0.77, + "grad_norm": 1.4703873122591984, + "learning_rate": 2.637450038405782e-06, + "loss": 0.1869, + "step": 15152 + }, + { + "epoch": 0.77, + "grad_norm": 0.8843757015526899, + "learning_rate": 2.6363356276716046e-06, + "loss": 0.1897, + "step": 15153 + }, + { + "epoch": 0.77, + "grad_norm": 2.082280409237539, + "learning_rate": 2.6352214166772363e-06, + "loss": 0.1589, + "step": 15154 + }, + { + "epoch": 0.77, + "grad_norm": 0.9841338075804467, + "learning_rate": 2.634107405452906e-06, + "loss": 0.1781, + "step": 15155 + }, + { + "epoch": 0.77, + "grad_norm": 0.9614771366458215, + "learning_rate": 2.6329935940288286e-06, + "loss": 0.1715, + "step": 15156 + }, + { + "epoch": 0.77, + "grad_norm": 1.2490440362855904, + "learning_rate": 2.6318799824352125e-06, + "loss": 0.1555, + "step": 15157 + }, + { + "epoch": 0.77, + "grad_norm": 1.14649656482353, + "learning_rate": 2.6307665707022678e-06, + "loss": 0.1588, + "step": 15158 + }, + { + "epoch": 0.77, + "grad_norm": 1.8545450488293231, + "learning_rate": 2.629653358860197e-06, + "loss": 0.1686, + "step": 15159 + }, + { + "epoch": 0.77, + "grad_norm": 0.7601598800170505, + "learning_rate": 2.628540346939198e-06, + "loss": 0.1584, + "step": 15160 + }, + { + "epoch": 0.77, + "grad_norm": 0.9209688037861604, + "learning_rate": 2.6274275349694544e-06, + "loss": 0.1715, + "step": 15161 + }, + { + "epoch": 0.77, + "grad_norm": 1.9961783908841748, + "learning_rate": 2.6263149229811592e-06, + "loss": 0.17, + "step": 15162 + }, + { + "epoch": 0.77, + "grad_norm": 1.0209861385594814, + "learning_rate": 2.6252025110044852e-06, + "loss": 0.1623, + "step": 15163 + }, + { + "epoch": 0.77, + "grad_norm": 1.4066217365859481, + "learning_rate": 2.6240902990696126e-06, + "loss": 0.1661, + "step": 15164 + }, + { + "epoch": 0.77, + "grad_norm": 0.7784038130987568, + "learning_rate": 2.6229782872067042e-06, + "loss": 0.183, + "step": 15165 + }, + { + "epoch": 0.77, + "grad_norm": 1.2715081433842295, + "learning_rate": 2.6218664754459267e-06, + "loss": 0.1943, + "step": 15166 + }, + { + "epoch": 0.77, + "grad_norm": 1.08772074648383, + "learning_rate": 2.6207548638174374e-06, + "loss": 0.1765, + "step": 15167 + }, + { + "epoch": 0.77, + "grad_norm": 1.6357270332436424, + "learning_rate": 2.6196434523513916e-06, + "loss": 0.1594, + "step": 15168 + }, + { + "epoch": 0.77, + "grad_norm": 1.4580920469900076, + "learning_rate": 2.6185322410779312e-06, + "loss": 0.1907, + "step": 15169 + }, + { + "epoch": 0.77, + "grad_norm": 1.5075212882245206, + "learning_rate": 2.617421230027205e-06, + "loss": 0.1922, + "step": 15170 + }, + { + "epoch": 0.77, + "grad_norm": 1.527004105493696, + "learning_rate": 2.616310419229341e-06, + "loss": 0.1636, + "step": 15171 + }, + { + "epoch": 0.77, + "grad_norm": 1.4319575105625417, + "learning_rate": 2.615199808714476e-06, + "loss": 0.1791, + "step": 15172 + }, + { + "epoch": 0.77, + "grad_norm": 1.545697916364387, + "learning_rate": 2.614089398512735e-06, + "loss": 0.1862, + "step": 15173 + }, + { + "epoch": 0.77, + "grad_norm": 1.0144438909510543, + "learning_rate": 2.6129791886542323e-06, + "loss": 0.1704, + "step": 15174 + }, + { + "epoch": 0.77, + "grad_norm": 1.1318400246614295, + "learning_rate": 2.6118691791690865e-06, + "loss": 0.1844, + "step": 15175 + }, + { + "epoch": 0.77, + "grad_norm": 2.9289726534512144, + "learning_rate": 2.6107593700874056e-06, + "loss": 0.1372, + "step": 15176 + }, + { + "epoch": 0.77, + "grad_norm": 1.1153456205612922, + "learning_rate": 2.609649761439298e-06, + "loss": 0.1693, + "step": 15177 + }, + { + "epoch": 0.77, + "grad_norm": 0.9627905346417706, + "learning_rate": 2.6085403532548547e-06, + "loss": 0.1608, + "step": 15178 + }, + { + "epoch": 0.77, + "grad_norm": 0.9862059209427578, + "learning_rate": 2.6074311455641756e-06, + "loss": 0.1863, + "step": 15179 + }, + { + "epoch": 0.77, + "grad_norm": 0.9294597573359676, + "learning_rate": 2.60632213839734e-06, + "loss": 0.1724, + "step": 15180 + }, + { + "epoch": 0.77, + "grad_norm": 0.899107516158811, + "learning_rate": 2.6052133317844387e-06, + "loss": 0.164, + "step": 15181 + }, + { + "epoch": 0.77, + "grad_norm": 1.3657017149652035, + "learning_rate": 2.6041047257555384e-06, + "loss": 0.1706, + "step": 15182 + }, + { + "epoch": 0.77, + "grad_norm": 8.480391779689135, + "learning_rate": 2.6029963203407195e-06, + "loss": 0.1561, + "step": 15183 + }, + { + "epoch": 0.77, + "grad_norm": 1.596005320977434, + "learning_rate": 2.6018881155700403e-06, + "loss": 0.1809, + "step": 15184 + }, + { + "epoch": 0.77, + "grad_norm": 3.873634574087102, + "learning_rate": 2.600780111473563e-06, + "loss": 0.1833, + "step": 15185 + }, + { + "epoch": 0.77, + "grad_norm": 4.740690382810113, + "learning_rate": 2.5996723080813433e-06, + "loss": 0.1672, + "step": 15186 + }, + { + "epoch": 0.77, + "grad_norm": 1.025542092337058, + "learning_rate": 2.5985647054234332e-06, + "loss": 0.1653, + "step": 15187 + }, + { + "epoch": 0.77, + "grad_norm": 1.02212783368464, + "learning_rate": 2.597457303529871e-06, + "loss": 0.1616, + "step": 15188 + }, + { + "epoch": 0.77, + "grad_norm": 1.548952516964932, + "learning_rate": 2.5963501024307005e-06, + "loss": 0.189, + "step": 15189 + }, + { + "epoch": 0.77, + "grad_norm": 1.3022275673627424, + "learning_rate": 2.595243102155951e-06, + "loss": 0.1571, + "step": 15190 + }, + { + "epoch": 0.77, + "grad_norm": 1.0470981966654658, + "learning_rate": 2.594136302735648e-06, + "loss": 0.1625, + "step": 15191 + }, + { + "epoch": 0.77, + "grad_norm": 1.5030610793104051, + "learning_rate": 2.5930297041998152e-06, + "loss": 0.1643, + "step": 15192 + }, + { + "epoch": 0.77, + "grad_norm": 0.8238779173041473, + "learning_rate": 2.591923306578471e-06, + "loss": 0.1498, + "step": 15193 + }, + { + "epoch": 0.77, + "grad_norm": 1.0571831014548954, + "learning_rate": 2.590817109901629e-06, + "loss": 0.1685, + "step": 15194 + }, + { + "epoch": 0.77, + "grad_norm": 0.8950212936038725, + "learning_rate": 2.589711114199287e-06, + "loss": 0.1648, + "step": 15195 + }, + { + "epoch": 0.77, + "grad_norm": 1.0896129472676792, + "learning_rate": 2.5886053195014537e-06, + "loss": 0.1841, + "step": 15196 + }, + { + "epoch": 0.77, + "grad_norm": 1.1069006521388893, + "learning_rate": 2.587499725838116e-06, + "loss": 0.1755, + "step": 15197 + }, + { + "epoch": 0.77, + "grad_norm": 1.305433030656861, + "learning_rate": 2.5863943332392703e-06, + "loss": 0.1801, + "step": 15198 + }, + { + "epoch": 0.77, + "grad_norm": 2.0925570729945107, + "learning_rate": 2.5852891417348933e-06, + "loss": 0.148, + "step": 15199 + }, + { + "epoch": 0.77, + "grad_norm": 1.1185001788765023, + "learning_rate": 2.5841841513549703e-06, + "loss": 0.1791, + "step": 15200 + }, + { + "epoch": 0.77, + "grad_norm": 1.0426008164602087, + "learning_rate": 2.583079362129469e-06, + "loss": 0.1624, + "step": 15201 + }, + { + "epoch": 0.77, + "grad_norm": 0.985214885876121, + "learning_rate": 2.5819747740883584e-06, + "loss": 0.176, + "step": 15202 + }, + { + "epoch": 0.77, + "grad_norm": 1.5640615652580436, + "learning_rate": 2.5808703872616014e-06, + "loss": 0.1742, + "step": 15203 + }, + { + "epoch": 0.77, + "grad_norm": 1.2960441256428383, + "learning_rate": 2.5797662016791556e-06, + "loss": 0.153, + "step": 15204 + }, + { + "epoch": 0.77, + "grad_norm": 1.1077933440093228, + "learning_rate": 2.5786622173709695e-06, + "loss": 0.1634, + "step": 15205 + }, + { + "epoch": 0.77, + "grad_norm": 0.9797945234196949, + "learning_rate": 2.5775584343669926e-06, + "loss": 0.1636, + "step": 15206 + }, + { + "epoch": 0.77, + "grad_norm": 0.9108251801362562, + "learning_rate": 2.576454852697161e-06, + "loss": 0.1462, + "step": 15207 + }, + { + "epoch": 0.77, + "grad_norm": 0.8377769081012556, + "learning_rate": 2.5753514723914098e-06, + "loss": 0.1479, + "step": 15208 + }, + { + "epoch": 0.77, + "grad_norm": 1.0428186371239592, + "learning_rate": 2.574248293479671e-06, + "loss": 0.1864, + "step": 15209 + }, + { + "epoch": 0.77, + "grad_norm": 1.4175156519849, + "learning_rate": 2.573145315991864e-06, + "loss": 0.1769, + "step": 15210 + }, + { + "epoch": 0.77, + "grad_norm": 1.1093744001581627, + "learning_rate": 2.5720425399579095e-06, + "loss": 0.1659, + "step": 15211 + }, + { + "epoch": 0.77, + "grad_norm": 1.180261198591606, + "learning_rate": 2.5709399654077204e-06, + "loss": 0.1641, + "step": 15212 + }, + { + "epoch": 0.77, + "grad_norm": 1.423271440155294, + "learning_rate": 2.5698375923712083e-06, + "loss": 0.1562, + "step": 15213 + }, + { + "epoch": 0.77, + "grad_norm": 1.1382874794726652, + "learning_rate": 2.568735420878268e-06, + "loss": 0.1581, + "step": 15214 + }, + { + "epoch": 0.77, + "grad_norm": 1.5057172478871992, + "learning_rate": 2.567633450958801e-06, + "loss": 0.1454, + "step": 15215 + }, + { + "epoch": 0.77, + "grad_norm": 1.190316189265075, + "learning_rate": 2.5665316826426946e-06, + "loss": 0.1836, + "step": 15216 + }, + { + "epoch": 0.77, + "grad_norm": 0.8338288755160992, + "learning_rate": 2.5654301159598384e-06, + "loss": 0.1512, + "step": 15217 + }, + { + "epoch": 0.77, + "grad_norm": 1.0685886262029858, + "learning_rate": 2.564328750940107e-06, + "loss": 0.1578, + "step": 15218 + }, + { + "epoch": 0.77, + "grad_norm": 1.0227346756920659, + "learning_rate": 2.5632275876133794e-06, + "loss": 0.1693, + "step": 15219 + }, + { + "epoch": 0.77, + "grad_norm": 1.0084711151912433, + "learning_rate": 2.562126626009522e-06, + "loss": 0.1963, + "step": 15220 + }, + { + "epoch": 0.77, + "grad_norm": 1.0826122914850413, + "learning_rate": 2.561025866158404e-06, + "loss": 0.1828, + "step": 15221 + }, + { + "epoch": 0.77, + "grad_norm": 1.2022230055717948, + "learning_rate": 2.5599253080898767e-06, + "loss": 0.1574, + "step": 15222 + }, + { + "epoch": 0.77, + "grad_norm": 1.5532502806186688, + "learning_rate": 2.558824951833798e-06, + "loss": 0.1625, + "step": 15223 + }, + { + "epoch": 0.77, + "grad_norm": 2.2549004826501027, + "learning_rate": 2.5577247974200103e-06, + "loss": 0.1733, + "step": 15224 + }, + { + "epoch": 0.77, + "grad_norm": 1.0977056913219776, + "learning_rate": 2.55662484487836e-06, + "loss": 0.1581, + "step": 15225 + }, + { + "epoch": 0.77, + "grad_norm": 1.0576616231310711, + "learning_rate": 2.555525094238682e-06, + "loss": 0.1915, + "step": 15226 + }, + { + "epoch": 0.77, + "grad_norm": 0.8492059396337894, + "learning_rate": 2.5544255455308032e-06, + "loss": 0.1763, + "step": 15227 + }, + { + "epoch": 0.77, + "grad_norm": 0.9742120885664564, + "learning_rate": 2.5533261987845525e-06, + "loss": 0.1606, + "step": 15228 + }, + { + "epoch": 0.77, + "grad_norm": 1.0710840678114584, + "learning_rate": 2.552227054029749e-06, + "loss": 0.1594, + "step": 15229 + }, + { + "epoch": 0.77, + "grad_norm": 1.1422554132137888, + "learning_rate": 2.5511281112962096e-06, + "loss": 0.1949, + "step": 15230 + }, + { + "epoch": 0.77, + "grad_norm": 0.9458474201377018, + "learning_rate": 2.550029370613738e-06, + "loss": 0.1586, + "step": 15231 + }, + { + "epoch": 0.77, + "grad_norm": 1.362543961001248, + "learning_rate": 2.548930832012143e-06, + "loss": 0.1677, + "step": 15232 + }, + { + "epoch": 0.77, + "grad_norm": 1.4492155243197786, + "learning_rate": 2.5478324955212186e-06, + "loss": 0.1613, + "step": 15233 + }, + { + "epoch": 0.77, + "grad_norm": 1.5213788798995165, + "learning_rate": 2.5467343611707607e-06, + "loss": 0.1697, + "step": 15234 + }, + { + "epoch": 0.77, + "grad_norm": 0.8861019921569067, + "learning_rate": 2.545636428990551e-06, + "loss": 0.1524, + "step": 15235 + }, + { + "epoch": 0.77, + "grad_norm": 1.0214376716215812, + "learning_rate": 2.5445386990103773e-06, + "loss": 0.1632, + "step": 15236 + }, + { + "epoch": 0.77, + "grad_norm": 2.4462903199411787, + "learning_rate": 2.5434411712600095e-06, + "loss": 0.1953, + "step": 15237 + }, + { + "epoch": 0.77, + "grad_norm": 1.088113114275449, + "learning_rate": 2.542343845769222e-06, + "loss": 0.1591, + "step": 15238 + }, + { + "epoch": 0.77, + "grad_norm": 0.9099559658873935, + "learning_rate": 2.5412467225677774e-06, + "loss": 0.1473, + "step": 15239 + }, + { + "epoch": 0.77, + "grad_norm": 1.2840902837839696, + "learning_rate": 2.540149801685441e-06, + "loss": 0.1561, + "step": 15240 + }, + { + "epoch": 0.78, + "grad_norm": 1.2896921273460544, + "learning_rate": 2.5390530831519587e-06, + "loss": 0.1589, + "step": 15241 + }, + { + "epoch": 0.78, + "grad_norm": 1.233985557375335, + "learning_rate": 2.5379565669970864e-06, + "loss": 0.1873, + "step": 15242 + }, + { + "epoch": 0.78, + "grad_norm": 0.9574844819931793, + "learning_rate": 2.5368602532505637e-06, + "loss": 0.1647, + "step": 15243 + }, + { + "epoch": 0.78, + "grad_norm": 1.1470495999146328, + "learning_rate": 2.535764141942124e-06, + "loss": 0.1885, + "step": 15244 + }, + { + "epoch": 0.78, + "grad_norm": 0.7780938539703632, + "learning_rate": 2.534668233101505e-06, + "loss": 0.1521, + "step": 15245 + }, + { + "epoch": 0.78, + "grad_norm": 1.4385410709228255, + "learning_rate": 2.533572526758431e-06, + "loss": 0.1688, + "step": 15246 + }, + { + "epoch": 0.78, + "grad_norm": 1.0239198319044815, + "learning_rate": 2.5324770229426276e-06, + "loss": 0.1568, + "step": 15247 + }, + { + "epoch": 0.78, + "grad_norm": 2.158525030798928, + "learning_rate": 2.5313817216838034e-06, + "loss": 0.1787, + "step": 15248 + }, + { + "epoch": 0.78, + "grad_norm": 1.4164698569917662, + "learning_rate": 2.530286623011675e-06, + "loss": 0.1805, + "step": 15249 + }, + { + "epoch": 0.78, + "grad_norm": 1.924334069260275, + "learning_rate": 2.5291917269559408e-06, + "loss": 0.175, + "step": 15250 + }, + { + "epoch": 0.78, + "grad_norm": 0.894267709513381, + "learning_rate": 2.528097033546305e-06, + "loss": 0.1582, + "step": 15251 + }, + { + "epoch": 0.78, + "grad_norm": 0.8781609591285007, + "learning_rate": 2.527002542812457e-06, + "loss": 0.1582, + "step": 15252 + }, + { + "epoch": 0.78, + "grad_norm": 1.542183566342674, + "learning_rate": 2.5259082547840907e-06, + "loss": 0.1541, + "step": 15253 + }, + { + "epoch": 0.78, + "grad_norm": 5.65101598504308, + "learning_rate": 2.524814169490881e-06, + "loss": 0.1781, + "step": 15254 + }, + { + "epoch": 0.78, + "grad_norm": 0.8947350917702382, + "learning_rate": 2.52372028696251e-06, + "loss": 0.1769, + "step": 15255 + }, + { + "epoch": 0.78, + "grad_norm": 0.9696609784115476, + "learning_rate": 2.5226266072286475e-06, + "loss": 0.1721, + "step": 15256 + }, + { + "epoch": 0.78, + "grad_norm": 1.4945032623101626, + "learning_rate": 2.521533130318965e-06, + "loss": 0.1764, + "step": 15257 + }, + { + "epoch": 0.78, + "grad_norm": 1.0840645808247387, + "learning_rate": 2.520439856263115e-06, + "loss": 0.1657, + "step": 15258 + }, + { + "epoch": 0.78, + "grad_norm": 4.211028713539488, + "learning_rate": 2.5193467850907583e-06, + "loss": 0.1807, + "step": 15259 + }, + { + "epoch": 0.78, + "grad_norm": 0.8345107762432861, + "learning_rate": 2.5182539168315435e-06, + "loss": 0.1776, + "step": 15260 + }, + { + "epoch": 0.78, + "grad_norm": 1.1090353856444384, + "learning_rate": 2.517161251515111e-06, + "loss": 0.1765, + "step": 15261 + }, + { + "epoch": 0.78, + "grad_norm": 0.9632084661239485, + "learning_rate": 2.516068789171102e-06, + "loss": 0.1592, + "step": 15262 + }, + { + "epoch": 0.78, + "grad_norm": 2.075643005645192, + "learning_rate": 2.5149765298291508e-06, + "loss": 0.1472, + "step": 15263 + }, + { + "epoch": 0.78, + "grad_norm": 0.9688207506122695, + "learning_rate": 2.513884473518885e-06, + "loss": 0.1789, + "step": 15264 + }, + { + "epoch": 0.78, + "grad_norm": 0.9284611694669284, + "learning_rate": 2.512792620269924e-06, + "loss": 0.153, + "step": 15265 + }, + { + "epoch": 0.78, + "grad_norm": 1.2757413064160035, + "learning_rate": 2.5117009701118888e-06, + "loss": 0.1467, + "step": 15266 + }, + { + "epoch": 0.78, + "grad_norm": 1.9680738544022398, + "learning_rate": 2.5106095230743844e-06, + "loss": 0.1595, + "step": 15267 + }, + { + "epoch": 0.78, + "grad_norm": 1.0877239157747474, + "learning_rate": 2.5095182791870234e-06, + "loss": 0.1698, + "step": 15268 + }, + { + "epoch": 0.78, + "grad_norm": 1.2268524665114693, + "learning_rate": 2.5084272384793985e-06, + "loss": 0.1561, + "step": 15269 + }, + { + "epoch": 0.78, + "grad_norm": 1.36790180905005, + "learning_rate": 2.5073364009811107e-06, + "loss": 0.1757, + "step": 15270 + }, + { + "epoch": 0.78, + "grad_norm": 1.0109270739079395, + "learning_rate": 2.5062457667217433e-06, + "loss": 0.161, + "step": 15271 + }, + { + "epoch": 0.78, + "grad_norm": 1.7892760497472147, + "learning_rate": 2.505155335730883e-06, + "loss": 0.1766, + "step": 15272 + }, + { + "epoch": 0.78, + "grad_norm": 1.6048470029419701, + "learning_rate": 2.5040651080381084e-06, + "loss": 0.1753, + "step": 15273 + }, + { + "epoch": 0.78, + "grad_norm": 1.0007696327568976, + "learning_rate": 2.5029750836729926e-06, + "loss": 0.1457, + "step": 15274 + }, + { + "epoch": 0.78, + "grad_norm": 2.0214733625354664, + "learning_rate": 2.501885262665099e-06, + "loss": 0.1636, + "step": 15275 + }, + { + "epoch": 0.78, + "grad_norm": 1.5284937432774757, + "learning_rate": 2.500795645043994e-06, + "loss": 0.1556, + "step": 15276 + }, + { + "epoch": 0.78, + "grad_norm": 1.1343197681393309, + "learning_rate": 2.4997062308392304e-06, + "loss": 0.1648, + "step": 15277 + }, + { + "epoch": 0.78, + "grad_norm": 1.261112057405962, + "learning_rate": 2.498617020080356e-06, + "loss": 0.174, + "step": 15278 + }, + { + "epoch": 0.78, + "grad_norm": 1.3837587883415603, + "learning_rate": 2.4975280127969214e-06, + "loss": 0.1876, + "step": 15279 + }, + { + "epoch": 0.78, + "grad_norm": 0.9992224350374982, + "learning_rate": 2.496439209018461e-06, + "loss": 0.1656, + "step": 15280 + }, + { + "epoch": 0.78, + "grad_norm": 1.1779324775338382, + "learning_rate": 2.4953506087745107e-06, + "loss": 0.1844, + "step": 15281 + }, + { + "epoch": 0.78, + "grad_norm": 0.9112312928556008, + "learning_rate": 2.494262212094598e-06, + "loss": 0.1557, + "step": 15282 + }, + { + "epoch": 0.78, + "grad_norm": 0.9344392435604768, + "learning_rate": 2.4931740190082497e-06, + "loss": 0.1636, + "step": 15283 + }, + { + "epoch": 0.78, + "grad_norm": 0.8794078901771586, + "learning_rate": 2.4920860295449787e-06, + "loss": 0.1581, + "step": 15284 + }, + { + "epoch": 0.78, + "grad_norm": 1.134697289302735, + "learning_rate": 2.4909982437342993e-06, + "loss": 0.181, + "step": 15285 + }, + { + "epoch": 0.78, + "grad_norm": 1.0548278226712828, + "learning_rate": 2.4899106616057155e-06, + "loss": 0.1757, + "step": 15286 + }, + { + "epoch": 0.78, + "grad_norm": 9.676475877009159, + "learning_rate": 2.4888232831887304e-06, + "loss": 0.1624, + "step": 15287 + }, + { + "epoch": 0.78, + "grad_norm": 0.7828993584768532, + "learning_rate": 2.487736108512836e-06, + "loss": 0.1836, + "step": 15288 + }, + { + "epoch": 0.78, + "grad_norm": 1.020287114138541, + "learning_rate": 2.486649137607524e-06, + "loss": 0.1586, + "step": 15289 + }, + { + "epoch": 0.78, + "grad_norm": 1.2777417930524249, + "learning_rate": 2.485562370502279e-06, + "loss": 0.1821, + "step": 15290 + }, + { + "epoch": 0.78, + "grad_norm": 1.0818007196529444, + "learning_rate": 2.4844758072265806e-06, + "loss": 0.1737, + "step": 15291 + }, + { + "epoch": 0.78, + "grad_norm": 1.4445018677484833, + "learning_rate": 2.4833894478098983e-06, + "loss": 0.1685, + "step": 15292 + }, + { + "epoch": 0.78, + "grad_norm": 0.980705724826226, + "learning_rate": 2.4823032922817045e-06, + "loss": 0.1708, + "step": 15293 + }, + { + "epoch": 0.78, + "grad_norm": 0.9967433356885528, + "learning_rate": 2.481217340671457e-06, + "loss": 0.1614, + "step": 15294 + }, + { + "epoch": 0.78, + "grad_norm": 0.9265582135084066, + "learning_rate": 2.4801315930086147e-06, + "loss": 0.1866, + "step": 15295 + }, + { + "epoch": 0.78, + "grad_norm": 1.4209191915362267, + "learning_rate": 2.479046049322629e-06, + "loss": 0.1861, + "step": 15296 + }, + { + "epoch": 0.78, + "grad_norm": 0.8959157659091018, + "learning_rate": 2.4779607096429403e-06, + "loss": 0.1587, + "step": 15297 + }, + { + "epoch": 0.78, + "grad_norm": 1.0865709667746095, + "learning_rate": 2.4768755739989925e-06, + "loss": 0.1525, + "step": 15298 + }, + { + "epoch": 0.78, + "grad_norm": 0.992508724360173, + "learning_rate": 2.475790642420219e-06, + "loss": 0.134, + "step": 15299 + }, + { + "epoch": 0.78, + "grad_norm": 1.3318596943250995, + "learning_rate": 2.474705914936053e-06, + "loss": 0.1858, + "step": 15300 + }, + { + "epoch": 0.78, + "grad_norm": 0.9430394381293802, + "learning_rate": 2.473621391575911e-06, + "loss": 0.1697, + "step": 15301 + }, + { + "epoch": 0.78, + "grad_norm": 0.8667635085832349, + "learning_rate": 2.4725370723692164e-06, + "loss": 0.1514, + "step": 15302 + }, + { + "epoch": 0.78, + "grad_norm": 1.5783046632381894, + "learning_rate": 2.471452957345376e-06, + "loss": 0.166, + "step": 15303 + }, + { + "epoch": 0.78, + "grad_norm": 0.949053055901867, + "learning_rate": 2.4703690465338025e-06, + "loss": 0.1693, + "step": 15304 + }, + { + "epoch": 0.78, + "grad_norm": 1.099586983786691, + "learning_rate": 2.469285339963892e-06, + "loss": 0.1668, + "step": 15305 + }, + { + "epoch": 0.78, + "grad_norm": 1.146907604484705, + "learning_rate": 2.468201837665043e-06, + "loss": 0.1638, + "step": 15306 + }, + { + "epoch": 0.78, + "grad_norm": 1.6703895998487162, + "learning_rate": 2.467118539666643e-06, + "loss": 0.1559, + "step": 15307 + }, + { + "epoch": 0.78, + "grad_norm": 1.0693170607150535, + "learning_rate": 2.4660354459980775e-06, + "loss": 0.1422, + "step": 15308 + }, + { + "epoch": 0.78, + "grad_norm": 0.870622931041626, + "learning_rate": 2.4649525566887267e-06, + "loss": 0.1573, + "step": 15309 + }, + { + "epoch": 0.78, + "grad_norm": 1.496630248263848, + "learning_rate": 2.4638698717679653e-06, + "loss": 0.1695, + "step": 15310 + }, + { + "epoch": 0.78, + "grad_norm": 1.133075124349076, + "learning_rate": 2.462787391265157e-06, + "loss": 0.1695, + "step": 15311 + }, + { + "epoch": 0.78, + "grad_norm": 1.0358289180935836, + "learning_rate": 2.4617051152096696e-06, + "loss": 0.1568, + "step": 15312 + }, + { + "epoch": 0.78, + "grad_norm": 1.1225526395435053, + "learning_rate": 2.4606230436308554e-06, + "loss": 0.1659, + "step": 15313 + }, + { + "epoch": 0.78, + "grad_norm": 1.0468975380099528, + "learning_rate": 2.4595411765580645e-06, + "loss": 0.1858, + "step": 15314 + }, + { + "epoch": 0.78, + "grad_norm": 0.9816839310890371, + "learning_rate": 2.4584595140206457e-06, + "loss": 0.1598, + "step": 15315 + }, + { + "epoch": 0.78, + "grad_norm": 1.085821885409933, + "learning_rate": 2.4573780560479387e-06, + "loss": 0.1631, + "step": 15316 + }, + { + "epoch": 0.78, + "grad_norm": 1.0087642673161445, + "learning_rate": 2.4562968026692803e-06, + "loss": 0.1661, + "step": 15317 + }, + { + "epoch": 0.78, + "grad_norm": 1.0371578322321766, + "learning_rate": 2.4552157539139944e-06, + "loss": 0.1793, + "step": 15318 + }, + { + "epoch": 0.78, + "grad_norm": 0.9130220149814539, + "learning_rate": 2.45413490981141e-06, + "loss": 0.171, + "step": 15319 + }, + { + "epoch": 0.78, + "grad_norm": 1.290014807266559, + "learning_rate": 2.45305427039084e-06, + "loss": 0.1791, + "step": 15320 + }, + { + "epoch": 0.78, + "grad_norm": 0.9635165312055891, + "learning_rate": 2.4519738356816015e-06, + "loss": 0.1666, + "step": 15321 + }, + { + "epoch": 0.78, + "grad_norm": 1.2303516824006075, + "learning_rate": 2.450893605712996e-06, + "loss": 0.1817, + "step": 15322 + }, + { + "epoch": 0.78, + "grad_norm": 1.3058539158515092, + "learning_rate": 2.449813580514332e-06, + "loss": 0.1724, + "step": 15323 + }, + { + "epoch": 0.78, + "grad_norm": 0.9524281611944755, + "learning_rate": 2.4487337601148975e-06, + "loss": 0.1874, + "step": 15324 + }, + { + "epoch": 0.78, + "grad_norm": 1.073158181893844, + "learning_rate": 2.447654144543986e-06, + "loss": 0.1632, + "step": 15325 + }, + { + "epoch": 0.78, + "grad_norm": 0.9513488743035756, + "learning_rate": 2.446574733830882e-06, + "loss": 0.1662, + "step": 15326 + }, + { + "epoch": 0.78, + "grad_norm": 0.8947551127078637, + "learning_rate": 2.4454955280048688e-06, + "loss": 0.1641, + "step": 15327 + }, + { + "epoch": 0.78, + "grad_norm": 0.8500710551120902, + "learning_rate": 2.4444165270952126e-06, + "loss": 0.1786, + "step": 15328 + }, + { + "epoch": 0.78, + "grad_norm": 1.6713358847496242, + "learning_rate": 2.4433377311311878e-06, + "loss": 0.1814, + "step": 15329 + }, + { + "epoch": 0.78, + "grad_norm": 1.3139186587490201, + "learning_rate": 2.4422591401420537e-06, + "loss": 0.1675, + "step": 15330 + }, + { + "epoch": 0.78, + "grad_norm": 1.3539006655141135, + "learning_rate": 2.4411807541570643e-06, + "loss": 0.198, + "step": 15331 + }, + { + "epoch": 0.78, + "grad_norm": 0.8974215267445775, + "learning_rate": 2.440102573205477e-06, + "loss": 0.1586, + "step": 15332 + }, + { + "epoch": 0.78, + "grad_norm": 1.1506955324100798, + "learning_rate": 2.4390245973165316e-06, + "loss": 0.1703, + "step": 15333 + }, + { + "epoch": 0.78, + "grad_norm": 1.2501650244300122, + "learning_rate": 2.4379468265194707e-06, + "loss": 0.1611, + "step": 15334 + }, + { + "epoch": 0.78, + "grad_norm": 1.033876510720954, + "learning_rate": 2.4368692608435294e-06, + "loss": 0.1792, + "step": 15335 + }, + { + "epoch": 0.78, + "grad_norm": 0.7951661193570492, + "learning_rate": 2.4357919003179396e-06, + "loss": 0.1471, + "step": 15336 + }, + { + "epoch": 0.78, + "grad_norm": 0.9520345764506051, + "learning_rate": 2.434714744971919e-06, + "loss": 0.1776, + "step": 15337 + }, + { + "epoch": 0.78, + "grad_norm": 1.0833822675819067, + "learning_rate": 2.4336377948346912e-06, + "loss": 0.1885, + "step": 15338 + }, + { + "epoch": 0.78, + "grad_norm": 2.1029213485645695, + "learning_rate": 2.432561049935462e-06, + "loss": 0.1834, + "step": 15339 + }, + { + "epoch": 0.78, + "grad_norm": 1.4361044589948972, + "learning_rate": 2.4314845103034456e-06, + "loss": 0.176, + "step": 15340 + }, + { + "epoch": 0.78, + "grad_norm": 0.8442285067210624, + "learning_rate": 2.4304081759678357e-06, + "loss": 0.1595, + "step": 15341 + }, + { + "epoch": 0.78, + "grad_norm": 0.9579393533005156, + "learning_rate": 2.429332046957832e-06, + "loss": 0.1709, + "step": 15342 + }, + { + "epoch": 0.78, + "grad_norm": 1.9511031711141813, + "learning_rate": 2.4282561233026236e-06, + "loss": 0.2082, + "step": 15343 + }, + { + "epoch": 0.78, + "grad_norm": 1.189125025522632, + "learning_rate": 2.4271804050313984e-06, + "loss": 0.1759, + "step": 15344 + }, + { + "epoch": 0.78, + "grad_norm": 0.8696093166445392, + "learning_rate": 2.42610489217333e-06, + "loss": 0.153, + "step": 15345 + }, + { + "epoch": 0.78, + "grad_norm": 0.7661160288872305, + "learning_rate": 2.4250295847575967e-06, + "loss": 0.1701, + "step": 15346 + }, + { + "epoch": 0.78, + "grad_norm": 1.0094608338108795, + "learning_rate": 2.4239544828133632e-06, + "loss": 0.1876, + "step": 15347 + }, + { + "epoch": 0.78, + "grad_norm": 1.1947365085635253, + "learning_rate": 2.422879586369791e-06, + "loss": 0.172, + "step": 15348 + }, + { + "epoch": 0.78, + "grad_norm": 1.02607706559597, + "learning_rate": 2.421804895456039e-06, + "loss": 0.1691, + "step": 15349 + }, + { + "epoch": 0.78, + "grad_norm": 1.1105200321284465, + "learning_rate": 2.420730410101255e-06, + "loss": 0.1751, + "step": 15350 + }, + { + "epoch": 0.78, + "grad_norm": 2.1219152032536965, + "learning_rate": 2.419656130334588e-06, + "loss": 0.1913, + "step": 15351 + }, + { + "epoch": 0.78, + "grad_norm": 1.4192179540898853, + "learning_rate": 2.4185820561851747e-06, + "loss": 0.1603, + "step": 15352 + }, + { + "epoch": 0.78, + "grad_norm": 1.6876340282880435, + "learning_rate": 2.417508187682156e-06, + "loss": 0.1619, + "step": 15353 + }, + { + "epoch": 0.78, + "grad_norm": 0.9298730731346014, + "learning_rate": 2.4164345248546517e-06, + "loss": 0.1627, + "step": 15354 + }, + { + "epoch": 0.78, + "grad_norm": 1.0761477246483009, + "learning_rate": 2.415361067731793e-06, + "loss": 0.1488, + "step": 15355 + }, + { + "epoch": 0.78, + "grad_norm": 0.9804762037101074, + "learning_rate": 2.41428781634269e-06, + "loss": 0.1548, + "step": 15356 + }, + { + "epoch": 0.78, + "grad_norm": 1.0872119575860812, + "learning_rate": 2.413214770716462e-06, + "loss": 0.1627, + "step": 15357 + }, + { + "epoch": 0.78, + "grad_norm": 0.9356716591209069, + "learning_rate": 2.412141930882208e-06, + "loss": 0.1662, + "step": 15358 + }, + { + "epoch": 0.78, + "grad_norm": 1.5485828606207372, + "learning_rate": 2.4110692968690364e-06, + "loss": 0.1539, + "step": 15359 + }, + { + "epoch": 0.78, + "grad_norm": 1.2746909556741446, + "learning_rate": 2.409996868706036e-06, + "loss": 0.1875, + "step": 15360 + }, + { + "epoch": 0.78, + "grad_norm": 1.020526020568585, + "learning_rate": 2.4089246464222995e-06, + "loss": 0.1603, + "step": 15361 + }, + { + "epoch": 0.78, + "grad_norm": 1.1005090547533583, + "learning_rate": 2.4078526300469097e-06, + "loss": 0.1756, + "step": 15362 + }, + { + "epoch": 0.78, + "grad_norm": 1.0681842201569682, + "learning_rate": 2.4067808196089493e-06, + "loss": 0.1727, + "step": 15363 + }, + { + "epoch": 0.78, + "grad_norm": 1.3422597996378893, + "learning_rate": 2.4057092151374885e-06, + "loss": 0.1712, + "step": 15364 + }, + { + "epoch": 0.78, + "grad_norm": 0.9132433065779902, + "learning_rate": 2.40463781666159e-06, + "loss": 0.1484, + "step": 15365 + }, + { + "epoch": 0.78, + "grad_norm": 1.0703530978743458, + "learning_rate": 2.403566624210324e-06, + "loss": 0.1645, + "step": 15366 + }, + { + "epoch": 0.78, + "grad_norm": 1.0893874090214328, + "learning_rate": 2.4024956378127396e-06, + "loss": 0.1551, + "step": 15367 + }, + { + "epoch": 0.78, + "grad_norm": 0.896536719043762, + "learning_rate": 2.401424857497889e-06, + "loss": 0.1692, + "step": 15368 + }, + { + "epoch": 0.78, + "grad_norm": 1.1534646592762958, + "learning_rate": 2.400354283294819e-06, + "loss": 0.1708, + "step": 15369 + }, + { + "epoch": 0.78, + "grad_norm": 1.20144166772448, + "learning_rate": 2.399283915232571e-06, + "loss": 0.1728, + "step": 15370 + }, + { + "epoch": 0.78, + "grad_norm": 0.8001018929783066, + "learning_rate": 2.398213753340174e-06, + "loss": 0.159, + "step": 15371 + }, + { + "epoch": 0.78, + "grad_norm": 1.1613880996230987, + "learning_rate": 2.3971437976466604e-06, + "loss": 0.1494, + "step": 15372 + }, + { + "epoch": 0.78, + "grad_norm": 1.0741129775209934, + "learning_rate": 2.3960740481810475e-06, + "loss": 0.1668, + "step": 15373 + }, + { + "epoch": 0.78, + "grad_norm": 0.998500376985148, + "learning_rate": 2.3950045049723593e-06, + "loss": 0.1767, + "step": 15374 + }, + { + "epoch": 0.78, + "grad_norm": 4.446899401008942, + "learning_rate": 2.3939351680495994e-06, + "loss": 0.1734, + "step": 15375 + }, + { + "epoch": 0.78, + "grad_norm": 0.9632523661077028, + "learning_rate": 2.392866037441781e-06, + "loss": 0.1463, + "step": 15376 + }, + { + "epoch": 0.78, + "grad_norm": 1.0273977255365927, + "learning_rate": 2.3917971131778982e-06, + "loss": 0.1737, + "step": 15377 + }, + { + "epoch": 0.78, + "grad_norm": 1.2251126007220232, + "learning_rate": 2.3907283952869485e-06, + "loss": 0.1656, + "step": 15378 + }, + { + "epoch": 0.78, + "grad_norm": 1.41623658769232, + "learning_rate": 2.389659883797921e-06, + "loss": 0.1625, + "step": 15379 + }, + { + "epoch": 0.78, + "grad_norm": 1.2970703730778923, + "learning_rate": 2.3885915787398016e-06, + "loss": 0.1631, + "step": 15380 + }, + { + "epoch": 0.78, + "grad_norm": 1.010114637458024, + "learning_rate": 2.3875234801415626e-06, + "loss": 0.1665, + "step": 15381 + }, + { + "epoch": 0.78, + "grad_norm": 1.2164200120028743, + "learning_rate": 2.3864555880321828e-06, + "loss": 0.1599, + "step": 15382 + }, + { + "epoch": 0.78, + "grad_norm": 1.4672333643271673, + "learning_rate": 2.3853879024406244e-06, + "loss": 0.1615, + "step": 15383 + }, + { + "epoch": 0.78, + "grad_norm": 1.120252041381067, + "learning_rate": 2.3843204233958463e-06, + "loss": 0.1483, + "step": 15384 + }, + { + "epoch": 0.78, + "grad_norm": 1.0045893183847037, + "learning_rate": 2.3832531509268076e-06, + "loss": 0.1928, + "step": 15385 + }, + { + "epoch": 0.78, + "grad_norm": 1.1425096056549606, + "learning_rate": 2.382186085062457e-06, + "loss": 0.1675, + "step": 15386 + }, + { + "epoch": 0.78, + "grad_norm": 1.3341631701663308, + "learning_rate": 2.3811192258317416e-06, + "loss": 0.1693, + "step": 15387 + }, + { + "epoch": 0.78, + "grad_norm": 1.257601081796507, + "learning_rate": 2.3800525732635946e-06, + "loss": 0.1686, + "step": 15388 + }, + { + "epoch": 0.78, + "grad_norm": 1.000073423100724, + "learning_rate": 2.3789861273869553e-06, + "loss": 0.1693, + "step": 15389 + }, + { + "epoch": 0.78, + "grad_norm": 1.0257732649799796, + "learning_rate": 2.3779198882307443e-06, + "loss": 0.1654, + "step": 15390 + }, + { + "epoch": 0.78, + "grad_norm": 1.5535326206303897, + "learning_rate": 2.3768538558238895e-06, + "loss": 0.1524, + "step": 15391 + }, + { + "epoch": 0.78, + "grad_norm": 1.2975656393098935, + "learning_rate": 2.375788030195303e-06, + "loss": 0.1834, + "step": 15392 + }, + { + "epoch": 0.78, + "grad_norm": 2.3739833482123904, + "learning_rate": 2.3747224113738985e-06, + "loss": 0.146, + "step": 15393 + }, + { + "epoch": 0.78, + "grad_norm": 0.9476331047559899, + "learning_rate": 2.373656999388576e-06, + "loss": 0.1535, + "step": 15394 + }, + { + "epoch": 0.78, + "grad_norm": 1.4355877710988458, + "learning_rate": 2.3725917942682397e-06, + "loss": 0.1889, + "step": 15395 + }, + { + "epoch": 0.78, + "grad_norm": 0.9183881432091104, + "learning_rate": 2.3715267960417798e-06, + "loss": 0.1549, + "step": 15396 + }, + { + "epoch": 0.78, + "grad_norm": 1.3000475050786378, + "learning_rate": 2.370462004738091e-06, + "loss": 0.1591, + "step": 15397 + }, + { + "epoch": 0.78, + "grad_norm": 0.8743533785508302, + "learning_rate": 2.3693974203860472e-06, + "loss": 0.1637, + "step": 15398 + }, + { + "epoch": 0.78, + "grad_norm": 1.416431709572786, + "learning_rate": 2.3683330430145333e-06, + "loss": 0.1899, + "step": 15399 + }, + { + "epoch": 0.78, + "grad_norm": 1.5467261188083323, + "learning_rate": 2.367268872652416e-06, + "loss": 0.156, + "step": 15400 + }, + { + "epoch": 0.78, + "grad_norm": 0.9294656354998612, + "learning_rate": 2.366204909328559e-06, + "loss": 0.1539, + "step": 15401 + }, + { + "epoch": 0.78, + "grad_norm": 0.8225038847893258, + "learning_rate": 2.3651411530718272e-06, + "loss": 0.1698, + "step": 15402 + }, + { + "epoch": 0.78, + "grad_norm": 1.262942076782041, + "learning_rate": 2.36407760391107e-06, + "loss": 0.1768, + "step": 15403 + }, + { + "epoch": 0.78, + "grad_norm": 1.0086611381985109, + "learning_rate": 2.3630142618751405e-06, + "loss": 0.1811, + "step": 15404 + }, + { + "epoch": 0.78, + "grad_norm": 0.9710681600261067, + "learning_rate": 2.3619511269928784e-06, + "loss": 0.1837, + "step": 15405 + }, + { + "epoch": 0.78, + "grad_norm": 1.3854709074678964, + "learning_rate": 2.360888199293128e-06, + "loss": 0.1789, + "step": 15406 + }, + { + "epoch": 0.78, + "grad_norm": 0.9299119097883016, + "learning_rate": 2.3598254788047136e-06, + "loss": 0.1539, + "step": 15407 + }, + { + "epoch": 0.78, + "grad_norm": 2.011282534568099, + "learning_rate": 2.358762965556467e-06, + "loss": 0.1656, + "step": 15408 + }, + { + "epoch": 0.78, + "grad_norm": 1.2551793735265222, + "learning_rate": 2.3577006595772032e-06, + "loss": 0.1673, + "step": 15409 + }, + { + "epoch": 0.78, + "grad_norm": 1.0943491917388624, + "learning_rate": 2.3566385608957443e-06, + "loss": 0.1731, + "step": 15410 + }, + { + "epoch": 0.78, + "grad_norm": 1.3107629112380317, + "learning_rate": 2.355576669540893e-06, + "loss": 0.1575, + "step": 15411 + }, + { + "epoch": 0.78, + "grad_norm": 0.8515648899316381, + "learning_rate": 2.354514985541456e-06, + "loss": 0.1599, + "step": 15412 + }, + { + "epoch": 0.78, + "grad_norm": 1.293967723582203, + "learning_rate": 2.353453508926232e-06, + "loss": 0.1566, + "step": 15413 + }, + { + "epoch": 0.78, + "grad_norm": 0.9095448026517428, + "learning_rate": 2.3523922397240163e-06, + "loss": 0.1534, + "step": 15414 + }, + { + "epoch": 0.78, + "grad_norm": 0.8032411398733617, + "learning_rate": 2.3513311779635904e-06, + "loss": 0.1514, + "step": 15415 + }, + { + "epoch": 0.78, + "grad_norm": 1.1203727977935276, + "learning_rate": 2.3502703236737412e-06, + "loss": 0.1597, + "step": 15416 + }, + { + "epoch": 0.78, + "grad_norm": 1.3115090337752493, + "learning_rate": 2.3492096768832417e-06, + "loss": 0.1593, + "step": 15417 + }, + { + "epoch": 0.78, + "grad_norm": 1.154189297651346, + "learning_rate": 2.348149237620858e-06, + "loss": 0.1696, + "step": 15418 + }, + { + "epoch": 0.78, + "grad_norm": 1.738661265681747, + "learning_rate": 2.3470890059153616e-06, + "loss": 0.16, + "step": 15419 + }, + { + "epoch": 0.78, + "grad_norm": 0.8918477045125891, + "learning_rate": 2.3460289817955063e-06, + "loss": 0.154, + "step": 15420 + }, + { + "epoch": 0.78, + "grad_norm": 1.7360933710102129, + "learning_rate": 2.3449691652900464e-06, + "loss": 0.1658, + "step": 15421 + }, + { + "epoch": 0.78, + "grad_norm": 1.0560603543571885, + "learning_rate": 2.3439095564277305e-06, + "loss": 0.1629, + "step": 15422 + }, + { + "epoch": 0.78, + "grad_norm": 1.9505114766577443, + "learning_rate": 2.342850155237303e-06, + "loss": 0.1818, + "step": 15423 + }, + { + "epoch": 0.78, + "grad_norm": 1.0034429408534218, + "learning_rate": 2.341790961747494e-06, + "loss": 0.1752, + "step": 15424 + }, + { + "epoch": 0.78, + "grad_norm": 2.088320359735791, + "learning_rate": 2.340731975987042e-06, + "loss": 0.1705, + "step": 15425 + }, + { + "epoch": 0.78, + "grad_norm": 0.9125680830719255, + "learning_rate": 2.3396731979846634e-06, + "loss": 0.1506, + "step": 15426 + }, + { + "epoch": 0.78, + "grad_norm": 0.830417739612693, + "learning_rate": 2.3386146277690858e-06, + "loss": 0.1519, + "step": 15427 + }, + { + "epoch": 0.78, + "grad_norm": 0.849114943252066, + "learning_rate": 2.3375562653690166e-06, + "loss": 0.1603, + "step": 15428 + }, + { + "epoch": 0.78, + "grad_norm": 1.3629818306238608, + "learning_rate": 2.336498110813168e-06, + "loss": 0.1476, + "step": 15429 + }, + { + "epoch": 0.78, + "grad_norm": 0.823546883376828, + "learning_rate": 2.3354401641302395e-06, + "loss": 0.1539, + "step": 15430 + }, + { + "epoch": 0.78, + "grad_norm": 0.978023459235194, + "learning_rate": 2.3343824253489277e-06, + "loss": 0.154, + "step": 15431 + }, + { + "epoch": 0.78, + "grad_norm": 1.0018849611636187, + "learning_rate": 2.333324894497927e-06, + "loss": 0.1513, + "step": 15432 + }, + { + "epoch": 0.78, + "grad_norm": 0.9875086947887675, + "learning_rate": 2.332267571605924e-06, + "loss": 0.1709, + "step": 15433 + }, + { + "epoch": 0.78, + "grad_norm": 1.5938673758138624, + "learning_rate": 2.331210456701597e-06, + "loss": 0.1562, + "step": 15434 + }, + { + "epoch": 0.78, + "grad_norm": 1.4053685858854916, + "learning_rate": 2.330153549813615e-06, + "loss": 0.1774, + "step": 15435 + }, + { + "epoch": 0.78, + "grad_norm": 4.934543246171554, + "learning_rate": 2.329096850970656e-06, + "loss": 0.1549, + "step": 15436 + }, + { + "epoch": 0.78, + "grad_norm": 1.3750873777918644, + "learning_rate": 2.3280403602013735e-06, + "loss": 0.1707, + "step": 15437 + }, + { + "epoch": 0.79, + "grad_norm": 0.974035012887203, + "learning_rate": 2.326984077534431e-06, + "loss": 0.1733, + "step": 15438 + }, + { + "epoch": 0.79, + "grad_norm": 1.003644320415384, + "learning_rate": 2.3259280029984775e-06, + "loss": 0.1602, + "step": 15439 + }, + { + "epoch": 0.79, + "grad_norm": 3.3928202333175244, + "learning_rate": 2.324872136622164e-06, + "loss": 0.1667, + "step": 15440 + }, + { + "epoch": 0.79, + "grad_norm": 1.158804952368038, + "learning_rate": 2.3238164784341242e-06, + "loss": 0.1555, + "step": 15441 + }, + { + "epoch": 0.79, + "grad_norm": 0.9716905368598698, + "learning_rate": 2.3227610284629985e-06, + "loss": 0.1649, + "step": 15442 + }, + { + "epoch": 0.79, + "grad_norm": 1.4020372115262854, + "learning_rate": 2.3217057867374114e-06, + "loss": 0.1764, + "step": 15443 + }, + { + "epoch": 0.79, + "grad_norm": 1.0717520983846267, + "learning_rate": 2.32065075328599e-06, + "loss": 0.1576, + "step": 15444 + }, + { + "epoch": 0.79, + "grad_norm": 1.1663936158767128, + "learning_rate": 2.319595928137349e-06, + "loss": 0.1636, + "step": 15445 + }, + { + "epoch": 0.79, + "grad_norm": 1.1732575843595243, + "learning_rate": 2.318541311320105e-06, + "loss": 0.1934, + "step": 15446 + }, + { + "epoch": 0.79, + "grad_norm": 0.9486028250086229, + "learning_rate": 2.317486902862859e-06, + "loss": 0.1571, + "step": 15447 + }, + { + "epoch": 0.79, + "grad_norm": 1.1498421923735032, + "learning_rate": 2.3164327027942147e-06, + "loss": 0.1772, + "step": 15448 + }, + { + "epoch": 0.79, + "grad_norm": 0.9350401407713013, + "learning_rate": 2.3153787111427673e-06, + "loss": 0.1614, + "step": 15449 + }, + { + "epoch": 0.79, + "grad_norm": 1.5593140843812545, + "learning_rate": 2.3143249279371085e-06, + "loss": 0.1494, + "step": 15450 + }, + { + "epoch": 0.79, + "grad_norm": 1.3508993582556552, + "learning_rate": 2.313271353205818e-06, + "loss": 0.1521, + "step": 15451 + }, + { + "epoch": 0.79, + "grad_norm": 0.787726912758565, + "learning_rate": 2.3122179869774784e-06, + "loss": 0.1542, + "step": 15452 + }, + { + "epoch": 0.79, + "grad_norm": 0.8663480465259822, + "learning_rate": 2.311164829280661e-06, + "loss": 0.1509, + "step": 15453 + }, + { + "epoch": 0.79, + "grad_norm": 1.2976235796846125, + "learning_rate": 2.3101118801439283e-06, + "loss": 0.1611, + "step": 15454 + }, + { + "epoch": 0.79, + "grad_norm": 0.6973010981091752, + "learning_rate": 2.3090591395958485e-06, + "loss": 0.1373, + "step": 15455 + }, + { + "epoch": 0.79, + "grad_norm": 1.1117049975822124, + "learning_rate": 2.3080066076649697e-06, + "loss": 0.1694, + "step": 15456 + }, + { + "epoch": 0.79, + "grad_norm": 1.1801660374439589, + "learning_rate": 2.3069542843798476e-06, + "loss": 0.1734, + "step": 15457 + }, + { + "epoch": 0.79, + "grad_norm": 0.9972892773542359, + "learning_rate": 2.3059021697690254e-06, + "loss": 0.1578, + "step": 15458 + }, + { + "epoch": 0.79, + "grad_norm": 0.9699548310098502, + "learning_rate": 2.3048502638610427e-06, + "loss": 0.164, + "step": 15459 + }, + { + "epoch": 0.79, + "grad_norm": 0.9651201995600788, + "learning_rate": 2.3037985666844297e-06, + "loss": 0.1616, + "step": 15460 + }, + { + "epoch": 0.79, + "grad_norm": 0.9133523025137816, + "learning_rate": 2.3027470782677173e-06, + "loss": 0.1678, + "step": 15461 + }, + { + "epoch": 0.79, + "grad_norm": 1.0927839593337292, + "learning_rate": 2.3016957986394228e-06, + "loss": 0.17, + "step": 15462 + }, + { + "epoch": 0.79, + "grad_norm": 0.8773806402542051, + "learning_rate": 2.3006447278280676e-06, + "loss": 0.1647, + "step": 15463 + }, + { + "epoch": 0.79, + "grad_norm": 1.3811845661656832, + "learning_rate": 2.299593865862155e-06, + "loss": 0.1737, + "step": 15464 + }, + { + "epoch": 0.79, + "grad_norm": 0.9285844656456148, + "learning_rate": 2.2985432127701945e-06, + "loss": 0.1498, + "step": 15465 + }, + { + "epoch": 0.79, + "grad_norm": 0.8726490367071329, + "learning_rate": 2.2974927685806848e-06, + "loss": 0.1588, + "step": 15466 + }, + { + "epoch": 0.79, + "grad_norm": 0.927809791960326, + "learning_rate": 2.296442533322121e-06, + "loss": 0.1699, + "step": 15467 + }, + { + "epoch": 0.79, + "grad_norm": 0.9565631359875368, + "learning_rate": 2.2953925070229865e-06, + "loss": 0.1768, + "step": 15468 + }, + { + "epoch": 0.79, + "grad_norm": 1.552914352956403, + "learning_rate": 2.2943426897117672e-06, + "loss": 0.1757, + "step": 15469 + }, + { + "epoch": 0.79, + "grad_norm": 1.2338495932171418, + "learning_rate": 2.2932930814169383e-06, + "loss": 0.1785, + "step": 15470 + }, + { + "epoch": 0.79, + "grad_norm": 1.3102501076279853, + "learning_rate": 2.292243682166967e-06, + "loss": 0.16, + "step": 15471 + }, + { + "epoch": 0.79, + "grad_norm": 1.131636846718695, + "learning_rate": 2.291194491990324e-06, + "loss": 0.1681, + "step": 15472 + }, + { + "epoch": 0.79, + "grad_norm": 1.0837521584107503, + "learning_rate": 2.2901455109154626e-06, + "loss": 0.1696, + "step": 15473 + }, + { + "epoch": 0.79, + "grad_norm": 1.1508754181295, + "learning_rate": 2.2890967389708396e-06, + "loss": 0.1715, + "step": 15474 + }, + { + "epoch": 0.79, + "grad_norm": 0.9192566541574264, + "learning_rate": 2.2880481761849037e-06, + "loss": 0.1358, + "step": 15475 + }, + { + "epoch": 0.79, + "grad_norm": 1.054010923229116, + "learning_rate": 2.286999822586099e-06, + "loss": 0.1643, + "step": 15476 + }, + { + "epoch": 0.79, + "grad_norm": 0.9484038897437859, + "learning_rate": 2.285951678202857e-06, + "loss": 0.1605, + "step": 15477 + }, + { + "epoch": 0.79, + "grad_norm": 0.8489375942887908, + "learning_rate": 2.2849037430636135e-06, + "loss": 0.1565, + "step": 15478 + }, + { + "epoch": 0.79, + "grad_norm": 1.4204227390337094, + "learning_rate": 2.2838560171967906e-06, + "loss": 0.1644, + "step": 15479 + }, + { + "epoch": 0.79, + "grad_norm": 0.9216879840577646, + "learning_rate": 2.28280850063081e-06, + "loss": 0.1564, + "step": 15480 + }, + { + "epoch": 0.79, + "grad_norm": 0.9092868562804686, + "learning_rate": 2.281761193394083e-06, + "loss": 0.1472, + "step": 15481 + }, + { + "epoch": 0.79, + "grad_norm": 1.920977308430054, + "learning_rate": 2.2807140955150198e-06, + "loss": 0.1597, + "step": 15482 + }, + { + "epoch": 0.79, + "grad_norm": 0.9738559801843604, + "learning_rate": 2.2796672070220217e-06, + "loss": 0.1674, + "step": 15483 + }, + { + "epoch": 0.79, + "grad_norm": 1.3205100337184514, + "learning_rate": 2.27862052794349e-06, + "loss": 0.1505, + "step": 15484 + }, + { + "epoch": 0.79, + "grad_norm": 1.0861368779297695, + "learning_rate": 2.27757405830781e-06, + "loss": 0.1811, + "step": 15485 + }, + { + "epoch": 0.79, + "grad_norm": 1.8969086125838965, + "learning_rate": 2.276527798143372e-06, + "loss": 0.1669, + "step": 15486 + }, + { + "epoch": 0.79, + "grad_norm": 0.9619151407615854, + "learning_rate": 2.275481747478554e-06, + "loss": 0.1779, + "step": 15487 + }, + { + "epoch": 0.79, + "grad_norm": 1.2675080890008574, + "learning_rate": 2.2744359063417276e-06, + "loss": 0.1436, + "step": 15488 + }, + { + "epoch": 0.79, + "grad_norm": 0.945079506316101, + "learning_rate": 2.2733902747612656e-06, + "loss": 0.1646, + "step": 15489 + }, + { + "epoch": 0.79, + "grad_norm": 2.6382623359251274, + "learning_rate": 2.2723448527655267e-06, + "loss": 0.1597, + "step": 15490 + }, + { + "epoch": 0.79, + "grad_norm": 2.530052373076095, + "learning_rate": 2.27129964038287e-06, + "loss": 0.1808, + "step": 15491 + }, + { + "epoch": 0.79, + "grad_norm": 3.0969313221348065, + "learning_rate": 2.2702546376416467e-06, + "loss": 0.1987, + "step": 15492 + }, + { + "epoch": 0.79, + "grad_norm": 0.909649025671053, + "learning_rate": 2.269209844570206e-06, + "loss": 0.1699, + "step": 15493 + }, + { + "epoch": 0.79, + "grad_norm": 1.3532387603007856, + "learning_rate": 2.268165261196882e-06, + "loss": 0.1581, + "step": 15494 + }, + { + "epoch": 0.79, + "grad_norm": 0.7315958100266694, + "learning_rate": 2.267120887550015e-06, + "loss": 0.16, + "step": 15495 + }, + { + "epoch": 0.79, + "grad_norm": 0.9825168423437397, + "learning_rate": 2.2660767236579275e-06, + "loss": 0.1693, + "step": 15496 + }, + { + "epoch": 0.79, + "grad_norm": 0.8804275441419434, + "learning_rate": 2.265032769548948e-06, + "loss": 0.1865, + "step": 15497 + }, + { + "epoch": 0.79, + "grad_norm": 1.3730261395856402, + "learning_rate": 2.26398902525139e-06, + "loss": 0.1503, + "step": 15498 + }, + { + "epoch": 0.79, + "grad_norm": 1.0981105723529896, + "learning_rate": 2.2629454907935687e-06, + "loss": 0.1803, + "step": 15499 + }, + { + "epoch": 0.79, + "grad_norm": 1.1567464441134347, + "learning_rate": 2.2619021662037855e-06, + "loss": 0.1713, + "step": 15500 + }, + { + "epoch": 0.79, + "grad_norm": 1.029235331388223, + "learning_rate": 2.2608590515103425e-06, + "loss": 0.1663, + "step": 15501 + }, + { + "epoch": 0.79, + "grad_norm": 2.2843708418108912, + "learning_rate": 2.2598161467415357e-06, + "loss": 0.1569, + "step": 15502 + }, + { + "epoch": 0.79, + "grad_norm": 1.1181951782321329, + "learning_rate": 2.2587734519256556e-06, + "loss": 0.1742, + "step": 15503 + }, + { + "epoch": 0.79, + "grad_norm": 1.0213400485525141, + "learning_rate": 2.257730967090982e-06, + "loss": 0.1643, + "step": 15504 + }, + { + "epoch": 0.79, + "grad_norm": 1.5573104906756543, + "learning_rate": 2.2566886922657917e-06, + "loss": 0.1647, + "step": 15505 + }, + { + "epoch": 0.79, + "grad_norm": 0.8940242870876924, + "learning_rate": 2.2556466274783596e-06, + "loss": 0.1733, + "step": 15506 + }, + { + "epoch": 0.79, + "grad_norm": 1.2743541925464599, + "learning_rate": 2.2546047727569475e-06, + "loss": 0.1742, + "step": 15507 + }, + { + "epoch": 0.79, + "grad_norm": 0.939656053009, + "learning_rate": 2.253563128129819e-06, + "loss": 0.1604, + "step": 15508 + }, + { + "epoch": 0.79, + "grad_norm": 1.5790346633369725, + "learning_rate": 2.252521693625228e-06, + "loss": 0.1426, + "step": 15509 + }, + { + "epoch": 0.79, + "grad_norm": 1.7542193003523812, + "learning_rate": 2.2514804692714264e-06, + "loss": 0.1828, + "step": 15510 + }, + { + "epoch": 0.79, + "grad_norm": 1.0661220233266782, + "learning_rate": 2.2504394550966513e-06, + "loss": 0.1616, + "step": 15511 + }, + { + "epoch": 0.79, + "grad_norm": 1.2734750540730406, + "learning_rate": 2.249398651129148e-06, + "loss": 0.1979, + "step": 15512 + }, + { + "epoch": 0.79, + "grad_norm": 1.3328836438750213, + "learning_rate": 2.2483580573971396e-06, + "loss": 0.1572, + "step": 15513 + }, + { + "epoch": 0.79, + "grad_norm": 1.0606100818038617, + "learning_rate": 2.2473176739288603e-06, + "loss": 0.1738, + "step": 15514 + }, + { + "epoch": 0.79, + "grad_norm": 1.0724834883126513, + "learning_rate": 2.246277500752524e-06, + "loss": 0.1526, + "step": 15515 + }, + { + "epoch": 0.79, + "grad_norm": 1.0180285929417798, + "learning_rate": 2.245237537896351e-06, + "loss": 0.1372, + "step": 15516 + }, + { + "epoch": 0.79, + "grad_norm": 1.3299070440761946, + "learning_rate": 2.2441977853885454e-06, + "loss": 0.1568, + "step": 15517 + }, + { + "epoch": 0.79, + "grad_norm": 1.8112916247301079, + "learning_rate": 2.2431582432573127e-06, + "loss": 0.1701, + "step": 15518 + }, + { + "epoch": 0.79, + "grad_norm": 1.9341991066548048, + "learning_rate": 2.2421189115308506e-06, + "loss": 0.1682, + "step": 15519 + }, + { + "epoch": 0.79, + "grad_norm": 0.9881390734593312, + "learning_rate": 2.241079790237355e-06, + "loss": 0.1502, + "step": 15520 + }, + { + "epoch": 0.79, + "grad_norm": 1.0181825157115165, + "learning_rate": 2.2400408794050045e-06, + "loss": 0.1626, + "step": 15521 + }, + { + "epoch": 0.79, + "grad_norm": 1.4345164270616522, + "learning_rate": 2.2390021790619863e-06, + "loss": 0.1739, + "step": 15522 + }, + { + "epoch": 0.79, + "grad_norm": 0.9341993239393256, + "learning_rate": 2.237963689236472e-06, + "loss": 0.1921, + "step": 15523 + }, + { + "epoch": 0.79, + "grad_norm": 0.9828588397695067, + "learning_rate": 2.2369254099566283e-06, + "loss": 0.1603, + "step": 15524 + }, + { + "epoch": 0.79, + "grad_norm": 1.2853311671086198, + "learning_rate": 2.2358873412506254e-06, + "loss": 0.1998, + "step": 15525 + }, + { + "epoch": 0.79, + "grad_norm": 0.9950469959633863, + "learning_rate": 2.2348494831466127e-06, + "loss": 0.1758, + "step": 15526 + }, + { + "epoch": 0.79, + "grad_norm": 1.3029094611737277, + "learning_rate": 2.2338118356727466e-06, + "loss": 0.1748, + "step": 15527 + }, + { + "epoch": 0.79, + "grad_norm": 0.810663812794333, + "learning_rate": 2.2327743988571738e-06, + "loss": 0.1567, + "step": 15528 + }, + { + "epoch": 0.79, + "grad_norm": 0.9306164030658635, + "learning_rate": 2.231737172728037e-06, + "loss": 0.1562, + "step": 15529 + }, + { + "epoch": 0.79, + "grad_norm": 1.5012657439900254, + "learning_rate": 2.2307001573134646e-06, + "loss": 0.1914, + "step": 15530 + }, + { + "epoch": 0.79, + "grad_norm": 1.0339311943769462, + "learning_rate": 2.2296633526415924e-06, + "loss": 0.1805, + "step": 15531 + }, + { + "epoch": 0.79, + "grad_norm": 0.8261741916565691, + "learning_rate": 2.2286267587405376e-06, + "loss": 0.182, + "step": 15532 + }, + { + "epoch": 0.79, + "grad_norm": 0.9142063944559767, + "learning_rate": 2.227590375638423e-06, + "loss": 0.1617, + "step": 15533 + }, + { + "epoch": 0.79, + "grad_norm": 1.3244697895988153, + "learning_rate": 2.226554203363357e-06, + "loss": 0.154, + "step": 15534 + }, + { + "epoch": 0.79, + "grad_norm": 1.3274544121566074, + "learning_rate": 2.225518241943446e-06, + "loss": 0.1635, + "step": 15535 + }, + { + "epoch": 0.79, + "grad_norm": 0.9677118097166073, + "learning_rate": 2.2244824914067932e-06, + "loss": 0.1533, + "step": 15536 + }, + { + "epoch": 0.79, + "grad_norm": 1.3979123867254049, + "learning_rate": 2.2234469517814937e-06, + "loss": 0.1597, + "step": 15537 + }, + { + "epoch": 0.79, + "grad_norm": 0.757339040284125, + "learning_rate": 2.2224116230956326e-06, + "loss": 0.1425, + "step": 15538 + }, + { + "epoch": 0.79, + "grad_norm": 1.0385628676593655, + "learning_rate": 2.2213765053772984e-06, + "loss": 0.1956, + "step": 15539 + }, + { + "epoch": 0.79, + "grad_norm": 1.1618767087057638, + "learning_rate": 2.220341598654565e-06, + "loss": 0.1637, + "step": 15540 + }, + { + "epoch": 0.79, + "grad_norm": 1.0568778997025638, + "learning_rate": 2.2193069029555035e-06, + "loss": 0.1813, + "step": 15541 + }, + { + "epoch": 0.79, + "grad_norm": 0.8628483075554302, + "learning_rate": 2.2182724183081837e-06, + "loss": 0.1805, + "step": 15542 + }, + { + "epoch": 0.79, + "grad_norm": 1.6281039310208947, + "learning_rate": 2.217238144740662e-06, + "loss": 0.1737, + "step": 15543 + }, + { + "epoch": 0.79, + "grad_norm": 1.0431107402561968, + "learning_rate": 2.216204082280995e-06, + "loss": 0.1733, + "step": 15544 + }, + { + "epoch": 0.79, + "grad_norm": 0.8442010781512328, + "learning_rate": 2.215170230957231e-06, + "loss": 0.1601, + "step": 15545 + }, + { + "epoch": 0.79, + "grad_norm": 0.8923269661973011, + "learning_rate": 2.2141365907974176e-06, + "loss": 0.1776, + "step": 15546 + }, + { + "epoch": 0.79, + "grad_norm": 1.2454688797031142, + "learning_rate": 2.213103161829586e-06, + "loss": 0.1488, + "step": 15547 + }, + { + "epoch": 0.79, + "grad_norm": 1.0118447767791443, + "learning_rate": 2.212069944081774e-06, + "loss": 0.1654, + "step": 15548 + }, + { + "epoch": 0.79, + "grad_norm": 1.570171035100308, + "learning_rate": 2.2110369375820016e-06, + "loss": 0.1827, + "step": 15549 + }, + { + "epoch": 0.79, + "grad_norm": 5.126571649081451, + "learning_rate": 2.2100041423582954e-06, + "loss": 0.1923, + "step": 15550 + }, + { + "epoch": 0.79, + "grad_norm": 0.8934819284786086, + "learning_rate": 2.208971558438664e-06, + "loss": 0.1508, + "step": 15551 + }, + { + "epoch": 0.79, + "grad_norm": 0.9617945397982444, + "learning_rate": 2.2079391858511214e-06, + "loss": 0.174, + "step": 15552 + }, + { + "epoch": 0.79, + "grad_norm": 1.322771889638723, + "learning_rate": 2.2069070246236658e-06, + "loss": 0.1791, + "step": 15553 + }, + { + "epoch": 0.79, + "grad_norm": 0.9423969982106858, + "learning_rate": 2.2058750747842974e-06, + "loss": 0.1592, + "step": 15554 + }, + { + "epoch": 0.79, + "grad_norm": 1.1030627892681681, + "learning_rate": 2.2048433363610077e-06, + "loss": 0.1518, + "step": 15555 + }, + { + "epoch": 0.79, + "grad_norm": 1.1999783035530458, + "learning_rate": 2.203811809381785e-06, + "loss": 0.1547, + "step": 15556 + }, + { + "epoch": 0.79, + "grad_norm": 1.0678737575178097, + "learning_rate": 2.2027804938746087e-06, + "loss": 0.154, + "step": 15557 + }, + { + "epoch": 0.79, + "grad_norm": 1.2888427274702388, + "learning_rate": 2.201749389867448e-06, + "loss": 0.1794, + "step": 15558 + }, + { + "epoch": 0.79, + "grad_norm": 0.8089560589472962, + "learning_rate": 2.200718497388279e-06, + "loss": 0.1611, + "step": 15559 + }, + { + "epoch": 0.79, + "grad_norm": 1.0591013646114105, + "learning_rate": 2.199687816465058e-06, + "loss": 0.1696, + "step": 15560 + }, + { + "epoch": 0.79, + "grad_norm": 1.0265503614760638, + "learning_rate": 2.1986573471257456e-06, + "loss": 0.1726, + "step": 15561 + }, + { + "epoch": 0.79, + "grad_norm": 1.0804107453865288, + "learning_rate": 2.1976270893982934e-06, + "loss": 0.1647, + "step": 15562 + }, + { + "epoch": 0.79, + "grad_norm": 0.9829531661161499, + "learning_rate": 2.19659704331065e-06, + "loss": 0.1648, + "step": 15563 + }, + { + "epoch": 0.79, + "grad_norm": 1.1559582416470937, + "learning_rate": 2.19556720889075e-06, + "loss": 0.1708, + "step": 15564 + }, + { + "epoch": 0.79, + "grad_norm": 1.511957859694274, + "learning_rate": 2.194537586166532e-06, + "loss": 0.1703, + "step": 15565 + }, + { + "epoch": 0.79, + "grad_norm": 0.9399930014089914, + "learning_rate": 2.1935081751659214e-06, + "loss": 0.1495, + "step": 15566 + }, + { + "epoch": 0.79, + "grad_norm": 1.1744683179840691, + "learning_rate": 2.192478975916844e-06, + "loss": 0.1646, + "step": 15567 + }, + { + "epoch": 0.79, + "grad_norm": 1.052596780486219, + "learning_rate": 2.191449988447213e-06, + "loss": 0.1721, + "step": 15568 + }, + { + "epoch": 0.79, + "grad_norm": 0.9301386245281534, + "learning_rate": 2.1904212127849455e-06, + "loss": 0.1756, + "step": 15569 + }, + { + "epoch": 0.79, + "grad_norm": 1.2471355427079205, + "learning_rate": 2.18939264895794e-06, + "loss": 0.146, + "step": 15570 + }, + { + "epoch": 0.79, + "grad_norm": 0.8480913113661026, + "learning_rate": 2.188364296994101e-06, + "loss": 0.1825, + "step": 15571 + }, + { + "epoch": 0.79, + "grad_norm": 1.8866841429424297, + "learning_rate": 2.1873361569213204e-06, + "loss": 0.1692, + "step": 15572 + }, + { + "epoch": 0.79, + "grad_norm": 0.9416588430775136, + "learning_rate": 2.186308228767492e-06, + "loss": 0.1639, + "step": 15573 + }, + { + "epoch": 0.79, + "grad_norm": 0.925607189488996, + "learning_rate": 2.1852805125604924e-06, + "loss": 0.1766, + "step": 15574 + }, + { + "epoch": 0.79, + "grad_norm": 1.0407514437395915, + "learning_rate": 2.184253008328199e-06, + "loss": 0.1669, + "step": 15575 + }, + { + "epoch": 0.79, + "grad_norm": 0.9420561629019543, + "learning_rate": 2.1832257160984873e-06, + "loss": 0.1662, + "step": 15576 + }, + { + "epoch": 0.79, + "grad_norm": 1.22011044368606, + "learning_rate": 2.182198635899215e-06, + "loss": 0.166, + "step": 15577 + }, + { + "epoch": 0.79, + "grad_norm": 1.6176134255385397, + "learning_rate": 2.18117176775825e-06, + "loss": 0.1818, + "step": 15578 + }, + { + "epoch": 0.79, + "grad_norm": 0.8881455447897848, + "learning_rate": 2.18014511170344e-06, + "loss": 0.1482, + "step": 15579 + }, + { + "epoch": 0.79, + "grad_norm": 1.196847111229772, + "learning_rate": 2.179118667762635e-06, + "loss": 0.1732, + "step": 15580 + }, + { + "epoch": 0.79, + "grad_norm": 0.7944044035893635, + "learning_rate": 2.178092435963678e-06, + "loss": 0.1418, + "step": 15581 + }, + { + "epoch": 0.79, + "grad_norm": 1.813756883174713, + "learning_rate": 2.177066416334409e-06, + "loss": 0.163, + "step": 15582 + }, + { + "epoch": 0.79, + "grad_norm": 0.7979800318997707, + "learning_rate": 2.176040608902652e-06, + "loss": 0.1461, + "step": 15583 + }, + { + "epoch": 0.79, + "grad_norm": 1.1091856405851366, + "learning_rate": 2.175015013696238e-06, + "loss": 0.155, + "step": 15584 + }, + { + "epoch": 0.79, + "grad_norm": 1.0806486704666542, + "learning_rate": 2.173989630742981e-06, + "loss": 0.1778, + "step": 15585 + }, + { + "epoch": 0.79, + "grad_norm": 1.2480394070848813, + "learning_rate": 2.172964460070699e-06, + "loss": 0.1795, + "step": 15586 + }, + { + "epoch": 0.79, + "grad_norm": 1.0284664104629266, + "learning_rate": 2.1719395017071966e-06, + "loss": 0.1626, + "step": 15587 + }, + { + "epoch": 0.79, + "grad_norm": 1.7285129265356551, + "learning_rate": 2.170914755680277e-06, + "loss": 0.1747, + "step": 15588 + }, + { + "epoch": 0.79, + "grad_norm": 1.0249807644458409, + "learning_rate": 2.1698902220177365e-06, + "loss": 0.16, + "step": 15589 + }, + { + "epoch": 0.79, + "grad_norm": 1.4069729783476428, + "learning_rate": 2.16886590074737e-06, + "loss": 0.1767, + "step": 15590 + }, + { + "epoch": 0.79, + "grad_norm": 1.1186356411264524, + "learning_rate": 2.167841791896954e-06, + "loss": 0.1698, + "step": 15591 + }, + { + "epoch": 0.79, + "grad_norm": 1.4428332295754163, + "learning_rate": 2.1668178954942754e-06, + "loss": 0.1612, + "step": 15592 + }, + { + "epoch": 0.79, + "grad_norm": 0.8692424415989399, + "learning_rate": 2.1657942115671037e-06, + "loss": 0.1542, + "step": 15593 + }, + { + "epoch": 0.79, + "grad_norm": 0.892577770113712, + "learning_rate": 2.164770740143203e-06, + "loss": 0.1696, + "step": 15594 + }, + { + "epoch": 0.79, + "grad_norm": 1.1374430428559237, + "learning_rate": 2.163747481250342e-06, + "loss": 0.1705, + "step": 15595 + }, + { + "epoch": 0.79, + "grad_norm": 1.096025517966074, + "learning_rate": 2.1627244349162702e-06, + "loss": 0.1597, + "step": 15596 + }, + { + "epoch": 0.79, + "grad_norm": 1.4958794914876075, + "learning_rate": 2.161701601168741e-06, + "loss": 0.1549, + "step": 15597 + }, + { + "epoch": 0.79, + "grad_norm": 1.4401233627884298, + "learning_rate": 2.1606789800354978e-06, + "loss": 0.156, + "step": 15598 + }, + { + "epoch": 0.79, + "grad_norm": 1.243981167898369, + "learning_rate": 2.1596565715442843e-06, + "loss": 0.1524, + "step": 15599 + }, + { + "epoch": 0.79, + "grad_norm": 1.12636656464376, + "learning_rate": 2.1586343757228247e-06, + "loss": 0.1605, + "step": 15600 + }, + { + "epoch": 0.79, + "grad_norm": 1.5292957878929483, + "learning_rate": 2.1576123925988548e-06, + "loss": 0.1592, + "step": 15601 + }, + { + "epoch": 0.79, + "grad_norm": 0.8964219413195446, + "learning_rate": 2.1565906222000877e-06, + "loss": 0.1647, + "step": 15602 + }, + { + "epoch": 0.79, + "grad_norm": 1.2008320785613633, + "learning_rate": 2.155569064554246e-06, + "loss": 0.1705, + "step": 15603 + }, + { + "epoch": 0.79, + "grad_norm": 1.5424164419925088, + "learning_rate": 2.154547719689034e-06, + "loss": 0.1898, + "step": 15604 + }, + { + "epoch": 0.79, + "grad_norm": 0.9392973979831717, + "learning_rate": 2.1535265876321574e-06, + "loss": 0.1622, + "step": 15605 + }, + { + "epoch": 0.79, + "grad_norm": 0.9378226880420547, + "learning_rate": 2.1525056684113166e-06, + "loss": 0.164, + "step": 15606 + }, + { + "epoch": 0.79, + "grad_norm": 1.1582115722775614, + "learning_rate": 2.1514849620542045e-06, + "loss": 0.1742, + "step": 15607 + }, + { + "epoch": 0.79, + "grad_norm": 0.9833527156186123, + "learning_rate": 2.1504644685885044e-06, + "loss": 0.1541, + "step": 15608 + }, + { + "epoch": 0.79, + "grad_norm": 1.1202416352338174, + "learning_rate": 2.1494441880419005e-06, + "loss": 0.1652, + "step": 15609 + }, + { + "epoch": 0.79, + "grad_norm": 1.2100725249448117, + "learning_rate": 2.1484241204420676e-06, + "loss": 0.1691, + "step": 15610 + }, + { + "epoch": 0.79, + "grad_norm": 1.2245008690966699, + "learning_rate": 2.1474042658166703e-06, + "loss": 0.1777, + "step": 15611 + }, + { + "epoch": 0.79, + "grad_norm": 0.8136968098830883, + "learning_rate": 2.1463846241933785e-06, + "loss": 0.1627, + "step": 15612 + }, + { + "epoch": 0.79, + "grad_norm": 1.0309363959270292, + "learning_rate": 2.1453651955998445e-06, + "loss": 0.1667, + "step": 15613 + }, + { + "epoch": 0.79, + "grad_norm": 1.2038948066066886, + "learning_rate": 2.1443459800637234e-06, + "loss": 0.1653, + "step": 15614 + }, + { + "epoch": 0.79, + "grad_norm": 0.8752988126685001, + "learning_rate": 2.143326977612662e-06, + "loss": 0.1613, + "step": 15615 + }, + { + "epoch": 0.79, + "grad_norm": 1.81493830010666, + "learning_rate": 2.1423081882743026e-06, + "loss": 0.1677, + "step": 15616 + }, + { + "epoch": 0.79, + "grad_norm": 0.8819559453162542, + "learning_rate": 2.1412896120762738e-06, + "loss": 0.1824, + "step": 15617 + }, + { + "epoch": 0.79, + "grad_norm": 0.8360825523913156, + "learning_rate": 2.1402712490462106e-06, + "loss": 0.1689, + "step": 15618 + }, + { + "epoch": 0.79, + "grad_norm": 0.8747309523646191, + "learning_rate": 2.139253099211732e-06, + "loss": 0.1606, + "step": 15619 + }, + { + "epoch": 0.79, + "grad_norm": 1.0068570576272733, + "learning_rate": 2.1382351626004595e-06, + "loss": 0.1643, + "step": 15620 + }, + { + "epoch": 0.79, + "grad_norm": 1.0247923129436445, + "learning_rate": 2.1372174392400003e-06, + "loss": 0.1668, + "step": 15621 + }, + { + "epoch": 0.79, + "grad_norm": 1.0541891741832206, + "learning_rate": 2.1361999291579636e-06, + "loss": 0.167, + "step": 15622 + }, + { + "epoch": 0.79, + "grad_norm": 1.006406001594581, + "learning_rate": 2.135182632381946e-06, + "loss": 0.1552, + "step": 15623 + }, + { + "epoch": 0.79, + "grad_norm": 1.0631465203649262, + "learning_rate": 2.134165548939543e-06, + "loss": 0.1555, + "step": 15624 + }, + { + "epoch": 0.79, + "grad_norm": 1.2124724084787541, + "learning_rate": 2.1331486788583444e-06, + "loss": 0.1926, + "step": 15625 + }, + { + "epoch": 0.79, + "grad_norm": 1.1730304656747461, + "learning_rate": 2.132132022165935e-06, + "loss": 0.1615, + "step": 15626 + }, + { + "epoch": 0.79, + "grad_norm": 0.9980917211125898, + "learning_rate": 2.1311155788898884e-06, + "loss": 0.1673, + "step": 15627 + }, + { + "epoch": 0.79, + "grad_norm": 1.3509935622005758, + "learning_rate": 2.130099349057774e-06, + "loss": 0.1931, + "step": 15628 + }, + { + "epoch": 0.79, + "grad_norm": 1.2673298856578072, + "learning_rate": 2.1290833326971617e-06, + "loss": 0.1688, + "step": 15629 + }, + { + "epoch": 0.79, + "grad_norm": 0.95908081207994, + "learning_rate": 2.128067529835606e-06, + "loss": 0.155, + "step": 15630 + }, + { + "epoch": 0.79, + "grad_norm": 1.0957765186131665, + "learning_rate": 2.1270519405006618e-06, + "loss": 0.1418, + "step": 15631 + }, + { + "epoch": 0.79, + "grad_norm": 0.9286845139617024, + "learning_rate": 2.12603656471988e-06, + "loss": 0.154, + "step": 15632 + }, + { + "epoch": 0.79, + "grad_norm": 0.8510502467796793, + "learning_rate": 2.1250214025208028e-06, + "loss": 0.1514, + "step": 15633 + }, + { + "epoch": 0.8, + "grad_norm": 1.1681591596259013, + "learning_rate": 2.1240064539309637e-06, + "loss": 0.1829, + "step": 15634 + }, + { + "epoch": 0.8, + "grad_norm": 1.076521627852123, + "learning_rate": 2.122991718977896e-06, + "loss": 0.1688, + "step": 15635 + }, + { + "epoch": 0.8, + "grad_norm": 1.7132103153256815, + "learning_rate": 2.121977197689119e-06, + "loss": 0.1528, + "step": 15636 + }, + { + "epoch": 0.8, + "grad_norm": 0.9492644200091807, + "learning_rate": 2.1209628900921597e-06, + "loss": 0.1833, + "step": 15637 + }, + { + "epoch": 0.8, + "grad_norm": 0.8755613060414379, + "learning_rate": 2.1199487962145236e-06, + "loss": 0.1654, + "step": 15638 + }, + { + "epoch": 0.8, + "grad_norm": 1.0228870344740009, + "learning_rate": 2.1189349160837247e-06, + "loss": 0.17, + "step": 15639 + }, + { + "epoch": 0.8, + "grad_norm": 0.9319147718614745, + "learning_rate": 2.1179212497272582e-06, + "loss": 0.1625, + "step": 15640 + }, + { + "epoch": 0.8, + "grad_norm": 1.2706661375506798, + "learning_rate": 2.116907797172624e-06, + "loss": 0.1656, + "step": 15641 + }, + { + "epoch": 0.8, + "grad_norm": 1.0564370189070695, + "learning_rate": 2.11589455844731e-06, + "loss": 0.1666, + "step": 15642 + }, + { + "epoch": 0.8, + "grad_norm": 0.8366015802686906, + "learning_rate": 2.1148815335788044e-06, + "loss": 0.1643, + "step": 15643 + }, + { + "epoch": 0.8, + "grad_norm": 1.4744670244805906, + "learning_rate": 2.113868722594582e-06, + "loss": 0.1693, + "step": 15644 + }, + { + "epoch": 0.8, + "grad_norm": 1.3168260345926481, + "learning_rate": 2.1128561255221138e-06, + "loss": 0.1713, + "step": 15645 + }, + { + "epoch": 0.8, + "grad_norm": 0.9985770754055425, + "learning_rate": 2.1118437423888715e-06, + "loss": 0.1794, + "step": 15646 + }, + { + "epoch": 0.8, + "grad_norm": 1.281413149778363, + "learning_rate": 2.1108315732223104e-06, + "loss": 0.1713, + "step": 15647 + }, + { + "epoch": 0.8, + "grad_norm": 0.9125070660623015, + "learning_rate": 2.109819618049891e-06, + "loss": 0.1653, + "step": 15648 + }, + { + "epoch": 0.8, + "grad_norm": 1.8057245462136853, + "learning_rate": 2.108807876899057e-06, + "loss": 0.1752, + "step": 15649 + }, + { + "epoch": 0.8, + "grad_norm": 0.8487713219507667, + "learning_rate": 2.1077963497972555e-06, + "loss": 0.1809, + "step": 15650 + }, + { + "epoch": 0.8, + "grad_norm": 0.9196659372204345, + "learning_rate": 2.1067850367719247e-06, + "loss": 0.1755, + "step": 15651 + }, + { + "epoch": 0.8, + "grad_norm": 0.99716980725985, + "learning_rate": 2.105773937850497e-06, + "loss": 0.175, + "step": 15652 + }, + { + "epoch": 0.8, + "grad_norm": 0.8829025384108097, + "learning_rate": 2.1047630530603946e-06, + "loss": 0.1684, + "step": 15653 + }, + { + "epoch": 0.8, + "grad_norm": 6.110652541716581, + "learning_rate": 2.103752382429043e-06, + "loss": 0.1611, + "step": 15654 + }, + { + "epoch": 0.8, + "grad_norm": 0.9720341752038881, + "learning_rate": 2.102741925983851e-06, + "loss": 0.1536, + "step": 15655 + }, + { + "epoch": 0.8, + "grad_norm": 0.9920772371908424, + "learning_rate": 2.101731683752234e-06, + "loss": 0.1868, + "step": 15656 + }, + { + "epoch": 0.8, + "grad_norm": 1.7231021520074439, + "learning_rate": 2.100721655761587e-06, + "loss": 0.182, + "step": 15657 + }, + { + "epoch": 0.8, + "grad_norm": 1.021432371471618, + "learning_rate": 2.0997118420393125e-06, + "loss": 0.171, + "step": 15658 + }, + { + "epoch": 0.8, + "grad_norm": 0.9649623834705489, + "learning_rate": 2.0987022426128e-06, + "loss": 0.1707, + "step": 15659 + }, + { + "epoch": 0.8, + "grad_norm": 1.258458441032225, + "learning_rate": 2.097692857509439e-06, + "loss": 0.1984, + "step": 15660 + }, + { + "epoch": 0.8, + "grad_norm": 0.9717680207982854, + "learning_rate": 2.096683686756602e-06, + "loss": 0.1566, + "step": 15661 + }, + { + "epoch": 0.8, + "grad_norm": 0.7280588099862542, + "learning_rate": 2.0956747303816694e-06, + "loss": 0.1463, + "step": 15662 + }, + { + "epoch": 0.8, + "grad_norm": 0.9012205407601845, + "learning_rate": 2.0946659884120056e-06, + "loss": 0.1622, + "step": 15663 + }, + { + "epoch": 0.8, + "grad_norm": 1.584582682612466, + "learning_rate": 2.0936574608749717e-06, + "loss": 0.164, + "step": 15664 + }, + { + "epoch": 0.8, + "grad_norm": 0.8224010780566432, + "learning_rate": 2.0926491477979272e-06, + "loss": 0.1481, + "step": 15665 + }, + { + "epoch": 0.8, + "grad_norm": 1.2102346578962973, + "learning_rate": 2.0916410492082195e-06, + "loss": 0.1545, + "step": 15666 + }, + { + "epoch": 0.8, + "grad_norm": 1.4038159209736356, + "learning_rate": 2.0906331651331945e-06, + "loss": 0.1589, + "step": 15667 + }, + { + "epoch": 0.8, + "grad_norm": 0.8479165441994736, + "learning_rate": 2.0896254956001916e-06, + "loss": 0.1943, + "step": 15668 + }, + { + "epoch": 0.8, + "grad_norm": 0.9723482193483798, + "learning_rate": 2.0886180406365465e-06, + "loss": 0.2076, + "step": 15669 + }, + { + "epoch": 0.8, + "grad_norm": 1.3228687636011576, + "learning_rate": 2.0876108002695817e-06, + "loss": 0.1931, + "step": 15670 + }, + { + "epoch": 0.8, + "grad_norm": 1.0600971579261, + "learning_rate": 2.0866037745266232e-06, + "loss": 0.1512, + "step": 15671 + }, + { + "epoch": 0.8, + "grad_norm": 1.2101345515538404, + "learning_rate": 2.0855969634349816e-06, + "loss": 0.1677, + "step": 15672 + }, + { + "epoch": 0.8, + "grad_norm": 1.2727403325036004, + "learning_rate": 2.0845903670219714e-06, + "loss": 0.1886, + "step": 15673 + }, + { + "epoch": 0.8, + "grad_norm": 1.1157062298942677, + "learning_rate": 2.083583985314893e-06, + "loss": 0.1569, + "step": 15674 + }, + { + "epoch": 0.8, + "grad_norm": 2.1086577491811576, + "learning_rate": 2.0825778183410485e-06, + "loss": 0.1888, + "step": 15675 + }, + { + "epoch": 0.8, + "grad_norm": 1.043001791637282, + "learning_rate": 2.0815718661277253e-06, + "loss": 0.1628, + "step": 15676 + }, + { + "epoch": 0.8, + "grad_norm": 0.8982340742383359, + "learning_rate": 2.0805661287022115e-06, + "loss": 0.1587, + "step": 15677 + }, + { + "epoch": 0.8, + "grad_norm": 0.7720902538988256, + "learning_rate": 2.0795606060917896e-06, + "loss": 0.1491, + "step": 15678 + }, + { + "epoch": 0.8, + "grad_norm": 1.4247042004798163, + "learning_rate": 2.0785552983237366e-06, + "loss": 0.1671, + "step": 15679 + }, + { + "epoch": 0.8, + "grad_norm": 1.8891588378758972, + "learning_rate": 2.077550205425317e-06, + "loss": 0.1702, + "step": 15680 + }, + { + "epoch": 0.8, + "grad_norm": 1.2919121709259531, + "learning_rate": 2.0765453274237944e-06, + "loss": 0.1652, + "step": 15681 + }, + { + "epoch": 0.8, + "grad_norm": 0.960395602437291, + "learning_rate": 2.0755406643464293e-06, + "loss": 0.1513, + "step": 15682 + }, + { + "epoch": 0.8, + "grad_norm": 1.094433177420648, + "learning_rate": 2.07453621622047e-06, + "loss": 0.1857, + "step": 15683 + }, + { + "epoch": 0.8, + "grad_norm": 1.192579997836186, + "learning_rate": 2.0735319830731614e-06, + "loss": 0.1708, + "step": 15684 + }, + { + "epoch": 0.8, + "grad_norm": 1.2940041677953567, + "learning_rate": 2.0725279649317463e-06, + "loss": 0.1768, + "step": 15685 + }, + { + "epoch": 0.8, + "grad_norm": 1.0753642049005883, + "learning_rate": 2.0715241618234603e-06, + "loss": 0.1733, + "step": 15686 + }, + { + "epoch": 0.8, + "grad_norm": 0.8004753323947372, + "learning_rate": 2.0705205737755276e-06, + "loss": 0.1839, + "step": 15687 + }, + { + "epoch": 0.8, + "grad_norm": 1.0962965286334978, + "learning_rate": 2.069517200815173e-06, + "loss": 0.1646, + "step": 15688 + }, + { + "epoch": 0.8, + "grad_norm": 1.1776350758720255, + "learning_rate": 2.0685140429696114e-06, + "loss": 0.1776, + "step": 15689 + }, + { + "epoch": 0.8, + "grad_norm": 1.1486790236667523, + "learning_rate": 2.0675111002660566e-06, + "loss": 0.1659, + "step": 15690 + }, + { + "epoch": 0.8, + "grad_norm": 1.0319557213802002, + "learning_rate": 2.066508372731708e-06, + "loss": 0.1483, + "step": 15691 + }, + { + "epoch": 0.8, + "grad_norm": 0.9726333520511419, + "learning_rate": 2.0655058603937704e-06, + "loss": 0.1565, + "step": 15692 + }, + { + "epoch": 0.8, + "grad_norm": 1.0562199934217975, + "learning_rate": 2.0645035632794317e-06, + "loss": 0.1657, + "step": 15693 + }, + { + "epoch": 0.8, + "grad_norm": 0.9933074570339296, + "learning_rate": 2.0635014814158826e-06, + "loss": 0.1601, + "step": 15694 + }, + { + "epoch": 0.8, + "grad_norm": 1.1359255460270459, + "learning_rate": 2.0624996148303043e-06, + "loss": 0.19, + "step": 15695 + }, + { + "epoch": 0.8, + "grad_norm": 0.909456397171364, + "learning_rate": 2.0614979635498743e-06, + "loss": 0.1661, + "step": 15696 + }, + { + "epoch": 0.8, + "grad_norm": 0.8667047580297196, + "learning_rate": 2.060496527601761e-06, + "loss": 0.1555, + "step": 15697 + }, + { + "epoch": 0.8, + "grad_norm": 1.7733186418000007, + "learning_rate": 2.059495307013125e-06, + "loss": 0.1288, + "step": 15698 + }, + { + "epoch": 0.8, + "grad_norm": 1.2790866004491492, + "learning_rate": 2.0584943018111304e-06, + "loss": 0.1741, + "step": 15699 + }, + { + "epoch": 0.8, + "grad_norm": 1.0679954503666529, + "learning_rate": 2.0574935120229224e-06, + "loss": 0.1689, + "step": 15700 + }, + { + "epoch": 0.8, + "grad_norm": 0.9779793058532171, + "learning_rate": 2.0564929376756526e-06, + "loss": 0.1613, + "step": 15701 + }, + { + "epoch": 0.8, + "grad_norm": 0.9095580126414525, + "learning_rate": 2.0554925787964596e-06, + "loss": 0.1584, + "step": 15702 + }, + { + "epoch": 0.8, + "grad_norm": 0.7880611948799627, + "learning_rate": 2.0544924354124828e-06, + "loss": 0.1481, + "step": 15703 + }, + { + "epoch": 0.8, + "grad_norm": 0.94216252208585, + "learning_rate": 2.053492507550845e-06, + "loss": 0.1826, + "step": 15704 + }, + { + "epoch": 0.8, + "grad_norm": 0.9309447877817286, + "learning_rate": 2.052492795238673e-06, + "loss": 0.1599, + "step": 15705 + }, + { + "epoch": 0.8, + "grad_norm": 1.2542500450284721, + "learning_rate": 2.051493298503081e-06, + "loss": 0.179, + "step": 15706 + }, + { + "epoch": 0.8, + "grad_norm": 1.0676784313529195, + "learning_rate": 2.050494017371185e-06, + "loss": 0.1487, + "step": 15707 + }, + { + "epoch": 0.8, + "grad_norm": 1.2173517816166688, + "learning_rate": 2.0494949518700846e-06, + "loss": 0.1743, + "step": 15708 + }, + { + "epoch": 0.8, + "grad_norm": 1.1361457952523206, + "learning_rate": 2.0484961020268857e-06, + "loss": 0.1756, + "step": 15709 + }, + { + "epoch": 0.8, + "grad_norm": 0.8451480774651444, + "learning_rate": 2.0474974678686764e-06, + "loss": 0.1673, + "step": 15710 + }, + { + "epoch": 0.8, + "grad_norm": 0.9615371002031146, + "learning_rate": 2.046499049422548e-06, + "loss": 0.1523, + "step": 15711 + }, + { + "epoch": 0.8, + "grad_norm": 0.8893607417171246, + "learning_rate": 2.045500846715581e-06, + "loss": 0.1546, + "step": 15712 + }, + { + "epoch": 0.8, + "grad_norm": 0.9308432802784401, + "learning_rate": 2.0445028597748564e-06, + "loss": 0.1748, + "step": 15713 + }, + { + "epoch": 0.8, + "grad_norm": 1.1209141683077861, + "learning_rate": 2.04350508862744e-06, + "loss": 0.1768, + "step": 15714 + }, + { + "epoch": 0.8, + "grad_norm": 0.9681851429622521, + "learning_rate": 2.042507533300395e-06, + "loss": 0.1701, + "step": 15715 + }, + { + "epoch": 0.8, + "grad_norm": 1.0106823838571959, + "learning_rate": 2.041510193820786e-06, + "loss": 0.1589, + "step": 15716 + }, + { + "epoch": 0.8, + "grad_norm": 1.1698606792797457, + "learning_rate": 2.0405130702156596e-06, + "loss": 0.163, + "step": 15717 + }, + { + "epoch": 0.8, + "grad_norm": 1.3727999721815387, + "learning_rate": 2.0395161625120684e-06, + "loss": 0.1619, + "step": 15718 + }, + { + "epoch": 0.8, + "grad_norm": 1.2642484204300222, + "learning_rate": 2.038519470737048e-06, + "loss": 0.1729, + "step": 15719 + }, + { + "epoch": 0.8, + "grad_norm": 1.390230059267973, + "learning_rate": 2.0375229949176367e-06, + "loss": 0.1794, + "step": 15720 + }, + { + "epoch": 0.8, + "grad_norm": 0.9163587083719319, + "learning_rate": 2.036526735080865e-06, + "loss": 0.1682, + "step": 15721 + }, + { + "epoch": 0.8, + "grad_norm": 1.9447935253499224, + "learning_rate": 2.0355306912537565e-06, + "loss": 0.1703, + "step": 15722 + }, + { + "epoch": 0.8, + "grad_norm": 1.3922281759361557, + "learning_rate": 2.0345348634633254e-06, + "loss": 0.1785, + "step": 15723 + }, + { + "epoch": 0.8, + "grad_norm": 1.0678235780247454, + "learning_rate": 2.0335392517365895e-06, + "loss": 0.1396, + "step": 15724 + }, + { + "epoch": 0.8, + "grad_norm": 1.2132485465777194, + "learning_rate": 2.032543856100548e-06, + "loss": 0.1861, + "step": 15725 + }, + { + "epoch": 0.8, + "grad_norm": 1.0154765236215866, + "learning_rate": 2.0315486765822067e-06, + "loss": 0.1645, + "step": 15726 + }, + { + "epoch": 0.8, + "grad_norm": 1.070807295078891, + "learning_rate": 2.0305537132085562e-06, + "loss": 0.1616, + "step": 15727 + }, + { + "epoch": 0.8, + "grad_norm": 1.8395267663742878, + "learning_rate": 2.0295589660065853e-06, + "loss": 0.1606, + "step": 15728 + }, + { + "epoch": 0.8, + "grad_norm": 1.0895675484709622, + "learning_rate": 2.0285644350032786e-06, + "loss": 0.1621, + "step": 15729 + }, + { + "epoch": 0.8, + "grad_norm": 4.234470650985866, + "learning_rate": 2.027570120225614e-06, + "loss": 0.1496, + "step": 15730 + }, + { + "epoch": 0.8, + "grad_norm": 0.9022044363184193, + "learning_rate": 2.026576021700557e-06, + "loss": 0.158, + "step": 15731 + }, + { + "epoch": 0.8, + "grad_norm": 1.0723980217946327, + "learning_rate": 2.025582139455078e-06, + "loss": 0.1712, + "step": 15732 + }, + { + "epoch": 0.8, + "grad_norm": 0.9710404594344169, + "learning_rate": 2.024588473516135e-06, + "loss": 0.1716, + "step": 15733 + }, + { + "epoch": 0.8, + "grad_norm": 1.0359442328774089, + "learning_rate": 2.023595023910677e-06, + "loss": 0.1672, + "step": 15734 + }, + { + "epoch": 0.8, + "grad_norm": 1.0489456256074758, + "learning_rate": 2.0226017906656568e-06, + "loss": 0.1577, + "step": 15735 + }, + { + "epoch": 0.8, + "grad_norm": 1.0072651874558733, + "learning_rate": 2.0216087738080116e-06, + "loss": 0.1693, + "step": 15736 + }, + { + "epoch": 0.8, + "grad_norm": 1.1457513882633208, + "learning_rate": 2.020615973364679e-06, + "loss": 0.1623, + "step": 15737 + }, + { + "epoch": 0.8, + "grad_norm": 1.297135541941941, + "learning_rate": 2.0196233893625883e-06, + "loss": 0.1638, + "step": 15738 + }, + { + "epoch": 0.8, + "grad_norm": 0.971115124295936, + "learning_rate": 2.0186310218286674e-06, + "loss": 0.1912, + "step": 15739 + }, + { + "epoch": 0.8, + "grad_norm": 1.2845202111648972, + "learning_rate": 2.0176388707898274e-06, + "loss": 0.1634, + "step": 15740 + }, + { + "epoch": 0.8, + "grad_norm": 0.8963316976105339, + "learning_rate": 2.0166469362729868e-06, + "loss": 0.147, + "step": 15741 + }, + { + "epoch": 0.8, + "grad_norm": 0.9617179150805765, + "learning_rate": 2.015655218305046e-06, + "loss": 0.1708, + "step": 15742 + }, + { + "epoch": 0.8, + "grad_norm": 1.0783035229158529, + "learning_rate": 2.0146637169129114e-06, + "loss": 0.1705, + "step": 15743 + }, + { + "epoch": 0.8, + "grad_norm": 0.989808277406126, + "learning_rate": 2.0136724321234714e-06, + "loss": 0.1549, + "step": 15744 + }, + { + "epoch": 0.8, + "grad_norm": 1.1378458311050972, + "learning_rate": 2.012681363963621e-06, + "loss": 0.1559, + "step": 15745 + }, + { + "epoch": 0.8, + "grad_norm": 1.1096384684415859, + "learning_rate": 2.011690512460237e-06, + "loss": 0.1661, + "step": 15746 + }, + { + "epoch": 0.8, + "grad_norm": 1.1269780749629759, + "learning_rate": 2.010699877640199e-06, + "loss": 0.146, + "step": 15747 + }, + { + "epoch": 0.8, + "grad_norm": 1.1432157339065054, + "learning_rate": 2.0097094595303766e-06, + "loss": 0.1725, + "step": 15748 + }, + { + "epoch": 0.8, + "grad_norm": 0.6841558682447949, + "learning_rate": 2.008719258157641e-06, + "loss": 0.1405, + "step": 15749 + }, + { + "epoch": 0.8, + "grad_norm": 1.4822552614993239, + "learning_rate": 2.0077292735488465e-06, + "loss": 0.1702, + "step": 15750 + }, + { + "epoch": 0.8, + "grad_norm": 0.872921064671971, + "learning_rate": 2.0067395057308436e-06, + "loss": 0.1593, + "step": 15751 + }, + { + "epoch": 0.8, + "grad_norm": 1.4210836201174981, + "learning_rate": 2.005749954730486e-06, + "loss": 0.1776, + "step": 15752 + }, + { + "epoch": 0.8, + "grad_norm": 1.0048219241176233, + "learning_rate": 2.0047606205746095e-06, + "loss": 0.1663, + "step": 15753 + }, + { + "epoch": 0.8, + "grad_norm": 2.1412716222867774, + "learning_rate": 2.0037715032900527e-06, + "loss": 0.1612, + "step": 15754 + }, + { + "epoch": 0.8, + "grad_norm": 1.0694424370520152, + "learning_rate": 2.002782602903647e-06, + "loss": 0.1815, + "step": 15755 + }, + { + "epoch": 0.8, + "grad_norm": 1.0157618690035999, + "learning_rate": 2.0017939194422153e-06, + "loss": 0.1572, + "step": 15756 + }, + { + "epoch": 0.8, + "grad_norm": 1.0320628363686533, + "learning_rate": 2.000805452932574e-06, + "loss": 0.16, + "step": 15757 + }, + { + "epoch": 0.8, + "grad_norm": 0.9266500851117392, + "learning_rate": 1.9998172034015382e-06, + "loss": 0.1642, + "step": 15758 + }, + { + "epoch": 0.8, + "grad_norm": 1.2331866837650636, + "learning_rate": 1.9988291708759112e-06, + "loss": 0.1876, + "step": 15759 + }, + { + "epoch": 0.8, + "grad_norm": 1.195498850218461, + "learning_rate": 1.997841355382497e-06, + "loss": 0.1913, + "step": 15760 + }, + { + "epoch": 0.8, + "grad_norm": 1.292371384434633, + "learning_rate": 1.996853756948085e-06, + "loss": 0.1433, + "step": 15761 + }, + { + "epoch": 0.8, + "grad_norm": 0.8263446800583641, + "learning_rate": 1.99586637559947e-06, + "loss": 0.1749, + "step": 15762 + }, + { + "epoch": 0.8, + "grad_norm": 0.9366195331034552, + "learning_rate": 1.9948792113634286e-06, + "loss": 0.1765, + "step": 15763 + }, + { + "epoch": 0.8, + "grad_norm": 0.9960600752360786, + "learning_rate": 1.9938922642667412e-06, + "loss": 0.1725, + "step": 15764 + }, + { + "epoch": 0.8, + "grad_norm": 0.813794636115855, + "learning_rate": 1.992905534336178e-06, + "loss": 0.1622, + "step": 15765 + }, + { + "epoch": 0.8, + "grad_norm": 1.0309233878437345, + "learning_rate": 1.9919190215985073e-06, + "loss": 0.1848, + "step": 15766 + }, + { + "epoch": 0.8, + "grad_norm": 1.122958722281624, + "learning_rate": 1.9909327260804857e-06, + "loss": 0.1691, + "step": 15767 + }, + { + "epoch": 0.8, + "grad_norm": 0.999950954499327, + "learning_rate": 1.9899466478088624e-06, + "loss": 0.172, + "step": 15768 + }, + { + "epoch": 0.8, + "grad_norm": 2.0435904214356926, + "learning_rate": 1.988960786810392e-06, + "loss": 0.1712, + "step": 15769 + }, + { + "epoch": 0.8, + "grad_norm": 1.0331564521315622, + "learning_rate": 1.98797514311181e-06, + "loss": 0.1449, + "step": 15770 + }, + { + "epoch": 0.8, + "grad_norm": 1.1759836053908204, + "learning_rate": 1.9869897167398576e-06, + "loss": 0.1635, + "step": 15771 + }, + { + "epoch": 0.8, + "grad_norm": 0.959731234772625, + "learning_rate": 1.9860045077212596e-06, + "loss": 0.1487, + "step": 15772 + }, + { + "epoch": 0.8, + "grad_norm": 1.0413370355997298, + "learning_rate": 1.9850195160827413e-06, + "loss": 0.1479, + "step": 15773 + }, + { + "epoch": 0.8, + "grad_norm": 1.0646452471085828, + "learning_rate": 1.984034741851022e-06, + "loss": 0.1884, + "step": 15774 + }, + { + "epoch": 0.8, + "grad_norm": 0.9575156754587291, + "learning_rate": 1.983050185052815e-06, + "loss": 0.1631, + "step": 15775 + }, + { + "epoch": 0.8, + "grad_norm": 1.0866603608279728, + "learning_rate": 1.982065845714821e-06, + "loss": 0.1566, + "step": 15776 + }, + { + "epoch": 0.8, + "grad_norm": 0.7649475499030911, + "learning_rate": 1.981081723863748e-06, + "loss": 0.1656, + "step": 15777 + }, + { + "epoch": 0.8, + "grad_norm": 1.4407044728748781, + "learning_rate": 1.9800978195262833e-06, + "loss": 0.1635, + "step": 15778 + }, + { + "epoch": 0.8, + "grad_norm": 1.2012243933420368, + "learning_rate": 1.9791141327291207e-06, + "loss": 0.1667, + "step": 15779 + }, + { + "epoch": 0.8, + "grad_norm": 0.7866904569914929, + "learning_rate": 1.9781306634989373e-06, + "loss": 0.1476, + "step": 15780 + }, + { + "epoch": 0.8, + "grad_norm": 1.4004951623252793, + "learning_rate": 1.9771474118624123e-06, + "loss": 0.1752, + "step": 15781 + }, + { + "epoch": 0.8, + "grad_norm": 1.1472073072957027, + "learning_rate": 1.976164377846218e-06, + "loss": 0.166, + "step": 15782 + }, + { + "epoch": 0.8, + "grad_norm": 0.9439095519046051, + "learning_rate": 1.97518156147702e-06, + "loss": 0.1813, + "step": 15783 + }, + { + "epoch": 0.8, + "grad_norm": 1.2137528617256548, + "learning_rate": 1.974198962781475e-06, + "loss": 0.1753, + "step": 15784 + }, + { + "epoch": 0.8, + "grad_norm": 0.939358251629492, + "learning_rate": 1.973216581786235e-06, + "loss": 0.1601, + "step": 15785 + }, + { + "epoch": 0.8, + "grad_norm": 1.0757297630588072, + "learning_rate": 1.972234418517951e-06, + "loss": 0.1591, + "step": 15786 + }, + { + "epoch": 0.8, + "grad_norm": 1.4805360150023117, + "learning_rate": 1.971252473003259e-06, + "loss": 0.1767, + "step": 15787 + }, + { + "epoch": 0.8, + "grad_norm": 1.0135733620995198, + "learning_rate": 1.9702707452687988e-06, + "loss": 0.14, + "step": 15788 + }, + { + "epoch": 0.8, + "grad_norm": 1.087953767363629, + "learning_rate": 1.9692892353411963e-06, + "loss": 0.1531, + "step": 15789 + }, + { + "epoch": 0.8, + "grad_norm": 1.67015170683124, + "learning_rate": 1.9683079432470774e-06, + "loss": 0.1513, + "step": 15790 + }, + { + "epoch": 0.8, + "grad_norm": 0.9810071214994488, + "learning_rate": 1.967326869013059e-06, + "loss": 0.1476, + "step": 15791 + }, + { + "epoch": 0.8, + "grad_norm": 1.0675283797681412, + "learning_rate": 1.966346012665754e-06, + "loss": 0.1575, + "step": 15792 + }, + { + "epoch": 0.8, + "grad_norm": 1.121737893971538, + "learning_rate": 1.965365374231766e-06, + "loss": 0.1748, + "step": 15793 + }, + { + "epoch": 0.8, + "grad_norm": 1.2965253386253484, + "learning_rate": 1.964384953737699e-06, + "loss": 0.1592, + "step": 15794 + }, + { + "epoch": 0.8, + "grad_norm": 0.8148539120516977, + "learning_rate": 1.9634047512101405e-06, + "loss": 0.182, + "step": 15795 + }, + { + "epoch": 0.8, + "grad_norm": 1.1682681957562062, + "learning_rate": 1.9624247666756844e-06, + "loss": 0.1777, + "step": 15796 + }, + { + "epoch": 0.8, + "grad_norm": 0.8664849282242465, + "learning_rate": 1.9614450001609085e-06, + "loss": 0.1577, + "step": 15797 + }, + { + "epoch": 0.8, + "grad_norm": 1.082203974334217, + "learning_rate": 1.960465451692394e-06, + "loss": 0.1579, + "step": 15798 + }, + { + "epoch": 0.8, + "grad_norm": 0.9962982984962556, + "learning_rate": 1.9594861212967055e-06, + "loss": 0.1612, + "step": 15799 + }, + { + "epoch": 0.8, + "grad_norm": 1.0910307551966145, + "learning_rate": 1.958507009000409e-06, + "loss": 0.1689, + "step": 15800 + }, + { + "epoch": 0.8, + "grad_norm": 0.846953543876322, + "learning_rate": 1.957528114830065e-06, + "loss": 0.1661, + "step": 15801 + }, + { + "epoch": 0.8, + "grad_norm": 1.0591308349877249, + "learning_rate": 1.9565494388122273e-06, + "loss": 0.1529, + "step": 15802 + }, + { + "epoch": 0.8, + "grad_norm": 1.0679178149141837, + "learning_rate": 1.95557098097344e-06, + "loss": 0.1597, + "step": 15803 + }, + { + "epoch": 0.8, + "grad_norm": 1.315835514316853, + "learning_rate": 1.9545927413402423e-06, + "loss": 0.1676, + "step": 15804 + }, + { + "epoch": 0.8, + "grad_norm": 0.9562754514141059, + "learning_rate": 1.953614719939173e-06, + "loss": 0.1579, + "step": 15805 + }, + { + "epoch": 0.8, + "grad_norm": 1.0302795637065507, + "learning_rate": 1.9526369167967575e-06, + "loss": 0.1626, + "step": 15806 + }, + { + "epoch": 0.8, + "grad_norm": 0.9573846588142638, + "learning_rate": 1.951659331939518e-06, + "loss": 0.1702, + "step": 15807 + }, + { + "epoch": 0.8, + "grad_norm": 1.5062813400680592, + "learning_rate": 1.9506819653939757e-06, + "loss": 0.1699, + "step": 15808 + }, + { + "epoch": 0.8, + "grad_norm": 1.898213287994842, + "learning_rate": 1.9497048171866417e-06, + "loss": 0.1678, + "step": 15809 + }, + { + "epoch": 0.8, + "grad_norm": 0.8511959910939554, + "learning_rate": 1.948727887344016e-06, + "loss": 0.155, + "step": 15810 + }, + { + "epoch": 0.8, + "grad_norm": 0.9784540549217013, + "learning_rate": 1.947751175892605e-06, + "loss": 0.1587, + "step": 15811 + }, + { + "epoch": 0.8, + "grad_norm": 0.9690023808596446, + "learning_rate": 1.946774682858894e-06, + "loss": 0.1456, + "step": 15812 + }, + { + "epoch": 0.8, + "grad_norm": 1.155590744200502, + "learning_rate": 1.9457984082693783e-06, + "loss": 0.1587, + "step": 15813 + }, + { + "epoch": 0.8, + "grad_norm": 1.1814968608821823, + "learning_rate": 1.9448223521505328e-06, + "loss": 0.1735, + "step": 15814 + }, + { + "epoch": 0.8, + "grad_norm": 1.0409770722575613, + "learning_rate": 1.9438465145288377e-06, + "loss": 0.1674, + "step": 15815 + }, + { + "epoch": 0.8, + "grad_norm": 1.612792088595669, + "learning_rate": 1.9428708954307595e-06, + "loss": 0.1713, + "step": 15816 + }, + { + "epoch": 0.8, + "grad_norm": 0.9555743960260334, + "learning_rate": 1.941895494882763e-06, + "loss": 0.1508, + "step": 15817 + }, + { + "epoch": 0.8, + "grad_norm": 1.4384926799369615, + "learning_rate": 1.940920312911306e-06, + "loss": 0.1838, + "step": 15818 + }, + { + "epoch": 0.8, + "grad_norm": 1.0606797339626892, + "learning_rate": 1.9399453495428434e-06, + "loss": 0.1627, + "step": 15819 + }, + { + "epoch": 0.8, + "grad_norm": 0.8377247178765734, + "learning_rate": 1.9389706048038183e-06, + "loss": 0.1799, + "step": 15820 + }, + { + "epoch": 0.8, + "grad_norm": 1.329021242357886, + "learning_rate": 1.9379960787206686e-06, + "loss": 0.1749, + "step": 15821 + }, + { + "epoch": 0.8, + "grad_norm": 1.2807704941709444, + "learning_rate": 1.9370217713198326e-06, + "loss": 0.1577, + "step": 15822 + }, + { + "epoch": 0.8, + "grad_norm": 1.2696927151281323, + "learning_rate": 1.936047682627734e-06, + "loss": 0.1498, + "step": 15823 + }, + { + "epoch": 0.8, + "grad_norm": 2.34286567311086, + "learning_rate": 1.9350738126707978e-06, + "loss": 0.1778, + "step": 15824 + }, + { + "epoch": 0.8, + "grad_norm": 1.102952604803538, + "learning_rate": 1.9341001614754407e-06, + "loss": 0.1737, + "step": 15825 + }, + { + "epoch": 0.8, + "grad_norm": 1.2905620541568963, + "learning_rate": 1.9331267290680744e-06, + "loss": 0.1787, + "step": 15826 + }, + { + "epoch": 0.8, + "grad_norm": 1.7752783882287622, + "learning_rate": 1.932153515475098e-06, + "loss": 0.1553, + "step": 15827 + }, + { + "epoch": 0.8, + "grad_norm": 1.389049775025545, + "learning_rate": 1.9311805207229162e-06, + "loss": 0.1576, + "step": 15828 + }, + { + "epoch": 0.8, + "grad_norm": 1.203796518234086, + "learning_rate": 1.9302077448379154e-06, + "loss": 0.1906, + "step": 15829 + }, + { + "epoch": 0.8, + "grad_norm": 0.7810701067378404, + "learning_rate": 1.929235187846488e-06, + "loss": 0.1539, + "step": 15830 + }, + { + "epoch": 0.81, + "grad_norm": 1.1441633726858527, + "learning_rate": 1.9282628497750098e-06, + "loss": 0.1673, + "step": 15831 + }, + { + "epoch": 0.81, + "grad_norm": 1.3399102124466988, + "learning_rate": 1.92729073064986e-06, + "loss": 0.194, + "step": 15832 + }, + { + "epoch": 0.81, + "grad_norm": 0.9996400920561656, + "learning_rate": 1.9263188304974023e-06, + "loss": 0.1648, + "step": 15833 + }, + { + "epoch": 0.81, + "grad_norm": 0.816567835157871, + "learning_rate": 1.9253471493440036e-06, + "loss": 0.1708, + "step": 15834 + }, + { + "epoch": 0.81, + "grad_norm": 0.8033366766000336, + "learning_rate": 1.924375687216018e-06, + "loss": 0.1448, + "step": 15835 + }, + { + "epoch": 0.81, + "grad_norm": 1.0534861733157275, + "learning_rate": 1.9234044441398016e-06, + "loss": 0.169, + "step": 15836 + }, + { + "epoch": 0.81, + "grad_norm": 0.8614388275474186, + "learning_rate": 1.922433420141695e-06, + "loss": 0.1566, + "step": 15837 + }, + { + "epoch": 0.81, + "grad_norm": 0.9897933960131479, + "learning_rate": 1.921462615248035e-06, + "loss": 0.1361, + "step": 15838 + }, + { + "epoch": 0.81, + "grad_norm": 1.0033910146002347, + "learning_rate": 1.9204920294851613e-06, + "loss": 0.168, + "step": 15839 + }, + { + "epoch": 0.81, + "grad_norm": 3.3643061597353467, + "learning_rate": 1.9195216628793956e-06, + "loss": 0.162, + "step": 15840 + }, + { + "epoch": 0.81, + "grad_norm": 1.3976734137083604, + "learning_rate": 1.9185515154570633e-06, + "loss": 0.18, + "step": 15841 + }, + { + "epoch": 0.81, + "grad_norm": 1.1903452435943236, + "learning_rate": 1.9175815872444748e-06, + "loss": 0.1673, + "step": 15842 + }, + { + "epoch": 0.81, + "grad_norm": 1.8597857795206585, + "learning_rate": 1.916611878267942e-06, + "loss": 0.1857, + "step": 15843 + }, + { + "epoch": 0.81, + "grad_norm": 1.2959628492227406, + "learning_rate": 1.915642388553769e-06, + "loss": 0.1528, + "step": 15844 + }, + { + "epoch": 0.81, + "grad_norm": 1.3997789619905479, + "learning_rate": 1.9146731181282554e-06, + "loss": 0.1762, + "step": 15845 + }, + { + "epoch": 0.81, + "grad_norm": 1.0091633763978645, + "learning_rate": 1.9137040670176878e-06, + "loss": 0.1719, + "step": 15846 + }, + { + "epoch": 0.81, + "grad_norm": 1.299180758820901, + "learning_rate": 1.9127352352483563e-06, + "loss": 0.1726, + "step": 15847 + }, + { + "epoch": 0.81, + "grad_norm": 0.9054257427731401, + "learning_rate": 1.911766622846536e-06, + "loss": 0.1776, + "step": 15848 + }, + { + "epoch": 0.81, + "grad_norm": 1.592958273371159, + "learning_rate": 1.9107982298385052e-06, + "loss": 0.1783, + "step": 15849 + }, + { + "epoch": 0.81, + "grad_norm": 1.15568910717821, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.164, + "step": 15850 + }, + { + "epoch": 0.81, + "grad_norm": 1.2317553042477192, + "learning_rate": 1.908862102108865e-06, + "loss": 0.1848, + "step": 15851 + }, + { + "epoch": 0.81, + "grad_norm": 1.065679268129052, + "learning_rate": 1.9078943674397753e-06, + "loss": 0.1531, + "step": 15852 + }, + { + "epoch": 0.81, + "grad_norm": 0.8677385153275745, + "learning_rate": 1.9069268522695105e-06, + "loss": 0.1704, + "step": 15853 + }, + { + "epoch": 0.81, + "grad_norm": 1.015003102554297, + "learning_rate": 1.9059595566243127e-06, + "loss": 0.1577, + "step": 15854 + }, + { + "epoch": 0.81, + "grad_norm": 1.5993881937232388, + "learning_rate": 1.9049924805304165e-06, + "loss": 0.1687, + "step": 15855 + }, + { + "epoch": 0.81, + "grad_norm": 0.9648294924172559, + "learning_rate": 1.9040256240140587e-06, + "loss": 0.1761, + "step": 15856 + }, + { + "epoch": 0.81, + "grad_norm": 1.129006993288202, + "learning_rate": 1.9030589871014604e-06, + "loss": 0.1539, + "step": 15857 + }, + { + "epoch": 0.81, + "grad_norm": 1.0888111506199036, + "learning_rate": 1.9020925698188465e-06, + "loss": 0.1651, + "step": 15858 + }, + { + "epoch": 0.81, + "grad_norm": 1.05744231234916, + "learning_rate": 1.9011263721924277e-06, + "loss": 0.1602, + "step": 15859 + }, + { + "epoch": 0.81, + "grad_norm": 2.8526338413193164, + "learning_rate": 1.9001603942484127e-06, + "loss": 0.172, + "step": 15860 + }, + { + "epoch": 0.81, + "grad_norm": 1.3523800109758868, + "learning_rate": 1.8991946360130043e-06, + "loss": 0.1401, + "step": 15861 + }, + { + "epoch": 0.81, + "grad_norm": 1.515723016981549, + "learning_rate": 1.8982290975124019e-06, + "loss": 0.1701, + "step": 15862 + }, + { + "epoch": 0.81, + "grad_norm": 1.7267760543292379, + "learning_rate": 1.8972637787727898e-06, + "loss": 0.1779, + "step": 15863 + }, + { + "epoch": 0.81, + "grad_norm": 1.773154795440011, + "learning_rate": 1.8962986798203587e-06, + "loss": 0.1682, + "step": 15864 + }, + { + "epoch": 0.81, + "grad_norm": 1.0148911097830382, + "learning_rate": 1.8953338006812805e-06, + "loss": 0.1721, + "step": 15865 + }, + { + "epoch": 0.81, + "grad_norm": 1.122399862038846, + "learning_rate": 1.8943691413817334e-06, + "loss": 0.1736, + "step": 15866 + }, + { + "epoch": 0.81, + "grad_norm": 1.1388130015125641, + "learning_rate": 1.8934047019478785e-06, + "loss": 0.149, + "step": 15867 + }, + { + "epoch": 0.81, + "grad_norm": 1.1411525588589864, + "learning_rate": 1.8924404824058816e-06, + "loss": 0.1763, + "step": 15868 + }, + { + "epoch": 0.81, + "grad_norm": 2.338725253391659, + "learning_rate": 1.8914764827818921e-06, + "loss": 0.1514, + "step": 15869 + }, + { + "epoch": 0.81, + "grad_norm": 1.542120667256205, + "learning_rate": 1.8905127031020598e-06, + "loss": 0.1648, + "step": 15870 + }, + { + "epoch": 0.81, + "grad_norm": 0.957055240444867, + "learning_rate": 1.8895491433925328e-06, + "loss": 0.1638, + "step": 15871 + }, + { + "epoch": 0.81, + "grad_norm": 0.8326227194676635, + "learning_rate": 1.8885858036794401e-06, + "loss": 0.1607, + "step": 15872 + }, + { + "epoch": 0.81, + "grad_norm": 1.0202307760657778, + "learning_rate": 1.8876226839889177e-06, + "loss": 0.1709, + "step": 15873 + }, + { + "epoch": 0.81, + "grad_norm": 1.1402323985518499, + "learning_rate": 1.8866597843470858e-06, + "loss": 0.1642, + "step": 15874 + }, + { + "epoch": 0.81, + "grad_norm": 1.010208304292357, + "learning_rate": 1.8856971047800687e-06, + "loss": 0.1636, + "step": 15875 + }, + { + "epoch": 0.81, + "grad_norm": 26.547268409354285, + "learning_rate": 1.8847346453139726e-06, + "loss": 0.1706, + "step": 15876 + }, + { + "epoch": 0.81, + "grad_norm": 2.9852515941330995, + "learning_rate": 1.8837724059749074e-06, + "loss": 0.159, + "step": 15877 + }, + { + "epoch": 0.81, + "grad_norm": 1.0533560381335083, + "learning_rate": 1.8828103867889747e-06, + "loss": 0.1958, + "step": 15878 + }, + { + "epoch": 0.81, + "grad_norm": 1.3430404155874531, + "learning_rate": 1.8818485877822712e-06, + "loss": 0.1859, + "step": 15879 + }, + { + "epoch": 0.81, + "grad_norm": 0.8794215939911043, + "learning_rate": 1.8808870089808806e-06, + "loss": 0.1685, + "step": 15880 + }, + { + "epoch": 0.81, + "grad_norm": 1.037638249002718, + "learning_rate": 1.879925650410892e-06, + "loss": 0.1607, + "step": 15881 + }, + { + "epoch": 0.81, + "grad_norm": 0.8697582929524619, + "learning_rate": 1.8789645120983746e-06, + "loss": 0.1792, + "step": 15882 + }, + { + "epoch": 0.81, + "grad_norm": 0.8982466851373708, + "learning_rate": 1.8780035940694075e-06, + "loss": 0.1577, + "step": 15883 + }, + { + "epoch": 0.81, + "grad_norm": 1.061375290924336, + "learning_rate": 1.8770428963500475e-06, + "loss": 0.1484, + "step": 15884 + }, + { + "epoch": 0.81, + "grad_norm": 1.4971804168207266, + "learning_rate": 1.8760824189663618e-06, + "loss": 0.1775, + "step": 15885 + }, + { + "epoch": 0.81, + "grad_norm": 0.9099341373713296, + "learning_rate": 1.875122161944396e-06, + "loss": 0.1559, + "step": 15886 + }, + { + "epoch": 0.81, + "grad_norm": 1.0462905697176088, + "learning_rate": 1.8741621253102005e-06, + "loss": 0.1675, + "step": 15887 + }, + { + "epoch": 0.81, + "grad_norm": 0.7935935303208741, + "learning_rate": 1.8732023090898165e-06, + "loss": 0.1369, + "step": 15888 + }, + { + "epoch": 0.81, + "grad_norm": 0.9045384382704341, + "learning_rate": 1.8722427133092813e-06, + "loss": 0.173, + "step": 15889 + }, + { + "epoch": 0.81, + "grad_norm": 1.1845902438296145, + "learning_rate": 1.8712833379946217e-06, + "loss": 0.1821, + "step": 15890 + }, + { + "epoch": 0.81, + "grad_norm": 0.7466157451415563, + "learning_rate": 1.8703241831718578e-06, + "loss": 0.1655, + "step": 15891 + }, + { + "epoch": 0.81, + "grad_norm": 0.8891584234780563, + "learning_rate": 1.8693652488670121e-06, + "loss": 0.1685, + "step": 15892 + }, + { + "epoch": 0.81, + "grad_norm": 0.973723244657113, + "learning_rate": 1.868406535106091e-06, + "loss": 0.1593, + "step": 15893 + }, + { + "epoch": 0.81, + "grad_norm": 1.1918050991675817, + "learning_rate": 1.8674480419151041e-06, + "loss": 0.1669, + "step": 15894 + }, + { + "epoch": 0.81, + "grad_norm": 0.8854809913998016, + "learning_rate": 1.8664897693200456e-06, + "loss": 0.1583, + "step": 15895 + }, + { + "epoch": 0.81, + "grad_norm": 1.0964839487023017, + "learning_rate": 1.8655317173469122e-06, + "loss": 0.1854, + "step": 15896 + }, + { + "epoch": 0.81, + "grad_norm": 0.8412304856386966, + "learning_rate": 1.86457388602169e-06, + "loss": 0.1756, + "step": 15897 + }, + { + "epoch": 0.81, + "grad_norm": 3.5080930598938767, + "learning_rate": 1.8636162753703636e-06, + "loss": 0.1666, + "step": 15898 + }, + { + "epoch": 0.81, + "grad_norm": 1.05984641476992, + "learning_rate": 1.862658885418902e-06, + "loss": 0.1487, + "step": 15899 + }, + { + "epoch": 0.81, + "grad_norm": 1.0047321045727262, + "learning_rate": 1.8617017161932815e-06, + "loss": 0.1705, + "step": 15900 + }, + { + "epoch": 0.81, + "grad_norm": 0.9856203454509121, + "learning_rate": 1.8607447677194578e-06, + "loss": 0.1615, + "step": 15901 + }, + { + "epoch": 0.81, + "grad_norm": 1.7206410988649752, + "learning_rate": 1.8597880400233959e-06, + "loss": 0.1529, + "step": 15902 + }, + { + "epoch": 0.81, + "grad_norm": 1.1406816086087803, + "learning_rate": 1.8588315331310392e-06, + "loss": 0.1806, + "step": 15903 + }, + { + "epoch": 0.81, + "grad_norm": 1.4328388461061061, + "learning_rate": 1.8578752470683381e-06, + "loss": 0.1686, + "step": 15904 + }, + { + "epoch": 0.81, + "grad_norm": 1.15321917515873, + "learning_rate": 1.8569191818612298e-06, + "loss": 0.1626, + "step": 15905 + }, + { + "epoch": 0.81, + "grad_norm": 1.175305560323787, + "learning_rate": 1.8559633375356511e-06, + "loss": 0.1803, + "step": 15906 + }, + { + "epoch": 0.81, + "grad_norm": 1.026447711263733, + "learning_rate": 1.8550077141175282e-06, + "loss": 0.1706, + "step": 15907 + }, + { + "epoch": 0.81, + "grad_norm": 0.8598763378612653, + "learning_rate": 1.8540523116327769e-06, + "loss": 0.1587, + "step": 15908 + }, + { + "epoch": 0.81, + "grad_norm": 0.9602544825284762, + "learning_rate": 1.8530971301073208e-06, + "loss": 0.1769, + "step": 15909 + }, + { + "epoch": 0.81, + "grad_norm": 1.1276646643628279, + "learning_rate": 1.8521421695670617e-06, + "loss": 0.1819, + "step": 15910 + }, + { + "epoch": 0.81, + "grad_norm": 1.3784525456178116, + "learning_rate": 1.8511874300379095e-06, + "loss": 0.1564, + "step": 15911 + }, + { + "epoch": 0.81, + "grad_norm": 0.9188987056733893, + "learning_rate": 1.8502329115457551e-06, + "loss": 0.185, + "step": 15912 + }, + { + "epoch": 0.81, + "grad_norm": 0.8884175776293851, + "learning_rate": 1.8492786141164943e-06, + "loss": 0.1616, + "step": 15913 + }, + { + "epoch": 0.81, + "grad_norm": 0.864036007568502, + "learning_rate": 1.8483245377760106e-06, + "loss": 0.163, + "step": 15914 + }, + { + "epoch": 0.81, + "grad_norm": 1.175478431383708, + "learning_rate": 1.847370682550187e-06, + "loss": 0.1706, + "step": 15915 + }, + { + "epoch": 0.81, + "grad_norm": 0.8877435476808375, + "learning_rate": 1.8464170484648924e-06, + "loss": 0.155, + "step": 15916 + }, + { + "epoch": 0.81, + "grad_norm": 1.2440300952858039, + "learning_rate": 1.8454636355459977e-06, + "loss": 0.152, + "step": 15917 + }, + { + "epoch": 0.81, + "grad_norm": 1.0778220695867444, + "learning_rate": 1.8445104438193595e-06, + "loss": 0.1672, + "step": 15918 + }, + { + "epoch": 0.81, + "grad_norm": 1.217954161449602, + "learning_rate": 1.8435574733108397e-06, + "loss": 0.1745, + "step": 15919 + }, + { + "epoch": 0.81, + "grad_norm": 0.905783275261828, + "learning_rate": 1.8426047240462807e-06, + "loss": 0.157, + "step": 15920 + }, + { + "epoch": 0.81, + "grad_norm": 1.0235188483867, + "learning_rate": 1.84165219605153e-06, + "loss": 0.1628, + "step": 15921 + }, + { + "epoch": 0.81, + "grad_norm": 0.7664994271883586, + "learning_rate": 1.840699889352423e-06, + "loss": 0.165, + "step": 15922 + }, + { + "epoch": 0.81, + "grad_norm": 1.55969940166033, + "learning_rate": 1.8397478039747962e-06, + "loss": 0.1829, + "step": 15923 + }, + { + "epoch": 0.81, + "grad_norm": 0.9627591745734583, + "learning_rate": 1.83879593994447e-06, + "loss": 0.1737, + "step": 15924 + }, + { + "epoch": 0.81, + "grad_norm": 1.1817190829880238, + "learning_rate": 1.8378442972872635e-06, + "loss": 0.164, + "step": 15925 + }, + { + "epoch": 0.81, + "grad_norm": 1.2039858241344097, + "learning_rate": 1.8368928760289928e-06, + "loss": 0.1579, + "step": 15926 + }, + { + "epoch": 0.81, + "grad_norm": 0.9183717007030037, + "learning_rate": 1.8359416761954629e-06, + "loss": 0.1791, + "step": 15927 + }, + { + "epoch": 0.81, + "grad_norm": 1.0886857154910026, + "learning_rate": 1.8349906978124776e-06, + "loss": 0.1548, + "step": 15928 + }, + { + "epoch": 0.81, + "grad_norm": 1.0074054922942874, + "learning_rate": 1.8340399409058284e-06, + "loss": 0.1753, + "step": 15929 + }, + { + "epoch": 0.81, + "grad_norm": 1.2258662949395527, + "learning_rate": 1.833089405501307e-06, + "loss": 0.1464, + "step": 15930 + }, + { + "epoch": 0.81, + "grad_norm": 0.9872561371738571, + "learning_rate": 1.8321390916246961e-06, + "loss": 0.1626, + "step": 15931 + }, + { + "epoch": 0.81, + "grad_norm": 1.3703066745943606, + "learning_rate": 1.8311889993017772e-06, + "loss": 0.1834, + "step": 15932 + }, + { + "epoch": 0.81, + "grad_norm": 1.1405886391351094, + "learning_rate": 1.8302391285583153e-06, + "loss": 0.1906, + "step": 15933 + }, + { + "epoch": 0.81, + "grad_norm": 1.6044683790640693, + "learning_rate": 1.829289479420081e-06, + "loss": 0.152, + "step": 15934 + }, + { + "epoch": 0.81, + "grad_norm": 1.029769477626035, + "learning_rate": 1.828340051912828e-06, + "loss": 0.1515, + "step": 15935 + }, + { + "epoch": 0.81, + "grad_norm": 1.3723365923561242, + "learning_rate": 1.8273908460623157e-06, + "loss": 0.1474, + "step": 15936 + }, + { + "epoch": 0.81, + "grad_norm": 1.248137118999691, + "learning_rate": 1.8264418618942859e-06, + "loss": 0.1438, + "step": 15937 + }, + { + "epoch": 0.81, + "grad_norm": 1.280726460462142, + "learning_rate": 1.8254930994344845e-06, + "loss": 0.1709, + "step": 15938 + }, + { + "epoch": 0.81, + "grad_norm": 1.1191582109101215, + "learning_rate": 1.8245445587086419e-06, + "loss": 0.1723, + "step": 15939 + }, + { + "epoch": 0.81, + "grad_norm": 1.2899830299526782, + "learning_rate": 1.82359623974249e-06, + "loss": 0.193, + "step": 15940 + }, + { + "epoch": 0.81, + "grad_norm": 1.0323262769938613, + "learning_rate": 1.8226481425617549e-06, + "loss": 0.1539, + "step": 15941 + }, + { + "epoch": 0.81, + "grad_norm": 1.5728960046365268, + "learning_rate": 1.821700267192148e-06, + "loss": 0.1543, + "step": 15942 + }, + { + "epoch": 0.81, + "grad_norm": 0.888435356901117, + "learning_rate": 1.8207526136593857e-06, + "loss": 0.1849, + "step": 15943 + }, + { + "epoch": 0.81, + "grad_norm": 1.2068742130827914, + "learning_rate": 1.8198051819891672e-06, + "loss": 0.1574, + "step": 15944 + }, + { + "epoch": 0.81, + "grad_norm": 0.8943393072237987, + "learning_rate": 1.8188579722071985e-06, + "loss": 0.1575, + "step": 15945 + }, + { + "epoch": 0.81, + "grad_norm": 1.218001554855009, + "learning_rate": 1.8179109843391673e-06, + "loss": 0.1686, + "step": 15946 + }, + { + "epoch": 0.81, + "grad_norm": 1.0580225505444398, + "learning_rate": 1.8169642184107628e-06, + "loss": 0.1771, + "step": 15947 + }, + { + "epoch": 0.81, + "grad_norm": 0.9884038518855041, + "learning_rate": 1.8160176744476654e-06, + "loss": 0.1588, + "step": 15948 + }, + { + "epoch": 0.81, + "grad_norm": 1.0281074804685377, + "learning_rate": 1.8150713524755536e-06, + "loss": 0.163, + "step": 15949 + }, + { + "epoch": 0.81, + "grad_norm": 0.8649855464767741, + "learning_rate": 1.8141252525200914e-06, + "loss": 0.1706, + "step": 15950 + }, + { + "epoch": 0.81, + "grad_norm": 1.5950583604159652, + "learning_rate": 1.813179374606946e-06, + "loss": 0.1543, + "step": 15951 + }, + { + "epoch": 0.81, + "grad_norm": 1.2086613904361754, + "learning_rate": 1.81223371876177e-06, + "loss": 0.1489, + "step": 15952 + }, + { + "epoch": 0.81, + "grad_norm": 2.7095853537929204, + "learning_rate": 1.8112882850102198e-06, + "loss": 0.1523, + "step": 15953 + }, + { + "epoch": 0.81, + "grad_norm": 0.9331269166408543, + "learning_rate": 1.8103430733779348e-06, + "loss": 0.1722, + "step": 15954 + }, + { + "epoch": 0.81, + "grad_norm": 0.8453545466230363, + "learning_rate": 1.809398083890559e-06, + "loss": 0.1549, + "step": 15955 + }, + { + "epoch": 0.81, + "grad_norm": 0.7681451063904245, + "learning_rate": 1.8084533165737195e-06, + "loss": 0.1457, + "step": 15956 + }, + { + "epoch": 0.81, + "grad_norm": 0.9178846288166791, + "learning_rate": 1.807508771453047e-06, + "loss": 0.1501, + "step": 15957 + }, + { + "epoch": 0.81, + "grad_norm": 1.0385827144853221, + "learning_rate": 1.8065644485541622e-06, + "loss": 0.1582, + "step": 15958 + }, + { + "epoch": 0.81, + "grad_norm": 0.8697295581069294, + "learning_rate": 1.8056203479026812e-06, + "loss": 0.1473, + "step": 15959 + }, + { + "epoch": 0.81, + "grad_norm": 1.2051188420799237, + "learning_rate": 1.8046764695242115e-06, + "loss": 0.1587, + "step": 15960 + }, + { + "epoch": 0.81, + "grad_norm": 1.0327908075621928, + "learning_rate": 1.8037328134443532e-06, + "loss": 0.175, + "step": 15961 + }, + { + "epoch": 0.81, + "grad_norm": 0.9823285672773258, + "learning_rate": 1.8027893796887075e-06, + "loss": 0.1754, + "step": 15962 + }, + { + "epoch": 0.81, + "grad_norm": 0.7991140537880537, + "learning_rate": 1.8018461682828603e-06, + "loss": 0.1396, + "step": 15963 + }, + { + "epoch": 0.81, + "grad_norm": 1.347432924466073, + "learning_rate": 1.8009031792524012e-06, + "loss": 0.1891, + "step": 15964 + }, + { + "epoch": 0.81, + "grad_norm": 1.6531415450426012, + "learning_rate": 1.7999604126229043e-06, + "loss": 0.1518, + "step": 15965 + }, + { + "epoch": 0.81, + "grad_norm": 1.0830412428308247, + "learning_rate": 1.7990178684199444e-06, + "loss": 0.1662, + "step": 15966 + }, + { + "epoch": 0.81, + "grad_norm": 0.9304754757162109, + "learning_rate": 1.798075546669088e-06, + "loss": 0.1515, + "step": 15967 + }, + { + "epoch": 0.81, + "grad_norm": 1.7648335000274984, + "learning_rate": 1.797133447395898e-06, + "loss": 0.1643, + "step": 15968 + }, + { + "epoch": 0.81, + "grad_norm": 1.5204642842384637, + "learning_rate": 1.7961915706259236e-06, + "loss": 0.1595, + "step": 15969 + }, + { + "epoch": 0.81, + "grad_norm": 1.230914612772511, + "learning_rate": 1.7952499163847192e-06, + "loss": 0.1545, + "step": 15970 + }, + { + "epoch": 0.81, + "grad_norm": 1.048495018382882, + "learning_rate": 1.794308484697822e-06, + "loss": 0.1423, + "step": 15971 + }, + { + "epoch": 0.81, + "grad_norm": 1.19952605245712, + "learning_rate": 1.7933672755907727e-06, + "loss": 0.1515, + "step": 15972 + }, + { + "epoch": 0.81, + "grad_norm": 1.2280860890863525, + "learning_rate": 1.7924262890890964e-06, + "loss": 0.1407, + "step": 15973 + }, + { + "epoch": 0.81, + "grad_norm": 1.0566678871977504, + "learning_rate": 1.7914855252183217e-06, + "loss": 0.193, + "step": 15974 + }, + { + "epoch": 0.81, + "grad_norm": 1.2322299380808484, + "learning_rate": 1.7905449840039645e-06, + "loss": 0.1444, + "step": 15975 + }, + { + "epoch": 0.81, + "grad_norm": 1.1100249588562576, + "learning_rate": 1.7896046654715427e-06, + "loss": 0.1737, + "step": 15976 + }, + { + "epoch": 0.81, + "grad_norm": 1.1424919104979556, + "learning_rate": 1.7886645696465566e-06, + "loss": 0.1688, + "step": 15977 + }, + { + "epoch": 0.81, + "grad_norm": 0.9677874819880176, + "learning_rate": 1.787724696554506e-06, + "loss": 0.1354, + "step": 15978 + }, + { + "epoch": 0.81, + "grad_norm": 1.060213074988549, + "learning_rate": 1.7867850462208892e-06, + "loss": 0.1783, + "step": 15979 + }, + { + "epoch": 0.81, + "grad_norm": 0.8748895283896404, + "learning_rate": 1.785845618671188e-06, + "loss": 0.1462, + "step": 15980 + }, + { + "epoch": 0.81, + "grad_norm": 1.25618502996236, + "learning_rate": 1.7849064139308925e-06, + "loss": 0.1594, + "step": 15981 + }, + { + "epoch": 0.81, + "grad_norm": 1.127720954067132, + "learning_rate": 1.7839674320254718e-06, + "loss": 0.1536, + "step": 15982 + }, + { + "epoch": 0.81, + "grad_norm": 1.0095621148613478, + "learning_rate": 1.783028672980398e-06, + "loss": 0.1686, + "step": 15983 + }, + { + "epoch": 0.81, + "grad_norm": 1.0674805735190684, + "learning_rate": 1.7820901368211362e-06, + "loss": 0.1575, + "step": 15984 + }, + { + "epoch": 0.81, + "grad_norm": 0.9566168211011928, + "learning_rate": 1.7811518235731461e-06, + "loss": 0.1596, + "step": 15985 + }, + { + "epoch": 0.81, + "grad_norm": 1.3033046485908206, + "learning_rate": 1.780213733261874e-06, + "loss": 0.1595, + "step": 15986 + }, + { + "epoch": 0.81, + "grad_norm": 0.9332843058050407, + "learning_rate": 1.7792758659127706e-06, + "loss": 0.172, + "step": 15987 + }, + { + "epoch": 0.81, + "grad_norm": 1.0017301526569287, + "learning_rate": 1.7783382215512724e-06, + "loss": 0.1769, + "step": 15988 + }, + { + "epoch": 0.81, + "grad_norm": 1.2771970862900306, + "learning_rate": 1.7774008002028164e-06, + "loss": 0.1475, + "step": 15989 + }, + { + "epoch": 0.81, + "grad_norm": 0.9754875288852934, + "learning_rate": 1.7764636018928249e-06, + "loss": 0.1619, + "step": 15990 + }, + { + "epoch": 0.81, + "grad_norm": 2.4876323017582416, + "learning_rate": 1.7755266266467264e-06, + "loss": 0.1752, + "step": 15991 + }, + { + "epoch": 0.81, + "grad_norm": 1.1379979426252034, + "learning_rate": 1.7745898744899292e-06, + "loss": 0.162, + "step": 15992 + }, + { + "epoch": 0.81, + "grad_norm": 0.8945533397316004, + "learning_rate": 1.7736533454478466e-06, + "loss": 0.166, + "step": 15993 + }, + { + "epoch": 0.81, + "grad_norm": 2.474577884249597, + "learning_rate": 1.7727170395458838e-06, + "loss": 0.1563, + "step": 15994 + }, + { + "epoch": 0.81, + "grad_norm": 1.1805580502190653, + "learning_rate": 1.7717809568094334e-06, + "loss": 0.1552, + "step": 15995 + }, + { + "epoch": 0.81, + "grad_norm": 0.8905053860600621, + "learning_rate": 1.7708450972638923e-06, + "loss": 0.1592, + "step": 15996 + }, + { + "epoch": 0.81, + "grad_norm": 1.1618938816030313, + "learning_rate": 1.76990946093464e-06, + "loss": 0.1844, + "step": 15997 + }, + { + "epoch": 0.81, + "grad_norm": 2.2644217815426884, + "learning_rate": 1.7689740478470608e-06, + "loss": 0.1661, + "step": 15998 + }, + { + "epoch": 0.81, + "grad_norm": 0.9554434364928839, + "learning_rate": 1.768038858026523e-06, + "loss": 0.163, + "step": 15999 + }, + { + "epoch": 0.81, + "grad_norm": 1.1010955555027215, + "learning_rate": 1.7671038914983963e-06, + "loss": 0.1675, + "step": 16000 + }, + { + "epoch": 0.81, + "grad_norm": 0.8927567601335374, + "learning_rate": 1.7661691482880416e-06, + "loss": 0.1633, + "step": 16001 + }, + { + "epoch": 0.81, + "grad_norm": 1.202918742731831, + "learning_rate": 1.7652346284208167e-06, + "loss": 0.1921, + "step": 16002 + }, + { + "epoch": 0.81, + "grad_norm": 1.2242544638925155, + "learning_rate": 1.7643003319220642e-06, + "loss": 0.1741, + "step": 16003 + }, + { + "epoch": 0.81, + "grad_norm": 0.8326302787576414, + "learning_rate": 1.763366258817133e-06, + "loss": 0.1455, + "step": 16004 + }, + { + "epoch": 0.81, + "grad_norm": 1.3261346371041038, + "learning_rate": 1.762432409131355e-06, + "loss": 0.1838, + "step": 16005 + }, + { + "epoch": 0.81, + "grad_norm": 1.3388318854708003, + "learning_rate": 1.7614987828900654e-06, + "loss": 0.1504, + "step": 16006 + }, + { + "epoch": 0.81, + "grad_norm": 1.1517921068209622, + "learning_rate": 1.760565380118584e-06, + "loss": 0.1813, + "step": 16007 + }, + { + "epoch": 0.81, + "grad_norm": 0.9223647551231623, + "learning_rate": 1.7596322008422351e-06, + "loss": 0.1501, + "step": 16008 + }, + { + "epoch": 0.81, + "grad_norm": 1.0249862314487033, + "learning_rate": 1.7586992450863261e-06, + "loss": 0.1602, + "step": 16009 + }, + { + "epoch": 0.81, + "grad_norm": 1.3558351653151626, + "learning_rate": 1.7577665128761645e-06, + "loss": 0.1563, + "step": 16010 + }, + { + "epoch": 0.81, + "grad_norm": 1.0748815541391956, + "learning_rate": 1.7568340042370546e-06, + "loss": 0.1664, + "step": 16011 + }, + { + "epoch": 0.81, + "grad_norm": 1.0504259349670237, + "learning_rate": 1.755901719194285e-06, + "loss": 0.1619, + "step": 16012 + }, + { + "epoch": 0.81, + "grad_norm": 1.0688802098767785, + "learning_rate": 1.7549696577731502e-06, + "loss": 0.1584, + "step": 16013 + }, + { + "epoch": 0.81, + "grad_norm": 1.414884983836496, + "learning_rate": 1.754037819998926e-06, + "loss": 0.1652, + "step": 16014 + }, + { + "epoch": 0.81, + "grad_norm": 1.3920470329979084, + "learning_rate": 1.753106205896895e-06, + "loss": 0.1533, + "step": 16015 + }, + { + "epoch": 0.81, + "grad_norm": 1.166883450478275, + "learning_rate": 1.75217481549232e-06, + "loss": 0.1838, + "step": 16016 + }, + { + "epoch": 0.81, + "grad_norm": 1.0275778044647248, + "learning_rate": 1.7512436488104723e-06, + "loss": 0.1607, + "step": 16017 + }, + { + "epoch": 0.81, + "grad_norm": 1.5038447361122222, + "learning_rate": 1.7503127058766046e-06, + "loss": 0.1896, + "step": 16018 + }, + { + "epoch": 0.81, + "grad_norm": 0.9306661523910239, + "learning_rate": 1.74938198671597e-06, + "loss": 0.1754, + "step": 16019 + }, + { + "epoch": 0.81, + "grad_norm": 1.4884420143779518, + "learning_rate": 1.7484514913538154e-06, + "loss": 0.1673, + "step": 16020 + }, + { + "epoch": 0.81, + "grad_norm": 0.9758314584022224, + "learning_rate": 1.7475212198153823e-06, + "loss": 0.1535, + "step": 16021 + }, + { + "epoch": 0.81, + "grad_norm": 1.2001369707687648, + "learning_rate": 1.7465911721259e-06, + "loss": 0.178, + "step": 16022 + }, + { + "epoch": 0.81, + "grad_norm": 1.2680078302782236, + "learning_rate": 1.7456613483106e-06, + "loss": 0.1624, + "step": 16023 + }, + { + "epoch": 0.81, + "grad_norm": 0.9690828344942196, + "learning_rate": 1.7447317483947002e-06, + "loss": 0.1319, + "step": 16024 + }, + { + "epoch": 0.81, + "grad_norm": 1.2521543439308118, + "learning_rate": 1.7438023724034215e-06, + "loss": 0.1739, + "step": 16025 + }, + { + "epoch": 0.81, + "grad_norm": 0.8606284544479783, + "learning_rate": 1.7428732203619659e-06, + "loss": 0.1684, + "step": 16026 + }, + { + "epoch": 0.82, + "grad_norm": 1.2639439074812617, + "learning_rate": 1.741944292295541e-06, + "loss": 0.2058, + "step": 16027 + }, + { + "epoch": 0.82, + "grad_norm": 0.9448353201257036, + "learning_rate": 1.7410155882293434e-06, + "loss": 0.148, + "step": 16028 + }, + { + "epoch": 0.82, + "grad_norm": 1.6066023830778167, + "learning_rate": 1.7400871081885672e-06, + "loss": 0.1708, + "step": 16029 + }, + { + "epoch": 0.82, + "grad_norm": 2.043947156009396, + "learning_rate": 1.7391588521983948e-06, + "loss": 0.162, + "step": 16030 + }, + { + "epoch": 0.82, + "grad_norm": 1.0654314480734575, + "learning_rate": 1.7382308202840027e-06, + "loss": 0.1717, + "step": 16031 + }, + { + "epoch": 0.82, + "grad_norm": 0.9800410320580712, + "learning_rate": 1.737303012470568e-06, + "loss": 0.1872, + "step": 16032 + }, + { + "epoch": 0.82, + "grad_norm": 1.0186507793410637, + "learning_rate": 1.7363754287832535e-06, + "loss": 0.174, + "step": 16033 + }, + { + "epoch": 0.82, + "grad_norm": 1.2723809059140396, + "learning_rate": 1.7354480692472253e-06, + "loss": 0.1666, + "step": 16034 + }, + { + "epoch": 0.82, + "grad_norm": 0.9192403831092457, + "learning_rate": 1.7345209338876324e-06, + "loss": 0.1606, + "step": 16035 + }, + { + "epoch": 0.82, + "grad_norm": 0.8880642758230413, + "learning_rate": 1.7335940227296254e-06, + "loss": 0.1649, + "step": 16036 + }, + { + "epoch": 0.82, + "grad_norm": 1.191038489713937, + "learning_rate": 1.7326673357983482e-06, + "loss": 0.1748, + "step": 16037 + }, + { + "epoch": 0.82, + "grad_norm": 1.0796781843628873, + "learning_rate": 1.7317408731189378e-06, + "loss": 0.1598, + "step": 16038 + }, + { + "epoch": 0.82, + "grad_norm": 1.8984590128115462, + "learning_rate": 1.7308146347165212e-06, + "loss": 0.1815, + "step": 16039 + }, + { + "epoch": 0.82, + "grad_norm": 0.9255649584515351, + "learning_rate": 1.729888620616228e-06, + "loss": 0.1813, + "step": 16040 + }, + { + "epoch": 0.82, + "grad_norm": 1.1961173565730827, + "learning_rate": 1.7289628308431694e-06, + "loss": 0.1753, + "step": 16041 + }, + { + "epoch": 0.82, + "grad_norm": 1.193927705094369, + "learning_rate": 1.7280372654224642e-06, + "loss": 0.1873, + "step": 16042 + }, + { + "epoch": 0.82, + "grad_norm": 0.8192238004138188, + "learning_rate": 1.7271119243792135e-06, + "loss": 0.1591, + "step": 16043 + }, + { + "epoch": 0.82, + "grad_norm": 0.964331006140522, + "learning_rate": 1.726186807738518e-06, + "loss": 0.1526, + "step": 16044 + }, + { + "epoch": 0.82, + "grad_norm": 1.8146540744504316, + "learning_rate": 1.7252619155254734e-06, + "loss": 0.1507, + "step": 16045 + }, + { + "epoch": 0.82, + "grad_norm": 0.8120205092019961, + "learning_rate": 1.7243372477651688e-06, + "loss": 0.1432, + "step": 16046 + }, + { + "epoch": 0.82, + "grad_norm": 0.9761030170080665, + "learning_rate": 1.7234128044826836e-06, + "loss": 0.1664, + "step": 16047 + }, + { + "epoch": 0.82, + "grad_norm": 1.1021142028744921, + "learning_rate": 1.7224885857030916e-06, + "loss": 0.1598, + "step": 16048 + }, + { + "epoch": 0.82, + "grad_norm": 1.8608872145849567, + "learning_rate": 1.7215645914514668e-06, + "loss": 0.1622, + "step": 16049 + }, + { + "epoch": 0.82, + "grad_norm": 1.107725886336896, + "learning_rate": 1.7206408217528669e-06, + "loss": 0.1642, + "step": 16050 + }, + { + "epoch": 0.82, + "grad_norm": 1.0829423681796864, + "learning_rate": 1.7197172766323556e-06, + "loss": 0.1549, + "step": 16051 + }, + { + "epoch": 0.82, + "grad_norm": 1.2654030905626539, + "learning_rate": 1.718793956114978e-06, + "loss": 0.1551, + "step": 16052 + }, + { + "epoch": 0.82, + "grad_norm": 1.7045139028307472, + "learning_rate": 1.717870860225782e-06, + "loss": 0.1527, + "step": 16053 + }, + { + "epoch": 0.82, + "grad_norm": 1.109320034642993, + "learning_rate": 1.7169479889898065e-06, + "loss": 0.175, + "step": 16054 + }, + { + "epoch": 0.82, + "grad_norm": 1.471551674901941, + "learning_rate": 1.7160253424320872e-06, + "loss": 0.1926, + "step": 16055 + }, + { + "epoch": 0.82, + "grad_norm": 1.8560448881427307, + "learning_rate": 1.7151029205776459e-06, + "loss": 0.181, + "step": 16056 + }, + { + "epoch": 0.82, + "grad_norm": 1.2365487452671833, + "learning_rate": 1.7141807234515085e-06, + "loss": 0.1671, + "step": 16057 + }, + { + "epoch": 0.82, + "grad_norm": 0.8256655601366389, + "learning_rate": 1.7132587510786846e-06, + "loss": 0.1469, + "step": 16058 + }, + { + "epoch": 0.82, + "grad_norm": 1.1887788220025923, + "learning_rate": 1.7123370034841869e-06, + "loss": 0.1688, + "step": 16059 + }, + { + "epoch": 0.82, + "grad_norm": 1.3846537570490054, + "learning_rate": 1.7114154806930138e-06, + "loss": 0.1707, + "step": 16060 + }, + { + "epoch": 0.82, + "grad_norm": 1.0174836563541019, + "learning_rate": 1.7104941827301668e-06, + "loss": 0.1723, + "step": 16061 + }, + { + "epoch": 0.82, + "grad_norm": 1.233727975926785, + "learning_rate": 1.7095731096206313e-06, + "loss": 0.182, + "step": 16062 + }, + { + "epoch": 0.82, + "grad_norm": 0.9338202871197535, + "learning_rate": 1.7086522613893918e-06, + "loss": 0.1656, + "step": 16063 + }, + { + "epoch": 0.82, + "grad_norm": 1.5330592853721912, + "learning_rate": 1.7077316380614317e-06, + "loss": 0.1568, + "step": 16064 + }, + { + "epoch": 0.82, + "grad_norm": 1.663600549880783, + "learning_rate": 1.7068112396617164e-06, + "loss": 0.1489, + "step": 16065 + }, + { + "epoch": 0.82, + "grad_norm": 1.2309622613220172, + "learning_rate": 1.7058910662152173e-06, + "loss": 0.1612, + "step": 16066 + }, + { + "epoch": 0.82, + "grad_norm": 1.2666130678712084, + "learning_rate": 1.7049711177468896e-06, + "loss": 0.1541, + "step": 16067 + }, + { + "epoch": 0.82, + "grad_norm": 0.9627847158187706, + "learning_rate": 1.7040513942816905e-06, + "loss": 0.1571, + "step": 16068 + }, + { + "epoch": 0.82, + "grad_norm": 2.041748171121705, + "learning_rate": 1.703131895844563e-06, + "loss": 0.1844, + "step": 16069 + }, + { + "epoch": 0.82, + "grad_norm": 1.0538437416835476, + "learning_rate": 1.7022126224604529e-06, + "loss": 0.1931, + "step": 16070 + }, + { + "epoch": 0.82, + "grad_norm": 0.8196593035973092, + "learning_rate": 1.7012935741542925e-06, + "loss": 0.1474, + "step": 16071 + }, + { + "epoch": 0.82, + "grad_norm": 1.5544017080113712, + "learning_rate": 1.700374750951016e-06, + "loss": 0.1739, + "step": 16072 + }, + { + "epoch": 0.82, + "grad_norm": 1.6594564457538314, + "learning_rate": 1.6994561528755404e-06, + "loss": 0.1629, + "step": 16073 + }, + { + "epoch": 0.82, + "grad_norm": 1.716203363591545, + "learning_rate": 1.698537779952788e-06, + "loss": 0.1574, + "step": 16074 + }, + { + "epoch": 0.82, + "grad_norm": 2.4843481543008163, + "learning_rate": 1.6976196322076655e-06, + "loss": 0.1672, + "step": 16075 + }, + { + "epoch": 0.82, + "grad_norm": 1.2056535835474138, + "learning_rate": 1.6967017096650807e-06, + "loss": 0.1832, + "step": 16076 + }, + { + "epoch": 0.82, + "grad_norm": 0.9487721823636202, + "learning_rate": 1.6957840123499292e-06, + "loss": 0.1626, + "step": 16077 + }, + { + "epoch": 0.82, + "grad_norm": 0.8886952662815739, + "learning_rate": 1.6948665402871067e-06, + "loss": 0.1518, + "step": 16078 + }, + { + "epoch": 0.82, + "grad_norm": 1.4301649119446862, + "learning_rate": 1.6939492935014966e-06, + "loss": 0.1574, + "step": 16079 + }, + { + "epoch": 0.82, + "grad_norm": 1.0618141135360368, + "learning_rate": 1.6930322720179816e-06, + "loss": 0.1476, + "step": 16080 + }, + { + "epoch": 0.82, + "grad_norm": 1.076064486726175, + "learning_rate": 1.6921154758614378e-06, + "loss": 0.164, + "step": 16081 + }, + { + "epoch": 0.82, + "grad_norm": 1.3984045430593135, + "learning_rate": 1.6911989050567279e-06, + "loss": 0.1682, + "step": 16082 + }, + { + "epoch": 0.82, + "grad_norm": 0.9973740840239897, + "learning_rate": 1.6902825596287198e-06, + "loss": 0.1669, + "step": 16083 + }, + { + "epoch": 0.82, + "grad_norm": 1.190269199690928, + "learning_rate": 1.6893664396022646e-06, + "loss": 0.1423, + "step": 16084 + }, + { + "epoch": 0.82, + "grad_norm": 1.473604726310278, + "learning_rate": 1.688450545002216e-06, + "loss": 0.1795, + "step": 16085 + }, + { + "epoch": 0.82, + "grad_norm": 1.0436960291378419, + "learning_rate": 1.6875348758534127e-06, + "loss": 0.1685, + "step": 16086 + }, + { + "epoch": 0.82, + "grad_norm": 1.3857159745313614, + "learning_rate": 1.6866194321806984e-06, + "loss": 0.1651, + "step": 16087 + }, + { + "epoch": 0.82, + "grad_norm": 0.8033380877779788, + "learning_rate": 1.6857042140088996e-06, + "loss": 0.1655, + "step": 16088 + }, + { + "epoch": 0.82, + "grad_norm": 0.9083806621123651, + "learning_rate": 1.6847892213628436e-06, + "loss": 0.1639, + "step": 16089 + }, + { + "epoch": 0.82, + "grad_norm": 1.0781586943612742, + "learning_rate": 1.6838744542673492e-06, + "loss": 0.1603, + "step": 16090 + }, + { + "epoch": 0.82, + "grad_norm": 0.9771201956290885, + "learning_rate": 1.6829599127472318e-06, + "loss": 0.1743, + "step": 16091 + }, + { + "epoch": 0.82, + "grad_norm": 1.060528233189165, + "learning_rate": 1.6820455968272953e-06, + "loss": 0.1684, + "step": 16092 + }, + { + "epoch": 0.82, + "grad_norm": 0.8894300227873709, + "learning_rate": 1.681131506532343e-06, + "loss": 0.1536, + "step": 16093 + }, + { + "epoch": 0.82, + "grad_norm": 1.3933629571467805, + "learning_rate": 1.6802176418871664e-06, + "loss": 0.1716, + "step": 16094 + }, + { + "epoch": 0.82, + "grad_norm": 0.9816041566511617, + "learning_rate": 1.6793040029165596e-06, + "loss": 0.1823, + "step": 16095 + }, + { + "epoch": 0.82, + "grad_norm": 1.2379254151826307, + "learning_rate": 1.6783905896452978e-06, + "loss": 0.1749, + "step": 16096 + }, + { + "epoch": 0.82, + "grad_norm": 1.0753028944850174, + "learning_rate": 1.677477402098162e-06, + "loss": 0.1633, + "step": 16097 + }, + { + "epoch": 0.82, + "grad_norm": 0.9176788598790777, + "learning_rate": 1.6765644402999216e-06, + "loss": 0.1544, + "step": 16098 + }, + { + "epoch": 0.82, + "grad_norm": 1.2709621204145147, + "learning_rate": 1.675651704275344e-06, + "loss": 0.1679, + "step": 16099 + }, + { + "epoch": 0.82, + "grad_norm": 1.1446384656419066, + "learning_rate": 1.674739194049183e-06, + "loss": 0.1589, + "step": 16100 + }, + { + "epoch": 0.82, + "grad_norm": 1.02180679059322, + "learning_rate": 1.673826909646189e-06, + "loss": 0.1534, + "step": 16101 + }, + { + "epoch": 0.82, + "grad_norm": 1.2679799246233314, + "learning_rate": 1.6729148510911142e-06, + "loss": 0.1636, + "step": 16102 + }, + { + "epoch": 0.82, + "grad_norm": 0.9948990950616705, + "learning_rate": 1.672003018408691e-06, + "loss": 0.1496, + "step": 16103 + }, + { + "epoch": 0.82, + "grad_norm": 0.900105132623221, + "learning_rate": 1.6710914116236588e-06, + "loss": 0.1585, + "step": 16104 + }, + { + "epoch": 0.82, + "grad_norm": 0.9949865804409496, + "learning_rate": 1.6701800307607397e-06, + "loss": 0.1584, + "step": 16105 + }, + { + "epoch": 0.82, + "grad_norm": 1.156298996343876, + "learning_rate": 1.6692688758446574e-06, + "loss": 0.174, + "step": 16106 + }, + { + "epoch": 0.82, + "grad_norm": 1.9389002868080762, + "learning_rate": 1.6683579469001287e-06, + "loss": 0.1793, + "step": 16107 + }, + { + "epoch": 0.82, + "grad_norm": 0.9776814651404512, + "learning_rate": 1.6674472439518629e-06, + "loss": 0.1497, + "step": 16108 + }, + { + "epoch": 0.82, + "grad_norm": 1.0709974373621516, + "learning_rate": 1.6665367670245592e-06, + "loss": 0.1841, + "step": 16109 + }, + { + "epoch": 0.82, + "grad_norm": 0.8930890109533807, + "learning_rate": 1.6656265161429186e-06, + "loss": 0.1682, + "step": 16110 + }, + { + "epoch": 0.82, + "grad_norm": 0.8296255055125633, + "learning_rate": 1.664716491331626e-06, + "loss": 0.1513, + "step": 16111 + }, + { + "epoch": 0.82, + "grad_norm": 0.9478955423947181, + "learning_rate": 1.6638066926153728e-06, + "loss": 0.1401, + "step": 16112 + }, + { + "epoch": 0.82, + "grad_norm": 0.9566375845034522, + "learning_rate": 1.6628971200188316e-06, + "loss": 0.1619, + "step": 16113 + }, + { + "epoch": 0.82, + "grad_norm": 0.7919743312019453, + "learning_rate": 1.66198777356668e-06, + "loss": 0.1513, + "step": 16114 + }, + { + "epoch": 0.82, + "grad_norm": 1.0115909183001128, + "learning_rate": 1.6610786532835776e-06, + "loss": 0.1706, + "step": 16115 + }, + { + "epoch": 0.82, + "grad_norm": 0.92772352473913, + "learning_rate": 1.660169759194188e-06, + "loss": 0.1635, + "step": 16116 + }, + { + "epoch": 0.82, + "grad_norm": 1.1785822029161883, + "learning_rate": 1.6592610913231665e-06, + "loss": 0.1683, + "step": 16117 + }, + { + "epoch": 0.82, + "grad_norm": 0.9009097664645848, + "learning_rate": 1.6583526496951573e-06, + "loss": 0.1604, + "step": 16118 + }, + { + "epoch": 0.82, + "grad_norm": 1.8481746888363388, + "learning_rate": 1.657444434334805e-06, + "loss": 0.1726, + "step": 16119 + }, + { + "epoch": 0.82, + "grad_norm": 1.0707301830080818, + "learning_rate": 1.656536445266742e-06, + "loss": 0.185, + "step": 16120 + }, + { + "epoch": 0.82, + "grad_norm": 1.1814006659495366, + "learning_rate": 1.655628682515602e-06, + "loss": 0.1763, + "step": 16121 + }, + { + "epoch": 0.82, + "grad_norm": 1.2437639723178673, + "learning_rate": 1.654721146106002e-06, + "loss": 0.1796, + "step": 16122 + }, + { + "epoch": 0.82, + "grad_norm": 1.7788299770768288, + "learning_rate": 1.6538138360625633e-06, + "loss": 0.1792, + "step": 16123 + }, + { + "epoch": 0.82, + "grad_norm": 1.0035397369570327, + "learning_rate": 1.652906752409894e-06, + "loss": 0.1646, + "step": 16124 + }, + { + "epoch": 0.82, + "grad_norm": 1.0281512783310551, + "learning_rate": 1.6519998951726045e-06, + "loss": 0.1858, + "step": 16125 + }, + { + "epoch": 0.82, + "grad_norm": 0.8408852550415522, + "learning_rate": 1.6510932643752863e-06, + "loss": 0.1609, + "step": 16126 + }, + { + "epoch": 0.82, + "grad_norm": 1.0078754787295523, + "learning_rate": 1.6501868600425374e-06, + "loss": 0.1689, + "step": 16127 + }, + { + "epoch": 0.82, + "grad_norm": 0.8816314300651446, + "learning_rate": 1.6492806821989393e-06, + "loss": 0.141, + "step": 16128 + }, + { + "epoch": 0.82, + "grad_norm": 1.0799580233562474, + "learning_rate": 1.6483747308690768e-06, + "loss": 0.1734, + "step": 16129 + }, + { + "epoch": 0.82, + "grad_norm": 1.128032995538658, + "learning_rate": 1.6474690060775178e-06, + "loss": 0.1487, + "step": 16130 + }, + { + "epoch": 0.82, + "grad_norm": 1.2278270712864054, + "learning_rate": 1.6465635078488372e-06, + "loss": 0.1667, + "step": 16131 + }, + { + "epoch": 0.82, + "grad_norm": 0.894278260754333, + "learning_rate": 1.6456582362075911e-06, + "loss": 0.1604, + "step": 16132 + }, + { + "epoch": 0.82, + "grad_norm": 0.8471791910866014, + "learning_rate": 1.6447531911783365e-06, + "loss": 0.1685, + "step": 16133 + }, + { + "epoch": 0.82, + "grad_norm": 1.4038276898923099, + "learning_rate": 1.6438483727856268e-06, + "loss": 0.166, + "step": 16134 + }, + { + "epoch": 0.82, + "grad_norm": 0.8586239073799874, + "learning_rate": 1.6429437810539982e-06, + "loss": 0.1607, + "step": 16135 + }, + { + "epoch": 0.82, + "grad_norm": 3.769070945114241, + "learning_rate": 1.6420394160079955e-06, + "loss": 0.1649, + "step": 16136 + }, + { + "epoch": 0.82, + "grad_norm": 1.3100985933899716, + "learning_rate": 1.6411352776721423e-06, + "loss": 0.2015, + "step": 16137 + }, + { + "epoch": 0.82, + "grad_norm": 1.2512570673071481, + "learning_rate": 1.640231366070969e-06, + "loss": 0.1608, + "step": 16138 + }, + { + "epoch": 0.82, + "grad_norm": 0.8822105121244441, + "learning_rate": 1.6393276812289905e-06, + "loss": 0.1631, + "step": 16139 + }, + { + "epoch": 0.82, + "grad_norm": 0.8806674958039984, + "learning_rate": 1.6384242231707203e-06, + "loss": 0.1791, + "step": 16140 + }, + { + "epoch": 0.82, + "grad_norm": 0.8521437055834901, + "learning_rate": 1.6375209919206657e-06, + "loss": 0.156, + "step": 16141 + }, + { + "epoch": 0.82, + "grad_norm": 0.863487539470905, + "learning_rate": 1.6366179875033284e-06, + "loss": 0.1528, + "step": 16142 + }, + { + "epoch": 0.82, + "grad_norm": 1.3976787853176753, + "learning_rate": 1.6357152099431984e-06, + "loss": 0.169, + "step": 16143 + }, + { + "epoch": 0.82, + "grad_norm": 1.139814567658281, + "learning_rate": 1.6348126592647684e-06, + "loss": 0.192, + "step": 16144 + }, + { + "epoch": 0.82, + "grad_norm": 1.1335595840636001, + "learning_rate": 1.6339103354925146e-06, + "loss": 0.1688, + "step": 16145 + }, + { + "epoch": 0.82, + "grad_norm": 0.8440126829948659, + "learning_rate": 1.6330082386509182e-06, + "loss": 0.1534, + "step": 16146 + }, + { + "epoch": 0.82, + "grad_norm": 0.8508049215827809, + "learning_rate": 1.6321063687644435e-06, + "loss": 0.1768, + "step": 16147 + }, + { + "epoch": 0.82, + "grad_norm": 1.282254408840473, + "learning_rate": 1.631204725857558e-06, + "loss": 0.1668, + "step": 16148 + }, + { + "epoch": 0.82, + "grad_norm": 1.0052396010512652, + "learning_rate": 1.6303033099547149e-06, + "loss": 0.1659, + "step": 16149 + }, + { + "epoch": 0.82, + "grad_norm": 0.9286821198774534, + "learning_rate": 1.629402121080368e-06, + "loss": 0.1649, + "step": 16150 + }, + { + "epoch": 0.82, + "grad_norm": 1.7312340130080666, + "learning_rate": 1.6285011592589628e-06, + "loss": 0.153, + "step": 16151 + }, + { + "epoch": 0.82, + "grad_norm": 1.0912525370551018, + "learning_rate": 1.6276004245149346e-06, + "loss": 0.1601, + "step": 16152 + }, + { + "epoch": 0.82, + "grad_norm": 1.2664795393962827, + "learning_rate": 1.6266999168727204e-06, + "loss": 0.1802, + "step": 16153 + }, + { + "epoch": 0.82, + "grad_norm": 1.0799495507810202, + "learning_rate": 1.6257996363567408e-06, + "loss": 0.1593, + "step": 16154 + }, + { + "epoch": 0.82, + "grad_norm": 0.9822469573275092, + "learning_rate": 1.6248995829914216e-06, + "loss": 0.1661, + "step": 16155 + }, + { + "epoch": 0.82, + "grad_norm": 0.8909355640364796, + "learning_rate": 1.6239997568011723e-06, + "loss": 0.1796, + "step": 16156 + }, + { + "epoch": 0.82, + "grad_norm": 0.9762448493305782, + "learning_rate": 1.6231001578104045e-06, + "loss": 0.1688, + "step": 16157 + }, + { + "epoch": 0.82, + "grad_norm": 0.8456224303249009, + "learning_rate": 1.6222007860435153e-06, + "loss": 0.1545, + "step": 16158 + }, + { + "epoch": 0.82, + "grad_norm": 1.097065634578124, + "learning_rate": 1.621301641524904e-06, + "loss": 0.1919, + "step": 16159 + }, + { + "epoch": 0.82, + "grad_norm": 1.2272594527185567, + "learning_rate": 1.6204027242789577e-06, + "loss": 0.1747, + "step": 16160 + }, + { + "epoch": 0.82, + "grad_norm": 1.4027279958659618, + "learning_rate": 1.619504034330064e-06, + "loss": 0.1336, + "step": 16161 + }, + { + "epoch": 0.82, + "grad_norm": 1.18531326701348, + "learning_rate": 1.618605571702595e-06, + "loss": 0.1574, + "step": 16162 + }, + { + "epoch": 0.82, + "grad_norm": 1.2249771958511868, + "learning_rate": 1.617707336420925e-06, + "loss": 0.1596, + "step": 16163 + }, + { + "epoch": 0.82, + "grad_norm": 1.0243959933280622, + "learning_rate": 1.6168093285094144e-06, + "loss": 0.1816, + "step": 16164 + }, + { + "epoch": 0.82, + "grad_norm": 1.1661606544666774, + "learning_rate": 1.6159115479924259e-06, + "loss": 0.1564, + "step": 16165 + }, + { + "epoch": 0.82, + "grad_norm": 1.535577913582053, + "learning_rate": 1.615013994894309e-06, + "loss": 0.1746, + "step": 16166 + }, + { + "epoch": 0.82, + "grad_norm": 1.085710298402728, + "learning_rate": 1.6141166692394106e-06, + "loss": 0.134, + "step": 16167 + }, + { + "epoch": 0.82, + "grad_norm": 1.0799809442745136, + "learning_rate": 1.6132195710520716e-06, + "loss": 0.1563, + "step": 16168 + }, + { + "epoch": 0.82, + "grad_norm": 1.2622437343108193, + "learning_rate": 1.6123227003566267e-06, + "loss": 0.1582, + "step": 16169 + }, + { + "epoch": 0.82, + "grad_norm": 1.0677073332003861, + "learning_rate": 1.6114260571774031e-06, + "loss": 0.1644, + "step": 16170 + }, + { + "epoch": 0.82, + "grad_norm": 1.2140411169479177, + "learning_rate": 1.6105296415387194e-06, + "loss": 0.1616, + "step": 16171 + }, + { + "epoch": 0.82, + "grad_norm": 0.9562483027738354, + "learning_rate": 1.609633453464895e-06, + "loss": 0.1614, + "step": 16172 + }, + { + "epoch": 0.82, + "grad_norm": 1.2325748339085705, + "learning_rate": 1.6087374929802346e-06, + "loss": 0.1424, + "step": 16173 + }, + { + "epoch": 0.82, + "grad_norm": 1.0057048813635803, + "learning_rate": 1.6078417601090457e-06, + "loss": 0.1744, + "step": 16174 + }, + { + "epoch": 0.82, + "grad_norm": 1.0863081131932661, + "learning_rate": 1.606946254875621e-06, + "loss": 0.1825, + "step": 16175 + }, + { + "epoch": 0.82, + "grad_norm": 1.0069948602074392, + "learning_rate": 1.6060509773042533e-06, + "loss": 0.1572, + "step": 16176 + }, + { + "epoch": 0.82, + "grad_norm": 0.8300856535641151, + "learning_rate": 1.6051559274192275e-06, + "loss": 0.183, + "step": 16177 + }, + { + "epoch": 0.82, + "grad_norm": 0.8124015039311869, + "learning_rate": 1.604261105244823e-06, + "loss": 0.1743, + "step": 16178 + }, + { + "epoch": 0.82, + "grad_norm": 1.0108753370218027, + "learning_rate": 1.6033665108053075e-06, + "loss": 0.1524, + "step": 16179 + }, + { + "epoch": 0.82, + "grad_norm": 1.1061628747426537, + "learning_rate": 1.6024721441249525e-06, + "loss": 0.1644, + "step": 16180 + }, + { + "epoch": 0.82, + "grad_norm": 1.137413127806049, + "learning_rate": 1.6015780052280128e-06, + "loss": 0.1657, + "step": 16181 + }, + { + "epoch": 0.82, + "grad_norm": 1.2868473563497702, + "learning_rate": 1.6006840941387458e-06, + "loss": 0.1818, + "step": 16182 + }, + { + "epoch": 0.82, + "grad_norm": 1.2457052080057864, + "learning_rate": 1.5997904108813944e-06, + "loss": 0.1723, + "step": 16183 + }, + { + "epoch": 0.82, + "grad_norm": 1.2843137230704258, + "learning_rate": 1.5988969554802058e-06, + "loss": 0.1634, + "step": 16184 + }, + { + "epoch": 0.82, + "grad_norm": 0.8464560307722944, + "learning_rate": 1.5980037279594097e-06, + "loss": 0.1469, + "step": 16185 + }, + { + "epoch": 0.82, + "grad_norm": 1.0749915701214785, + "learning_rate": 1.5971107283432363e-06, + "loss": 0.1841, + "step": 16186 + }, + { + "epoch": 0.82, + "grad_norm": 1.1146209065453505, + "learning_rate": 1.5962179566559112e-06, + "loss": 0.1895, + "step": 16187 + }, + { + "epoch": 0.82, + "grad_norm": 0.9494386154442932, + "learning_rate": 1.5953254129216467e-06, + "loss": 0.1494, + "step": 16188 + }, + { + "epoch": 0.82, + "grad_norm": 1.082046499912269, + "learning_rate": 1.594433097164657e-06, + "loss": 0.1761, + "step": 16189 + }, + { + "epoch": 0.82, + "grad_norm": 2.0094160078152505, + "learning_rate": 1.593541009409143e-06, + "loss": 0.1646, + "step": 16190 + }, + { + "epoch": 0.82, + "grad_norm": 1.3228427682959791, + "learning_rate": 1.592649149679305e-06, + "loss": 0.1611, + "step": 16191 + }, + { + "epoch": 0.82, + "grad_norm": 1.2135503694029435, + "learning_rate": 1.5917575179993328e-06, + "loss": 0.1604, + "step": 16192 + }, + { + "epoch": 0.82, + "grad_norm": 1.4004802440515558, + "learning_rate": 1.5908661143934112e-06, + "loss": 0.1517, + "step": 16193 + }, + { + "epoch": 0.82, + "grad_norm": 0.8070297986487432, + "learning_rate": 1.5899749388857222e-06, + "loss": 0.1672, + "step": 16194 + }, + { + "epoch": 0.82, + "grad_norm": 0.9741897108941053, + "learning_rate": 1.5890839915004398e-06, + "loss": 0.1729, + "step": 16195 + }, + { + "epoch": 0.82, + "grad_norm": 0.948612681485556, + "learning_rate": 1.5881932722617277e-06, + "loss": 0.1833, + "step": 16196 + }, + { + "epoch": 0.82, + "grad_norm": 2.751015180435459, + "learning_rate": 1.5873027811937491e-06, + "loss": 0.1689, + "step": 16197 + }, + { + "epoch": 0.82, + "grad_norm": 0.8035718788803883, + "learning_rate": 1.5864125183206569e-06, + "loss": 0.1714, + "step": 16198 + }, + { + "epoch": 0.82, + "grad_norm": 0.8762858502283523, + "learning_rate": 1.5855224836666016e-06, + "loss": 0.1465, + "step": 16199 + }, + { + "epoch": 0.82, + "grad_norm": 1.2879099555445566, + "learning_rate": 1.584632677255723e-06, + "loss": 0.1748, + "step": 16200 + }, + { + "epoch": 0.82, + "grad_norm": 1.0847853329598949, + "learning_rate": 1.5837430991121594e-06, + "loss": 0.1581, + "step": 16201 + }, + { + "epoch": 0.82, + "grad_norm": 1.1504659150239045, + "learning_rate": 1.5828537492600382e-06, + "loss": 0.1577, + "step": 16202 + }, + { + "epoch": 0.82, + "grad_norm": 2.439135514835605, + "learning_rate": 1.5819646277234834e-06, + "loss": 0.166, + "step": 16203 + }, + { + "epoch": 0.82, + "grad_norm": 1.593461322594299, + "learning_rate": 1.581075734526617e-06, + "loss": 0.1686, + "step": 16204 + }, + { + "epoch": 0.82, + "grad_norm": 1.0218168743453442, + "learning_rate": 1.580187069693544e-06, + "loss": 0.1435, + "step": 16205 + }, + { + "epoch": 0.82, + "grad_norm": 1.13250068726926, + "learning_rate": 1.5792986332483739e-06, + "loss": 0.17, + "step": 16206 + }, + { + "epoch": 0.82, + "grad_norm": 1.1497848504486927, + "learning_rate": 1.578410425215202e-06, + "loss": 0.1874, + "step": 16207 + }, + { + "epoch": 0.82, + "grad_norm": 0.9860777098312422, + "learning_rate": 1.577522445618126e-06, + "loss": 0.1784, + "step": 16208 + }, + { + "epoch": 0.82, + "grad_norm": 1.0653169406941323, + "learning_rate": 1.576634694481227e-06, + "loss": 0.1772, + "step": 16209 + }, + { + "epoch": 0.82, + "grad_norm": 1.3828187860236651, + "learning_rate": 1.575747171828589e-06, + "loss": 0.1707, + "step": 16210 + }, + { + "epoch": 0.82, + "grad_norm": 1.0105506162693179, + "learning_rate": 1.5748598776842838e-06, + "loss": 0.175, + "step": 16211 + }, + { + "epoch": 0.82, + "grad_norm": 1.0796724874430454, + "learning_rate": 1.5739728120723795e-06, + "loss": 0.1764, + "step": 16212 + }, + { + "epoch": 0.82, + "grad_norm": 0.8717548247982144, + "learning_rate": 1.573085975016938e-06, + "loss": 0.1559, + "step": 16213 + }, + { + "epoch": 0.82, + "grad_norm": 0.9807085943349985, + "learning_rate": 1.5721993665420187e-06, + "loss": 0.1687, + "step": 16214 + }, + { + "epoch": 0.82, + "grad_norm": 1.243518809958635, + "learning_rate": 1.5713129866716647e-06, + "loss": 0.1695, + "step": 16215 + }, + { + "epoch": 0.82, + "grad_norm": 1.0002427361868018, + "learning_rate": 1.5704268354299246e-06, + "loss": 0.1571, + "step": 16216 + }, + { + "epoch": 0.82, + "grad_norm": 1.0513130593254636, + "learning_rate": 1.56954091284083e-06, + "loss": 0.1683, + "step": 16217 + }, + { + "epoch": 0.82, + "grad_norm": 1.1471167807414897, + "learning_rate": 1.5686552189284177e-06, + "loss": 0.172, + "step": 16218 + }, + { + "epoch": 0.82, + "grad_norm": 0.8267405172081425, + "learning_rate": 1.5677697537167048e-06, + "loss": 0.1513, + "step": 16219 + }, + { + "epoch": 0.82, + "grad_norm": 0.7890938430215125, + "learning_rate": 1.5668845172297143e-06, + "loss": 0.1588, + "step": 16220 + }, + { + "epoch": 0.82, + "grad_norm": 0.8541591590053499, + "learning_rate": 1.5659995094914603e-06, + "loss": 0.1799, + "step": 16221 + }, + { + "epoch": 0.82, + "grad_norm": 1.0495622903964184, + "learning_rate": 1.565114730525944e-06, + "loss": 0.1556, + "step": 16222 + }, + { + "epoch": 0.82, + "grad_norm": 1.0599533152387104, + "learning_rate": 1.564230180357168e-06, + "loss": 0.169, + "step": 16223 + }, + { + "epoch": 0.83, + "grad_norm": 1.1758205426339616, + "learning_rate": 1.5633458590091233e-06, + "loss": 0.1818, + "step": 16224 + }, + { + "epoch": 0.83, + "grad_norm": 1.3566302422746803, + "learning_rate": 1.5624617665058005e-06, + "loss": 0.1603, + "step": 16225 + }, + { + "epoch": 0.83, + "grad_norm": 1.0060617851367233, + "learning_rate": 1.5615779028711775e-06, + "loss": 0.1576, + "step": 16226 + }, + { + "epoch": 0.83, + "grad_norm": 1.6034621808942282, + "learning_rate": 1.5606942681292326e-06, + "loss": 0.1638, + "step": 16227 + }, + { + "epoch": 0.83, + "grad_norm": 1.0066324518206973, + "learning_rate": 1.55981086230393e-06, + "loss": 0.1641, + "step": 16228 + }, + { + "epoch": 0.83, + "grad_norm": 1.0712368190292123, + "learning_rate": 1.5589276854192336e-06, + "loss": 0.1839, + "step": 16229 + }, + { + "epoch": 0.83, + "grad_norm": 0.9567927807699808, + "learning_rate": 1.5580447374991003e-06, + "loss": 0.1688, + "step": 16230 + }, + { + "epoch": 0.83, + "grad_norm": 1.3476202419921466, + "learning_rate": 1.557162018567484e-06, + "loss": 0.1629, + "step": 16231 + }, + { + "epoch": 0.83, + "grad_norm": 0.8950686877683366, + "learning_rate": 1.5562795286483212e-06, + "loss": 0.1457, + "step": 16232 + }, + { + "epoch": 0.83, + "grad_norm": 0.9267746323940856, + "learning_rate": 1.555397267765556e-06, + "loss": 0.1918, + "step": 16233 + }, + { + "epoch": 0.83, + "grad_norm": 1.0000652875899563, + "learning_rate": 1.5545152359431149e-06, + "loss": 0.1721, + "step": 16234 + }, + { + "epoch": 0.83, + "grad_norm": 0.9567611863206363, + "learning_rate": 1.5536334332049274e-06, + "loss": 0.1734, + "step": 16235 + }, + { + "epoch": 0.83, + "grad_norm": 1.0972798771214651, + "learning_rate": 1.5527518595749068e-06, + "loss": 0.1634, + "step": 16236 + }, + { + "epoch": 0.83, + "grad_norm": 1.130250553330175, + "learning_rate": 1.551870515076972e-06, + "loss": 0.172, + "step": 16237 + }, + { + "epoch": 0.83, + "grad_norm": 1.0199759185793813, + "learning_rate": 1.5509893997350245e-06, + "loss": 0.1778, + "step": 16238 + }, + { + "epoch": 0.83, + "grad_norm": 1.033979382796841, + "learning_rate": 1.5501085135729666e-06, + "loss": 0.1737, + "step": 16239 + }, + { + "epoch": 0.83, + "grad_norm": 1.348123338689501, + "learning_rate": 1.5492278566146945e-06, + "loss": 0.1746, + "step": 16240 + }, + { + "epoch": 0.83, + "grad_norm": 1.066130979899349, + "learning_rate": 1.5483474288840927e-06, + "loss": 0.196, + "step": 16241 + }, + { + "epoch": 0.83, + "grad_norm": 0.9158187549060963, + "learning_rate": 1.5474672304050454e-06, + "loss": 0.1575, + "step": 16242 + }, + { + "epoch": 0.83, + "grad_norm": 1.1187810995278062, + "learning_rate": 1.5465872612014255e-06, + "loss": 0.172, + "step": 16243 + }, + { + "epoch": 0.83, + "grad_norm": 2.019799657851233, + "learning_rate": 1.545707521297105e-06, + "loss": 0.1769, + "step": 16244 + }, + { + "epoch": 0.83, + "grad_norm": 1.0459380016808018, + "learning_rate": 1.5448280107159442e-06, + "loss": 0.1888, + "step": 16245 + }, + { + "epoch": 0.83, + "grad_norm": 1.187399423034828, + "learning_rate": 1.5439487294818002e-06, + "loss": 0.1361, + "step": 16246 + }, + { + "epoch": 0.83, + "grad_norm": 1.0301682592357948, + "learning_rate": 1.543069677618525e-06, + "loss": 0.1436, + "step": 16247 + }, + { + "epoch": 0.83, + "grad_norm": 1.1966512262807896, + "learning_rate": 1.5421908551499653e-06, + "loss": 0.1637, + "step": 16248 + }, + { + "epoch": 0.83, + "grad_norm": 1.800092552120167, + "learning_rate": 1.5413122620999533e-06, + "loss": 0.161, + "step": 16249 + }, + { + "epoch": 0.83, + "grad_norm": 0.8528766108510961, + "learning_rate": 1.540433898492326e-06, + "loss": 0.1634, + "step": 16250 + }, + { + "epoch": 0.83, + "grad_norm": 0.9868816636905955, + "learning_rate": 1.539555764350905e-06, + "loss": 0.167, + "step": 16251 + }, + { + "epoch": 0.83, + "grad_norm": 0.856926643725097, + "learning_rate": 1.5386778596995144e-06, + "loss": 0.1434, + "step": 16252 + }, + { + "epoch": 0.83, + "grad_norm": 1.1659977976533071, + "learning_rate": 1.5378001845619616e-06, + "loss": 0.1716, + "step": 16253 + }, + { + "epoch": 0.83, + "grad_norm": 1.4201134946411582, + "learning_rate": 1.53692273896206e-06, + "loss": 0.1706, + "step": 16254 + }, + { + "epoch": 0.83, + "grad_norm": 1.1415900722887473, + "learning_rate": 1.5360455229236049e-06, + "loss": 0.1813, + "step": 16255 + }, + { + "epoch": 0.83, + "grad_norm": 1.1716332672969776, + "learning_rate": 1.5351685364703916e-06, + "loss": 0.1597, + "step": 16256 + }, + { + "epoch": 0.83, + "grad_norm": 0.9427550839380094, + "learning_rate": 1.5342917796262136e-06, + "loss": 0.1502, + "step": 16257 + }, + { + "epoch": 0.83, + "grad_norm": 0.8506175397006018, + "learning_rate": 1.533415252414846e-06, + "loss": 0.1593, + "step": 16258 + }, + { + "epoch": 0.83, + "grad_norm": 0.7706683930731257, + "learning_rate": 1.5325389548600711e-06, + "loss": 0.1607, + "step": 16259 + }, + { + "epoch": 0.83, + "grad_norm": 1.1888829820220763, + "learning_rate": 1.531662886985652e-06, + "loss": 0.1752, + "step": 16260 + }, + { + "epoch": 0.83, + "grad_norm": 1.6787971741140937, + "learning_rate": 1.5307870488153586e-06, + "loss": 0.1642, + "step": 16261 + }, + { + "epoch": 0.83, + "grad_norm": 0.936195924156837, + "learning_rate": 1.529911440372942e-06, + "loss": 0.1665, + "step": 16262 + }, + { + "epoch": 0.83, + "grad_norm": 4.5813626304960975, + "learning_rate": 1.5290360616821564e-06, + "loss": 0.167, + "step": 16263 + }, + { + "epoch": 0.83, + "grad_norm": 0.9985641711965435, + "learning_rate": 1.5281609127667451e-06, + "loss": 0.1679, + "step": 16264 + }, + { + "epoch": 0.83, + "grad_norm": 2.9953769021880143, + "learning_rate": 1.5272859936504513e-06, + "loss": 0.1587, + "step": 16265 + }, + { + "epoch": 0.83, + "grad_norm": 0.899874692531944, + "learning_rate": 1.5264113043569994e-06, + "loss": 0.1555, + "step": 16266 + }, + { + "epoch": 0.83, + "grad_norm": 1.1861633024929679, + "learning_rate": 1.5255368449101226e-06, + "loss": 0.18, + "step": 16267 + }, + { + "epoch": 0.83, + "grad_norm": 1.033925166327829, + "learning_rate": 1.5246626153335364e-06, + "loss": 0.1426, + "step": 16268 + }, + { + "epoch": 0.83, + "grad_norm": 2.049961418295712, + "learning_rate": 1.5237886156509563e-06, + "loss": 0.1673, + "step": 16269 + }, + { + "epoch": 0.83, + "grad_norm": 1.0985176283197147, + "learning_rate": 1.5229148458860865e-06, + "loss": 0.1601, + "step": 16270 + }, + { + "epoch": 0.83, + "grad_norm": 0.8934168171267731, + "learning_rate": 1.5220413060626327e-06, + "loss": 0.1644, + "step": 16271 + }, + { + "epoch": 0.83, + "grad_norm": 0.9981608465218564, + "learning_rate": 1.5211679962042858e-06, + "loss": 0.1549, + "step": 16272 + }, + { + "epoch": 0.83, + "grad_norm": 0.8914455120721887, + "learning_rate": 1.5202949163347348e-06, + "loss": 0.1378, + "step": 16273 + }, + { + "epoch": 0.83, + "grad_norm": 1.6112780017756578, + "learning_rate": 1.519422066477666e-06, + "loss": 0.1838, + "step": 16274 + }, + { + "epoch": 0.83, + "grad_norm": 0.8425598172314763, + "learning_rate": 1.5185494466567508e-06, + "loss": 0.1734, + "step": 16275 + }, + { + "epoch": 0.83, + "grad_norm": 1.0261225190109788, + "learning_rate": 1.5176770568956623e-06, + "loss": 0.1726, + "step": 16276 + }, + { + "epoch": 0.83, + "grad_norm": 1.1873182046443194, + "learning_rate": 1.5168048972180605e-06, + "loss": 0.1688, + "step": 16277 + }, + { + "epoch": 0.83, + "grad_norm": 1.3521035987276042, + "learning_rate": 1.5159329676476075e-06, + "loss": 0.1623, + "step": 16278 + }, + { + "epoch": 0.83, + "grad_norm": 1.2750024973858918, + "learning_rate": 1.5150612682079502e-06, + "loss": 0.1844, + "step": 16279 + }, + { + "epoch": 0.83, + "grad_norm": 1.3258702215505207, + "learning_rate": 1.5141897989227372e-06, + "loss": 0.1674, + "step": 16280 + }, + { + "epoch": 0.83, + "grad_norm": 0.9289221783042039, + "learning_rate": 1.513318559815603e-06, + "loss": 0.1771, + "step": 16281 + }, + { + "epoch": 0.83, + "grad_norm": 1.074167772829532, + "learning_rate": 1.512447550910181e-06, + "loss": 0.1811, + "step": 16282 + }, + { + "epoch": 0.83, + "grad_norm": 0.9460771136548177, + "learning_rate": 1.5115767722301e-06, + "loss": 0.1523, + "step": 16283 + }, + { + "epoch": 0.83, + "grad_norm": 0.925950211310057, + "learning_rate": 1.51070622379898e-06, + "loss": 0.1527, + "step": 16284 + }, + { + "epoch": 0.83, + "grad_norm": 1.0862374172733744, + "learning_rate": 1.509835905640431e-06, + "loss": 0.1697, + "step": 16285 + }, + { + "epoch": 0.83, + "grad_norm": 1.2381593824256332, + "learning_rate": 1.5089658177780653e-06, + "loss": 0.1697, + "step": 16286 + }, + { + "epoch": 0.83, + "grad_norm": 1.3238266766665978, + "learning_rate": 1.5080959602354783e-06, + "loss": 0.1588, + "step": 16287 + }, + { + "epoch": 0.83, + "grad_norm": 3.170935256463546, + "learning_rate": 1.5072263330362713e-06, + "loss": 0.1631, + "step": 16288 + }, + { + "epoch": 0.83, + "grad_norm": 1.085728386526513, + "learning_rate": 1.5063569362040265e-06, + "loss": 0.1783, + "step": 16289 + }, + { + "epoch": 0.83, + "grad_norm": 0.8514572170467741, + "learning_rate": 1.5054877697623305e-06, + "loss": 0.1727, + "step": 16290 + }, + { + "epoch": 0.83, + "grad_norm": 1.031428838828624, + "learning_rate": 1.50461883373476e-06, + "loss": 0.1678, + "step": 16291 + }, + { + "epoch": 0.83, + "grad_norm": 1.1681429510390187, + "learning_rate": 1.5037501281448819e-06, + "loss": 0.1705, + "step": 16292 + }, + { + "epoch": 0.83, + "grad_norm": 1.383254386252376, + "learning_rate": 1.5028816530162627e-06, + "loss": 0.1611, + "step": 16293 + }, + { + "epoch": 0.83, + "grad_norm": 1.0569013766570596, + "learning_rate": 1.5020134083724568e-06, + "loss": 0.1687, + "step": 16294 + }, + { + "epoch": 0.83, + "grad_norm": 0.900329262047711, + "learning_rate": 1.5011453942370191e-06, + "loss": 0.1653, + "step": 16295 + }, + { + "epoch": 0.83, + "grad_norm": 1.2811474221674615, + "learning_rate": 1.5002776106334904e-06, + "loss": 0.1677, + "step": 16296 + }, + { + "epoch": 0.83, + "grad_norm": 1.1071821925985452, + "learning_rate": 1.4994100575854143e-06, + "loss": 0.1496, + "step": 16297 + }, + { + "epoch": 0.83, + "grad_norm": 0.7783188030566939, + "learning_rate": 1.4985427351163184e-06, + "loss": 0.1746, + "step": 16298 + }, + { + "epoch": 0.83, + "grad_norm": 1.3811377289457276, + "learning_rate": 1.4976756432497309e-06, + "loss": 0.1598, + "step": 16299 + }, + { + "epoch": 0.83, + "grad_norm": 0.7740242905028717, + "learning_rate": 1.4968087820091714e-06, + "loss": 0.1797, + "step": 16300 + }, + { + "epoch": 0.83, + "grad_norm": 1.2751691210341451, + "learning_rate": 1.495942151418156e-06, + "loss": 0.176, + "step": 16301 + }, + { + "epoch": 0.83, + "grad_norm": 0.8551080958388871, + "learning_rate": 1.495075751500188e-06, + "loss": 0.1475, + "step": 16302 + }, + { + "epoch": 0.83, + "grad_norm": 1.1155180459959604, + "learning_rate": 1.4942095822787738e-06, + "loss": 0.1723, + "step": 16303 + }, + { + "epoch": 0.83, + "grad_norm": 0.9582474774941909, + "learning_rate": 1.4933436437774017e-06, + "loss": 0.1623, + "step": 16304 + }, + { + "epoch": 0.83, + "grad_norm": 1.1581963477224246, + "learning_rate": 1.4924779360195662e-06, + "loss": 0.18, + "step": 16305 + }, + { + "epoch": 0.83, + "grad_norm": 1.1922788818545937, + "learning_rate": 1.4916124590287451e-06, + "loss": 0.1693, + "step": 16306 + }, + { + "epoch": 0.83, + "grad_norm": 1.7897651228922262, + "learning_rate": 1.4907472128284185e-06, + "loss": 0.1646, + "step": 16307 + }, + { + "epoch": 0.83, + "grad_norm": 1.2555341889630993, + "learning_rate": 1.489882197442053e-06, + "loss": 0.187, + "step": 16308 + }, + { + "epoch": 0.83, + "grad_norm": 0.7526073707918681, + "learning_rate": 1.4890174128931123e-06, + "loss": 0.1391, + "step": 16309 + }, + { + "epoch": 0.83, + "grad_norm": 1.70102190020427, + "learning_rate": 1.4881528592050576e-06, + "loss": 0.164, + "step": 16310 + }, + { + "epoch": 0.83, + "grad_norm": 1.1421019175027005, + "learning_rate": 1.4872885364013357e-06, + "loss": 0.1652, + "step": 16311 + }, + { + "epoch": 0.83, + "grad_norm": 1.3468381523610138, + "learning_rate": 1.4864244445053956e-06, + "loss": 0.1671, + "step": 16312 + }, + { + "epoch": 0.83, + "grad_norm": 0.8954079863094079, + "learning_rate": 1.4855605835406695e-06, + "loss": 0.1435, + "step": 16313 + }, + { + "epoch": 0.83, + "grad_norm": 1.6833123666950103, + "learning_rate": 1.4846969535305967e-06, + "loss": 0.179, + "step": 16314 + }, + { + "epoch": 0.83, + "grad_norm": 1.1499521503608294, + "learning_rate": 1.4838335544985982e-06, + "loss": 0.1595, + "step": 16315 + }, + { + "epoch": 0.83, + "grad_norm": 1.255100924441286, + "learning_rate": 1.4829703864680945e-06, + "loss": 0.1686, + "step": 16316 + }, + { + "epoch": 0.83, + "grad_norm": 1.269949061927699, + "learning_rate": 1.482107449462501e-06, + "loss": 0.174, + "step": 16317 + }, + { + "epoch": 0.83, + "grad_norm": 1.0630711296146427, + "learning_rate": 1.4812447435052258e-06, + "loss": 0.1379, + "step": 16318 + }, + { + "epoch": 0.83, + "grad_norm": 1.1507195675909738, + "learning_rate": 1.4803822686196657e-06, + "loss": 0.1676, + "step": 16319 + }, + { + "epoch": 0.83, + "grad_norm": 1.0780534179912902, + "learning_rate": 1.4795200248292207e-06, + "loss": 0.1674, + "step": 16320 + }, + { + "epoch": 0.83, + "grad_norm": 1.1352401978328097, + "learning_rate": 1.4786580121572736e-06, + "loss": 0.1763, + "step": 16321 + }, + { + "epoch": 0.83, + "grad_norm": 1.2704926559864493, + "learning_rate": 1.477796230627211e-06, + "loss": 0.1644, + "step": 16322 + }, + { + "epoch": 0.83, + "grad_norm": 1.3269094679941507, + "learning_rate": 1.476934680262405e-06, + "loss": 0.1408, + "step": 16323 + }, + { + "epoch": 0.83, + "grad_norm": 0.9223432723723342, + "learning_rate": 1.4760733610862298e-06, + "loss": 0.1999, + "step": 16324 + }, + { + "epoch": 0.83, + "grad_norm": 1.0167153221285, + "learning_rate": 1.475212273122043e-06, + "loss": 0.154, + "step": 16325 + }, + { + "epoch": 0.83, + "grad_norm": 1.1643049382379205, + "learning_rate": 1.474351416393206e-06, + "loss": 0.1609, + "step": 16326 + }, + { + "epoch": 0.83, + "grad_norm": 1.3266734784664354, + "learning_rate": 1.47349079092307e-06, + "loss": 0.1696, + "step": 16327 + }, + { + "epoch": 0.83, + "grad_norm": 0.965325287003813, + "learning_rate": 1.4726303967349754e-06, + "loss": 0.1585, + "step": 16328 + }, + { + "epoch": 0.83, + "grad_norm": 1.8263306753742028, + "learning_rate": 1.4717702338522654e-06, + "loss": 0.2012, + "step": 16329 + }, + { + "epoch": 0.83, + "grad_norm": 1.655930629079804, + "learning_rate": 1.4709103022982673e-06, + "loss": 0.2196, + "step": 16330 + }, + { + "epoch": 0.83, + "grad_norm": 0.8648245666271341, + "learning_rate": 1.4700506020963113e-06, + "loss": 0.1682, + "step": 16331 + }, + { + "epoch": 0.83, + "grad_norm": 1.1845845429985948, + "learning_rate": 1.4691911332697118e-06, + "loss": 0.1737, + "step": 16332 + }, + { + "epoch": 0.83, + "grad_norm": 1.0154675997457938, + "learning_rate": 1.468331895841787e-06, + "loss": 0.1662, + "step": 16333 + }, + { + "epoch": 0.83, + "grad_norm": 1.0646982504956448, + "learning_rate": 1.4674728898358391e-06, + "loss": 0.1612, + "step": 16334 + }, + { + "epoch": 0.83, + "grad_norm": 0.929233799062197, + "learning_rate": 1.466614115275171e-06, + "loss": 0.1785, + "step": 16335 + }, + { + "epoch": 0.83, + "grad_norm": 0.8890200594074377, + "learning_rate": 1.4657555721830775e-06, + "loss": 0.1637, + "step": 16336 + }, + { + "epoch": 0.83, + "grad_norm": 1.407647389509816, + "learning_rate": 1.4648972605828482e-06, + "loss": 0.1699, + "step": 16337 + }, + { + "epoch": 0.83, + "grad_norm": 0.9429287949318823, + "learning_rate": 1.4640391804977605e-06, + "loss": 0.1524, + "step": 16338 + }, + { + "epoch": 0.83, + "grad_norm": 1.389627543861334, + "learning_rate": 1.4631813319510945e-06, + "loss": 0.1587, + "step": 16339 + }, + { + "epoch": 0.83, + "grad_norm": 1.8792147979013831, + "learning_rate": 1.462323714966114e-06, + "loss": 0.1748, + "step": 16340 + }, + { + "epoch": 0.83, + "grad_norm": 1.1173411674101408, + "learning_rate": 1.461466329566088e-06, + "loss": 0.1689, + "step": 16341 + }, + { + "epoch": 0.83, + "grad_norm": 1.1589594154477718, + "learning_rate": 1.460609175774268e-06, + "loss": 0.1778, + "step": 16342 + }, + { + "epoch": 0.83, + "grad_norm": 3.0335774583431516, + "learning_rate": 1.4597522536139052e-06, + "loss": 0.143, + "step": 16343 + }, + { + "epoch": 0.83, + "grad_norm": 1.2014210515526043, + "learning_rate": 1.458895563108248e-06, + "loss": 0.1636, + "step": 16344 + }, + { + "epoch": 0.83, + "grad_norm": 1.2414386216197213, + "learning_rate": 1.4580391042805287e-06, + "loss": 0.1932, + "step": 16345 + }, + { + "epoch": 0.83, + "grad_norm": 1.2599250563452244, + "learning_rate": 1.4571828771539843e-06, + "loss": 0.1674, + "step": 16346 + }, + { + "epoch": 0.83, + "grad_norm": 1.1791440023288944, + "learning_rate": 1.4563268817518327e-06, + "loss": 0.1776, + "step": 16347 + }, + { + "epoch": 0.83, + "grad_norm": 1.0322052855841803, + "learning_rate": 1.4554711180973003e-06, + "loss": 0.1613, + "step": 16348 + }, + { + "epoch": 0.83, + "grad_norm": 2.2190450616956596, + "learning_rate": 1.4546155862135946e-06, + "loss": 0.1624, + "step": 16349 + }, + { + "epoch": 0.83, + "grad_norm": 1.610583375351424, + "learning_rate": 1.4537602861239253e-06, + "loss": 0.1605, + "step": 16350 + }, + { + "epoch": 0.83, + "grad_norm": 1.11298369910105, + "learning_rate": 1.452905217851489e-06, + "loss": 0.1929, + "step": 16351 + }, + { + "epoch": 0.83, + "grad_norm": 0.9145769935297287, + "learning_rate": 1.452050381419481e-06, + "loss": 0.1699, + "step": 16352 + }, + { + "epoch": 0.83, + "grad_norm": 1.458729223003803, + "learning_rate": 1.4511957768510897e-06, + "loss": 0.1594, + "step": 16353 + }, + { + "epoch": 0.83, + "grad_norm": 1.1644354573343156, + "learning_rate": 1.4503414041694985e-06, + "loss": 0.1862, + "step": 16354 + }, + { + "epoch": 0.83, + "grad_norm": 0.9109531000011147, + "learning_rate": 1.4494872633978763e-06, + "loss": 0.1455, + "step": 16355 + }, + { + "epoch": 0.83, + "grad_norm": 1.853462469807387, + "learning_rate": 1.4486333545593978e-06, + "loss": 0.1534, + "step": 16356 + }, + { + "epoch": 0.83, + "grad_norm": 1.6192208694248555, + "learning_rate": 1.4477796776772202e-06, + "loss": 0.1783, + "step": 16357 + }, + { + "epoch": 0.83, + "grad_norm": 1.1859687710840425, + "learning_rate": 1.4469262327745038e-06, + "loss": 0.1566, + "step": 16358 + }, + { + "epoch": 0.83, + "grad_norm": 1.1934652652801543, + "learning_rate": 1.4460730198743945e-06, + "loss": 0.1648, + "step": 16359 + }, + { + "epoch": 0.83, + "grad_norm": 0.8857142797389227, + "learning_rate": 1.445220039000037e-06, + "loss": 0.1575, + "step": 16360 + }, + { + "epoch": 0.83, + "grad_norm": 0.8879561711602736, + "learning_rate": 1.444367290174573e-06, + "loss": 0.1495, + "step": 16361 + }, + { + "epoch": 0.83, + "grad_norm": 0.9377892019061446, + "learning_rate": 1.4435147734211252e-06, + "loss": 0.1924, + "step": 16362 + }, + { + "epoch": 0.83, + "grad_norm": 1.100278086878258, + "learning_rate": 1.442662488762826e-06, + "loss": 0.1658, + "step": 16363 + }, + { + "epoch": 0.83, + "grad_norm": 0.9895730310143408, + "learning_rate": 1.441810436222788e-06, + "loss": 0.1758, + "step": 16364 + }, + { + "epoch": 0.83, + "grad_norm": 1.2782203378181645, + "learning_rate": 1.4409586158241272e-06, + "loss": 0.1713, + "step": 16365 + }, + { + "epoch": 0.83, + "grad_norm": 1.0976176848106227, + "learning_rate": 1.4401070275899442e-06, + "loss": 0.1675, + "step": 16366 + }, + { + "epoch": 0.83, + "grad_norm": 0.9710678328379625, + "learning_rate": 1.4392556715433447e-06, + "loss": 0.1641, + "step": 16367 + }, + { + "epoch": 0.83, + "grad_norm": 1.3605252867567799, + "learning_rate": 1.438404547707417e-06, + "loss": 0.1721, + "step": 16368 + }, + { + "epoch": 0.83, + "grad_norm": 1.1191812979827758, + "learning_rate": 1.4375536561052483e-06, + "loss": 0.167, + "step": 16369 + }, + { + "epoch": 0.83, + "grad_norm": 0.9856198196977122, + "learning_rate": 1.4367029967599211e-06, + "loss": 0.1886, + "step": 16370 + }, + { + "epoch": 0.83, + "grad_norm": 1.1024483254210755, + "learning_rate": 1.4358525696945104e-06, + "loss": 0.1536, + "step": 16371 + }, + { + "epoch": 0.83, + "grad_norm": 0.8868452274281557, + "learning_rate": 1.4350023749320807e-06, + "loss": 0.1692, + "step": 16372 + }, + { + "epoch": 0.83, + "grad_norm": 1.0929098488958067, + "learning_rate": 1.4341524124956974e-06, + "loss": 0.1664, + "step": 16373 + }, + { + "epoch": 0.83, + "grad_norm": 1.1365025851174004, + "learning_rate": 1.4333026824084116e-06, + "loss": 0.1412, + "step": 16374 + }, + { + "epoch": 0.83, + "grad_norm": 1.0641312250521482, + "learning_rate": 1.4324531846932766e-06, + "loss": 0.1577, + "step": 16375 + }, + { + "epoch": 0.83, + "grad_norm": 1.1004909593029202, + "learning_rate": 1.4316039193733299e-06, + "loss": 0.1544, + "step": 16376 + }, + { + "epoch": 0.83, + "grad_norm": 1.044385625193077, + "learning_rate": 1.4307548864716137e-06, + "loss": 0.1674, + "step": 16377 + }, + { + "epoch": 0.83, + "grad_norm": 2.4788242272864243, + "learning_rate": 1.4299060860111536e-06, + "loss": 0.1409, + "step": 16378 + }, + { + "epoch": 0.83, + "grad_norm": 0.976932708881975, + "learning_rate": 1.4290575180149735e-06, + "loss": 0.155, + "step": 16379 + }, + { + "epoch": 0.83, + "grad_norm": 1.4220348323583278, + "learning_rate": 1.4282091825060963e-06, + "loss": 0.1681, + "step": 16380 + }, + { + "epoch": 0.83, + "grad_norm": 1.172753518856644, + "learning_rate": 1.4273610795075255e-06, + "loss": 0.168, + "step": 16381 + }, + { + "epoch": 0.83, + "grad_norm": 1.0594369986964254, + "learning_rate": 1.4265132090422718e-06, + "loss": 0.1767, + "step": 16382 + }, + { + "epoch": 0.83, + "grad_norm": 0.9537846912091068, + "learning_rate": 1.42566557113333e-06, + "loss": 0.1582, + "step": 16383 + }, + { + "epoch": 0.83, + "grad_norm": 1.1865624630619633, + "learning_rate": 1.4248181658036964e-06, + "loss": 0.1691, + "step": 16384 + }, + { + "epoch": 0.83, + "grad_norm": 1.086406895496996, + "learning_rate": 1.4239709930763513e-06, + "loss": 0.1747, + "step": 16385 + }, + { + "epoch": 0.83, + "grad_norm": 1.1570947276293209, + "learning_rate": 1.4231240529742774e-06, + "loss": 0.1715, + "step": 16386 + }, + { + "epoch": 0.83, + "grad_norm": 0.9852028749587856, + "learning_rate": 1.4222773455204486e-06, + "loss": 0.1396, + "step": 16387 + }, + { + "epoch": 0.83, + "grad_norm": 1.275628718580738, + "learning_rate": 1.4214308707378333e-06, + "loss": 0.176, + "step": 16388 + }, + { + "epoch": 0.83, + "grad_norm": 1.2817426416927773, + "learning_rate": 1.4205846286493875e-06, + "loss": 0.1856, + "step": 16389 + }, + { + "epoch": 0.83, + "grad_norm": 1.0207781425195546, + "learning_rate": 1.4197386192780715e-06, + "loss": 0.1574, + "step": 16390 + }, + { + "epoch": 0.83, + "grad_norm": 0.991995452101157, + "learning_rate": 1.4188928426468263e-06, + "loss": 0.1583, + "step": 16391 + }, + { + "epoch": 0.83, + "grad_norm": 1.4650315505964862, + "learning_rate": 1.418047298778601e-06, + "loss": 0.1735, + "step": 16392 + }, + { + "epoch": 0.83, + "grad_norm": 0.9804090300175904, + "learning_rate": 1.4172019876963249e-06, + "loss": 0.1867, + "step": 16393 + }, + { + "epoch": 0.83, + "grad_norm": 1.0081826340989326, + "learning_rate": 1.4163569094229311e-06, + "loss": 0.1488, + "step": 16394 + }, + { + "epoch": 0.83, + "grad_norm": 1.0979656503210717, + "learning_rate": 1.4155120639813392e-06, + "loss": 0.1768, + "step": 16395 + }, + { + "epoch": 0.83, + "grad_norm": 3.648498219436163, + "learning_rate": 1.414667451394468e-06, + "loss": 0.1893, + "step": 16396 + }, + { + "epoch": 0.83, + "grad_norm": 0.8907662091834141, + "learning_rate": 1.4138230716852285e-06, + "loss": 0.189, + "step": 16397 + }, + { + "epoch": 0.83, + "grad_norm": 1.169590311793009, + "learning_rate": 1.4129789248765214e-06, + "loss": 0.1481, + "step": 16398 + }, + { + "epoch": 0.83, + "grad_norm": 0.8896912266362786, + "learning_rate": 1.4121350109912479e-06, + "loss": 0.1561, + "step": 16399 + }, + { + "epoch": 0.83, + "grad_norm": 1.0289256443235757, + "learning_rate": 1.4112913300522946e-06, + "loss": 0.1538, + "step": 16400 + }, + { + "epoch": 0.83, + "grad_norm": 2.1660382935985822, + "learning_rate": 1.4104478820825518e-06, + "loss": 0.153, + "step": 16401 + }, + { + "epoch": 0.83, + "grad_norm": 1.0220030347435074, + "learning_rate": 1.4096046671048935e-06, + "loss": 0.1428, + "step": 16402 + }, + { + "epoch": 0.83, + "grad_norm": 0.7746742346123936, + "learning_rate": 1.4087616851421959e-06, + "loss": 0.1324, + "step": 16403 + }, + { + "epoch": 0.83, + "grad_norm": 1.068496271057554, + "learning_rate": 1.4079189362173196e-06, + "loss": 0.1699, + "step": 16404 + }, + { + "epoch": 0.83, + "grad_norm": 1.0600589036312207, + "learning_rate": 1.4070764203531283e-06, + "loss": 0.155, + "step": 16405 + }, + { + "epoch": 0.83, + "grad_norm": 1.0528632347878315, + "learning_rate": 1.4062341375724742e-06, + "loss": 0.1697, + "step": 16406 + }, + { + "epoch": 0.83, + "grad_norm": 1.084378023728591, + "learning_rate": 1.4053920878982074e-06, + "loss": 0.1613, + "step": 16407 + }, + { + "epoch": 0.83, + "grad_norm": 1.3434346377155673, + "learning_rate": 1.4045502713531623e-06, + "loss": 0.1797, + "step": 16408 + }, + { + "epoch": 0.83, + "grad_norm": 1.131817937825781, + "learning_rate": 1.4037086879601803e-06, + "loss": 0.162, + "step": 16409 + }, + { + "epoch": 0.83, + "grad_norm": 0.779090735975815, + "learning_rate": 1.4028673377420821e-06, + "loss": 0.1587, + "step": 16410 + }, + { + "epoch": 0.83, + "grad_norm": 1.631104786572298, + "learning_rate": 1.402026220721695e-06, + "loss": 0.1926, + "step": 16411 + }, + { + "epoch": 0.83, + "grad_norm": 1.1120761340682184, + "learning_rate": 1.4011853369218308e-06, + "loss": 0.1774, + "step": 16412 + }, + { + "epoch": 0.83, + "grad_norm": 0.8984433358696586, + "learning_rate": 1.4003446863653004e-06, + "loss": 0.1529, + "step": 16413 + }, + { + "epoch": 0.83, + "grad_norm": 0.9389321223117489, + "learning_rate": 1.3995042690749072e-06, + "loss": 0.1876, + "step": 16414 + }, + { + "epoch": 0.83, + "grad_norm": 1.002293730084705, + "learning_rate": 1.3986640850734444e-06, + "loss": 0.1472, + "step": 16415 + }, + { + "epoch": 0.83, + "grad_norm": 0.9460194544172318, + "learning_rate": 1.3978241343837073e-06, + "loss": 0.178, + "step": 16416 + }, + { + "epoch": 0.83, + "grad_norm": 0.9586613850300995, + "learning_rate": 1.396984417028473e-06, + "loss": 0.1549, + "step": 16417 + }, + { + "epoch": 0.83, + "grad_norm": 1.3849114738939412, + "learning_rate": 1.3961449330305255e-06, + "loss": 0.16, + "step": 16418 + }, + { + "epoch": 0.83, + "grad_norm": 1.1897351890626975, + "learning_rate": 1.3953056824126298e-06, + "loss": 0.1598, + "step": 16419 + }, + { + "epoch": 0.83, + "grad_norm": 1.1119253441096333, + "learning_rate": 1.3944666651975559e-06, + "loss": 0.1643, + "step": 16420 + }, + { + "epoch": 0.84, + "grad_norm": 0.9219902980961129, + "learning_rate": 1.3936278814080572e-06, + "loss": 0.1604, + "step": 16421 + }, + { + "epoch": 0.84, + "grad_norm": 1.2686997809339151, + "learning_rate": 1.3927893310668883e-06, + "loss": 0.1657, + "step": 16422 + }, + { + "epoch": 0.84, + "grad_norm": 1.1503442654242535, + "learning_rate": 1.3919510141967951e-06, + "loss": 0.1586, + "step": 16423 + }, + { + "epoch": 0.84, + "grad_norm": 1.0609452628827456, + "learning_rate": 1.3911129308205196e-06, + "loss": 0.1798, + "step": 16424 + }, + { + "epoch": 0.84, + "grad_norm": 0.9015507948091649, + "learning_rate": 1.390275080960789e-06, + "loss": 0.1503, + "step": 16425 + }, + { + "epoch": 0.84, + "grad_norm": 1.1521551730803656, + "learning_rate": 1.3894374646403363e-06, + "loss": 0.1621, + "step": 16426 + }, + { + "epoch": 0.84, + "grad_norm": 1.111762810036442, + "learning_rate": 1.3886000818818758e-06, + "loss": 0.1691, + "step": 16427 + }, + { + "epoch": 0.84, + "grad_norm": 1.0080781136444183, + "learning_rate": 1.3877629327081266e-06, + "loss": 0.1465, + "step": 16428 + }, + { + "epoch": 0.84, + "grad_norm": 1.299355270870439, + "learning_rate": 1.3869260171417919e-06, + "loss": 0.166, + "step": 16429 + }, + { + "epoch": 0.84, + "grad_norm": 1.1388745378816476, + "learning_rate": 1.3860893352055782e-06, + "loss": 0.1801, + "step": 16430 + }, + { + "epoch": 0.84, + "grad_norm": 1.1660929089576133, + "learning_rate": 1.3852528869221759e-06, + "loss": 0.153, + "step": 16431 + }, + { + "epoch": 0.84, + "grad_norm": 1.2058432239074868, + "learning_rate": 1.3844166723142748e-06, + "loss": 0.1906, + "step": 16432 + }, + { + "epoch": 0.84, + "grad_norm": 1.496859015835789, + "learning_rate": 1.3835806914045602e-06, + "loss": 0.1554, + "step": 16433 + }, + { + "epoch": 0.84, + "grad_norm": 2.102976200306153, + "learning_rate": 1.3827449442157049e-06, + "loss": 0.1776, + "step": 16434 + }, + { + "epoch": 0.84, + "grad_norm": 0.8612700307948432, + "learning_rate": 1.3819094307703807e-06, + "loss": 0.162, + "step": 16435 + }, + { + "epoch": 0.84, + "grad_norm": 1.2124658733496405, + "learning_rate": 1.3810741510912485e-06, + "loss": 0.1533, + "step": 16436 + }, + { + "epoch": 0.84, + "grad_norm": 1.2087711356790787, + "learning_rate": 1.380239105200969e-06, + "loss": 0.1724, + "step": 16437 + }, + { + "epoch": 0.84, + "grad_norm": 0.8197129714176736, + "learning_rate": 1.3794042931221873e-06, + "loss": 0.1483, + "step": 16438 + }, + { + "epoch": 0.84, + "grad_norm": 1.2090348799586166, + "learning_rate": 1.3785697148775522e-06, + "loss": 0.1589, + "step": 16439 + }, + { + "epoch": 0.84, + "grad_norm": 0.9053956920846441, + "learning_rate": 1.3777353704897002e-06, + "loss": 0.1658, + "step": 16440 + }, + { + "epoch": 0.84, + "grad_norm": 0.970638279591743, + "learning_rate": 1.376901259981266e-06, + "loss": 0.1563, + "step": 16441 + }, + { + "epoch": 0.84, + "grad_norm": 1.8254486572617799, + "learning_rate": 1.3760673833748684e-06, + "loss": 0.1497, + "step": 16442 + }, + { + "epoch": 0.84, + "grad_norm": 1.0612951183055963, + "learning_rate": 1.3752337406931338e-06, + "loss": 0.1393, + "step": 16443 + }, + { + "epoch": 0.84, + "grad_norm": 1.129746663017258, + "learning_rate": 1.3744003319586685e-06, + "loss": 0.162, + "step": 16444 + }, + { + "epoch": 0.84, + "grad_norm": 1.166170052999444, + "learning_rate": 1.3735671571940835e-06, + "loss": 0.1663, + "step": 16445 + }, + { + "epoch": 0.84, + "grad_norm": 0.9240405167721508, + "learning_rate": 1.3727342164219736e-06, + "loss": 0.1682, + "step": 16446 + }, + { + "epoch": 0.84, + "grad_norm": 1.8770970913366118, + "learning_rate": 1.371901509664939e-06, + "loss": 0.1839, + "step": 16447 + }, + { + "epoch": 0.84, + "grad_norm": 1.3908562774318896, + "learning_rate": 1.3710690369455605e-06, + "loss": 0.1514, + "step": 16448 + }, + { + "epoch": 0.84, + "grad_norm": 1.5732484290072952, + "learning_rate": 1.3702367982864218e-06, + "loss": 0.1551, + "step": 16449 + }, + { + "epoch": 0.84, + "grad_norm": 1.0855678285096475, + "learning_rate": 1.3694047937100985e-06, + "loss": 0.1643, + "step": 16450 + }, + { + "epoch": 0.84, + "grad_norm": 1.2534297475243648, + "learning_rate": 1.368573023239157e-06, + "loss": 0.1929, + "step": 16451 + }, + { + "epoch": 0.84, + "grad_norm": 0.9189340263405366, + "learning_rate": 1.3677414868961615e-06, + "loss": 0.1644, + "step": 16452 + }, + { + "epoch": 0.84, + "grad_norm": 1.0245879139658436, + "learning_rate": 1.3669101847036625e-06, + "loss": 0.1689, + "step": 16453 + }, + { + "epoch": 0.84, + "grad_norm": 1.392745604846661, + "learning_rate": 1.3660791166842158e-06, + "loss": 0.1812, + "step": 16454 + }, + { + "epoch": 0.84, + "grad_norm": 1.4550152590012073, + "learning_rate": 1.3652482828603575e-06, + "loss": 0.155, + "step": 16455 + }, + { + "epoch": 0.84, + "grad_norm": 1.0442879814309745, + "learning_rate": 1.3644176832546296e-06, + "loss": 0.184, + "step": 16456 + }, + { + "epoch": 0.84, + "grad_norm": 1.1630175865305374, + "learning_rate": 1.3635873178895587e-06, + "loss": 0.1712, + "step": 16457 + }, + { + "epoch": 0.84, + "grad_norm": 1.1786859632173177, + "learning_rate": 1.3627571867876689e-06, + "loss": 0.1613, + "step": 16458 + }, + { + "epoch": 0.84, + "grad_norm": 0.825323167134559, + "learning_rate": 1.3619272899714776e-06, + "loss": 0.1654, + "step": 16459 + }, + { + "epoch": 0.84, + "grad_norm": 1.2754532680519124, + "learning_rate": 1.3610976274634991e-06, + "loss": 0.1724, + "step": 16460 + }, + { + "epoch": 0.84, + "grad_norm": 1.1863324872811745, + "learning_rate": 1.3602681992862333e-06, + "loss": 0.1731, + "step": 16461 + }, + { + "epoch": 0.84, + "grad_norm": 0.9164563337067313, + "learning_rate": 1.359439005462183e-06, + "loss": 0.1614, + "step": 16462 + }, + { + "epoch": 0.84, + "grad_norm": 1.4849730806306138, + "learning_rate": 1.3586100460138352e-06, + "loss": 0.1425, + "step": 16463 + }, + { + "epoch": 0.84, + "grad_norm": 0.9652478128597369, + "learning_rate": 1.3577813209636803e-06, + "loss": 0.1739, + "step": 16464 + }, + { + "epoch": 0.84, + "grad_norm": 1.041259915146661, + "learning_rate": 1.3569528303341927e-06, + "loss": 0.1763, + "step": 16465 + }, + { + "epoch": 0.84, + "grad_norm": 1.0776118097768106, + "learning_rate": 1.356124574147848e-06, + "loss": 0.1636, + "step": 16466 + }, + { + "epoch": 0.84, + "grad_norm": 1.4982116007172273, + "learning_rate": 1.3552965524271144e-06, + "loss": 0.1715, + "step": 16467 + }, + { + "epoch": 0.84, + "grad_norm": 1.091139820413839, + "learning_rate": 1.3544687651944476e-06, + "loss": 0.1667, + "step": 16468 + }, + { + "epoch": 0.84, + "grad_norm": 1.0532924603589213, + "learning_rate": 1.3536412124723075e-06, + "loss": 0.1559, + "step": 16469 + }, + { + "epoch": 0.84, + "grad_norm": 1.7324418230947063, + "learning_rate": 1.3528138942831337e-06, + "loss": 0.1611, + "step": 16470 + }, + { + "epoch": 0.84, + "grad_norm": 1.2857408269894968, + "learning_rate": 1.351986810649375e-06, + "loss": 0.1643, + "step": 16471 + }, + { + "epoch": 0.84, + "grad_norm": 1.007133004425682, + "learning_rate": 1.35115996159346e-06, + "loss": 0.1704, + "step": 16472 + }, + { + "epoch": 0.84, + "grad_norm": 0.9087690813649096, + "learning_rate": 1.3503333471378211e-06, + "loss": 0.168, + "step": 16473 + }, + { + "epoch": 0.84, + "grad_norm": 1.3235653985397124, + "learning_rate": 1.3495069673048778e-06, + "loss": 0.1738, + "step": 16474 + }, + { + "epoch": 0.84, + "grad_norm": 1.0334826159258361, + "learning_rate": 1.3486808221170455e-06, + "loss": 0.1568, + "step": 16475 + }, + { + "epoch": 0.84, + "grad_norm": 1.1220531438873298, + "learning_rate": 1.3478549115967344e-06, + "loss": 0.1461, + "step": 16476 + }, + { + "epoch": 0.84, + "grad_norm": 1.7370009339817567, + "learning_rate": 1.3470292357663506e-06, + "loss": 0.1513, + "step": 16477 + }, + { + "epoch": 0.84, + "grad_norm": 1.0200812436144857, + "learning_rate": 1.3462037946482842e-06, + "loss": 0.1608, + "step": 16478 + }, + { + "epoch": 0.84, + "grad_norm": 0.8658443215049889, + "learning_rate": 1.3453785882649317e-06, + "loss": 0.1708, + "step": 16479 + }, + { + "epoch": 0.84, + "grad_norm": 0.9038392745163156, + "learning_rate": 1.3445536166386708e-06, + "loss": 0.1551, + "step": 16480 + }, + { + "epoch": 0.84, + "grad_norm": 0.8945414551418993, + "learning_rate": 1.3437288797918858e-06, + "loss": 0.1659, + "step": 16481 + }, + { + "epoch": 0.84, + "grad_norm": 1.2513579274863056, + "learning_rate": 1.3429043777469397e-06, + "loss": 0.1573, + "step": 16482 + }, + { + "epoch": 0.84, + "grad_norm": 1.032128921002778, + "learning_rate": 1.3420801105262026e-06, + "loss": 0.1572, + "step": 16483 + }, + { + "epoch": 0.84, + "grad_norm": 0.9609598056949419, + "learning_rate": 1.3412560781520334e-06, + "loss": 0.1575, + "step": 16484 + }, + { + "epoch": 0.84, + "grad_norm": 0.9334010175995588, + "learning_rate": 1.3404322806467796e-06, + "loss": 0.1288, + "step": 16485 + }, + { + "epoch": 0.84, + "grad_norm": 1.3460503358341827, + "learning_rate": 1.339608718032791e-06, + "loss": 0.1533, + "step": 16486 + }, + { + "epoch": 0.84, + "grad_norm": 1.8418329982109378, + "learning_rate": 1.3387853903324032e-06, + "loss": 0.1772, + "step": 16487 + }, + { + "epoch": 0.84, + "grad_norm": 1.667359050992339, + "learning_rate": 1.337962297567954e-06, + "loss": 0.1493, + "step": 16488 + }, + { + "epoch": 0.84, + "grad_norm": 1.0445679883208367, + "learning_rate": 1.3371394397617644e-06, + "loss": 0.1637, + "step": 16489 + }, + { + "epoch": 0.84, + "grad_norm": 0.8907911590724815, + "learning_rate": 1.3363168169361574e-06, + "loss": 0.1643, + "step": 16490 + }, + { + "epoch": 0.84, + "grad_norm": 1.0557707181235574, + "learning_rate": 1.3354944291134452e-06, + "loss": 0.1616, + "step": 16491 + }, + { + "epoch": 0.84, + "grad_norm": 0.8614216558652774, + "learning_rate": 1.3346722763159358e-06, + "loss": 0.1483, + "step": 16492 + }, + { + "epoch": 0.84, + "grad_norm": 1.469119758561671, + "learning_rate": 1.3338503585659302e-06, + "loss": 0.1802, + "step": 16493 + }, + { + "epoch": 0.84, + "grad_norm": 0.8268722157076108, + "learning_rate": 1.3330286758857258e-06, + "loss": 0.1535, + "step": 16494 + }, + { + "epoch": 0.84, + "grad_norm": 0.978780111321717, + "learning_rate": 1.3322072282976051e-06, + "loss": 0.1771, + "step": 16495 + }, + { + "epoch": 0.84, + "grad_norm": 0.7792839909962566, + "learning_rate": 1.3313860158238556e-06, + "loss": 0.156, + "step": 16496 + }, + { + "epoch": 0.84, + "grad_norm": 1.130327790411797, + "learning_rate": 1.3305650384867475e-06, + "loss": 0.1761, + "step": 16497 + }, + { + "epoch": 0.84, + "grad_norm": 0.9698512769898063, + "learning_rate": 1.329744296308555e-06, + "loss": 0.1744, + "step": 16498 + }, + { + "epoch": 0.84, + "grad_norm": 1.7540272536679848, + "learning_rate": 1.3289237893115348e-06, + "loss": 0.1715, + "step": 16499 + }, + { + "epoch": 0.84, + "grad_norm": 0.9543124083561036, + "learning_rate": 1.3281035175179503e-06, + "loss": 0.1716, + "step": 16500 + }, + { + "epoch": 0.84, + "grad_norm": 1.0794734550478802, + "learning_rate": 1.3272834809500446e-06, + "loss": 0.176, + "step": 16501 + }, + { + "epoch": 0.84, + "grad_norm": 0.986549355968139, + "learning_rate": 1.3264636796300646e-06, + "loss": 0.1717, + "step": 16502 + }, + { + "epoch": 0.84, + "grad_norm": 0.9899451300872415, + "learning_rate": 1.325644113580249e-06, + "loss": 0.1564, + "step": 16503 + }, + { + "epoch": 0.84, + "grad_norm": 1.1125736143872642, + "learning_rate": 1.3248247828228244e-06, + "loss": 0.1646, + "step": 16504 + }, + { + "epoch": 0.84, + "grad_norm": 1.1028198395225064, + "learning_rate": 1.324005687380021e-06, + "loss": 0.1556, + "step": 16505 + }, + { + "epoch": 0.84, + "grad_norm": 1.9268461161989021, + "learning_rate": 1.32318682727405e-06, + "loss": 0.1824, + "step": 16506 + }, + { + "epoch": 0.84, + "grad_norm": 1.2747726043601, + "learning_rate": 1.322368202527129e-06, + "loss": 0.1638, + "step": 16507 + }, + { + "epoch": 0.84, + "grad_norm": 9.851572183953692, + "learning_rate": 1.321549813161458e-06, + "loss": 0.1739, + "step": 16508 + }, + { + "epoch": 0.84, + "grad_norm": 1.2012203043166474, + "learning_rate": 1.3207316591992392e-06, + "loss": 0.1578, + "step": 16509 + }, + { + "epoch": 0.84, + "grad_norm": 2.631785569122105, + "learning_rate": 1.3199137406626639e-06, + "loss": 0.1582, + "step": 16510 + }, + { + "epoch": 0.84, + "grad_norm": 1.8110969696411472, + "learning_rate": 1.319096057573921e-06, + "loss": 0.1816, + "step": 16511 + }, + { + "epoch": 0.84, + "grad_norm": 2.4847912761305304, + "learning_rate": 1.3182786099551848e-06, + "loss": 0.1669, + "step": 16512 + }, + { + "epoch": 0.84, + "grad_norm": 1.0646581402910877, + "learning_rate": 1.3174613978286355e-06, + "loss": 0.1706, + "step": 16513 + }, + { + "epoch": 0.84, + "grad_norm": 1.5209956250155796, + "learning_rate": 1.3166444212164331e-06, + "loss": 0.1956, + "step": 16514 + }, + { + "epoch": 0.84, + "grad_norm": 1.0073009213634985, + "learning_rate": 1.3158276801407432e-06, + "loss": 0.1748, + "step": 16515 + }, + { + "epoch": 0.84, + "grad_norm": 0.9904598112902595, + "learning_rate": 1.3150111746237159e-06, + "loss": 0.1339, + "step": 16516 + }, + { + "epoch": 0.84, + "grad_norm": 1.4137658796525223, + "learning_rate": 1.3141949046875025e-06, + "loss": 0.1742, + "step": 16517 + }, + { + "epoch": 0.84, + "grad_norm": 1.1149818676663423, + "learning_rate": 1.3133788703542417e-06, + "loss": 0.1771, + "step": 16518 + }, + { + "epoch": 0.84, + "grad_norm": 0.8472715236112703, + "learning_rate": 1.3125630716460692e-06, + "loss": 0.1746, + "step": 16519 + }, + { + "epoch": 0.84, + "grad_norm": 1.1760004963805697, + "learning_rate": 1.3117475085851173e-06, + "loss": 0.1757, + "step": 16520 + }, + { + "epoch": 0.84, + "grad_norm": 1.0916298607265162, + "learning_rate": 1.3109321811935017e-06, + "loss": 0.1651, + "step": 16521 + }, + { + "epoch": 0.84, + "grad_norm": 1.0303127188530168, + "learning_rate": 1.3101170894933436e-06, + "loss": 0.1641, + "step": 16522 + }, + { + "epoch": 0.84, + "grad_norm": 0.9849398321369848, + "learning_rate": 1.3093022335067485e-06, + "loss": 0.1786, + "step": 16523 + }, + { + "epoch": 0.84, + "grad_norm": 0.926636407257029, + "learning_rate": 1.3084876132558233e-06, + "loss": 0.1583, + "step": 16524 + }, + { + "epoch": 0.84, + "grad_norm": 1.0249330189845316, + "learning_rate": 1.3076732287626603e-06, + "loss": 0.1666, + "step": 16525 + }, + { + "epoch": 0.84, + "grad_norm": 0.8608635334435888, + "learning_rate": 1.306859080049353e-06, + "loss": 0.163, + "step": 16526 + }, + { + "epoch": 0.84, + "grad_norm": 1.077394151696253, + "learning_rate": 1.3060451671379837e-06, + "loss": 0.1793, + "step": 16527 + }, + { + "epoch": 0.84, + "grad_norm": 0.9431214779085131, + "learning_rate": 1.3052314900506292e-06, + "loss": 0.1718, + "step": 16528 + }, + { + "epoch": 0.84, + "grad_norm": 1.0518179327905288, + "learning_rate": 1.3044180488093616e-06, + "loss": 0.1525, + "step": 16529 + }, + { + "epoch": 0.84, + "grad_norm": 1.3463257700345999, + "learning_rate": 1.303604843436248e-06, + "loss": 0.1587, + "step": 16530 + }, + { + "epoch": 0.84, + "grad_norm": 1.0130415107260289, + "learning_rate": 1.3027918739533429e-06, + "loss": 0.1584, + "step": 16531 + }, + { + "epoch": 0.84, + "grad_norm": 1.2403872414203043, + "learning_rate": 1.3019791403826998e-06, + "loss": 0.158, + "step": 16532 + }, + { + "epoch": 0.84, + "grad_norm": 0.9707677779447362, + "learning_rate": 1.3011666427463631e-06, + "loss": 0.1448, + "step": 16533 + }, + { + "epoch": 0.84, + "grad_norm": 1.4342116326299943, + "learning_rate": 1.3003543810663744e-06, + "loss": 0.1634, + "step": 16534 + }, + { + "epoch": 0.84, + "grad_norm": 0.8825279913771847, + "learning_rate": 1.2995423553647623e-06, + "loss": 0.1418, + "step": 16535 + }, + { + "epoch": 0.84, + "grad_norm": 1.8585049453063847, + "learning_rate": 1.2987305656635541e-06, + "loss": 0.1763, + "step": 16536 + }, + { + "epoch": 0.84, + "grad_norm": 0.8345181790410855, + "learning_rate": 1.297919011984774e-06, + "loss": 0.192, + "step": 16537 + }, + { + "epoch": 0.84, + "grad_norm": 1.0837015041442009, + "learning_rate": 1.2971076943504302e-06, + "loss": 0.1513, + "step": 16538 + }, + { + "epoch": 0.84, + "grad_norm": 1.3773116668194152, + "learning_rate": 1.296296612782534e-06, + "loss": 0.1754, + "step": 16539 + }, + { + "epoch": 0.84, + "grad_norm": 1.0696309455674147, + "learning_rate": 1.2954857673030807e-06, + "loss": 0.1589, + "step": 16540 + }, + { + "epoch": 0.84, + "grad_norm": 0.9033196322937592, + "learning_rate": 1.2946751579340699e-06, + "loss": 0.1849, + "step": 16541 + }, + { + "epoch": 0.84, + "grad_norm": 0.9471419142153938, + "learning_rate": 1.293864784697486e-06, + "loss": 0.1392, + "step": 16542 + }, + { + "epoch": 0.84, + "grad_norm": 0.8082640463443478, + "learning_rate": 1.2930546476153128e-06, + "loss": 0.1456, + "step": 16543 + }, + { + "epoch": 0.84, + "grad_norm": 1.1840965734810238, + "learning_rate": 1.2922447467095222e-06, + "loss": 0.1835, + "step": 16544 + }, + { + "epoch": 0.84, + "grad_norm": 1.5552907721291551, + "learning_rate": 1.2914350820020837e-06, + "loss": 0.1479, + "step": 16545 + }, + { + "epoch": 0.84, + "grad_norm": 1.2297661172025343, + "learning_rate": 1.290625653514962e-06, + "loss": 0.157, + "step": 16546 + }, + { + "epoch": 0.84, + "grad_norm": 0.8249973525995244, + "learning_rate": 1.2898164612701125e-06, + "loss": 0.1489, + "step": 16547 + }, + { + "epoch": 0.84, + "grad_norm": 1.5170565974582284, + "learning_rate": 1.2890075052894812e-06, + "loss": 0.1853, + "step": 16548 + }, + { + "epoch": 0.84, + "grad_norm": 0.9236007876873691, + "learning_rate": 1.2881987855950162e-06, + "loss": 0.1636, + "step": 16549 + }, + { + "epoch": 0.84, + "grad_norm": 0.9777282906763574, + "learning_rate": 1.2873903022086487e-06, + "loss": 0.155, + "step": 16550 + }, + { + "epoch": 0.84, + "grad_norm": 1.1129280526793448, + "learning_rate": 1.2865820551523134e-06, + "loss": 0.1848, + "step": 16551 + }, + { + "epoch": 0.84, + "grad_norm": 0.995012290601466, + "learning_rate": 1.2857740444479306e-06, + "loss": 0.1923, + "step": 16552 + }, + { + "epoch": 0.84, + "grad_norm": 1.014297480330186, + "learning_rate": 1.2849662701174204e-06, + "loss": 0.1802, + "step": 16553 + }, + { + "epoch": 0.84, + "grad_norm": 1.3572272500601252, + "learning_rate": 1.28415873218269e-06, + "loss": 0.1404, + "step": 16554 + }, + { + "epoch": 0.84, + "grad_norm": 1.5575915436378995, + "learning_rate": 1.2833514306656468e-06, + "loss": 0.1473, + "step": 16555 + }, + { + "epoch": 0.84, + "grad_norm": 1.1382318316054767, + "learning_rate": 1.2825443655881897e-06, + "loss": 0.1705, + "step": 16556 + }, + { + "epoch": 0.84, + "grad_norm": 0.9193602383948707, + "learning_rate": 1.2817375369722074e-06, + "loss": 0.1576, + "step": 16557 + }, + { + "epoch": 0.84, + "grad_norm": 0.8721215551729223, + "learning_rate": 1.2809309448395891e-06, + "loss": 0.1705, + "step": 16558 + }, + { + "epoch": 0.84, + "grad_norm": 1.222139036368183, + "learning_rate": 1.2801245892122095e-06, + "loss": 0.169, + "step": 16559 + }, + { + "epoch": 0.84, + "grad_norm": 1.0006821267173502, + "learning_rate": 1.2793184701119444e-06, + "loss": 0.1468, + "step": 16560 + }, + { + "epoch": 0.84, + "grad_norm": 0.8754388516894127, + "learning_rate": 1.2785125875606563e-06, + "loss": 0.1743, + "step": 16561 + }, + { + "epoch": 0.84, + "grad_norm": 0.8966656140349384, + "learning_rate": 1.277706941580208e-06, + "loss": 0.1775, + "step": 16562 + }, + { + "epoch": 0.84, + "grad_norm": 1.2780190755664806, + "learning_rate": 1.2769015321924506e-06, + "loss": 0.1712, + "step": 16563 + }, + { + "epoch": 0.84, + "grad_norm": 0.9360785334190713, + "learning_rate": 1.2760963594192332e-06, + "loss": 0.1488, + "step": 16564 + }, + { + "epoch": 0.84, + "grad_norm": 0.910441612685864, + "learning_rate": 1.2752914232823942e-06, + "loss": 0.1678, + "step": 16565 + }, + { + "epoch": 0.84, + "grad_norm": 1.5132061912551134, + "learning_rate": 1.2744867238037695e-06, + "loss": 0.1779, + "step": 16566 + }, + { + "epoch": 0.84, + "grad_norm": 1.0221427405596983, + "learning_rate": 1.2736822610051825e-06, + "loss": 0.1525, + "step": 16567 + }, + { + "epoch": 0.84, + "grad_norm": 0.9288590580525012, + "learning_rate": 1.2728780349084603e-06, + "loss": 0.1559, + "step": 16568 + }, + { + "epoch": 0.84, + "grad_norm": 1.1899146088344454, + "learning_rate": 1.272074045535412e-06, + "loss": 0.1549, + "step": 16569 + }, + { + "epoch": 0.84, + "grad_norm": 1.185011658713729, + "learning_rate": 1.271270292907849e-06, + "loss": 0.1754, + "step": 16570 + }, + { + "epoch": 0.84, + "grad_norm": 1.110425755207929, + "learning_rate": 1.270466777047572e-06, + "loss": 0.1713, + "step": 16571 + }, + { + "epoch": 0.84, + "grad_norm": 0.8477743760141506, + "learning_rate": 1.2696634979763757e-06, + "loss": 0.145, + "step": 16572 + }, + { + "epoch": 0.84, + "grad_norm": 0.876659589143047, + "learning_rate": 1.2688604557160523e-06, + "loss": 0.1719, + "step": 16573 + }, + { + "epoch": 0.84, + "grad_norm": 1.931464501268706, + "learning_rate": 1.26805765028838e-06, + "loss": 0.1742, + "step": 16574 + }, + { + "epoch": 0.84, + "grad_norm": 1.0268481379315793, + "learning_rate": 1.2672550817151397e-06, + "loss": 0.1778, + "step": 16575 + }, + { + "epoch": 0.84, + "grad_norm": 0.9313058948325292, + "learning_rate": 1.2664527500180956e-06, + "loss": 0.1481, + "step": 16576 + }, + { + "epoch": 0.84, + "grad_norm": 1.041165067742634, + "learning_rate": 1.2656506552190163e-06, + "loss": 0.1793, + "step": 16577 + }, + { + "epoch": 0.84, + "grad_norm": 1.3093192642434583, + "learning_rate": 1.264848797339655e-06, + "loss": 0.1745, + "step": 16578 + }, + { + "epoch": 0.84, + "grad_norm": 1.1592795347577627, + "learning_rate": 1.2640471764017625e-06, + "loss": 0.165, + "step": 16579 + }, + { + "epoch": 0.84, + "grad_norm": 1.5147033431066657, + "learning_rate": 1.2632457924270835e-06, + "loss": 0.1658, + "step": 16580 + }, + { + "epoch": 0.84, + "grad_norm": 0.9477516005638641, + "learning_rate": 1.2624446454373596e-06, + "loss": 0.162, + "step": 16581 + }, + { + "epoch": 0.84, + "grad_norm": 1.6145234740898102, + "learning_rate": 1.2616437354543142e-06, + "loss": 0.1783, + "step": 16582 + }, + { + "epoch": 0.84, + "grad_norm": 0.9709781790920385, + "learning_rate": 1.2608430624996793e-06, + "loss": 0.1613, + "step": 16583 + }, + { + "epoch": 0.84, + "grad_norm": 1.0059251190145087, + "learning_rate": 1.2600426265951671e-06, + "loss": 0.1655, + "step": 16584 + }, + { + "epoch": 0.84, + "grad_norm": 0.852242224906641, + "learning_rate": 1.2592424277624948e-06, + "loss": 0.1798, + "step": 16585 + }, + { + "epoch": 0.84, + "grad_norm": 0.9307410637537811, + "learning_rate": 1.2584424660233641e-06, + "loss": 0.1717, + "step": 16586 + }, + { + "epoch": 0.84, + "grad_norm": 2.3261276227772267, + "learning_rate": 1.2576427413994764e-06, + "loss": 0.1568, + "step": 16587 + }, + { + "epoch": 0.84, + "grad_norm": 1.1057900784109742, + "learning_rate": 1.2568432539125207e-06, + "loss": 0.1714, + "step": 16588 + }, + { + "epoch": 0.84, + "grad_norm": 1.0409802751492843, + "learning_rate": 1.256044003584186e-06, + "loss": 0.1449, + "step": 16589 + }, + { + "epoch": 0.84, + "grad_norm": 1.189379870667062, + "learning_rate": 1.255244990436153e-06, + "loss": 0.1788, + "step": 16590 + }, + { + "epoch": 0.84, + "grad_norm": 0.9420416050470736, + "learning_rate": 1.2544462144900926e-06, + "loss": 0.1716, + "step": 16591 + }, + { + "epoch": 0.84, + "grad_norm": 0.7131804242727195, + "learning_rate": 1.253647675767674e-06, + "loss": 0.1671, + "step": 16592 + }, + { + "epoch": 0.84, + "grad_norm": 0.8651428576530652, + "learning_rate": 1.2528493742905533e-06, + "loss": 0.1541, + "step": 16593 + }, + { + "epoch": 0.84, + "grad_norm": 0.9329106635611252, + "learning_rate": 1.252051310080391e-06, + "loss": 0.1493, + "step": 16594 + }, + { + "epoch": 0.84, + "grad_norm": 0.8348058446150314, + "learning_rate": 1.2512534831588285e-06, + "loss": 0.1701, + "step": 16595 + }, + { + "epoch": 0.84, + "grad_norm": 1.3710651965857372, + "learning_rate": 1.2504558935475108e-06, + "loss": 0.1533, + "step": 16596 + }, + { + "epoch": 0.84, + "grad_norm": 1.540429321961662, + "learning_rate": 1.2496585412680696e-06, + "loss": 0.1754, + "step": 16597 + }, + { + "epoch": 0.84, + "grad_norm": 0.9723673657195523, + "learning_rate": 1.2488614263421338e-06, + "loss": 0.1668, + "step": 16598 + }, + { + "epoch": 0.84, + "grad_norm": 0.8723989382271515, + "learning_rate": 1.248064548791328e-06, + "loss": 0.2018, + "step": 16599 + }, + { + "epoch": 0.84, + "grad_norm": 1.1434520514035937, + "learning_rate": 1.2472679086372662e-06, + "loss": 0.1568, + "step": 16600 + }, + { + "epoch": 0.84, + "grad_norm": 1.3246782154021164, + "learning_rate": 1.2464715059015553e-06, + "loss": 0.1793, + "step": 16601 + }, + { + "epoch": 0.84, + "grad_norm": 2.537808161865273, + "learning_rate": 1.2456753406058008e-06, + "loss": 0.1464, + "step": 16602 + }, + { + "epoch": 0.84, + "grad_norm": 1.4533241913453714, + "learning_rate": 1.2448794127715947e-06, + "loss": 0.1396, + "step": 16603 + }, + { + "epoch": 0.84, + "grad_norm": 0.9080860744875678, + "learning_rate": 1.2440837224205316e-06, + "loss": 0.1399, + "step": 16604 + }, + { + "epoch": 0.84, + "grad_norm": 1.470295143980769, + "learning_rate": 1.243288269574191e-06, + "loss": 0.1974, + "step": 16605 + }, + { + "epoch": 0.84, + "grad_norm": 2.04915953083232, + "learning_rate": 1.242493054254149e-06, + "loss": 0.1341, + "step": 16606 + }, + { + "epoch": 0.84, + "grad_norm": 1.044190308506685, + "learning_rate": 1.2416980764819807e-06, + "loss": 0.1829, + "step": 16607 + }, + { + "epoch": 0.84, + "grad_norm": 0.867774746271164, + "learning_rate": 1.2409033362792444e-06, + "loss": 0.1518, + "step": 16608 + }, + { + "epoch": 0.84, + "grad_norm": 1.054702853097598, + "learning_rate": 1.2401088336675015e-06, + "loss": 0.1585, + "step": 16609 + }, + { + "epoch": 0.84, + "grad_norm": 0.9045444257729103, + "learning_rate": 1.2393145686682995e-06, + "loss": 0.1741, + "step": 16610 + }, + { + "epoch": 0.84, + "grad_norm": 1.4045243451592733, + "learning_rate": 1.2385205413031865e-06, + "loss": 0.179, + "step": 16611 + }, + { + "epoch": 0.84, + "grad_norm": 0.8820536894164042, + "learning_rate": 1.2377267515936964e-06, + "loss": 0.1542, + "step": 16612 + }, + { + "epoch": 0.84, + "grad_norm": 1.015566468698118, + "learning_rate": 1.2369331995613664e-06, + "loss": 0.1619, + "step": 16613 + }, + { + "epoch": 0.84, + "grad_norm": 1.103506609341633, + "learning_rate": 1.2361398852277151e-06, + "loss": 0.1456, + "step": 16614 + }, + { + "epoch": 0.84, + "grad_norm": 0.9577327705290578, + "learning_rate": 1.2353468086142639e-06, + "loss": 0.1849, + "step": 16615 + }, + { + "epoch": 0.84, + "grad_norm": 1.0037833100661795, + "learning_rate": 1.2345539697425269e-06, + "loss": 0.1584, + "step": 16616 + }, + { + "epoch": 0.85, + "grad_norm": 0.8532619138403739, + "learning_rate": 1.2337613686340099e-06, + "loss": 0.16, + "step": 16617 + }, + { + "epoch": 0.85, + "grad_norm": 1.0251008349032038, + "learning_rate": 1.2329690053102085e-06, + "loss": 0.1654, + "step": 16618 + }, + { + "epoch": 0.85, + "grad_norm": 0.8976002504409775, + "learning_rate": 1.2321768797926203e-06, + "loss": 0.1532, + "step": 16619 + }, + { + "epoch": 0.85, + "grad_norm": 1.1929559472997977, + "learning_rate": 1.2313849921027277e-06, + "loss": 0.1759, + "step": 16620 + }, + { + "epoch": 0.85, + "grad_norm": 1.2062388730124642, + "learning_rate": 1.2305933422620143e-06, + "loss": 0.1706, + "step": 16621 + }, + { + "epoch": 0.85, + "grad_norm": 0.9055888863766589, + "learning_rate": 1.2298019302919505e-06, + "loss": 0.1615, + "step": 16622 + }, + { + "epoch": 0.85, + "grad_norm": 1.9225015025746164, + "learning_rate": 1.2290107562140053e-06, + "loss": 0.1593, + "step": 16623 + }, + { + "epoch": 0.85, + "grad_norm": 1.2805442002710812, + "learning_rate": 1.2282198200496377e-06, + "loss": 0.1569, + "step": 16624 + }, + { + "epoch": 0.85, + "grad_norm": 1.4657779384979428, + "learning_rate": 1.2274291218203027e-06, + "loss": 0.167, + "step": 16625 + }, + { + "epoch": 0.85, + "grad_norm": 1.0700714587634568, + "learning_rate": 1.22663866154745e-06, + "loss": 0.1695, + "step": 16626 + }, + { + "epoch": 0.85, + "grad_norm": 0.8939815710264033, + "learning_rate": 1.225848439252517e-06, + "loss": 0.1686, + "step": 16627 + }, + { + "epoch": 0.85, + "grad_norm": 1.144969292046293, + "learning_rate": 1.2250584549569433e-06, + "loss": 0.1475, + "step": 16628 + }, + { + "epoch": 0.85, + "grad_norm": 0.9233843808729649, + "learning_rate": 1.2242687086821525e-06, + "loss": 0.1464, + "step": 16629 + }, + { + "epoch": 0.85, + "grad_norm": 2.6290684852204285, + "learning_rate": 1.2234792004495699e-06, + "loss": 0.1749, + "step": 16630 + }, + { + "epoch": 0.85, + "grad_norm": 0.856185263973189, + "learning_rate": 1.2226899302806083e-06, + "loss": 0.1517, + "step": 16631 + }, + { + "epoch": 0.85, + "grad_norm": 1.0364409409561266, + "learning_rate": 1.2219008981966785e-06, + "loss": 0.1678, + "step": 16632 + }, + { + "epoch": 0.85, + "grad_norm": 1.130986837273496, + "learning_rate": 1.221112104219182e-06, + "loss": 0.1577, + "step": 16633 + }, + { + "epoch": 0.85, + "grad_norm": 0.9271609905857692, + "learning_rate": 1.2203235483695176e-06, + "loss": 0.1397, + "step": 16634 + }, + { + "epoch": 0.85, + "grad_norm": 1.3835412288620175, + "learning_rate": 1.2195352306690711e-06, + "loss": 0.1585, + "step": 16635 + }, + { + "epoch": 0.85, + "grad_norm": 2.924188954337587, + "learning_rate": 1.218747151139229e-06, + "loss": 0.1469, + "step": 16636 + }, + { + "epoch": 0.85, + "grad_norm": 1.1342662068725529, + "learning_rate": 1.2179593098013642e-06, + "loss": 0.1648, + "step": 16637 + }, + { + "epoch": 0.85, + "grad_norm": 1.391040675182654, + "learning_rate": 1.2171717066768518e-06, + "loss": 0.1583, + "step": 16638 + }, + { + "epoch": 0.85, + "grad_norm": 1.3017551674300645, + "learning_rate": 1.2163843417870503e-06, + "loss": 0.1599, + "step": 16639 + }, + { + "epoch": 0.85, + "grad_norm": 1.2893603271246814, + "learning_rate": 1.2155972151533225e-06, + "loss": 0.1635, + "step": 16640 + }, + { + "epoch": 0.85, + "grad_norm": 1.5560227204195407, + "learning_rate": 1.2148103267970135e-06, + "loss": 0.1589, + "step": 16641 + }, + { + "epoch": 0.85, + "grad_norm": 1.3221619669349556, + "learning_rate": 1.2140236767394708e-06, + "loss": 0.1865, + "step": 16642 + }, + { + "epoch": 0.85, + "grad_norm": 1.5757234214722537, + "learning_rate": 1.213237265002034e-06, + "loss": 0.1658, + "step": 16643 + }, + { + "epoch": 0.85, + "grad_norm": 1.029193912238693, + "learning_rate": 1.2124510916060307e-06, + "loss": 0.1572, + "step": 16644 + }, + { + "epoch": 0.85, + "grad_norm": 1.3440173887333735, + "learning_rate": 1.21166515657279e-06, + "loss": 0.1482, + "step": 16645 + }, + { + "epoch": 0.85, + "grad_norm": 4.427243578192571, + "learning_rate": 1.2108794599236262e-06, + "loss": 0.1679, + "step": 16646 + }, + { + "epoch": 0.85, + "grad_norm": 1.2071076339340914, + "learning_rate": 1.2100940016798558e-06, + "loss": 0.1848, + "step": 16647 + }, + { + "epoch": 0.85, + "grad_norm": 1.1556199995839302, + "learning_rate": 1.2093087818627801e-06, + "loss": 0.1781, + "step": 16648 + }, + { + "epoch": 0.85, + "grad_norm": 0.8756934847230738, + "learning_rate": 1.2085238004937017e-06, + "loss": 0.1563, + "step": 16649 + }, + { + "epoch": 0.85, + "grad_norm": 1.0378254794811703, + "learning_rate": 1.2077390575939097e-06, + "loss": 0.1684, + "step": 16650 + }, + { + "epoch": 0.85, + "grad_norm": 1.5403138077505762, + "learning_rate": 1.2069545531846926e-06, + "loss": 0.1441, + "step": 16651 + }, + { + "epoch": 0.85, + "grad_norm": 1.753894289360156, + "learning_rate": 1.2061702872873304e-06, + "loss": 0.1792, + "step": 16652 + }, + { + "epoch": 0.85, + "grad_norm": 0.9978037641503398, + "learning_rate": 1.205386259923097e-06, + "loss": 0.1488, + "step": 16653 + }, + { + "epoch": 0.85, + "grad_norm": 1.0916070622992498, + "learning_rate": 1.2046024711132564e-06, + "loss": 0.1893, + "step": 16654 + }, + { + "epoch": 0.85, + "grad_norm": 1.0336387552043056, + "learning_rate": 1.2038189208790718e-06, + "loss": 0.1595, + "step": 16655 + }, + { + "epoch": 0.85, + "grad_norm": 0.9740026133928388, + "learning_rate": 1.203035609241795e-06, + "loss": 0.1524, + "step": 16656 + }, + { + "epoch": 0.85, + "grad_norm": 1.2294148310206503, + "learning_rate": 1.2022525362226755e-06, + "loss": 0.1754, + "step": 16657 + }, + { + "epoch": 0.85, + "grad_norm": 0.9963912849910682, + "learning_rate": 1.201469701842951e-06, + "loss": 0.1686, + "step": 16658 + }, + { + "epoch": 0.85, + "grad_norm": 1.4602436474568625, + "learning_rate": 1.2006871061238578e-06, + "loss": 0.1516, + "step": 16659 + }, + { + "epoch": 0.85, + "grad_norm": 1.2083045646730413, + "learning_rate": 1.1999047490866255e-06, + "loss": 0.1754, + "step": 16660 + }, + { + "epoch": 0.85, + "grad_norm": 0.9319805806800955, + "learning_rate": 1.1991226307524727e-06, + "loss": 0.1651, + "step": 16661 + }, + { + "epoch": 0.85, + "grad_norm": 1.057164534609886, + "learning_rate": 1.198340751142617e-06, + "loss": 0.1414, + "step": 16662 + }, + { + "epoch": 0.85, + "grad_norm": 1.2254343247363502, + "learning_rate": 1.1975591102782635e-06, + "loss": 0.1509, + "step": 16663 + }, + { + "epoch": 0.85, + "grad_norm": 0.8658829631174367, + "learning_rate": 1.1967777081806187e-06, + "loss": 0.1448, + "step": 16664 + }, + { + "epoch": 0.85, + "grad_norm": 0.960613665093742, + "learning_rate": 1.1959965448708731e-06, + "loss": 0.1701, + "step": 16665 + }, + { + "epoch": 0.85, + "grad_norm": 1.748125493424765, + "learning_rate": 1.1952156203702215e-06, + "loss": 0.1563, + "step": 16666 + }, + { + "epoch": 0.85, + "grad_norm": 0.9075724124031925, + "learning_rate": 1.1944349346998407e-06, + "loss": 0.1496, + "step": 16667 + }, + { + "epoch": 0.85, + "grad_norm": 0.9928967707047157, + "learning_rate": 1.1936544878809097e-06, + "loss": 0.1748, + "step": 16668 + }, + { + "epoch": 0.85, + "grad_norm": 1.0413533784476476, + "learning_rate": 1.1928742799345982e-06, + "loss": 0.1767, + "step": 16669 + }, + { + "epoch": 0.85, + "grad_norm": 0.9206089006391661, + "learning_rate": 1.1920943108820714e-06, + "loss": 0.1706, + "step": 16670 + }, + { + "epoch": 0.85, + "grad_norm": 1.2401044893499937, + "learning_rate": 1.1913145807444815e-06, + "loss": 0.1882, + "step": 16671 + }, + { + "epoch": 0.85, + "grad_norm": 0.908453897796349, + "learning_rate": 1.1905350895429835e-06, + "loss": 0.1746, + "step": 16672 + }, + { + "epoch": 0.85, + "grad_norm": 0.8886683505554861, + "learning_rate": 1.1897558372987172e-06, + "loss": 0.1677, + "step": 16673 + }, + { + "epoch": 0.85, + "grad_norm": 1.4196567280330579, + "learning_rate": 1.1889768240328225e-06, + "loss": 0.1716, + "step": 16674 + }, + { + "epoch": 0.85, + "grad_norm": 1.0678208880582205, + "learning_rate": 1.1881980497664282e-06, + "loss": 0.1358, + "step": 16675 + }, + { + "epoch": 0.85, + "grad_norm": 1.0379333061722018, + "learning_rate": 1.1874195145206603e-06, + "loss": 0.1687, + "step": 16676 + }, + { + "epoch": 0.85, + "grad_norm": 1.126038895773892, + "learning_rate": 1.1866412183166343e-06, + "loss": 0.1762, + "step": 16677 + }, + { + "epoch": 0.85, + "grad_norm": 1.0333889541378998, + "learning_rate": 1.1858631611754623e-06, + "loss": 0.1668, + "step": 16678 + }, + { + "epoch": 0.85, + "grad_norm": 1.1298072797466057, + "learning_rate": 1.185085343118253e-06, + "loss": 0.1697, + "step": 16679 + }, + { + "epoch": 0.85, + "grad_norm": 0.7944560604697056, + "learning_rate": 1.1843077641660994e-06, + "loss": 0.1484, + "step": 16680 + }, + { + "epoch": 0.85, + "grad_norm": 1.622632460724044, + "learning_rate": 1.183530424340098e-06, + "loss": 0.1536, + "step": 16681 + }, + { + "epoch": 0.85, + "grad_norm": 1.2091299150586754, + "learning_rate": 1.1827533236613287e-06, + "loss": 0.1691, + "step": 16682 + }, + { + "epoch": 0.85, + "grad_norm": 1.0788931153139116, + "learning_rate": 1.1819764621508757e-06, + "loss": 0.1669, + "step": 16683 + }, + { + "epoch": 0.85, + "grad_norm": 1.4777706715450771, + "learning_rate": 1.1811998398298074e-06, + "loss": 0.1733, + "step": 16684 + }, + { + "epoch": 0.85, + "grad_norm": 0.9636743457847207, + "learning_rate": 1.1804234567191919e-06, + "loss": 0.1698, + "step": 16685 + }, + { + "epoch": 0.85, + "grad_norm": 0.93649863813618, + "learning_rate": 1.1796473128400888e-06, + "loss": 0.1561, + "step": 16686 + }, + { + "epoch": 0.85, + "grad_norm": 1.1509318525011094, + "learning_rate": 1.178871408213551e-06, + "loss": 0.1648, + "step": 16687 + }, + { + "epoch": 0.85, + "grad_norm": 1.1851623200092514, + "learning_rate": 1.1780957428606232e-06, + "loss": 0.16, + "step": 16688 + }, + { + "epoch": 0.85, + "grad_norm": 1.303285860586854, + "learning_rate": 1.1773203168023496e-06, + "loss": 0.1832, + "step": 16689 + }, + { + "epoch": 0.85, + "grad_norm": 1.5576203670591835, + "learning_rate": 1.1765451300597574e-06, + "loss": 0.1639, + "step": 16690 + }, + { + "epoch": 0.85, + "grad_norm": 1.3140735072944771, + "learning_rate": 1.1757701826538792e-06, + "loss": 0.1562, + "step": 16691 + }, + { + "epoch": 0.85, + "grad_norm": 1.0278507652035873, + "learning_rate": 1.1749954746057313e-06, + "loss": 0.166, + "step": 16692 + }, + { + "epoch": 0.85, + "grad_norm": 1.224398636284123, + "learning_rate": 1.1742210059363312e-06, + "loss": 0.1546, + "step": 16693 + }, + { + "epoch": 0.85, + "grad_norm": 1.26970783716945, + "learning_rate": 1.1734467766666835e-06, + "loss": 0.1856, + "step": 16694 + }, + { + "epoch": 0.85, + "grad_norm": 1.0206282492563024, + "learning_rate": 1.1726727868177902e-06, + "loss": 0.1519, + "step": 16695 + }, + { + "epoch": 0.85, + "grad_norm": 1.0267392565262938, + "learning_rate": 1.1718990364106476e-06, + "loss": 0.1599, + "step": 16696 + }, + { + "epoch": 0.85, + "grad_norm": 1.0081117309857632, + "learning_rate": 1.1711255254662413e-06, + "loss": 0.168, + "step": 16697 + }, + { + "epoch": 0.85, + "grad_norm": 1.3485046268234842, + "learning_rate": 1.1703522540055545e-06, + "loss": 0.1509, + "step": 16698 + }, + { + "epoch": 0.85, + "grad_norm": 1.1515961083542794, + "learning_rate": 1.1695792220495605e-06, + "loss": 0.1563, + "step": 16699 + }, + { + "epoch": 0.85, + "grad_norm": 1.1567499848884912, + "learning_rate": 1.1688064296192313e-06, + "loss": 0.1724, + "step": 16700 + }, + { + "epoch": 0.85, + "grad_norm": 1.7545024826691027, + "learning_rate": 1.1680338767355237e-06, + "loss": 0.1678, + "step": 16701 + }, + { + "epoch": 0.85, + "grad_norm": 0.988731933262323, + "learning_rate": 1.1672615634193961e-06, + "loss": 0.1494, + "step": 16702 + }, + { + "epoch": 0.85, + "grad_norm": 1.7795116724077813, + "learning_rate": 1.1664894896917966e-06, + "loss": 0.1538, + "step": 16703 + }, + { + "epoch": 0.85, + "grad_norm": 1.127248147568288, + "learning_rate": 1.1657176555736716e-06, + "loss": 0.182, + "step": 16704 + }, + { + "epoch": 0.85, + "grad_norm": 2.0378052019218376, + "learning_rate": 1.164946061085952e-06, + "loss": 0.1718, + "step": 16705 + }, + { + "epoch": 0.85, + "grad_norm": 1.1251788094051784, + "learning_rate": 1.1641747062495723e-06, + "loss": 0.1738, + "step": 16706 + }, + { + "epoch": 0.85, + "grad_norm": 1.0306360633053053, + "learning_rate": 1.163403591085449e-06, + "loss": 0.1581, + "step": 16707 + }, + { + "epoch": 0.85, + "grad_norm": 0.934544355171937, + "learning_rate": 1.1626327156145055e-06, + "loss": 0.1486, + "step": 16708 + }, + { + "epoch": 0.85, + "grad_norm": 1.7447446102909068, + "learning_rate": 1.1618620798576474e-06, + "loss": 0.1702, + "step": 16709 + }, + { + "epoch": 0.85, + "grad_norm": 1.399895022519527, + "learning_rate": 1.16109168383578e-06, + "loss": 0.1679, + "step": 16710 + }, + { + "epoch": 0.85, + "grad_norm": 1.2903232844612942, + "learning_rate": 1.1603215275697988e-06, + "loss": 0.1856, + "step": 16711 + }, + { + "epoch": 0.85, + "grad_norm": 1.024990843444502, + "learning_rate": 1.159551611080596e-06, + "loss": 0.1592, + "step": 16712 + }, + { + "epoch": 0.85, + "grad_norm": 0.9194915982988668, + "learning_rate": 1.1587819343890561e-06, + "loss": 0.1671, + "step": 16713 + }, + { + "epoch": 0.85, + "grad_norm": 1.1353129904878652, + "learning_rate": 1.1580124975160534e-06, + "loss": 0.1731, + "step": 16714 + }, + { + "epoch": 0.85, + "grad_norm": 1.0024954784880729, + "learning_rate": 1.1572433004824635e-06, + "loss": 0.1533, + "step": 16715 + }, + { + "epoch": 0.85, + "grad_norm": 1.0136144334892372, + "learning_rate": 1.1564743433091463e-06, + "loss": 0.1551, + "step": 16716 + }, + { + "epoch": 0.85, + "grad_norm": 1.0058020332612845, + "learning_rate": 1.1557056260169653e-06, + "loss": 0.177, + "step": 16717 + }, + { + "epoch": 0.85, + "grad_norm": 0.7741571728019853, + "learning_rate": 1.1549371486267646e-06, + "loss": 0.153, + "step": 16718 + }, + { + "epoch": 0.85, + "grad_norm": 1.121485078716398, + "learning_rate": 1.1541689111593969e-06, + "loss": 0.1623, + "step": 16719 + }, + { + "epoch": 0.85, + "grad_norm": 1.0291767208676794, + "learning_rate": 1.153400913635695e-06, + "loss": 0.1635, + "step": 16720 + }, + { + "epoch": 0.85, + "grad_norm": 1.1490953421687071, + "learning_rate": 1.1526331560764926e-06, + "loss": 0.1611, + "step": 16721 + }, + { + "epoch": 0.85, + "grad_norm": 1.15884448329287, + "learning_rate": 1.151865638502615e-06, + "loss": 0.1759, + "step": 16722 + }, + { + "epoch": 0.85, + "grad_norm": 1.0724547886666227, + "learning_rate": 1.1510983609348847e-06, + "loss": 0.1575, + "step": 16723 + }, + { + "epoch": 0.85, + "grad_norm": 0.9482165988645026, + "learning_rate": 1.1503313233941082e-06, + "loss": 0.1598, + "step": 16724 + }, + { + "epoch": 0.85, + "grad_norm": 0.9189930992394797, + "learning_rate": 1.1495645259010969e-06, + "loss": 0.16, + "step": 16725 + }, + { + "epoch": 0.85, + "grad_norm": 0.6922036695384618, + "learning_rate": 1.148797968476646e-06, + "loss": 0.1426, + "step": 16726 + }, + { + "epoch": 0.85, + "grad_norm": 1.3946996712571365, + "learning_rate": 1.1480316511415513e-06, + "loss": 0.1821, + "step": 16727 + }, + { + "epoch": 0.85, + "grad_norm": 0.9618638738746309, + "learning_rate": 1.1472655739165961e-06, + "loss": 0.1481, + "step": 16728 + }, + { + "epoch": 0.85, + "grad_norm": 1.9534958513144793, + "learning_rate": 1.1464997368225629e-06, + "loss": 0.1691, + "step": 16729 + }, + { + "epoch": 0.85, + "grad_norm": 1.0621077501576741, + "learning_rate": 1.1457341398802269e-06, + "loss": 0.167, + "step": 16730 + }, + { + "epoch": 0.85, + "grad_norm": 1.1239092189624629, + "learning_rate": 1.1449687831103495e-06, + "loss": 0.1778, + "step": 16731 + }, + { + "epoch": 0.85, + "grad_norm": 1.1674994384296928, + "learning_rate": 1.1442036665336953e-06, + "loss": 0.1806, + "step": 16732 + }, + { + "epoch": 0.85, + "grad_norm": 0.9455081777025037, + "learning_rate": 1.1434387901710164e-06, + "loss": 0.1679, + "step": 16733 + }, + { + "epoch": 0.85, + "grad_norm": 1.0255477263215511, + "learning_rate": 1.142674154043062e-06, + "loss": 0.1815, + "step": 16734 + }, + { + "epoch": 0.85, + "grad_norm": 0.9171922536179344, + "learning_rate": 1.1419097581705686e-06, + "loss": 0.1512, + "step": 16735 + }, + { + "epoch": 0.85, + "grad_norm": 1.0202324495736854, + "learning_rate": 1.1411456025742763e-06, + "loss": 0.1703, + "step": 16736 + }, + { + "epoch": 0.85, + "grad_norm": 0.8282685116494622, + "learning_rate": 1.1403816872749074e-06, + "loss": 0.1707, + "step": 16737 + }, + { + "epoch": 0.85, + "grad_norm": 1.3741495687109866, + "learning_rate": 1.1396180122931854e-06, + "loss": 0.1663, + "step": 16738 + }, + { + "epoch": 0.85, + "grad_norm": 0.944588868826035, + "learning_rate": 1.1388545776498262e-06, + "loss": 0.1568, + "step": 16739 + }, + { + "epoch": 0.85, + "grad_norm": 0.9649547498178261, + "learning_rate": 1.1380913833655383e-06, + "loss": 0.1646, + "step": 16740 + }, + { + "epoch": 0.85, + "grad_norm": 1.0557333070394006, + "learning_rate": 1.13732842946102e-06, + "loss": 0.1941, + "step": 16741 + }, + { + "epoch": 0.85, + "grad_norm": 1.2316283911922998, + "learning_rate": 1.136565715956971e-06, + "loss": 0.1602, + "step": 16742 + }, + { + "epoch": 0.85, + "grad_norm": 1.1441727807938522, + "learning_rate": 1.1358032428740763e-06, + "loss": 0.1568, + "step": 16743 + }, + { + "epoch": 0.85, + "grad_norm": 1.5479886581033788, + "learning_rate": 1.13504101023302e-06, + "loss": 0.1602, + "step": 16744 + }, + { + "epoch": 0.85, + "grad_norm": 0.934032607491036, + "learning_rate": 1.134279018054475e-06, + "loss": 0.1683, + "step": 16745 + }, + { + "epoch": 0.85, + "grad_norm": 1.0609235864393567, + "learning_rate": 1.1335172663591155e-06, + "loss": 0.161, + "step": 16746 + }, + { + "epoch": 0.85, + "grad_norm": 1.6371308461071334, + "learning_rate": 1.1327557551675983e-06, + "loss": 0.1723, + "step": 16747 + }, + { + "epoch": 0.85, + "grad_norm": 1.1378631228418552, + "learning_rate": 1.1319944845005815e-06, + "loss": 0.1641, + "step": 16748 + }, + { + "epoch": 0.85, + "grad_norm": 1.0222296313110266, + "learning_rate": 1.1312334543787185e-06, + "loss": 0.1712, + "step": 16749 + }, + { + "epoch": 0.85, + "grad_norm": 0.9525488302209721, + "learning_rate": 1.130472664822646e-06, + "loss": 0.164, + "step": 16750 + }, + { + "epoch": 0.85, + "grad_norm": 1.048597087332812, + "learning_rate": 1.1297121158530056e-06, + "loss": 0.1345, + "step": 16751 + }, + { + "epoch": 0.85, + "grad_norm": 0.9475471414124667, + "learning_rate": 1.1289518074904227e-06, + "loss": 0.1838, + "step": 16752 + }, + { + "epoch": 0.85, + "grad_norm": 0.9362631929645722, + "learning_rate": 1.1281917397555253e-06, + "loss": 0.1598, + "step": 16753 + }, + { + "epoch": 0.85, + "grad_norm": 0.9116553796982717, + "learning_rate": 1.127431912668926e-06, + "loss": 0.1609, + "step": 16754 + }, + { + "epoch": 0.85, + "grad_norm": 0.9891970094191186, + "learning_rate": 1.126672326251238e-06, + "loss": 0.1706, + "step": 16755 + }, + { + "epoch": 0.85, + "grad_norm": 1.4497468118338226, + "learning_rate": 1.125912980523064e-06, + "loss": 0.1674, + "step": 16756 + }, + { + "epoch": 0.85, + "grad_norm": 1.0089683525593092, + "learning_rate": 1.1251538755050029e-06, + "loss": 0.1587, + "step": 16757 + }, + { + "epoch": 0.85, + "grad_norm": 0.8990294325244518, + "learning_rate": 1.1243950112176428e-06, + "loss": 0.1645, + "step": 16758 + }, + { + "epoch": 0.85, + "grad_norm": 0.9831129016045187, + "learning_rate": 1.1236363876815705e-06, + "loss": 0.1588, + "step": 16759 + }, + { + "epoch": 0.85, + "grad_norm": 1.0042388056429106, + "learning_rate": 1.1228780049173616e-06, + "loss": 0.1534, + "step": 16760 + }, + { + "epoch": 0.85, + "grad_norm": 1.0436397492912155, + "learning_rate": 1.1221198629455898e-06, + "loss": 0.1647, + "step": 16761 + }, + { + "epoch": 0.85, + "grad_norm": 0.9136767477797114, + "learning_rate": 1.1213619617868154e-06, + "loss": 0.1388, + "step": 16762 + }, + { + "epoch": 0.85, + "grad_norm": 0.8794550315521854, + "learning_rate": 1.120604301461602e-06, + "loss": 0.1854, + "step": 16763 + }, + { + "epoch": 0.85, + "grad_norm": 1.1904734038906932, + "learning_rate": 1.1198468819904962e-06, + "loss": 0.1649, + "step": 16764 + }, + { + "epoch": 0.85, + "grad_norm": 1.0325953177890068, + "learning_rate": 1.1190897033940461e-06, + "loss": 0.166, + "step": 16765 + }, + { + "epoch": 0.85, + "grad_norm": 1.0705206526153064, + "learning_rate": 1.11833276569279e-06, + "loss": 0.1667, + "step": 16766 + }, + { + "epoch": 0.85, + "grad_norm": 0.949411680260893, + "learning_rate": 1.117576068907258e-06, + "loss": 0.166, + "step": 16767 + }, + { + "epoch": 0.85, + "grad_norm": 0.9224277427372204, + "learning_rate": 1.116819613057979e-06, + "loss": 0.1555, + "step": 16768 + }, + { + "epoch": 0.85, + "grad_norm": 1.0286982699631861, + "learning_rate": 1.1160633981654679e-06, + "loss": 0.163, + "step": 16769 + }, + { + "epoch": 0.85, + "grad_norm": 0.9503726540756631, + "learning_rate": 1.1153074242502404e-06, + "loss": 0.1821, + "step": 16770 + }, + { + "epoch": 0.85, + "grad_norm": 1.0626090541232485, + "learning_rate": 1.1145516913327991e-06, + "loss": 0.1817, + "step": 16771 + }, + { + "epoch": 0.85, + "grad_norm": 0.8112805845435304, + "learning_rate": 1.1137961994336467e-06, + "loss": 0.1458, + "step": 16772 + }, + { + "epoch": 0.85, + "grad_norm": 3.6616339502407302, + "learning_rate": 1.1130409485732718e-06, + "loss": 0.1719, + "step": 16773 + }, + { + "epoch": 0.85, + "grad_norm": 1.4096929572061465, + "learning_rate": 1.112285938772164e-06, + "loss": 0.1431, + "step": 16774 + }, + { + "epoch": 0.85, + "grad_norm": 1.1876330357447296, + "learning_rate": 1.1115311700508026e-06, + "loss": 0.1736, + "step": 16775 + }, + { + "epoch": 0.85, + "grad_norm": 0.8458628499040595, + "learning_rate": 1.1107766424296606e-06, + "loss": 0.1515, + "step": 16776 + }, + { + "epoch": 0.85, + "grad_norm": 0.9829250335898947, + "learning_rate": 1.1100223559292035e-06, + "loss": 0.1588, + "step": 16777 + }, + { + "epoch": 0.85, + "grad_norm": 0.9993965169500876, + "learning_rate": 1.1092683105698943e-06, + "loss": 0.1751, + "step": 16778 + }, + { + "epoch": 0.85, + "grad_norm": 1.0923849752085002, + "learning_rate": 1.1085145063721814e-06, + "loss": 0.1757, + "step": 16779 + }, + { + "epoch": 0.85, + "grad_norm": 1.5467147394843548, + "learning_rate": 1.1077609433565173e-06, + "loss": 0.1703, + "step": 16780 + }, + { + "epoch": 0.85, + "grad_norm": 1.8122282454607503, + "learning_rate": 1.1070076215433367e-06, + "loss": 0.1697, + "step": 16781 + }, + { + "epoch": 0.85, + "grad_norm": 1.49125635176288, + "learning_rate": 1.1062545409530778e-06, + "loss": 0.1824, + "step": 16782 + }, + { + "epoch": 0.85, + "grad_norm": 1.0610533686743882, + "learning_rate": 1.1055017016061687e-06, + "loss": 0.1772, + "step": 16783 + }, + { + "epoch": 0.85, + "grad_norm": 0.9301093497530798, + "learning_rate": 1.1047491035230262e-06, + "loss": 0.1504, + "step": 16784 + }, + { + "epoch": 0.85, + "grad_norm": 1.229501628580653, + "learning_rate": 1.1039967467240687e-06, + "loss": 0.1473, + "step": 16785 + }, + { + "epoch": 0.85, + "grad_norm": 1.2435714542781264, + "learning_rate": 1.1032446312296995e-06, + "loss": 0.154, + "step": 16786 + }, + { + "epoch": 0.85, + "grad_norm": 0.9557053053008547, + "learning_rate": 1.102492757060325e-06, + "loss": 0.167, + "step": 16787 + }, + { + "epoch": 0.85, + "grad_norm": 2.9880025827983654, + "learning_rate": 1.1017411242363341e-06, + "loss": 0.147, + "step": 16788 + }, + { + "epoch": 0.85, + "grad_norm": 0.9448609802832272, + "learning_rate": 1.1009897327781204e-06, + "loss": 0.1805, + "step": 16789 + }, + { + "epoch": 0.85, + "grad_norm": 1.1482386908142481, + "learning_rate": 1.1002385827060602e-06, + "loss": 0.1621, + "step": 16790 + }, + { + "epoch": 0.85, + "grad_norm": 1.6367255102843754, + "learning_rate": 1.0994876740405314e-06, + "loss": 0.1771, + "step": 16791 + }, + { + "epoch": 0.85, + "grad_norm": 0.8989530912050467, + "learning_rate": 1.0987370068019021e-06, + "loss": 0.1603, + "step": 16792 + }, + { + "epoch": 0.85, + "grad_norm": 1.1553896505928736, + "learning_rate": 1.0979865810105371e-06, + "loss": 0.1405, + "step": 16793 + }, + { + "epoch": 0.85, + "grad_norm": 1.0036497212360274, + "learning_rate": 1.0972363966867861e-06, + "loss": 0.1559, + "step": 16794 + }, + { + "epoch": 0.85, + "grad_norm": 1.6289270363632566, + "learning_rate": 1.0964864538510022e-06, + "loss": 0.1923, + "step": 16795 + }, + { + "epoch": 0.85, + "grad_norm": 0.9544763364485803, + "learning_rate": 1.095736752523525e-06, + "loss": 0.1522, + "step": 16796 + }, + { + "epoch": 0.85, + "grad_norm": 1.0078909004640204, + "learning_rate": 1.094987292724693e-06, + "loss": 0.165, + "step": 16797 + }, + { + "epoch": 0.85, + "grad_norm": 1.453663246336247, + "learning_rate": 1.0942380744748315e-06, + "loss": 0.1784, + "step": 16798 + }, + { + "epoch": 0.85, + "grad_norm": 1.6144793012492105, + "learning_rate": 1.0934890977942646e-06, + "loss": 0.1675, + "step": 16799 + }, + { + "epoch": 0.85, + "grad_norm": 0.9762171377498543, + "learning_rate": 1.0927403627033129e-06, + "loss": 0.1633, + "step": 16800 + }, + { + "epoch": 0.85, + "grad_norm": 1.2662144545477028, + "learning_rate": 1.0919918692222785e-06, + "loss": 0.1654, + "step": 16801 + }, + { + "epoch": 0.85, + "grad_norm": 1.0651887956028305, + "learning_rate": 1.091243617371469e-06, + "loss": 0.1502, + "step": 16802 + }, + { + "epoch": 0.85, + "grad_norm": 0.8561373873884675, + "learning_rate": 1.0904956071711792e-06, + "loss": 0.1681, + "step": 16803 + }, + { + "epoch": 0.85, + "grad_norm": 1.7538486873095505, + "learning_rate": 1.0897478386417003e-06, + "loss": 0.1783, + "step": 16804 + }, + { + "epoch": 0.85, + "grad_norm": 1.043431297504966, + "learning_rate": 1.0890003118033132e-06, + "loss": 0.1787, + "step": 16805 + }, + { + "epoch": 0.85, + "grad_norm": 0.8472181882913274, + "learning_rate": 1.088253026676297e-06, + "loss": 0.1628, + "step": 16806 + }, + { + "epoch": 0.85, + "grad_norm": 0.9808913031348376, + "learning_rate": 1.08750598328092e-06, + "loss": 0.1607, + "step": 16807 + }, + { + "epoch": 0.85, + "grad_norm": 1.0819924372854248, + "learning_rate": 1.0867591816374456e-06, + "loss": 0.1702, + "step": 16808 + }, + { + "epoch": 0.85, + "grad_norm": 0.9482495395134791, + "learning_rate": 1.0860126217661326e-06, + "loss": 0.1651, + "step": 16809 + }, + { + "epoch": 0.85, + "grad_norm": 1.2523002163974752, + "learning_rate": 1.0852663036872324e-06, + "loss": 0.1727, + "step": 16810 + }, + { + "epoch": 0.85, + "grad_norm": 0.8939924425722398, + "learning_rate": 1.0845202274209842e-06, + "loss": 0.1637, + "step": 16811 + }, + { + "epoch": 0.85, + "grad_norm": 0.9276742350222884, + "learning_rate": 1.0837743929876321e-06, + "loss": 0.162, + "step": 16812 + }, + { + "epoch": 0.85, + "grad_norm": 0.9346752169512902, + "learning_rate": 1.0830288004073997e-06, + "loss": 0.1653, + "step": 16813 + }, + { + "epoch": 0.86, + "grad_norm": 0.8831929316357998, + "learning_rate": 1.0822834497005174e-06, + "loss": 0.1514, + "step": 16814 + }, + { + "epoch": 0.86, + "grad_norm": 0.8418644534814382, + "learning_rate": 1.0815383408871983e-06, + "loss": 0.1696, + "step": 16815 + }, + { + "epoch": 0.86, + "grad_norm": 0.9462951060958208, + "learning_rate": 1.080793473987657e-06, + "loss": 0.1689, + "step": 16816 + }, + { + "epoch": 0.86, + "grad_norm": 1.1273647831520743, + "learning_rate": 1.080048849022095e-06, + "loss": 0.1578, + "step": 16817 + }, + { + "epoch": 0.86, + "grad_norm": 1.064506698976965, + "learning_rate": 1.079304466010712e-06, + "loss": 0.152, + "step": 16818 + }, + { + "epoch": 0.86, + "grad_norm": 0.9350482498821943, + "learning_rate": 1.0785603249737008e-06, + "loss": 0.1572, + "step": 16819 + }, + { + "epoch": 0.86, + "grad_norm": 0.8666707172937299, + "learning_rate": 1.0778164259312418e-06, + "loss": 0.1439, + "step": 16820 + }, + { + "epoch": 0.86, + "grad_norm": 1.2224856485223698, + "learning_rate": 1.0770727689035198e-06, + "loss": 0.163, + "step": 16821 + }, + { + "epoch": 0.86, + "grad_norm": 1.5428579120628447, + "learning_rate": 1.0763293539107e-06, + "loss": 0.1616, + "step": 16822 + }, + { + "epoch": 0.86, + "grad_norm": 1.1434133419606083, + "learning_rate": 1.0755861809729518e-06, + "loss": 0.1518, + "step": 16823 + }, + { + "epoch": 0.86, + "grad_norm": 0.8105189335585022, + "learning_rate": 1.0748432501104322e-06, + "loss": 0.1448, + "step": 16824 + }, + { + "epoch": 0.86, + "grad_norm": 1.9990081041082408, + "learning_rate": 1.074100561343292e-06, + "loss": 0.171, + "step": 16825 + }, + { + "epoch": 0.86, + "grad_norm": 1.9316776439039638, + "learning_rate": 1.0733581146916793e-06, + "loss": 0.1706, + "step": 16826 + }, + { + "epoch": 0.86, + "grad_norm": 0.9572566077695434, + "learning_rate": 1.0726159101757327e-06, + "loss": 0.1766, + "step": 16827 + }, + { + "epoch": 0.86, + "grad_norm": 1.2773673420161995, + "learning_rate": 1.0718739478155827e-06, + "loss": 0.1915, + "step": 16828 + }, + { + "epoch": 0.86, + "grad_norm": 0.9132114296361398, + "learning_rate": 1.0711322276313586e-06, + "loss": 0.1636, + "step": 16829 + }, + { + "epoch": 0.86, + "grad_norm": 0.8430574453562424, + "learning_rate": 1.0703907496431743e-06, + "loss": 0.1673, + "step": 16830 + }, + { + "epoch": 0.86, + "grad_norm": 1.9082397791162895, + "learning_rate": 1.0696495138711472e-06, + "loss": 0.1566, + "step": 16831 + }, + { + "epoch": 0.86, + "grad_norm": 1.021608843054946, + "learning_rate": 1.06890852033538e-06, + "loss": 0.1549, + "step": 16832 + }, + { + "epoch": 0.86, + "grad_norm": 1.528181675833744, + "learning_rate": 1.0681677690559743e-06, + "loss": 0.1617, + "step": 16833 + }, + { + "epoch": 0.86, + "grad_norm": 1.0214552074791603, + "learning_rate": 1.0674272600530223e-06, + "loss": 0.1514, + "step": 16834 + }, + { + "epoch": 0.86, + "grad_norm": 1.0212046421885956, + "learning_rate": 1.0666869933466085e-06, + "loss": 0.1613, + "step": 16835 + }, + { + "epoch": 0.86, + "grad_norm": 1.9574791998002623, + "learning_rate": 1.065946968956818e-06, + "loss": 0.1637, + "step": 16836 + }, + { + "epoch": 0.86, + "grad_norm": 1.0040720899152797, + "learning_rate": 1.0652071869037172e-06, + "loss": 0.1667, + "step": 16837 + }, + { + "epoch": 0.86, + "grad_norm": 2.9335152830552658, + "learning_rate": 1.0644676472073789e-06, + "loss": 0.1617, + "step": 16838 + }, + { + "epoch": 0.86, + "grad_norm": 1.1742512919227117, + "learning_rate": 1.0637283498878592e-06, + "loss": 0.1684, + "step": 16839 + }, + { + "epoch": 0.86, + "grad_norm": 1.1858526578379234, + "learning_rate": 1.0629892949652133e-06, + "loss": 0.1756, + "step": 16840 + }, + { + "epoch": 0.86, + "grad_norm": 1.1309226874477336, + "learning_rate": 1.0622504824594859e-06, + "loss": 0.1786, + "step": 16841 + }, + { + "epoch": 0.86, + "grad_norm": 0.9353874388661649, + "learning_rate": 1.0615119123907214e-06, + "loss": 0.1552, + "step": 16842 + }, + { + "epoch": 0.86, + "grad_norm": 1.8959094708241735, + "learning_rate": 1.060773584778949e-06, + "loss": 0.1619, + "step": 16843 + }, + { + "epoch": 0.86, + "grad_norm": 1.089295782757551, + "learning_rate": 1.0600354996441986e-06, + "loss": 0.168, + "step": 16844 + }, + { + "epoch": 0.86, + "grad_norm": 0.9320442391189215, + "learning_rate": 1.0592976570064894e-06, + "loss": 0.1626, + "step": 16845 + }, + { + "epoch": 0.86, + "grad_norm": 1.1291690300178296, + "learning_rate": 1.058560056885838e-06, + "loss": 0.1674, + "step": 16846 + }, + { + "epoch": 0.86, + "grad_norm": 1.0781302845432605, + "learning_rate": 1.0578226993022488e-06, + "loss": 0.1682, + "step": 16847 + }, + { + "epoch": 0.86, + "grad_norm": 0.8344528027416395, + "learning_rate": 1.0570855842757255e-06, + "loss": 0.1715, + "step": 16848 + }, + { + "epoch": 0.86, + "grad_norm": 0.9485026795896946, + "learning_rate": 1.0563487118262583e-06, + "loss": 0.1544, + "step": 16849 + }, + { + "epoch": 0.86, + "grad_norm": 1.2366380866579516, + "learning_rate": 1.0556120819738403e-06, + "loss": 0.1746, + "step": 16850 + }, + { + "epoch": 0.86, + "grad_norm": 0.8921220522014538, + "learning_rate": 1.0548756947384475e-06, + "loss": 0.1773, + "step": 16851 + }, + { + "epoch": 0.86, + "grad_norm": 1.2807101360981994, + "learning_rate": 1.0541395501400564e-06, + "loss": 0.1778, + "step": 16852 + }, + { + "epoch": 0.86, + "grad_norm": 0.9657410535613355, + "learning_rate": 1.0534036481986375e-06, + "loss": 0.1783, + "step": 16853 + }, + { + "epoch": 0.86, + "grad_norm": 1.1306778682370806, + "learning_rate": 1.0526679889341484e-06, + "loss": 0.176, + "step": 16854 + }, + { + "epoch": 0.86, + "grad_norm": 0.8940590385512903, + "learning_rate": 1.0519325723665463e-06, + "loss": 0.1595, + "step": 16855 + }, + { + "epoch": 0.86, + "grad_norm": 1.067914678261924, + "learning_rate": 1.0511973985157775e-06, + "loss": 0.1704, + "step": 16856 + }, + { + "epoch": 0.86, + "grad_norm": 1.20909021403613, + "learning_rate": 1.0504624674017872e-06, + "loss": 0.1612, + "step": 16857 + }, + { + "epoch": 0.86, + "grad_norm": 1.0323079356985376, + "learning_rate": 1.0497277790445048e-06, + "loss": 0.1611, + "step": 16858 + }, + { + "epoch": 0.86, + "grad_norm": 0.8631782867591103, + "learning_rate": 1.0489933334638648e-06, + "loss": 0.1794, + "step": 16859 + }, + { + "epoch": 0.86, + "grad_norm": 1.1566319344478255, + "learning_rate": 1.0482591306797829e-06, + "loss": 0.1809, + "step": 16860 + }, + { + "epoch": 0.86, + "grad_norm": 1.1401885609706566, + "learning_rate": 1.0475251707121791e-06, + "loss": 0.1381, + "step": 16861 + }, + { + "epoch": 0.86, + "grad_norm": 1.5511236998777376, + "learning_rate": 1.0467914535809599e-06, + "loss": 0.152, + "step": 16862 + }, + { + "epoch": 0.86, + "grad_norm": 1.5753143634265765, + "learning_rate": 1.04605797930603e-06, + "loss": 0.1685, + "step": 16863 + }, + { + "epoch": 0.86, + "grad_norm": 1.1623307582591522, + "learning_rate": 1.0453247479072814e-06, + "loss": 0.1507, + "step": 16864 + }, + { + "epoch": 0.86, + "grad_norm": 1.1270531847545042, + "learning_rate": 1.0445917594046073e-06, + "loss": 0.1511, + "step": 16865 + }, + { + "epoch": 0.86, + "grad_norm": 2.0339182269036757, + "learning_rate": 1.043859013817885e-06, + "loss": 0.165, + "step": 16866 + }, + { + "epoch": 0.86, + "grad_norm": 1.0908660060529582, + "learning_rate": 1.0431265111669952e-06, + "loss": 0.152, + "step": 16867 + }, + { + "epoch": 0.86, + "grad_norm": 1.0022956563358218, + "learning_rate": 1.0423942514718043e-06, + "loss": 0.1529, + "step": 16868 + }, + { + "epoch": 0.86, + "grad_norm": 0.8736893626239377, + "learning_rate": 1.0416622347521732e-06, + "loss": 0.1714, + "step": 16869 + }, + { + "epoch": 0.86, + "grad_norm": 2.6477719177196377, + "learning_rate": 1.0409304610279603e-06, + "loss": 0.174, + "step": 16870 + }, + { + "epoch": 0.86, + "grad_norm": 1.3499421597610715, + "learning_rate": 1.0401989303190141e-06, + "loss": 0.1519, + "step": 16871 + }, + { + "epoch": 0.86, + "grad_norm": 1.1109980178568921, + "learning_rate": 1.039467642645181e-06, + "loss": 0.1718, + "step": 16872 + }, + { + "epoch": 0.86, + "grad_norm": 1.226219785772608, + "learning_rate": 1.038736598026291e-06, + "loss": 0.1719, + "step": 16873 + }, + { + "epoch": 0.86, + "grad_norm": 1.439364984206557, + "learning_rate": 1.03800579648218e-06, + "loss": 0.1446, + "step": 16874 + }, + { + "epoch": 0.86, + "grad_norm": 1.1988578257912608, + "learning_rate": 1.0372752380326645e-06, + "loss": 0.1809, + "step": 16875 + }, + { + "epoch": 0.86, + "grad_norm": 1.5934155252133868, + "learning_rate": 1.0365449226975677e-06, + "loss": 0.1803, + "step": 16876 + }, + { + "epoch": 0.86, + "grad_norm": 1.1377548167853007, + "learning_rate": 1.0358148504966935e-06, + "loss": 0.1548, + "step": 16877 + }, + { + "epoch": 0.86, + "grad_norm": 1.436773878572512, + "learning_rate": 1.0350850214498486e-06, + "loss": 0.1562, + "step": 16878 + }, + { + "epoch": 0.86, + "grad_norm": 0.9801581415679828, + "learning_rate": 1.0343554355768282e-06, + "loss": 0.171, + "step": 16879 + }, + { + "epoch": 0.86, + "grad_norm": 0.7754791213662016, + "learning_rate": 1.0336260928974252e-06, + "loss": 0.1658, + "step": 16880 + }, + { + "epoch": 0.86, + "grad_norm": 1.0383938709850917, + "learning_rate": 1.0328969934314181e-06, + "loss": 0.1607, + "step": 16881 + }, + { + "epoch": 0.86, + "grad_norm": 0.8080092997766583, + "learning_rate": 1.0321681371985892e-06, + "loss": 0.1422, + "step": 16882 + }, + { + "epoch": 0.86, + "grad_norm": 1.0266233977427306, + "learning_rate": 1.0314395242187037e-06, + "loss": 0.1649, + "step": 16883 + }, + { + "epoch": 0.86, + "grad_norm": 1.4384666085729678, + "learning_rate": 1.0307111545115301e-06, + "loss": 0.1672, + "step": 16884 + }, + { + "epoch": 0.86, + "grad_norm": 1.1492817195259588, + "learning_rate": 1.0299830280968205e-06, + "loss": 0.1709, + "step": 16885 + }, + { + "epoch": 0.86, + "grad_norm": 1.0985167211243825, + "learning_rate": 1.02925514499433e-06, + "loss": 0.1668, + "step": 16886 + }, + { + "epoch": 0.86, + "grad_norm": 1.032134037777604, + "learning_rate": 1.0285275052237987e-06, + "loss": 0.1665, + "step": 16887 + }, + { + "epoch": 0.86, + "grad_norm": 1.078501377404676, + "learning_rate": 1.027800108804966e-06, + "loss": 0.1602, + "step": 16888 + }, + { + "epoch": 0.86, + "grad_norm": 0.9876329553773102, + "learning_rate": 1.027072955757563e-06, + "loss": 0.1541, + "step": 16889 + }, + { + "epoch": 0.86, + "grad_norm": 1.0754178281906779, + "learning_rate": 1.026346046101312e-06, + "loss": 0.1579, + "step": 16890 + }, + { + "epoch": 0.86, + "grad_norm": 2.2273783014245425, + "learning_rate": 1.0256193798559322e-06, + "loss": 0.169, + "step": 16891 + }, + { + "epoch": 0.86, + "grad_norm": 1.2721252163758574, + "learning_rate": 1.0248929570411327e-06, + "loss": 0.1998, + "step": 16892 + }, + { + "epoch": 0.86, + "grad_norm": 0.9013353020840622, + "learning_rate": 1.0241667776766196e-06, + "loss": 0.1588, + "step": 16893 + }, + { + "epoch": 0.86, + "grad_norm": 0.9338838445718988, + "learning_rate": 1.0234408417820884e-06, + "loss": 0.151, + "step": 16894 + }, + { + "epoch": 0.86, + "grad_norm": 1.1278581521392987, + "learning_rate": 1.0227151493772324e-06, + "loss": 0.1667, + "step": 16895 + }, + { + "epoch": 0.86, + "grad_norm": 1.0304483129985431, + "learning_rate": 1.0219897004817337e-06, + "loss": 0.1555, + "step": 16896 + }, + { + "epoch": 0.86, + "grad_norm": 1.1925611182107805, + "learning_rate": 1.0212644951152718e-06, + "loss": 0.1734, + "step": 16897 + }, + { + "epoch": 0.86, + "grad_norm": 1.005882606197199, + "learning_rate": 1.0205395332975165e-06, + "loss": 0.1622, + "step": 16898 + }, + { + "epoch": 0.86, + "grad_norm": 1.0572915878224836, + "learning_rate": 1.019814815048137e-06, + "loss": 0.1744, + "step": 16899 + }, + { + "epoch": 0.86, + "grad_norm": 0.9945705083558409, + "learning_rate": 1.0190903403867847e-06, + "loss": 0.1624, + "step": 16900 + }, + { + "epoch": 0.86, + "grad_norm": 1.3335322820199638, + "learning_rate": 1.0183661093331165e-06, + "loss": 0.1597, + "step": 16901 + }, + { + "epoch": 0.86, + "grad_norm": 0.8729913239957138, + "learning_rate": 1.0176421219067734e-06, + "loss": 0.1777, + "step": 16902 + }, + { + "epoch": 0.86, + "grad_norm": 1.4870799694049563, + "learning_rate": 1.0169183781273962e-06, + "loss": 0.1648, + "step": 16903 + }, + { + "epoch": 0.86, + "grad_norm": 1.0104638473649, + "learning_rate": 1.0161948780146136e-06, + "loss": 0.1772, + "step": 16904 + }, + { + "epoch": 0.86, + "grad_norm": 1.696486495966895, + "learning_rate": 1.0154716215880523e-06, + "loss": 0.1871, + "step": 16905 + }, + { + "epoch": 0.86, + "grad_norm": 0.9349237651600266, + "learning_rate": 1.014748608867333e-06, + "loss": 0.1545, + "step": 16906 + }, + { + "epoch": 0.86, + "grad_norm": 1.1359011223613704, + "learning_rate": 1.0140258398720625e-06, + "loss": 0.1712, + "step": 16907 + }, + { + "epoch": 0.86, + "grad_norm": 1.0986364477647335, + "learning_rate": 1.0133033146218518e-06, + "loss": 0.1805, + "step": 16908 + }, + { + "epoch": 0.86, + "grad_norm": 0.7640555565366878, + "learning_rate": 1.012581033136294e-06, + "loss": 0.1664, + "step": 16909 + }, + { + "epoch": 0.86, + "grad_norm": 1.2273472753204244, + "learning_rate": 1.0118589954349845e-06, + "loss": 0.1764, + "step": 16910 + }, + { + "epoch": 0.86, + "grad_norm": 1.0131939116632758, + "learning_rate": 1.0111372015375054e-06, + "loss": 0.1775, + "step": 16911 + }, + { + "epoch": 0.86, + "grad_norm": 0.7986491351427615, + "learning_rate": 1.01041565146344e-06, + "loss": 0.1536, + "step": 16912 + }, + { + "epoch": 0.86, + "grad_norm": 1.1023363192896058, + "learning_rate": 1.009694345232356e-06, + "loss": 0.1632, + "step": 16913 + }, + { + "epoch": 0.86, + "grad_norm": 1.0947374161202323, + "learning_rate": 1.00897328286382e-06, + "loss": 0.1689, + "step": 16914 + }, + { + "epoch": 0.86, + "grad_norm": 1.305555176945722, + "learning_rate": 1.0082524643773916e-06, + "loss": 0.1832, + "step": 16915 + }, + { + "epoch": 0.86, + "grad_norm": 0.9597581032536183, + "learning_rate": 1.0075318897926255e-06, + "loss": 0.1506, + "step": 16916 + }, + { + "epoch": 0.86, + "grad_norm": 0.8767785279661219, + "learning_rate": 1.0068115591290628e-06, + "loss": 0.1808, + "step": 16917 + }, + { + "epoch": 0.86, + "grad_norm": 0.9193785028161882, + "learning_rate": 1.0060914724062454e-06, + "loss": 0.1582, + "step": 16918 + }, + { + "epoch": 0.86, + "grad_norm": 0.8982585434629353, + "learning_rate": 1.0053716296437034e-06, + "loss": 0.1622, + "step": 16919 + }, + { + "epoch": 0.86, + "grad_norm": 0.9873719152167707, + "learning_rate": 1.0046520308609664e-06, + "loss": 0.1576, + "step": 16920 + }, + { + "epoch": 0.86, + "grad_norm": 0.9879782842011524, + "learning_rate": 1.0039326760775492e-06, + "loss": 0.1362, + "step": 16921 + }, + { + "epoch": 0.86, + "grad_norm": 0.9697752776839753, + "learning_rate": 1.003213565312966e-06, + "loss": 0.1545, + "step": 16922 + }, + { + "epoch": 0.86, + "grad_norm": 0.8439307478501283, + "learning_rate": 1.0024946985867244e-06, + "loss": 0.1654, + "step": 16923 + }, + { + "epoch": 0.86, + "grad_norm": 0.9783270450751373, + "learning_rate": 1.0017760759183203e-06, + "loss": 0.1753, + "step": 16924 + }, + { + "epoch": 0.86, + "grad_norm": 1.0366479590941498, + "learning_rate": 1.0010576973272512e-06, + "loss": 0.1519, + "step": 16925 + }, + { + "epoch": 0.86, + "grad_norm": 1.4252173197466278, + "learning_rate": 1.0003395628329982e-06, + "loss": 0.1745, + "step": 16926 + }, + { + "epoch": 0.86, + "grad_norm": 1.0046565519981416, + "learning_rate": 9.996216724550445e-07, + "loss": 0.1617, + "step": 16927 + }, + { + "epoch": 0.86, + "grad_norm": 1.1973466911484718, + "learning_rate": 9.98904026212859e-07, + "loss": 0.1726, + "step": 16928 + }, + { + "epoch": 0.86, + "grad_norm": 1.023743851749609, + "learning_rate": 9.981866241259131e-07, + "loss": 0.1789, + "step": 16929 + }, + { + "epoch": 0.86, + "grad_norm": 0.9877633466971224, + "learning_rate": 9.974694662136609e-07, + "loss": 0.1487, + "step": 16930 + }, + { + "epoch": 0.86, + "grad_norm": 1.3325695332137415, + "learning_rate": 9.967525524955579e-07, + "loss": 0.1973, + "step": 16931 + }, + { + "epoch": 0.86, + "grad_norm": 1.949333856302942, + "learning_rate": 9.9603588299105e-07, + "loss": 0.1755, + "step": 16932 + }, + { + "epoch": 0.86, + "grad_norm": 0.9485511719358928, + "learning_rate": 9.9531945771958e-07, + "loss": 0.165, + "step": 16933 + }, + { + "epoch": 0.86, + "grad_norm": 0.9261506185655848, + "learning_rate": 9.946032767005752e-07, + "loss": 0.1637, + "step": 16934 + }, + { + "epoch": 0.86, + "grad_norm": 0.9878937517566453, + "learning_rate": 9.938873399534688e-07, + "loss": 0.1424, + "step": 16935 + }, + { + "epoch": 0.86, + "grad_norm": 0.9556000137729997, + "learning_rate": 9.931716474976738e-07, + "loss": 0.1391, + "step": 16936 + }, + { + "epoch": 0.86, + "grad_norm": 0.9511096528875889, + "learning_rate": 9.924561993526082e-07, + "loss": 0.1454, + "step": 16937 + }, + { + "epoch": 0.86, + "grad_norm": 1.5058829954646482, + "learning_rate": 9.917409955376778e-07, + "loss": 0.1548, + "step": 16938 + }, + { + "epoch": 0.86, + "grad_norm": 0.9453923029256704, + "learning_rate": 9.910260360722802e-07, + "loss": 0.1553, + "step": 16939 + }, + { + "epoch": 0.86, + "grad_norm": 3.388034607806563, + "learning_rate": 9.903113209758098e-07, + "loss": 0.1567, + "step": 16940 + }, + { + "epoch": 0.86, + "grad_norm": 1.1517223486250001, + "learning_rate": 9.895968502676533e-07, + "loss": 0.1749, + "step": 16941 + }, + { + "epoch": 0.86, + "grad_norm": 0.8428053004420055, + "learning_rate": 9.888826239671934e-07, + "loss": 0.1547, + "step": 16942 + }, + { + "epoch": 0.86, + "grad_norm": 0.8433502857727055, + "learning_rate": 9.881686420937986e-07, + "loss": 0.1471, + "step": 16943 + }, + { + "epoch": 0.86, + "grad_norm": 3.2771510094579086, + "learning_rate": 9.874549046668413e-07, + "loss": 0.1807, + "step": 16944 + }, + { + "epoch": 0.86, + "grad_norm": 0.9337510319746428, + "learning_rate": 9.867414117056763e-07, + "loss": 0.1473, + "step": 16945 + }, + { + "epoch": 0.86, + "grad_norm": 1.2713121575294868, + "learning_rate": 9.860281632296609e-07, + "loss": 0.1457, + "step": 16946 + }, + { + "epoch": 0.86, + "grad_norm": 1.0183252588457072, + "learning_rate": 9.8531515925814e-07, + "loss": 0.1712, + "step": 16947 + }, + { + "epoch": 0.86, + "grad_norm": 1.4441436451206293, + "learning_rate": 9.846023998104536e-07, + "loss": 0.1787, + "step": 16948 + }, + { + "epoch": 0.86, + "grad_norm": 0.8774364051310735, + "learning_rate": 9.838898849059364e-07, + "loss": 0.148, + "step": 16949 + }, + { + "epoch": 0.86, + "grad_norm": 0.8797088309314717, + "learning_rate": 9.831776145639182e-07, + "loss": 0.1535, + "step": 16950 + }, + { + "epoch": 0.86, + "grad_norm": 0.9116377986628547, + "learning_rate": 9.824655888037138e-07, + "loss": 0.1485, + "step": 16951 + }, + { + "epoch": 0.86, + "grad_norm": 1.921263524194139, + "learning_rate": 9.817538076446409e-07, + "loss": 0.1454, + "step": 16952 + }, + { + "epoch": 0.86, + "grad_norm": 0.8002452664261146, + "learning_rate": 9.810422711060042e-07, + "loss": 0.1457, + "step": 16953 + }, + { + "epoch": 0.86, + "grad_norm": 1.0264341071482261, + "learning_rate": 9.80330979207108e-07, + "loss": 0.1914, + "step": 16954 + }, + { + "epoch": 0.86, + "grad_norm": 0.9512470084198945, + "learning_rate": 9.796199319672416e-07, + "loss": 0.1529, + "step": 16955 + }, + { + "epoch": 0.86, + "grad_norm": 0.9112980601957212, + "learning_rate": 9.78909129405694e-07, + "loss": 0.1685, + "step": 16956 + }, + { + "epoch": 0.86, + "grad_norm": 1.146254875669652, + "learning_rate": 9.78198571541744e-07, + "loss": 0.1802, + "step": 16957 + }, + { + "epoch": 0.86, + "grad_norm": 0.978711356522037, + "learning_rate": 9.774882583946688e-07, + "loss": 0.1465, + "step": 16958 + }, + { + "epoch": 0.86, + "grad_norm": 0.8759311125604019, + "learning_rate": 9.767781899837348e-07, + "loss": 0.1703, + "step": 16959 + }, + { + "epoch": 0.86, + "grad_norm": 1.8155043599843381, + "learning_rate": 9.760683663282012e-07, + "loss": 0.1608, + "step": 16960 + }, + { + "epoch": 0.86, + "grad_norm": 1.3248677378427802, + "learning_rate": 9.753587874473235e-07, + "loss": 0.176, + "step": 16961 + }, + { + "epoch": 0.86, + "grad_norm": 1.061681244714024, + "learning_rate": 9.746494533603478e-07, + "loss": 0.1809, + "step": 16962 + }, + { + "epoch": 0.86, + "grad_norm": 0.8151729487186564, + "learning_rate": 9.739403640865164e-07, + "loss": 0.1664, + "step": 16963 + }, + { + "epoch": 0.86, + "grad_norm": 1.1969609035416964, + "learning_rate": 9.732315196450615e-07, + "loss": 0.1886, + "step": 16964 + }, + { + "epoch": 0.86, + "grad_norm": 1.3341288883277864, + "learning_rate": 9.725229200552123e-07, + "loss": 0.1751, + "step": 16965 + }, + { + "epoch": 0.86, + "grad_norm": 2.9657370156268987, + "learning_rate": 9.718145653361878e-07, + "loss": 0.197, + "step": 16966 + }, + { + "epoch": 0.86, + "grad_norm": 1.0373624882601507, + "learning_rate": 9.711064555072026e-07, + "loss": 0.1779, + "step": 16967 + }, + { + "epoch": 0.86, + "grad_norm": 1.0226918276226975, + "learning_rate": 9.703985905874646e-07, + "loss": 0.1604, + "step": 16968 + }, + { + "epoch": 0.86, + "grad_norm": 1.6206548522077415, + "learning_rate": 9.696909705961776e-07, + "loss": 0.17, + "step": 16969 + }, + { + "epoch": 0.86, + "grad_norm": 0.9537709638528886, + "learning_rate": 9.689835955525307e-07, + "loss": 0.1632, + "step": 16970 + }, + { + "epoch": 0.86, + "grad_norm": 1.2901100670976893, + "learning_rate": 9.682764654757149e-07, + "loss": 0.1727, + "step": 16971 + }, + { + "epoch": 0.86, + "grad_norm": 0.843111449613659, + "learning_rate": 9.675695803849094e-07, + "loss": 0.1562, + "step": 16972 + }, + { + "epoch": 0.86, + "grad_norm": 1.0565847073406056, + "learning_rate": 9.668629402992902e-07, + "loss": 0.1769, + "step": 16973 + }, + { + "epoch": 0.86, + "grad_norm": 0.8774607355627476, + "learning_rate": 9.661565452380228e-07, + "loss": 0.1604, + "step": 16974 + }, + { + "epoch": 0.86, + "grad_norm": 0.8361969432025993, + "learning_rate": 9.654503952202687e-07, + "loss": 0.1505, + "step": 16975 + }, + { + "epoch": 0.86, + "grad_norm": 2.1873928931233717, + "learning_rate": 9.647444902651847e-07, + "loss": 0.1659, + "step": 16976 + }, + { + "epoch": 0.86, + "grad_norm": 1.6912378470493363, + "learning_rate": 9.640388303919156e-07, + "loss": 0.164, + "step": 16977 + }, + { + "epoch": 0.86, + "grad_norm": 0.9739879821560055, + "learning_rate": 9.63333415619605e-07, + "loss": 0.1601, + "step": 16978 + }, + { + "epoch": 0.86, + "grad_norm": 0.9162667771255657, + "learning_rate": 9.626282459673842e-07, + "loss": 0.169, + "step": 16979 + }, + { + "epoch": 0.86, + "grad_norm": 1.0533004190917632, + "learning_rate": 9.619233214543833e-07, + "loss": 0.1456, + "step": 16980 + }, + { + "epoch": 0.86, + "grad_norm": 1.1535147922097448, + "learning_rate": 9.612186420997227e-07, + "loss": 0.1706, + "step": 16981 + }, + { + "epoch": 0.86, + "grad_norm": 1.2602299328060675, + "learning_rate": 9.605142079225183e-07, + "loss": 0.1586, + "step": 16982 + }, + { + "epoch": 0.86, + "grad_norm": 1.674843203986836, + "learning_rate": 9.598100189418736e-07, + "loss": 0.1344, + "step": 16983 + }, + { + "epoch": 0.86, + "grad_norm": 1.0166396943321496, + "learning_rate": 9.591060751768943e-07, + "loss": 0.1781, + "step": 16984 + }, + { + "epoch": 0.86, + "grad_norm": 1.0828299161586112, + "learning_rate": 9.584023766466721e-07, + "loss": 0.1953, + "step": 16985 + }, + { + "epoch": 0.86, + "grad_norm": 1.089162076325917, + "learning_rate": 9.576989233702993e-07, + "loss": 0.1856, + "step": 16986 + }, + { + "epoch": 0.86, + "grad_norm": 1.1043827985285775, + "learning_rate": 9.569957153668507e-07, + "loss": 0.1617, + "step": 16987 + }, + { + "epoch": 0.86, + "grad_norm": 1.0517660021678024, + "learning_rate": 9.562927526554066e-07, + "loss": 0.1583, + "step": 16988 + }, + { + "epoch": 0.86, + "grad_norm": 1.5950084990409343, + "learning_rate": 9.555900352550308e-07, + "loss": 0.1689, + "step": 16989 + }, + { + "epoch": 0.86, + "grad_norm": 0.9618689152709855, + "learning_rate": 9.548875631847875e-07, + "loss": 0.1714, + "step": 16990 + }, + { + "epoch": 0.86, + "grad_norm": 0.9033812724850739, + "learning_rate": 9.541853364637299e-07, + "loss": 0.1571, + "step": 16991 + }, + { + "epoch": 0.86, + "grad_norm": 0.9739470181448753, + "learning_rate": 9.534833551109035e-07, + "loss": 0.1388, + "step": 16992 + }, + { + "epoch": 0.86, + "grad_norm": 0.9077329611081956, + "learning_rate": 9.527816191453531e-07, + "loss": 0.1683, + "step": 16993 + }, + { + "epoch": 0.86, + "grad_norm": 0.8965000939025348, + "learning_rate": 9.520801285861126e-07, + "loss": 0.1708, + "step": 16994 + }, + { + "epoch": 0.86, + "grad_norm": 0.8336839169695421, + "learning_rate": 9.513788834522108e-07, + "loss": 0.1565, + "step": 16995 + }, + { + "epoch": 0.86, + "grad_norm": 1.5352836241948775, + "learning_rate": 9.506778837626652e-07, + "loss": 0.1605, + "step": 16996 + }, + { + "epoch": 0.86, + "grad_norm": 0.9124484802811895, + "learning_rate": 9.499771295364957e-07, + "loss": 0.1536, + "step": 16997 + }, + { + "epoch": 0.86, + "grad_norm": 1.10837471926042, + "learning_rate": 9.492766207927062e-07, + "loss": 0.1658, + "step": 16998 + }, + { + "epoch": 0.86, + "grad_norm": 1.166103641007375, + "learning_rate": 9.485763575503015e-07, + "loss": 0.1366, + "step": 16999 + }, + { + "epoch": 0.86, + "grad_norm": 0.8544681293800753, + "learning_rate": 9.47876339828272e-07, + "loss": 0.1594, + "step": 17000 + }, + { + "epoch": 0.86, + "grad_norm": 1.1404024846192349, + "learning_rate": 9.471765676456079e-07, + "loss": 0.1733, + "step": 17001 + }, + { + "epoch": 0.86, + "grad_norm": 1.1349490458873757, + "learning_rate": 9.464770410212909e-07, + "loss": 0.1495, + "step": 17002 + }, + { + "epoch": 0.86, + "grad_norm": 1.2651117346806118, + "learning_rate": 9.457777599742979e-07, + "loss": 0.1682, + "step": 17003 + }, + { + "epoch": 0.86, + "grad_norm": 1.2507880576208228, + "learning_rate": 9.450787245235926e-07, + "loss": 0.1689, + "step": 17004 + }, + { + "epoch": 0.86, + "grad_norm": 0.9300768396542201, + "learning_rate": 9.443799346881388e-07, + "loss": 0.1604, + "step": 17005 + }, + { + "epoch": 0.86, + "grad_norm": 1.0231991740589423, + "learning_rate": 9.436813904868902e-07, + "loss": 0.1793, + "step": 17006 + }, + { + "epoch": 0.86, + "grad_norm": 1.0992909713211707, + "learning_rate": 9.429830919387972e-07, + "loss": 0.1623, + "step": 17007 + }, + { + "epoch": 0.86, + "grad_norm": 0.870414843937221, + "learning_rate": 9.422850390627991e-07, + "loss": 0.1626, + "step": 17008 + }, + { + "epoch": 0.86, + "grad_norm": 1.3107033059662958, + "learning_rate": 9.415872318778285e-07, + "loss": 0.1745, + "step": 17009 + }, + { + "epoch": 0.86, + "grad_norm": 1.7913604862726662, + "learning_rate": 9.40889670402817e-07, + "loss": 0.1709, + "step": 17010 + }, + { + "epoch": 0.87, + "grad_norm": 1.3926226337980574, + "learning_rate": 9.401923546566838e-07, + "loss": 0.1497, + "step": 17011 + }, + { + "epoch": 0.87, + "grad_norm": 1.255256928642862, + "learning_rate": 9.394952846583472e-07, + "loss": 0.1872, + "step": 17012 + }, + { + "epoch": 0.87, + "grad_norm": 0.8369092897709353, + "learning_rate": 9.387984604267109e-07, + "loss": 0.1717, + "step": 17013 + }, + { + "epoch": 0.87, + "grad_norm": 0.8268688499431834, + "learning_rate": 9.381018819806797e-07, + "loss": 0.152, + "step": 17014 + }, + { + "epoch": 0.87, + "grad_norm": 2.7473049573367447, + "learning_rate": 9.374055493391455e-07, + "loss": 0.1608, + "step": 17015 + }, + { + "epoch": 0.87, + "grad_norm": 0.917773236637632, + "learning_rate": 9.367094625209983e-07, + "loss": 0.1705, + "step": 17016 + }, + { + "epoch": 0.87, + "grad_norm": 0.9269986492790633, + "learning_rate": 9.360136215451177e-07, + "loss": 0.1783, + "step": 17017 + }, + { + "epoch": 0.87, + "grad_norm": 0.9102869074299589, + "learning_rate": 9.353180264303818e-07, + "loss": 0.1713, + "step": 17018 + }, + { + "epoch": 0.87, + "grad_norm": 0.847096642866557, + "learning_rate": 9.346226771956523e-07, + "loss": 0.1719, + "step": 17019 + }, + { + "epoch": 0.87, + "grad_norm": 1.115828035313165, + "learning_rate": 9.339275738597975e-07, + "loss": 0.1617, + "step": 17020 + }, + { + "epoch": 0.87, + "grad_norm": 0.9756168196493309, + "learning_rate": 9.332327164416688e-07, + "loss": 0.173, + "step": 17021 + }, + { + "epoch": 0.87, + "grad_norm": 1.120436059367365, + "learning_rate": 9.325381049601157e-07, + "loss": 0.1631, + "step": 17022 + }, + { + "epoch": 0.87, + "grad_norm": 0.9422116076804968, + "learning_rate": 9.318437394339774e-07, + "loss": 0.1624, + "step": 17023 + }, + { + "epoch": 0.87, + "grad_norm": 2.919121126930504, + "learning_rate": 9.311496198820913e-07, + "loss": 0.1802, + "step": 17024 + }, + { + "epoch": 0.87, + "grad_norm": 3.408213788699369, + "learning_rate": 9.304557463232844e-07, + "loss": 0.1517, + "step": 17025 + }, + { + "epoch": 0.87, + "grad_norm": 1.1067485928654712, + "learning_rate": 9.297621187763761e-07, + "loss": 0.1771, + "step": 17026 + }, + { + "epoch": 0.87, + "grad_norm": 1.1005835224655929, + "learning_rate": 9.290687372601814e-07, + "loss": 0.1662, + "step": 17027 + }, + { + "epoch": 0.87, + "grad_norm": 1.5768878819440173, + "learning_rate": 9.283756017935108e-07, + "loss": 0.1446, + "step": 17028 + }, + { + "epoch": 0.87, + "grad_norm": 0.8738222182636233, + "learning_rate": 9.276827123951648e-07, + "loss": 0.1458, + "step": 17029 + }, + { + "epoch": 0.87, + "grad_norm": 0.9600910172910317, + "learning_rate": 9.269900690839373e-07, + "loss": 0.1729, + "step": 17030 + }, + { + "epoch": 0.87, + "grad_norm": 0.9455883463851124, + "learning_rate": 9.262976718786176e-07, + "loss": 0.1481, + "step": 17031 + }, + { + "epoch": 0.87, + "grad_norm": 1.4184803589090373, + "learning_rate": 9.256055207979841e-07, + "loss": 0.1701, + "step": 17032 + }, + { + "epoch": 0.87, + "grad_norm": 1.0587723639969662, + "learning_rate": 9.249136158608163e-07, + "loss": 0.1896, + "step": 17033 + }, + { + "epoch": 0.87, + "grad_norm": 1.0309704538708153, + "learning_rate": 9.242219570858757e-07, + "loss": 0.1813, + "step": 17034 + }, + { + "epoch": 0.87, + "grad_norm": 1.0474496824022568, + "learning_rate": 9.235305444919307e-07, + "loss": 0.1718, + "step": 17035 + }, + { + "epoch": 0.87, + "grad_norm": 3.23008560442467, + "learning_rate": 9.228393780977296e-07, + "loss": 0.1671, + "step": 17036 + }, + { + "epoch": 0.87, + "grad_norm": 0.9214761008903439, + "learning_rate": 9.22148457922023e-07, + "loss": 0.1703, + "step": 17037 + }, + { + "epoch": 0.87, + "grad_norm": 1.447550516118549, + "learning_rate": 9.214577839835514e-07, + "loss": 0.144, + "step": 17038 + }, + { + "epoch": 0.87, + "grad_norm": 2.4592960258901386, + "learning_rate": 9.207673563010533e-07, + "loss": 0.187, + "step": 17039 + }, + { + "epoch": 0.87, + "grad_norm": 2.8769187476094875, + "learning_rate": 9.200771748932513e-07, + "loss": 0.1853, + "step": 17040 + }, + { + "epoch": 0.87, + "grad_norm": 0.9573474621913002, + "learning_rate": 9.193872397788705e-07, + "loss": 0.1656, + "step": 17041 + }, + { + "epoch": 0.87, + "grad_norm": 1.6841387334089557, + "learning_rate": 9.186975509766216e-07, + "loss": 0.1465, + "step": 17042 + }, + { + "epoch": 0.87, + "grad_norm": 1.230747654561539, + "learning_rate": 9.180081085052162e-07, + "loss": 0.1594, + "step": 17043 + }, + { + "epoch": 0.87, + "grad_norm": 1.9144311106279304, + "learning_rate": 9.173189123833526e-07, + "loss": 0.1746, + "step": 17044 + }, + { + "epoch": 0.87, + "grad_norm": 0.969731855311959, + "learning_rate": 9.166299626297271e-07, + "loss": 0.1699, + "step": 17045 + }, + { + "epoch": 0.87, + "grad_norm": 1.0739965557393911, + "learning_rate": 9.159412592630279e-07, + "loss": 0.1706, + "step": 17046 + }, + { + "epoch": 0.87, + "grad_norm": 1.5170650586505527, + "learning_rate": 9.152528023019325e-07, + "loss": 0.1634, + "step": 17047 + }, + { + "epoch": 0.87, + "grad_norm": 1.441823463208156, + "learning_rate": 9.145645917651214e-07, + "loss": 0.152, + "step": 17048 + }, + { + "epoch": 0.87, + "grad_norm": 1.0354265673263898, + "learning_rate": 9.138766276712552e-07, + "loss": 0.1649, + "step": 17049 + }, + { + "epoch": 0.87, + "grad_norm": 1.1997505349833961, + "learning_rate": 9.131889100390024e-07, + "loss": 0.1737, + "step": 17050 + }, + { + "epoch": 0.87, + "grad_norm": 6.196969721625916, + "learning_rate": 9.125014388870101e-07, + "loss": 0.1546, + "step": 17051 + }, + { + "epoch": 0.87, + "grad_norm": 0.9409125676079062, + "learning_rate": 9.118142142339326e-07, + "loss": 0.1916, + "step": 17052 + }, + { + "epoch": 0.87, + "grad_norm": 1.046336542925199, + "learning_rate": 9.111272360984058e-07, + "loss": 0.158, + "step": 17053 + }, + { + "epoch": 0.87, + "grad_norm": 0.9049607362766768, + "learning_rate": 9.104405044990661e-07, + "loss": 0.1799, + "step": 17054 + }, + { + "epoch": 0.87, + "grad_norm": 1.0200458401151316, + "learning_rate": 9.097540194545407e-07, + "loss": 0.1589, + "step": 17055 + }, + { + "epoch": 0.87, + "grad_norm": 0.8076764602199117, + "learning_rate": 9.090677809834525e-07, + "loss": 0.1961, + "step": 17056 + }, + { + "epoch": 0.87, + "grad_norm": 1.0527309977219788, + "learning_rate": 9.083817891044133e-07, + "loss": 0.1601, + "step": 17057 + }, + { + "epoch": 0.87, + "grad_norm": 1.3595022523979385, + "learning_rate": 9.076960438360327e-07, + "loss": 0.1576, + "step": 17058 + }, + { + "epoch": 0.87, + "grad_norm": 0.9395096482761963, + "learning_rate": 9.070105451969091e-07, + "loss": 0.1624, + "step": 17059 + }, + { + "epoch": 0.87, + "grad_norm": 1.3353654985682613, + "learning_rate": 9.063252932056399e-07, + "loss": 0.1646, + "step": 17060 + }, + { + "epoch": 0.87, + "grad_norm": 1.1705992125708042, + "learning_rate": 9.056402878808102e-07, + "loss": 0.1655, + "step": 17061 + }, + { + "epoch": 0.87, + "grad_norm": 1.1283675058654659, + "learning_rate": 9.049555292409995e-07, + "loss": 0.1509, + "step": 17062 + }, + { + "epoch": 0.87, + "grad_norm": 0.8843788307703369, + "learning_rate": 9.042710173047841e-07, + "loss": 0.1508, + "step": 17063 + }, + { + "epoch": 0.87, + "grad_norm": 0.9171842968857437, + "learning_rate": 9.035867520907304e-07, + "loss": 0.1591, + "step": 17064 + }, + { + "epoch": 0.87, + "grad_norm": 0.9310906259002731, + "learning_rate": 9.029027336174023e-07, + "loss": 0.1681, + "step": 17065 + }, + { + "epoch": 0.87, + "grad_norm": 3.831040289624738, + "learning_rate": 9.022189619033495e-07, + "loss": 0.1806, + "step": 17066 + }, + { + "epoch": 0.87, + "grad_norm": 1.3134835153634918, + "learning_rate": 9.015354369671237e-07, + "loss": 0.185, + "step": 17067 + }, + { + "epoch": 0.87, + "grad_norm": 1.1991897700260155, + "learning_rate": 9.0085215882726e-07, + "loss": 0.1491, + "step": 17068 + }, + { + "epoch": 0.87, + "grad_norm": 1.0090186658831273, + "learning_rate": 9.001691275022984e-07, + "loss": 0.1621, + "step": 17069 + }, + { + "epoch": 0.87, + "grad_norm": 1.8294614748653688, + "learning_rate": 8.994863430107603e-07, + "loss": 0.154, + "step": 17070 + }, + { + "epoch": 0.87, + "grad_norm": 0.917475210040729, + "learning_rate": 8.988038053711701e-07, + "loss": 0.1641, + "step": 17071 + }, + { + "epoch": 0.87, + "grad_norm": 0.9202157068800895, + "learning_rate": 8.981215146020394e-07, + "loss": 0.1689, + "step": 17072 + }, + { + "epoch": 0.87, + "grad_norm": 1.1480664342916895, + "learning_rate": 8.974394707218792e-07, + "loss": 0.1692, + "step": 17073 + }, + { + "epoch": 0.87, + "grad_norm": 1.1053708115759986, + "learning_rate": 8.967576737491856e-07, + "loss": 0.1842, + "step": 17074 + }, + { + "epoch": 0.87, + "grad_norm": 0.9201851719480871, + "learning_rate": 8.960761237024573e-07, + "loss": 0.1747, + "step": 17075 + }, + { + "epoch": 0.87, + "grad_norm": 1.0454642154287848, + "learning_rate": 8.95394820600175e-07, + "loss": 0.1831, + "step": 17076 + }, + { + "epoch": 0.87, + "grad_norm": 1.0702369137636214, + "learning_rate": 8.94713764460825e-07, + "loss": 0.1695, + "step": 17077 + }, + { + "epoch": 0.87, + "grad_norm": 1.0091538146196728, + "learning_rate": 8.940329553028782e-07, + "loss": 0.1739, + "step": 17078 + }, + { + "epoch": 0.87, + "grad_norm": 1.0326055197662956, + "learning_rate": 8.933523931447996e-07, + "loss": 0.1647, + "step": 17079 + }, + { + "epoch": 0.87, + "grad_norm": 1.0176709480126727, + "learning_rate": 8.926720780050513e-07, + "loss": 0.1689, + "step": 17080 + }, + { + "epoch": 0.87, + "grad_norm": 1.0575101352035323, + "learning_rate": 8.919920099020884e-07, + "loss": 0.1482, + "step": 17081 + }, + { + "epoch": 0.87, + "grad_norm": 0.8582034731028793, + "learning_rate": 8.913121888543575e-07, + "loss": 0.1623, + "step": 17082 + }, + { + "epoch": 0.87, + "grad_norm": 1.43270078497795, + "learning_rate": 8.906326148802968e-07, + "loss": 0.1651, + "step": 17083 + }, + { + "epoch": 0.87, + "grad_norm": 1.2905694891493917, + "learning_rate": 8.89953287998343e-07, + "loss": 0.1678, + "step": 17084 + }, + { + "epoch": 0.87, + "grad_norm": 2.019744957485963, + "learning_rate": 8.892742082269179e-07, + "loss": 0.1683, + "step": 17085 + }, + { + "epoch": 0.87, + "grad_norm": 1.270537102765076, + "learning_rate": 8.885953755844467e-07, + "loss": 0.1568, + "step": 17086 + }, + { + "epoch": 0.87, + "grad_norm": 1.0910139306492328, + "learning_rate": 8.879167900893392e-07, + "loss": 0.1664, + "step": 17087 + }, + { + "epoch": 0.87, + "grad_norm": 4.833649447834106, + "learning_rate": 8.872384517600053e-07, + "loss": 0.1686, + "step": 17088 + }, + { + "epoch": 0.87, + "grad_norm": 0.9117543020684723, + "learning_rate": 8.86560360614841e-07, + "loss": 0.1638, + "step": 17089 + }, + { + "epoch": 0.87, + "grad_norm": 1.3114404092976746, + "learning_rate": 8.858825166722418e-07, + "loss": 0.1694, + "step": 17090 + }, + { + "epoch": 0.87, + "grad_norm": 0.9301874381300534, + "learning_rate": 8.852049199505941e-07, + "loss": 0.1502, + "step": 17091 + }, + { + "epoch": 0.87, + "grad_norm": 1.6722753021063932, + "learning_rate": 8.845275704682788e-07, + "loss": 0.1689, + "step": 17092 + }, + { + "epoch": 0.87, + "grad_norm": 1.0828515003759154, + "learning_rate": 8.838504682436666e-07, + "loss": 0.1768, + "step": 17093 + }, + { + "epoch": 0.87, + "grad_norm": 0.9844296891643936, + "learning_rate": 8.831736132951274e-07, + "loss": 0.1713, + "step": 17094 + }, + { + "epoch": 0.87, + "grad_norm": 0.9202759734089795, + "learning_rate": 8.824970056410187e-07, + "loss": 0.1538, + "step": 17095 + }, + { + "epoch": 0.87, + "grad_norm": 1.4026351905757806, + "learning_rate": 8.818206452996924e-07, + "loss": 0.1789, + "step": 17096 + }, + { + "epoch": 0.87, + "grad_norm": 2.615643107469265, + "learning_rate": 8.811445322894951e-07, + "loss": 0.1541, + "step": 17097 + }, + { + "epoch": 0.87, + "grad_norm": 1.1891702976515879, + "learning_rate": 8.804686666287688e-07, + "loss": 0.1564, + "step": 17098 + }, + { + "epoch": 0.87, + "grad_norm": 1.0217262338262187, + "learning_rate": 8.797930483358452e-07, + "loss": 0.167, + "step": 17099 + }, + { + "epoch": 0.87, + "grad_norm": 1.0987630781058417, + "learning_rate": 8.7911767742905e-07, + "loss": 0.1531, + "step": 17100 + }, + { + "epoch": 0.87, + "grad_norm": 1.0437865886697673, + "learning_rate": 8.784425539267038e-07, + "loss": 0.1842, + "step": 17101 + }, + { + "epoch": 0.87, + "grad_norm": 0.7743462507231501, + "learning_rate": 8.777676778471167e-07, + "loss": 0.1587, + "step": 17102 + }, + { + "epoch": 0.87, + "grad_norm": 0.8877364528856684, + "learning_rate": 8.770930492085983e-07, + "loss": 0.1584, + "step": 17103 + }, + { + "epoch": 0.87, + "grad_norm": 1.7920300896197154, + "learning_rate": 8.764186680294451e-07, + "loss": 0.179, + "step": 17104 + }, + { + "epoch": 0.87, + "grad_norm": 1.3643034663684792, + "learning_rate": 8.757445343279514e-07, + "loss": 0.1764, + "step": 17105 + }, + { + "epoch": 0.87, + "grad_norm": 0.9771049397743856, + "learning_rate": 8.750706481224014e-07, + "loss": 0.1593, + "step": 17106 + }, + { + "epoch": 0.87, + "grad_norm": 1.1156830433209912, + "learning_rate": 8.743970094310761e-07, + "loss": 0.1466, + "step": 17107 + }, + { + "epoch": 0.87, + "grad_norm": 1.2084500367718818, + "learning_rate": 8.737236182722464e-07, + "loss": 0.1795, + "step": 17108 + }, + { + "epoch": 0.87, + "grad_norm": 1.0696994804947388, + "learning_rate": 8.730504746641811e-07, + "loss": 0.1802, + "step": 17109 + }, + { + "epoch": 0.87, + "grad_norm": 0.9620892356217626, + "learning_rate": 8.723775786251354e-07, + "loss": 0.1533, + "step": 17110 + }, + { + "epoch": 0.87, + "grad_norm": 0.9459466502861161, + "learning_rate": 8.71704930173366e-07, + "loss": 0.136, + "step": 17111 + }, + { + "epoch": 0.87, + "grad_norm": 1.4660240342842592, + "learning_rate": 8.710325293271126e-07, + "loss": 0.1786, + "step": 17112 + }, + { + "epoch": 0.87, + "grad_norm": 1.228860617049589, + "learning_rate": 8.703603761046209e-07, + "loss": 0.1744, + "step": 17113 + }, + { + "epoch": 0.87, + "grad_norm": 0.841572818613883, + "learning_rate": 8.696884705241182e-07, + "loss": 0.1544, + "step": 17114 + }, + { + "epoch": 0.87, + "grad_norm": 1.1666695309699784, + "learning_rate": 8.690168126038301e-07, + "loss": 0.1818, + "step": 17115 + }, + { + "epoch": 0.87, + "grad_norm": 1.3421305915982424, + "learning_rate": 8.683454023619775e-07, + "loss": 0.158, + "step": 17116 + }, + { + "epoch": 0.87, + "grad_norm": 0.894492935229557, + "learning_rate": 8.676742398167704e-07, + "loss": 0.1709, + "step": 17117 + }, + { + "epoch": 0.87, + "grad_norm": 1.0467244786440408, + "learning_rate": 8.670033249864174e-07, + "loss": 0.1636, + "step": 17118 + }, + { + "epoch": 0.87, + "grad_norm": 1.2673527693727311, + "learning_rate": 8.66332657889114e-07, + "loss": 0.1778, + "step": 17119 + }, + { + "epoch": 0.87, + "grad_norm": 1.0575893042859574, + "learning_rate": 8.656622385430547e-07, + "loss": 0.1773, + "step": 17120 + }, + { + "epoch": 0.87, + "grad_norm": 1.1402664070621547, + "learning_rate": 8.649920669664202e-07, + "loss": 0.1673, + "step": 17121 + }, + { + "epoch": 0.87, + "grad_norm": 1.012078317229577, + "learning_rate": 8.643221431773952e-07, + "loss": 0.1697, + "step": 17122 + }, + { + "epoch": 0.87, + "grad_norm": 0.9484744378294417, + "learning_rate": 8.636524671941449e-07, + "loss": 0.1586, + "step": 17123 + }, + { + "epoch": 0.87, + "grad_norm": 0.9925048517257558, + "learning_rate": 8.629830390348382e-07, + "loss": 0.1731, + "step": 17124 + }, + { + "epoch": 0.87, + "grad_norm": 0.9828673798522906, + "learning_rate": 8.623138587176327e-07, + "loss": 0.1712, + "step": 17125 + }, + { + "epoch": 0.87, + "grad_norm": 1.7792158808509042, + "learning_rate": 8.616449262606819e-07, + "loss": 0.1464, + "step": 17126 + }, + { + "epoch": 0.87, + "grad_norm": 1.0968797454498505, + "learning_rate": 8.609762416821255e-07, + "loss": 0.171, + "step": 17127 + }, + { + "epoch": 0.87, + "grad_norm": 0.9891852371438813, + "learning_rate": 8.603078050001079e-07, + "loss": 0.1573, + "step": 17128 + }, + { + "epoch": 0.87, + "grad_norm": 2.21659724600745, + "learning_rate": 8.596396162327547e-07, + "loss": 0.1641, + "step": 17129 + }, + { + "epoch": 0.87, + "grad_norm": 1.141819368951561, + "learning_rate": 8.589716753981958e-07, + "loss": 0.1607, + "step": 17130 + }, + { + "epoch": 0.87, + "grad_norm": 1.566854918335254, + "learning_rate": 8.583039825145456e-07, + "loss": 0.1803, + "step": 17131 + }, + { + "epoch": 0.87, + "grad_norm": 1.0486597977148742, + "learning_rate": 8.576365375999151e-07, + "loss": 0.174, + "step": 17132 + }, + { + "epoch": 0.87, + "grad_norm": 0.8382358364538444, + "learning_rate": 8.569693406724089e-07, + "loss": 0.1534, + "step": 17133 + }, + { + "epoch": 0.87, + "grad_norm": 0.9025809558840777, + "learning_rate": 8.563023917501267e-07, + "loss": 0.1739, + "step": 17134 + }, + { + "epoch": 0.87, + "grad_norm": 1.0805583000627847, + "learning_rate": 8.556356908511598e-07, + "loss": 0.173, + "step": 17135 + }, + { + "epoch": 0.87, + "grad_norm": 1.0793640614355466, + "learning_rate": 8.549692379935904e-07, + "loss": 0.1488, + "step": 17136 + }, + { + "epoch": 0.87, + "grad_norm": 0.873788082427515, + "learning_rate": 8.543030331954971e-07, + "loss": 0.1416, + "step": 17137 + }, + { + "epoch": 0.87, + "grad_norm": 1.5731957662613225, + "learning_rate": 8.536370764749502e-07, + "loss": 0.1654, + "step": 17138 + }, + { + "epoch": 0.87, + "grad_norm": 1.055450540196962, + "learning_rate": 8.529713678500151e-07, + "loss": 0.1739, + "step": 17139 + }, + { + "epoch": 0.87, + "grad_norm": 0.9008341102955426, + "learning_rate": 8.523059073387474e-07, + "loss": 0.1509, + "step": 17140 + }, + { + "epoch": 0.87, + "grad_norm": 0.8612485946862515, + "learning_rate": 8.516406949591982e-07, + "loss": 0.1641, + "step": 17141 + }, + { + "epoch": 0.87, + "grad_norm": 1.0342607155695587, + "learning_rate": 8.509757307294109e-07, + "loss": 0.173, + "step": 17142 + }, + { + "epoch": 0.87, + "grad_norm": 0.9384308186441249, + "learning_rate": 8.503110146674265e-07, + "loss": 0.1612, + "step": 17143 + }, + { + "epoch": 0.87, + "grad_norm": 1.1232769793082433, + "learning_rate": 8.496465467912707e-07, + "loss": 0.1474, + "step": 17144 + }, + { + "epoch": 0.87, + "grad_norm": 1.414174825300634, + "learning_rate": 8.489823271189712e-07, + "loss": 0.1608, + "step": 17145 + }, + { + "epoch": 0.87, + "grad_norm": 2.43889602224765, + "learning_rate": 8.483183556685404e-07, + "loss": 0.1896, + "step": 17146 + }, + { + "epoch": 0.87, + "grad_norm": 1.271727693081388, + "learning_rate": 8.476546324579937e-07, + "loss": 0.165, + "step": 17147 + }, + { + "epoch": 0.87, + "grad_norm": 1.3473452346504362, + "learning_rate": 8.469911575053314e-07, + "loss": 0.1632, + "step": 17148 + }, + { + "epoch": 0.87, + "grad_norm": 0.9080943240602966, + "learning_rate": 8.463279308285488e-07, + "loss": 0.1635, + "step": 17149 + }, + { + "epoch": 0.87, + "grad_norm": 1.002355761226064, + "learning_rate": 8.456649524456384e-07, + "loss": 0.1702, + "step": 17150 + }, + { + "epoch": 0.87, + "grad_norm": 1.1178232175726797, + "learning_rate": 8.450022223745836e-07, + "loss": 0.1625, + "step": 17151 + }, + { + "epoch": 0.87, + "grad_norm": 0.9855127077410061, + "learning_rate": 8.44339740633362e-07, + "loss": 0.1483, + "step": 17152 + }, + { + "epoch": 0.87, + "grad_norm": 1.2085619193638986, + "learning_rate": 8.436775072399406e-07, + "loss": 0.1676, + "step": 17153 + }, + { + "epoch": 0.87, + "grad_norm": 1.106924279440494, + "learning_rate": 8.43015522212286e-07, + "loss": 0.162, + "step": 17154 + }, + { + "epoch": 0.87, + "grad_norm": 0.9675200693427021, + "learning_rate": 8.423537855683494e-07, + "loss": 0.1629, + "step": 17155 + }, + { + "epoch": 0.87, + "grad_norm": 1.046538127580827, + "learning_rate": 8.416922973260865e-07, + "loss": 0.1471, + "step": 17156 + }, + { + "epoch": 0.87, + "grad_norm": 0.9846894068376528, + "learning_rate": 8.410310575034353e-07, + "loss": 0.1523, + "step": 17157 + }, + { + "epoch": 0.87, + "grad_norm": 2.4189671040115495, + "learning_rate": 8.403700661183356e-07, + "loss": 0.1633, + "step": 17158 + }, + { + "epoch": 0.87, + "grad_norm": 1.1215961139129562, + "learning_rate": 8.397093231887143e-07, + "loss": 0.1657, + "step": 17159 + }, + { + "epoch": 0.87, + "grad_norm": 0.9463614162161704, + "learning_rate": 8.390488287324938e-07, + "loss": 0.1509, + "step": 17160 + }, + { + "epoch": 0.87, + "grad_norm": 1.7667384852227885, + "learning_rate": 8.383885827675919e-07, + "loss": 0.1589, + "step": 17161 + }, + { + "epoch": 0.87, + "grad_norm": 1.957756528157922, + "learning_rate": 8.377285853119188e-07, + "loss": 0.1726, + "step": 17162 + }, + { + "epoch": 0.87, + "grad_norm": 0.8567222388031468, + "learning_rate": 8.370688363833734e-07, + "loss": 0.1536, + "step": 17163 + }, + { + "epoch": 0.87, + "grad_norm": 1.5066683002465677, + "learning_rate": 8.364093359998549e-07, + "loss": 0.1671, + "step": 17164 + }, + { + "epoch": 0.87, + "grad_norm": 1.4478565414467472, + "learning_rate": 8.35750084179251e-07, + "loss": 0.1595, + "step": 17165 + }, + { + "epoch": 0.87, + "grad_norm": 0.9872017321212999, + "learning_rate": 8.350910809394419e-07, + "loss": 0.1471, + "step": 17166 + }, + { + "epoch": 0.87, + "grad_norm": 0.831180223310793, + "learning_rate": 8.344323262983056e-07, + "loss": 0.1693, + "step": 17167 + }, + { + "epoch": 0.87, + "grad_norm": 1.1506089263597188, + "learning_rate": 8.337738202737089e-07, + "loss": 0.1629, + "step": 17168 + }, + { + "epoch": 0.87, + "grad_norm": 1.5383110955910784, + "learning_rate": 8.331155628835174e-07, + "loss": 0.1663, + "step": 17169 + }, + { + "epoch": 0.87, + "grad_norm": 1.0783555363988484, + "learning_rate": 8.324575541455815e-07, + "loss": 0.1681, + "step": 17170 + }, + { + "epoch": 0.87, + "grad_norm": 4.396062909846658, + "learning_rate": 8.317997940777555e-07, + "loss": 0.1819, + "step": 17171 + }, + { + "epoch": 0.87, + "grad_norm": 0.8100993706187744, + "learning_rate": 8.311422826978743e-07, + "loss": 0.167, + "step": 17172 + }, + { + "epoch": 0.87, + "grad_norm": 1.1519109872457536, + "learning_rate": 8.304850200237801e-07, + "loss": 0.1864, + "step": 17173 + }, + { + "epoch": 0.87, + "grad_norm": 1.1145067988423865, + "learning_rate": 8.298280060732944e-07, + "loss": 0.1596, + "step": 17174 + }, + { + "epoch": 0.87, + "grad_norm": 0.9373264532460865, + "learning_rate": 8.29171240864245e-07, + "loss": 0.1651, + "step": 17175 + }, + { + "epoch": 0.87, + "grad_norm": 3.015744660484614, + "learning_rate": 8.285147244144409e-07, + "loss": 0.1858, + "step": 17176 + }, + { + "epoch": 0.87, + "grad_norm": 1.2143262440929645, + "learning_rate": 8.278584567416936e-07, + "loss": 0.1677, + "step": 17177 + }, + { + "epoch": 0.87, + "grad_norm": 0.8214701062742716, + "learning_rate": 8.272024378638033e-07, + "loss": 0.1551, + "step": 17178 + }, + { + "epoch": 0.87, + "grad_norm": 1.089253746380905, + "learning_rate": 8.265466677985667e-07, + "loss": 0.1829, + "step": 17179 + }, + { + "epoch": 0.87, + "grad_norm": 1.4385101425570883, + "learning_rate": 8.258911465637675e-07, + "loss": 0.1705, + "step": 17180 + }, + { + "epoch": 0.87, + "grad_norm": 0.9000616820305697, + "learning_rate": 8.252358741771915e-07, + "loss": 0.1539, + "step": 17181 + }, + { + "epoch": 0.87, + "grad_norm": 1.435773434610318, + "learning_rate": 8.245808506566088e-07, + "loss": 0.1807, + "step": 17182 + }, + { + "epoch": 0.87, + "grad_norm": 1.7531987585070594, + "learning_rate": 8.239260760197909e-07, + "loss": 0.1493, + "step": 17183 + }, + { + "epoch": 0.87, + "grad_norm": 0.9821965062763874, + "learning_rate": 8.232715502844968e-07, + "loss": 0.1592, + "step": 17184 + }, + { + "epoch": 0.87, + "grad_norm": 0.918881429900906, + "learning_rate": 8.226172734684779e-07, + "loss": 0.1879, + "step": 17185 + }, + { + "epoch": 0.87, + "grad_norm": 1.2048594743894412, + "learning_rate": 8.219632455894833e-07, + "loss": 0.1753, + "step": 17186 + }, + { + "epoch": 0.87, + "grad_norm": 1.7541521591860647, + "learning_rate": 8.213094666652544e-07, + "loss": 0.1596, + "step": 17187 + }, + { + "epoch": 0.87, + "grad_norm": 1.348671888935361, + "learning_rate": 8.206559367135258e-07, + "loss": 0.1725, + "step": 17188 + }, + { + "epoch": 0.87, + "grad_norm": 1.2557319159842633, + "learning_rate": 8.200026557520224e-07, + "loss": 0.1568, + "step": 17189 + }, + { + "epoch": 0.87, + "grad_norm": 1.1154418262195018, + "learning_rate": 8.193496237984677e-07, + "loss": 0.1638, + "step": 17190 + }, + { + "epoch": 0.87, + "grad_norm": 0.9471020486573046, + "learning_rate": 8.186968408705697e-07, + "loss": 0.1498, + "step": 17191 + }, + { + "epoch": 0.87, + "grad_norm": 0.8734603646022715, + "learning_rate": 8.18044306986041e-07, + "loss": 0.1548, + "step": 17192 + }, + { + "epoch": 0.87, + "grad_norm": 1.0798723064063107, + "learning_rate": 8.173920221625776e-07, + "loss": 0.1455, + "step": 17193 + }, + { + "epoch": 0.87, + "grad_norm": 0.8520548621086595, + "learning_rate": 8.167399864178749e-07, + "loss": 0.1777, + "step": 17194 + }, + { + "epoch": 0.87, + "grad_norm": 0.8022428202826044, + "learning_rate": 8.160881997696169e-07, + "loss": 0.1615, + "step": 17195 + }, + { + "epoch": 0.87, + "grad_norm": 1.5703257428569193, + "learning_rate": 8.154366622354881e-07, + "loss": 0.1734, + "step": 17196 + }, + { + "epoch": 0.87, + "grad_norm": 0.8567723378618287, + "learning_rate": 8.147853738331569e-07, + "loss": 0.1329, + "step": 17197 + }, + { + "epoch": 0.87, + "grad_norm": 1.858800319683022, + "learning_rate": 8.141343345802933e-07, + "loss": 0.1812, + "step": 17198 + }, + { + "epoch": 0.87, + "grad_norm": 0.9505104143578171, + "learning_rate": 8.134835444945521e-07, + "loss": 0.1556, + "step": 17199 + }, + { + "epoch": 0.87, + "grad_norm": 0.8372452692250099, + "learning_rate": 8.128330035935906e-07, + "loss": 0.148, + "step": 17200 + }, + { + "epoch": 0.87, + "grad_norm": 0.9504940848887924, + "learning_rate": 8.121827118950521e-07, + "loss": 0.1598, + "step": 17201 + }, + { + "epoch": 0.87, + "grad_norm": 1.674017728736617, + "learning_rate": 8.115326694165759e-07, + "loss": 0.1491, + "step": 17202 + }, + { + "epoch": 0.87, + "grad_norm": 1.0173530063273255, + "learning_rate": 8.108828761757948e-07, + "loss": 0.1596, + "step": 17203 + }, + { + "epoch": 0.87, + "grad_norm": 1.7623952342628695, + "learning_rate": 8.102333321903344e-07, + "loss": 0.1666, + "step": 17204 + }, + { + "epoch": 0.87, + "grad_norm": 1.194863581749474, + "learning_rate": 8.095840374778153e-07, + "loss": 0.1646, + "step": 17205 + }, + { + "epoch": 0.87, + "grad_norm": 0.9392827691446882, + "learning_rate": 8.089349920558465e-07, + "loss": 0.1578, + "step": 17206 + }, + { + "epoch": 0.88, + "grad_norm": 1.0778890805293149, + "learning_rate": 8.082861959420374e-07, + "loss": 0.17, + "step": 17207 + }, + { + "epoch": 0.88, + "grad_norm": 1.6435653004467239, + "learning_rate": 8.076376491539827e-07, + "loss": 0.1663, + "step": 17208 + }, + { + "epoch": 0.88, + "grad_norm": 1.2995772904802547, + "learning_rate": 8.069893517092775e-07, + "loss": 0.1644, + "step": 17209 + }, + { + "epoch": 0.88, + "grad_norm": 1.3175793197088113, + "learning_rate": 8.063413036255041e-07, + "loss": 0.1531, + "step": 17210 + }, + { + "epoch": 0.88, + "grad_norm": 1.0686728260509748, + "learning_rate": 8.05693504920243e-07, + "loss": 0.1778, + "step": 17211 + }, + { + "epoch": 0.88, + "grad_norm": 1.7564258312190575, + "learning_rate": 8.050459556110635e-07, + "loss": 0.1698, + "step": 17212 + }, + { + "epoch": 0.88, + "grad_norm": 1.2441905694267967, + "learning_rate": 8.043986557155315e-07, + "loss": 0.1581, + "step": 17213 + }, + { + "epoch": 0.88, + "grad_norm": 1.0520955464046744, + "learning_rate": 8.037516052512062e-07, + "loss": 0.1643, + "step": 17214 + }, + { + "epoch": 0.88, + "grad_norm": 1.3677567086585491, + "learning_rate": 8.031048042356393e-07, + "loss": 0.1551, + "step": 17215 + }, + { + "epoch": 0.88, + "grad_norm": 1.2042220749762846, + "learning_rate": 8.024582526863722e-07, + "loss": 0.1711, + "step": 17216 + }, + { + "epoch": 0.88, + "grad_norm": 1.0138279779706278, + "learning_rate": 8.018119506209454e-07, + "loss": 0.1576, + "step": 17217 + }, + { + "epoch": 0.88, + "grad_norm": 1.2803233628460835, + "learning_rate": 8.011658980568903e-07, + "loss": 0.1636, + "step": 17218 + }, + { + "epoch": 0.88, + "grad_norm": 0.8006138395157962, + "learning_rate": 8.005200950117275e-07, + "loss": 0.1514, + "step": 17219 + }, + { + "epoch": 0.88, + "grad_norm": 1.0802690996728705, + "learning_rate": 7.998745415029762e-07, + "loss": 0.1676, + "step": 17220 + }, + { + "epoch": 0.88, + "grad_norm": 1.227853461420169, + "learning_rate": 7.99229237548148e-07, + "loss": 0.1644, + "step": 17221 + }, + { + "epoch": 0.88, + "grad_norm": 1.0425876802294116, + "learning_rate": 7.985841831647489e-07, + "loss": 0.168, + "step": 17222 + }, + { + "epoch": 0.88, + "grad_norm": 1.0061004327128427, + "learning_rate": 7.979393783702704e-07, + "loss": 0.1621, + "step": 17223 + }, + { + "epoch": 0.88, + "grad_norm": 1.6922528782775406, + "learning_rate": 7.972948231822087e-07, + "loss": 0.1624, + "step": 17224 + }, + { + "epoch": 0.88, + "grad_norm": 0.9172038906437257, + "learning_rate": 7.966505176180428e-07, + "loss": 0.1756, + "step": 17225 + }, + { + "epoch": 0.88, + "grad_norm": 1.3744976825971134, + "learning_rate": 7.960064616952523e-07, + "loss": 0.1641, + "step": 17226 + }, + { + "epoch": 0.88, + "grad_norm": 1.268173030375666, + "learning_rate": 7.953626554313055e-07, + "loss": 0.1457, + "step": 17227 + }, + { + "epoch": 0.88, + "grad_norm": 1.4362620959002659, + "learning_rate": 7.947190988436681e-07, + "loss": 0.1489, + "step": 17228 + }, + { + "epoch": 0.88, + "grad_norm": 1.1619879743795989, + "learning_rate": 7.940757919497944e-07, + "loss": 0.1814, + "step": 17229 + }, + { + "epoch": 0.88, + "grad_norm": 0.9927394194170999, + "learning_rate": 7.934327347671333e-07, + "loss": 0.1714, + "step": 17230 + }, + { + "epoch": 0.88, + "grad_norm": 0.9038842448831277, + "learning_rate": 7.927899273131301e-07, + "loss": 0.1454, + "step": 17231 + }, + { + "epoch": 0.88, + "grad_norm": 1.3595367981253321, + "learning_rate": 7.921473696052206e-07, + "loss": 0.1528, + "step": 17232 + }, + { + "epoch": 0.88, + "grad_norm": 1.0757902365280851, + "learning_rate": 7.915050616608333e-07, + "loss": 0.1663, + "step": 17233 + }, + { + "epoch": 0.88, + "grad_norm": 1.2885867737737462, + "learning_rate": 7.90863003497393e-07, + "loss": 0.1816, + "step": 17234 + }, + { + "epoch": 0.88, + "grad_norm": 1.374042545612596, + "learning_rate": 7.902211951323135e-07, + "loss": 0.1461, + "step": 17235 + }, + { + "epoch": 0.88, + "grad_norm": 1.2403023423821855, + "learning_rate": 7.895796365830021e-07, + "loss": 0.1737, + "step": 17236 + }, + { + "epoch": 0.88, + "grad_norm": 1.1516657453656927, + "learning_rate": 7.889383278668661e-07, + "loss": 0.1486, + "step": 17237 + }, + { + "epoch": 0.88, + "grad_norm": 1.3981759897934891, + "learning_rate": 7.882972690012957e-07, + "loss": 0.1755, + "step": 17238 + }, + { + "epoch": 0.88, + "grad_norm": 1.3367001388773048, + "learning_rate": 7.876564600036818e-07, + "loss": 0.1781, + "step": 17239 + }, + { + "epoch": 0.88, + "grad_norm": 0.9591334225113681, + "learning_rate": 7.870159008914069e-07, + "loss": 0.163, + "step": 17240 + }, + { + "epoch": 0.88, + "grad_norm": 1.1275579944257983, + "learning_rate": 7.863755916818483e-07, + "loss": 0.1838, + "step": 17241 + }, + { + "epoch": 0.88, + "grad_norm": 1.4577251812287217, + "learning_rate": 7.85735532392371e-07, + "loss": 0.164, + "step": 17242 + }, + { + "epoch": 0.88, + "grad_norm": 1.0104330875157992, + "learning_rate": 7.850957230403378e-07, + "loss": 0.1717, + "step": 17243 + }, + { + "epoch": 0.88, + "grad_norm": 1.104711385825246, + "learning_rate": 7.844561636431036e-07, + "loss": 0.1879, + "step": 17244 + }, + { + "epoch": 0.88, + "grad_norm": 1.5995755578306345, + "learning_rate": 7.838168542180169e-07, + "loss": 0.1398, + "step": 17245 + }, + { + "epoch": 0.88, + "grad_norm": 1.1064120870453258, + "learning_rate": 7.83177794782417e-07, + "loss": 0.1694, + "step": 17246 + }, + { + "epoch": 0.88, + "grad_norm": 0.9905646769100281, + "learning_rate": 7.825389853536403e-07, + "loss": 0.1724, + "step": 17247 + }, + { + "epoch": 0.88, + "grad_norm": 1.0883238839154536, + "learning_rate": 7.819004259490148e-07, + "loss": 0.1671, + "step": 17248 + }, + { + "epoch": 0.88, + "grad_norm": 1.589772887880888, + "learning_rate": 7.812621165858625e-07, + "loss": 0.1989, + "step": 17249 + }, + { + "epoch": 0.88, + "grad_norm": 0.9767483701679833, + "learning_rate": 7.806240572814927e-07, + "loss": 0.1762, + "step": 17250 + }, + { + "epoch": 0.88, + "grad_norm": 0.9107924632802049, + "learning_rate": 7.799862480532194e-07, + "loss": 0.1513, + "step": 17251 + }, + { + "epoch": 0.88, + "grad_norm": 0.9780873235784965, + "learning_rate": 7.793486889183377e-07, + "loss": 0.1605, + "step": 17252 + }, + { + "epoch": 0.88, + "grad_norm": 1.2925286403963623, + "learning_rate": 7.787113798941449e-07, + "loss": 0.1701, + "step": 17253 + }, + { + "epoch": 0.88, + "grad_norm": 1.2855320737032634, + "learning_rate": 7.780743209979269e-07, + "loss": 0.1574, + "step": 17254 + }, + { + "epoch": 0.88, + "grad_norm": 1.0986164566088186, + "learning_rate": 7.774375122469624e-07, + "loss": 0.1723, + "step": 17255 + }, + { + "epoch": 0.88, + "grad_norm": 1.3614995748491658, + "learning_rate": 7.768009536585264e-07, + "loss": 0.1501, + "step": 17256 + }, + { + "epoch": 0.88, + "grad_norm": 1.6770303975905876, + "learning_rate": 7.76164645249885e-07, + "loss": 0.1551, + "step": 17257 + }, + { + "epoch": 0.88, + "grad_norm": 1.076382257356652, + "learning_rate": 7.755285870383011e-07, + "loss": 0.165, + "step": 17258 + }, + { + "epoch": 0.88, + "grad_norm": 1.5438451565334113, + "learning_rate": 7.748927790410221e-07, + "loss": 0.1292, + "step": 17259 + }, + { + "epoch": 0.88, + "grad_norm": 1.2707148714608554, + "learning_rate": 7.742572212753008e-07, + "loss": 0.1611, + "step": 17260 + }, + { + "epoch": 0.88, + "grad_norm": 1.1250090687287368, + "learning_rate": 7.736219137583701e-07, + "loss": 0.1839, + "step": 17261 + }, + { + "epoch": 0.88, + "grad_norm": 0.9268465624245334, + "learning_rate": 7.729868565074694e-07, + "loss": 0.1683, + "step": 17262 + }, + { + "epoch": 0.88, + "grad_norm": 0.8984489920500216, + "learning_rate": 7.723520495398185e-07, + "loss": 0.1573, + "step": 17263 + }, + { + "epoch": 0.88, + "grad_norm": 1.2633143592884477, + "learning_rate": 7.717174928726401e-07, + "loss": 0.1511, + "step": 17264 + }, + { + "epoch": 0.88, + "grad_norm": 1.4414316188321858, + "learning_rate": 7.710831865231461e-07, + "loss": 0.1825, + "step": 17265 + }, + { + "epoch": 0.88, + "grad_norm": 1.0270181484599707, + "learning_rate": 7.704491305085427e-07, + "loss": 0.193, + "step": 17266 + }, + { + "epoch": 0.88, + "grad_norm": 0.7770444000137979, + "learning_rate": 7.698153248460271e-07, + "loss": 0.1773, + "step": 17267 + }, + { + "epoch": 0.88, + "grad_norm": 1.3377018839713426, + "learning_rate": 7.691817695527936e-07, + "loss": 0.1898, + "step": 17268 + }, + { + "epoch": 0.88, + "grad_norm": 1.2142810653464722, + "learning_rate": 7.68548464646024e-07, + "loss": 0.1691, + "step": 17269 + }, + { + "epoch": 0.88, + "grad_norm": 1.133137337818727, + "learning_rate": 7.679154101428998e-07, + "loss": 0.1435, + "step": 17270 + }, + { + "epoch": 0.88, + "grad_norm": 1.0938459587105016, + "learning_rate": 7.672826060605931e-07, + "loss": 0.1618, + "step": 17271 + }, + { + "epoch": 0.88, + "grad_norm": 1.30171994693312, + "learning_rate": 7.666500524162646e-07, + "loss": 0.1507, + "step": 17272 + }, + { + "epoch": 0.88, + "grad_norm": 1.2058208735977238, + "learning_rate": 7.660177492270749e-07, + "loss": 0.1453, + "step": 17273 + }, + { + "epoch": 0.88, + "grad_norm": 1.1198741455359467, + "learning_rate": 7.653856965101747e-07, + "loss": 0.1589, + "step": 17274 + }, + { + "epoch": 0.88, + "grad_norm": 0.8746745470939669, + "learning_rate": 7.647538942827115e-07, + "loss": 0.145, + "step": 17275 + }, + { + "epoch": 0.88, + "grad_norm": 1.2037543741940957, + "learning_rate": 7.641223425618193e-07, + "loss": 0.1648, + "step": 17276 + }, + { + "epoch": 0.88, + "grad_norm": 0.8927725992603462, + "learning_rate": 7.634910413646313e-07, + "loss": 0.1376, + "step": 17277 + }, + { + "epoch": 0.88, + "grad_norm": 1.3217628032750142, + "learning_rate": 7.62859990708269e-07, + "loss": 0.1622, + "step": 17278 + }, + { + "epoch": 0.88, + "grad_norm": 1.1443610497786343, + "learning_rate": 7.622291906098523e-07, + "loss": 0.156, + "step": 17279 + }, + { + "epoch": 0.88, + "grad_norm": 1.2311048614980642, + "learning_rate": 7.615986410864895e-07, + "loss": 0.1614, + "step": 17280 + }, + { + "epoch": 0.88, + "grad_norm": 1.2165201760489044, + "learning_rate": 7.609683421552861e-07, + "loss": 0.1638, + "step": 17281 + }, + { + "epoch": 0.88, + "grad_norm": 1.5208694414991424, + "learning_rate": 7.603382938333382e-07, + "loss": 0.1596, + "step": 17282 + }, + { + "epoch": 0.88, + "grad_norm": 1.2366002878916877, + "learning_rate": 7.597084961377343e-07, + "loss": 0.1453, + "step": 17283 + }, + { + "epoch": 0.88, + "grad_norm": 1.956702883190355, + "learning_rate": 7.590789490855599e-07, + "loss": 0.1609, + "step": 17284 + }, + { + "epoch": 0.88, + "grad_norm": 1.7430471552680176, + "learning_rate": 7.584496526938933e-07, + "loss": 0.1864, + "step": 17285 + }, + { + "epoch": 0.88, + "grad_norm": 1.1854800332218853, + "learning_rate": 7.578206069797989e-07, + "loss": 0.1511, + "step": 17286 + }, + { + "epoch": 0.88, + "grad_norm": 1.5933026546514464, + "learning_rate": 7.57191811960345e-07, + "loss": 0.1418, + "step": 17287 + }, + { + "epoch": 0.88, + "grad_norm": 1.140500896412719, + "learning_rate": 7.565632676525858e-07, + "loss": 0.1621, + "step": 17288 + }, + { + "epoch": 0.88, + "grad_norm": 1.2614036948724896, + "learning_rate": 7.559349740735677e-07, + "loss": 0.1618, + "step": 17289 + }, + { + "epoch": 0.88, + "grad_norm": 1.0229832009884556, + "learning_rate": 7.55306931240335e-07, + "loss": 0.1566, + "step": 17290 + }, + { + "epoch": 0.88, + "grad_norm": 1.1404776623724786, + "learning_rate": 7.546791391699248e-07, + "loss": 0.151, + "step": 17291 + }, + { + "epoch": 0.88, + "grad_norm": 9.027669539432624, + "learning_rate": 7.540515978793661e-07, + "loss": 0.154, + "step": 17292 + }, + { + "epoch": 0.88, + "grad_norm": 0.9730493748099621, + "learning_rate": 7.534243073856784e-07, + "loss": 0.1629, + "step": 17293 + }, + { + "epoch": 0.88, + "grad_norm": 0.9318541669733853, + "learning_rate": 7.527972677058814e-07, + "loss": 0.1712, + "step": 17294 + }, + { + "epoch": 0.88, + "grad_norm": 1.128684780041153, + "learning_rate": 7.521704788569783e-07, + "loss": 0.1829, + "step": 17295 + }, + { + "epoch": 0.88, + "grad_norm": 1.0252096353600924, + "learning_rate": 7.515439408559744e-07, + "loss": 0.1749, + "step": 17296 + }, + { + "epoch": 0.88, + "grad_norm": 0.995727140664652, + "learning_rate": 7.509176537198626e-07, + "loss": 0.1692, + "step": 17297 + }, + { + "epoch": 0.88, + "grad_norm": 0.969441578451583, + "learning_rate": 7.502916174656338e-07, + "loss": 0.1593, + "step": 17298 + }, + { + "epoch": 0.88, + "grad_norm": 1.2329684303060555, + "learning_rate": 7.496658321102646e-07, + "loss": 0.1613, + "step": 17299 + }, + { + "epoch": 0.88, + "grad_norm": 0.9423363240103179, + "learning_rate": 7.490402976707323e-07, + "loss": 0.1572, + "step": 17300 + }, + { + "epoch": 0.88, + "grad_norm": 1.2564010048538554, + "learning_rate": 7.484150141640056e-07, + "loss": 0.173, + "step": 17301 + }, + { + "epoch": 0.88, + "grad_norm": 1.070435552031945, + "learning_rate": 7.477899816070444e-07, + "loss": 0.1659, + "step": 17302 + }, + { + "epoch": 0.88, + "grad_norm": 1.5672121229787728, + "learning_rate": 7.471652000168017e-07, + "loss": 0.1677, + "step": 17303 + }, + { + "epoch": 0.88, + "grad_norm": 1.0885715864729035, + "learning_rate": 7.465406694102273e-07, + "loss": 0.1572, + "step": 17304 + }, + { + "epoch": 0.88, + "grad_norm": 1.436480958250531, + "learning_rate": 7.459163898042599e-07, + "loss": 0.1626, + "step": 17305 + }, + { + "epoch": 0.88, + "grad_norm": 1.405159042399723, + "learning_rate": 7.452923612158303e-07, + "loss": 0.1751, + "step": 17306 + }, + { + "epoch": 0.88, + "grad_norm": 0.9347108488017326, + "learning_rate": 7.446685836618706e-07, + "loss": 0.1812, + "step": 17307 + }, + { + "epoch": 0.88, + "grad_norm": 1.16810165610296, + "learning_rate": 7.440450571592972e-07, + "loss": 0.1463, + "step": 17308 + }, + { + "epoch": 0.88, + "grad_norm": 1.045784749400454, + "learning_rate": 7.434217817250233e-07, + "loss": 0.1556, + "step": 17309 + }, + { + "epoch": 0.88, + "grad_norm": 1.2025574622438935, + "learning_rate": 7.427987573759576e-07, + "loss": 0.1656, + "step": 17310 + }, + { + "epoch": 0.88, + "grad_norm": 1.0111063760692782, + "learning_rate": 7.421759841289989e-07, + "loss": 0.1608, + "step": 17311 + }, + { + "epoch": 0.88, + "grad_norm": 1.522813352090518, + "learning_rate": 7.41553462001039e-07, + "loss": 0.1501, + "step": 17312 + }, + { + "epoch": 0.88, + "grad_norm": 1.2803206073977835, + "learning_rate": 7.409311910089645e-07, + "loss": 0.1971, + "step": 17313 + }, + { + "epoch": 0.88, + "grad_norm": 0.8767646651286921, + "learning_rate": 7.403091711696542e-07, + "loss": 0.1629, + "step": 17314 + }, + { + "epoch": 0.88, + "grad_norm": 1.2290321284414285, + "learning_rate": 7.396874024999811e-07, + "loss": 0.1695, + "step": 17315 + }, + { + "epoch": 0.88, + "grad_norm": 1.4577615994194053, + "learning_rate": 7.390658850168098e-07, + "loss": 0.1624, + "step": 17316 + }, + { + "epoch": 0.88, + "grad_norm": 1.235987193669687, + "learning_rate": 7.384446187369987e-07, + "loss": 0.1489, + "step": 17317 + }, + { + "epoch": 0.88, + "grad_norm": 0.7957026665132977, + "learning_rate": 7.378236036774e-07, + "loss": 0.1522, + "step": 17318 + }, + { + "epoch": 0.88, + "grad_norm": 0.8319149713315096, + "learning_rate": 7.372028398548614e-07, + "loss": 0.137, + "step": 17319 + }, + { + "epoch": 0.88, + "grad_norm": 1.2876008102788261, + "learning_rate": 7.365823272862183e-07, + "loss": 0.1818, + "step": 17320 + }, + { + "epoch": 0.88, + "grad_norm": 1.1938219095258311, + "learning_rate": 7.359620659883026e-07, + "loss": 0.1785, + "step": 17321 + }, + { + "epoch": 0.88, + "grad_norm": 1.1088942410264782, + "learning_rate": 7.35342055977939e-07, + "loss": 0.1956, + "step": 17322 + }, + { + "epoch": 0.88, + "grad_norm": 1.463606545831211, + "learning_rate": 7.347222972719459e-07, + "loss": 0.1475, + "step": 17323 + }, + { + "epoch": 0.88, + "grad_norm": 0.9638203971133041, + "learning_rate": 7.341027898871345e-07, + "loss": 0.1626, + "step": 17324 + }, + { + "epoch": 0.88, + "grad_norm": 1.7616047380436473, + "learning_rate": 7.334835338403056e-07, + "loss": 0.1497, + "step": 17325 + }, + { + "epoch": 0.88, + "grad_norm": 1.1910719387044761, + "learning_rate": 7.328645291482606e-07, + "loss": 0.1651, + "step": 17326 + }, + { + "epoch": 0.88, + "grad_norm": 1.595718060050182, + "learning_rate": 7.322457758277879e-07, + "loss": 0.1504, + "step": 17327 + }, + { + "epoch": 0.88, + "grad_norm": 0.7702924587303707, + "learning_rate": 7.316272738956731e-07, + "loss": 0.1508, + "step": 17328 + }, + { + "epoch": 0.88, + "grad_norm": 1.0275799989309966, + "learning_rate": 7.310090233686917e-07, + "loss": 0.1594, + "step": 17329 + }, + { + "epoch": 0.88, + "grad_norm": 0.9451932165357124, + "learning_rate": 7.303910242636147e-07, + "loss": 0.1729, + "step": 17330 + }, + { + "epoch": 0.88, + "grad_norm": 0.8652856570187278, + "learning_rate": 7.297732765972033e-07, + "loss": 0.158, + "step": 17331 + }, + { + "epoch": 0.88, + "grad_norm": 0.7772120827819856, + "learning_rate": 7.29155780386217e-07, + "loss": 0.1576, + "step": 17332 + }, + { + "epoch": 0.88, + "grad_norm": 0.8908650335082182, + "learning_rate": 7.285385356474017e-07, + "loss": 0.1343, + "step": 17333 + }, + { + "epoch": 0.88, + "grad_norm": 1.237847844903063, + "learning_rate": 7.27921542397505e-07, + "loss": 0.1735, + "step": 17334 + }, + { + "epoch": 0.88, + "grad_norm": 1.1559617010008287, + "learning_rate": 7.273048006532569e-07, + "loss": 0.1639, + "step": 17335 + }, + { + "epoch": 0.88, + "grad_norm": 1.1807040984900672, + "learning_rate": 7.266883104313916e-07, + "loss": 0.1661, + "step": 17336 + }, + { + "epoch": 0.88, + "grad_norm": 0.9171491441040258, + "learning_rate": 7.260720717486281e-07, + "loss": 0.1743, + "step": 17337 + }, + { + "epoch": 0.88, + "grad_norm": 0.9859557166131068, + "learning_rate": 7.254560846216863e-07, + "loss": 0.1809, + "step": 17338 + }, + { + "epoch": 0.88, + "grad_norm": 1.1130369511140106, + "learning_rate": 7.248403490672695e-07, + "loss": 0.1541, + "step": 17339 + }, + { + "epoch": 0.88, + "grad_norm": 0.8683717597560087, + "learning_rate": 7.242248651020845e-07, + "loss": 0.1517, + "step": 17340 + }, + { + "epoch": 0.88, + "grad_norm": 0.8444625384194985, + "learning_rate": 7.236096327428233e-07, + "loss": 0.1519, + "step": 17341 + }, + { + "epoch": 0.88, + "grad_norm": 1.2794386141678484, + "learning_rate": 7.229946520061737e-07, + "loss": 0.1743, + "step": 17342 + }, + { + "epoch": 0.88, + "grad_norm": 1.0310179364533691, + "learning_rate": 7.223799229088179e-07, + "loss": 0.1449, + "step": 17343 + }, + { + "epoch": 0.88, + "grad_norm": 0.7725121361002629, + "learning_rate": 7.217654454674305e-07, + "loss": 0.1587, + "step": 17344 + }, + { + "epoch": 0.88, + "grad_norm": 1.5382257187815973, + "learning_rate": 7.211512196986803e-07, + "loss": 0.1663, + "step": 17345 + }, + { + "epoch": 0.88, + "grad_norm": 1.0917289681842852, + "learning_rate": 7.205372456192272e-07, + "loss": 0.164, + "step": 17346 + }, + { + "epoch": 0.88, + "grad_norm": 0.9496468908631138, + "learning_rate": 7.199235232457258e-07, + "loss": 0.1825, + "step": 17347 + }, + { + "epoch": 0.88, + "grad_norm": 0.8778324978603581, + "learning_rate": 7.193100525948227e-07, + "loss": 0.1506, + "step": 17348 + }, + { + "epoch": 0.88, + "grad_norm": 1.910535826795464, + "learning_rate": 7.18696833683159e-07, + "loss": 0.1748, + "step": 17349 + }, + { + "epoch": 0.88, + "grad_norm": 1.096729609227987, + "learning_rate": 7.18083866527367e-07, + "loss": 0.1566, + "step": 17350 + }, + { + "epoch": 0.88, + "grad_norm": 0.8855534457008581, + "learning_rate": 7.174711511440757e-07, + "loss": 0.1595, + "step": 17351 + }, + { + "epoch": 0.88, + "grad_norm": 1.1438526198987724, + "learning_rate": 7.168586875499018e-07, + "loss": 0.1628, + "step": 17352 + }, + { + "epoch": 0.88, + "grad_norm": 0.9813627770034073, + "learning_rate": 7.162464757614606e-07, + "loss": 0.1505, + "step": 17353 + }, + { + "epoch": 0.88, + "grad_norm": 0.8963296520021135, + "learning_rate": 7.156345157953581e-07, + "loss": 0.1593, + "step": 17354 + }, + { + "epoch": 0.88, + "grad_norm": 1.0822470726148679, + "learning_rate": 7.150228076681954e-07, + "loss": 0.128, + "step": 17355 + }, + { + "epoch": 0.88, + "grad_norm": 1.0598933426021497, + "learning_rate": 7.144113513965623e-07, + "loss": 0.1831, + "step": 17356 + }, + { + "epoch": 0.88, + "grad_norm": 1.100669695491159, + "learning_rate": 7.138001469970468e-07, + "loss": 0.1648, + "step": 17357 + }, + { + "epoch": 0.88, + "grad_norm": 2.753646196389935, + "learning_rate": 7.131891944862269e-07, + "loss": 0.1572, + "step": 17358 + }, + { + "epoch": 0.88, + "grad_norm": 1.060548591987011, + "learning_rate": 7.125784938806723e-07, + "loss": 0.1664, + "step": 17359 + }, + { + "epoch": 0.88, + "grad_norm": 1.2714760170007762, + "learning_rate": 7.119680451969524e-07, + "loss": 0.1563, + "step": 17360 + }, + { + "epoch": 0.88, + "grad_norm": 1.391024615858319, + "learning_rate": 7.113578484516226e-07, + "loss": 0.1485, + "step": 17361 + }, + { + "epoch": 0.88, + "grad_norm": 1.0065629449928741, + "learning_rate": 7.107479036612375e-07, + "loss": 0.1482, + "step": 17362 + }, + { + "epoch": 0.88, + "grad_norm": 0.9222886347377515, + "learning_rate": 7.101382108423383e-07, + "loss": 0.1642, + "step": 17363 + }, + { + "epoch": 0.88, + "grad_norm": 0.9073932362794302, + "learning_rate": 7.095287700114673e-07, + "loss": 0.1695, + "step": 17364 + }, + { + "epoch": 0.88, + "grad_norm": 0.9807012578354181, + "learning_rate": 7.089195811851502e-07, + "loss": 0.1545, + "step": 17365 + }, + { + "epoch": 0.88, + "grad_norm": 1.1494039005467738, + "learning_rate": 7.083106443799171e-07, + "loss": 0.1607, + "step": 17366 + }, + { + "epoch": 0.88, + "grad_norm": 0.9978285693974382, + "learning_rate": 7.077019596122802e-07, + "loss": 0.1654, + "step": 17367 + }, + { + "epoch": 0.88, + "grad_norm": 1.099167488039827, + "learning_rate": 7.070935268987545e-07, + "loss": 0.1693, + "step": 17368 + }, + { + "epoch": 0.88, + "grad_norm": 0.8713895734861333, + "learning_rate": 7.064853462558397e-07, + "loss": 0.1564, + "step": 17369 + }, + { + "epoch": 0.88, + "grad_norm": 1.1131621662933266, + "learning_rate": 7.05877417700035e-07, + "loss": 0.1686, + "step": 17370 + }, + { + "epoch": 0.88, + "grad_norm": 1.0392605432584487, + "learning_rate": 7.052697412478304e-07, + "loss": 0.161, + "step": 17371 + }, + { + "epoch": 0.88, + "grad_norm": 0.9436772944917501, + "learning_rate": 7.046623169157107e-07, + "loss": 0.1495, + "step": 17372 + }, + { + "epoch": 0.88, + "grad_norm": 0.9696917040806536, + "learning_rate": 7.040551447201494e-07, + "loss": 0.1708, + "step": 17373 + }, + { + "epoch": 0.88, + "grad_norm": 0.9909409102170923, + "learning_rate": 7.034482246776187e-07, + "loss": 0.1651, + "step": 17374 + }, + { + "epoch": 0.88, + "grad_norm": 0.8782964173375589, + "learning_rate": 7.028415568045799e-07, + "loss": 0.1585, + "step": 17375 + }, + { + "epoch": 0.88, + "grad_norm": 1.4118814762288865, + "learning_rate": 7.022351411174866e-07, + "loss": 0.1663, + "step": 17376 + }, + { + "epoch": 0.88, + "grad_norm": 1.0070090083368075, + "learning_rate": 7.016289776327922e-07, + "loss": 0.1451, + "step": 17377 + }, + { + "epoch": 0.88, + "grad_norm": 1.237220825335278, + "learning_rate": 7.010230663669359e-07, + "loss": 0.1594, + "step": 17378 + }, + { + "epoch": 0.88, + "grad_norm": 1.4767425759880979, + "learning_rate": 7.004174073363546e-07, + "loss": 0.1521, + "step": 17379 + }, + { + "epoch": 0.88, + "grad_norm": 1.376161358185498, + "learning_rate": 6.998120005574749e-07, + "loss": 0.1733, + "step": 17380 + }, + { + "epoch": 0.88, + "grad_norm": 1.2946303494952718, + "learning_rate": 6.992068460467227e-07, + "loss": 0.1671, + "step": 17381 + }, + { + "epoch": 0.88, + "grad_norm": 0.9239760508107825, + "learning_rate": 6.986019438205082e-07, + "loss": 0.1603, + "step": 17382 + }, + { + "epoch": 0.88, + "grad_norm": 1.7276860307771067, + "learning_rate": 6.979972938952428e-07, + "loss": 0.1722, + "step": 17383 + }, + { + "epoch": 0.88, + "grad_norm": 1.2215560557678822, + "learning_rate": 6.973928962873244e-07, + "loss": 0.1796, + "step": 17384 + }, + { + "epoch": 0.88, + "grad_norm": 1.2635692699347436, + "learning_rate": 6.96788751013151e-07, + "loss": 0.1584, + "step": 17385 + }, + { + "epoch": 0.88, + "grad_norm": 0.8953773561330749, + "learning_rate": 6.961848580891062e-07, + "loss": 0.1671, + "step": 17386 + }, + { + "epoch": 0.88, + "grad_norm": 0.9207265981103875, + "learning_rate": 6.955812175315735e-07, + "loss": 0.1659, + "step": 17387 + }, + { + "epoch": 0.88, + "grad_norm": 1.3557730967996362, + "learning_rate": 6.949778293569253e-07, + "loss": 0.1504, + "step": 17388 + }, + { + "epoch": 0.88, + "grad_norm": 1.4123195119500518, + "learning_rate": 6.943746935815299e-07, + "loss": 0.1495, + "step": 17389 + }, + { + "epoch": 0.88, + "grad_norm": 1.3687925436747972, + "learning_rate": 6.937718102217461e-07, + "loss": 0.1978, + "step": 17390 + }, + { + "epoch": 0.88, + "grad_norm": 1.2313131796932406, + "learning_rate": 6.931691792939288e-07, + "loss": 0.1657, + "step": 17391 + }, + { + "epoch": 0.88, + "grad_norm": 0.9197371602670851, + "learning_rate": 6.925668008144204e-07, + "loss": 0.1782, + "step": 17392 + }, + { + "epoch": 0.88, + "grad_norm": 0.8727111408694579, + "learning_rate": 6.919646747995668e-07, + "loss": 0.1529, + "step": 17393 + }, + { + "epoch": 0.88, + "grad_norm": 0.8399857072424712, + "learning_rate": 6.913628012656959e-07, + "loss": 0.1724, + "step": 17394 + }, + { + "epoch": 0.88, + "grad_norm": 1.2380908736083993, + "learning_rate": 6.907611802291325e-07, + "loss": 0.154, + "step": 17395 + }, + { + "epoch": 0.88, + "grad_norm": 1.0970936765726857, + "learning_rate": 6.901598117061992e-07, + "loss": 0.1483, + "step": 17396 + }, + { + "epoch": 0.88, + "grad_norm": 1.0378284593686915, + "learning_rate": 6.895586957132061e-07, + "loss": 0.1594, + "step": 17397 + }, + { + "epoch": 0.88, + "grad_norm": 1.0045317486233272, + "learning_rate": 6.889578322664614e-07, + "loss": 0.1703, + "step": 17398 + }, + { + "epoch": 0.88, + "grad_norm": 2.096540638874496, + "learning_rate": 6.883572213822598e-07, + "loss": 0.1555, + "step": 17399 + }, + { + "epoch": 0.88, + "grad_norm": 1.6452198223024794, + "learning_rate": 6.87756863076896e-07, + "loss": 0.1605, + "step": 17400 + }, + { + "epoch": 0.88, + "grad_norm": 1.1708595471622938, + "learning_rate": 6.871567573666516e-07, + "loss": 0.1745, + "step": 17401 + }, + { + "epoch": 0.88, + "grad_norm": 0.9364607771802455, + "learning_rate": 6.865569042678066e-07, + "loss": 0.1622, + "step": 17402 + }, + { + "epoch": 0.88, + "grad_norm": 1.069816199579359, + "learning_rate": 6.859573037966316e-07, + "loss": 0.153, + "step": 17403 + }, + { + "epoch": 0.89, + "grad_norm": 1.110928969063031, + "learning_rate": 6.853579559693913e-07, + "loss": 0.1694, + "step": 17404 + }, + { + "epoch": 0.89, + "grad_norm": 0.982718671441663, + "learning_rate": 6.847588608023414e-07, + "loss": 0.1646, + "step": 17405 + }, + { + "epoch": 0.89, + "grad_norm": 0.9125047628380294, + "learning_rate": 6.841600183117336e-07, + "loss": 0.1578, + "step": 17406 + }, + { + "epoch": 0.89, + "grad_norm": 4.941456937741606, + "learning_rate": 6.835614285138115e-07, + "loss": 0.1549, + "step": 17407 + }, + { + "epoch": 0.89, + "grad_norm": 1.5330577556007376, + "learning_rate": 6.829630914248131e-07, + "loss": 0.1576, + "step": 17408 + }, + { + "epoch": 0.89, + "grad_norm": 1.6233044430724675, + "learning_rate": 6.823650070609666e-07, + "loss": 0.1449, + "step": 17409 + }, + { + "epoch": 0.89, + "grad_norm": 1.2905297846245838, + "learning_rate": 6.817671754384958e-07, + "loss": 0.167, + "step": 17410 + }, + { + "epoch": 0.89, + "grad_norm": 0.994848823740401, + "learning_rate": 6.811695965736176e-07, + "loss": 0.1641, + "step": 17411 + }, + { + "epoch": 0.89, + "grad_norm": 1.2405990289193678, + "learning_rate": 6.805722704825379e-07, + "loss": 0.1834, + "step": 17412 + }, + { + "epoch": 0.89, + "grad_norm": 1.263778459068319, + "learning_rate": 6.799751971814628e-07, + "loss": 0.1523, + "step": 17413 + }, + { + "epoch": 0.89, + "grad_norm": 1.227892496199799, + "learning_rate": 6.793783766865858e-07, + "loss": 0.1575, + "step": 17414 + }, + { + "epoch": 0.89, + "grad_norm": 0.9766224628091549, + "learning_rate": 6.787818090140985e-07, + "loss": 0.1624, + "step": 17415 + }, + { + "epoch": 0.89, + "grad_norm": 1.0826552449351832, + "learning_rate": 6.781854941801802e-07, + "loss": 0.1557, + "step": 17416 + }, + { + "epoch": 0.89, + "grad_norm": 1.076768625593469, + "learning_rate": 6.775894322010079e-07, + "loss": 0.1573, + "step": 17417 + }, + { + "epoch": 0.89, + "grad_norm": 1.107756832754881, + "learning_rate": 6.769936230927477e-07, + "loss": 0.1561, + "step": 17418 + }, + { + "epoch": 0.89, + "grad_norm": 0.9123922774250832, + "learning_rate": 6.763980668715631e-07, + "loss": 0.1576, + "step": 17419 + }, + { + "epoch": 0.89, + "grad_norm": 1.4473130817984259, + "learning_rate": 6.758027635536057e-07, + "loss": 0.1613, + "step": 17420 + }, + { + "epoch": 0.89, + "grad_norm": 1.038560087888518, + "learning_rate": 6.752077131550272e-07, + "loss": 0.1369, + "step": 17421 + }, + { + "epoch": 0.89, + "grad_norm": 1.1176387403979628, + "learning_rate": 6.746129156919645e-07, + "loss": 0.164, + "step": 17422 + }, + { + "epoch": 0.89, + "grad_norm": 1.4993249242433018, + "learning_rate": 6.740183711805537e-07, + "loss": 0.1457, + "step": 17423 + }, + { + "epoch": 0.89, + "grad_norm": 1.496667523309315, + "learning_rate": 6.734240796369207e-07, + "loss": 0.159, + "step": 17424 + }, + { + "epoch": 0.89, + "grad_norm": 1.389756354612381, + "learning_rate": 6.728300410771871e-07, + "loss": 0.1447, + "step": 17425 + }, + { + "epoch": 0.89, + "grad_norm": 1.2795009098863095, + "learning_rate": 6.722362555174644e-07, + "loss": 0.1725, + "step": 17426 + }, + { + "epoch": 0.89, + "grad_norm": 0.9403435892659728, + "learning_rate": 6.71642722973862e-07, + "loss": 0.1661, + "step": 17427 + }, + { + "epoch": 0.89, + "grad_norm": 0.8253890419597139, + "learning_rate": 6.710494434624781e-07, + "loss": 0.1522, + "step": 17428 + }, + { + "epoch": 0.89, + "grad_norm": 0.9326051873116562, + "learning_rate": 6.704564169994022e-07, + "loss": 0.1802, + "step": 17429 + }, + { + "epoch": 0.89, + "grad_norm": 1.464770167732049, + "learning_rate": 6.698636436007256e-07, + "loss": 0.1509, + "step": 17430 + }, + { + "epoch": 0.89, + "grad_norm": 1.1107217098498596, + "learning_rate": 6.692711232825222e-07, + "loss": 0.1652, + "step": 17431 + }, + { + "epoch": 0.89, + "grad_norm": 1.305902964651348, + "learning_rate": 6.686788560608671e-07, + "loss": 0.1673, + "step": 17432 + }, + { + "epoch": 0.89, + "grad_norm": 0.8709144814823717, + "learning_rate": 6.680868419518249e-07, + "loss": 0.1666, + "step": 17433 + }, + { + "epoch": 0.89, + "grad_norm": 1.1971517608356697, + "learning_rate": 6.674950809714553e-07, + "loss": 0.1649, + "step": 17434 + }, + { + "epoch": 0.89, + "grad_norm": 0.9360655756463602, + "learning_rate": 6.669035731358075e-07, + "loss": 0.1852, + "step": 17435 + }, + { + "epoch": 0.89, + "grad_norm": 1.0354548469117784, + "learning_rate": 6.663123184609299e-07, + "loss": 0.1611, + "step": 17436 + }, + { + "epoch": 0.89, + "grad_norm": 1.266579707399008, + "learning_rate": 6.657213169628551e-07, + "loss": 0.1612, + "step": 17437 + }, + { + "epoch": 0.89, + "grad_norm": 1.0852045706792273, + "learning_rate": 6.651305686576182e-07, + "loss": 0.1614, + "step": 17438 + }, + { + "epoch": 0.89, + "grad_norm": 1.4032412323848755, + "learning_rate": 6.645400735612417e-07, + "loss": 0.1786, + "step": 17439 + }, + { + "epoch": 0.89, + "grad_norm": 1.065819611607627, + "learning_rate": 6.639498316897419e-07, + "loss": 0.165, + "step": 17440 + }, + { + "epoch": 0.89, + "grad_norm": 0.8775842417319027, + "learning_rate": 6.633598430591304e-07, + "loss": 0.1523, + "step": 17441 + }, + { + "epoch": 0.89, + "grad_norm": 0.9855080549366656, + "learning_rate": 6.627701076854121e-07, + "loss": 0.1678, + "step": 17442 + }, + { + "epoch": 0.89, + "grad_norm": 0.8520611272160736, + "learning_rate": 6.621806255845797e-07, + "loss": 0.177, + "step": 17443 + }, + { + "epoch": 0.89, + "grad_norm": 1.3422182280118413, + "learning_rate": 6.615913967726273e-07, + "loss": 0.162, + "step": 17444 + }, + { + "epoch": 0.89, + "grad_norm": 0.8702945134702448, + "learning_rate": 6.610024212655364e-07, + "loss": 0.1519, + "step": 17445 + }, + { + "epoch": 0.89, + "grad_norm": 0.8964156650542517, + "learning_rate": 6.604136990792797e-07, + "loss": 0.1481, + "step": 17446 + }, + { + "epoch": 0.89, + "grad_norm": 1.5792301251622436, + "learning_rate": 6.598252302298313e-07, + "loss": 0.1649, + "step": 17447 + }, + { + "epoch": 0.89, + "grad_norm": 1.0581326679188312, + "learning_rate": 6.592370147331495e-07, + "loss": 0.159, + "step": 17448 + }, + { + "epoch": 0.89, + "grad_norm": 1.5677395291460763, + "learning_rate": 6.586490526051903e-07, + "loss": 0.1634, + "step": 17449 + }, + { + "epoch": 0.89, + "grad_norm": 0.9677135043618276, + "learning_rate": 6.580613438619044e-07, + "loss": 0.1741, + "step": 17450 + }, + { + "epoch": 0.89, + "grad_norm": 1.1615411184151843, + "learning_rate": 6.574738885192322e-07, + "loss": 0.1679, + "step": 17451 + }, + { + "epoch": 0.89, + "grad_norm": 1.0354720596249762, + "learning_rate": 6.568866865931078e-07, + "loss": 0.1655, + "step": 17452 + }, + { + "epoch": 0.89, + "grad_norm": 0.9829772905302709, + "learning_rate": 6.562997380994618e-07, + "loss": 0.1731, + "step": 17453 + }, + { + "epoch": 0.89, + "grad_norm": 1.1348821137608869, + "learning_rate": 6.557130430542114e-07, + "loss": 0.1869, + "step": 17454 + }, + { + "epoch": 0.89, + "grad_norm": 0.9861542333307887, + "learning_rate": 6.551266014732738e-07, + "loss": 0.1699, + "step": 17455 + }, + { + "epoch": 0.89, + "grad_norm": 1.2911984007830952, + "learning_rate": 6.54540413372553e-07, + "loss": 0.1657, + "step": 17456 + }, + { + "epoch": 0.89, + "grad_norm": 1.093145298497331, + "learning_rate": 6.53954478767953e-07, + "loss": 0.1652, + "step": 17457 + }, + { + "epoch": 0.89, + "grad_norm": 0.9231078419410501, + "learning_rate": 6.533687976753644e-07, + "loss": 0.1535, + "step": 17458 + }, + { + "epoch": 0.89, + "grad_norm": 2.001758271155291, + "learning_rate": 6.527833701106745e-07, + "loss": 0.1653, + "step": 17459 + }, + { + "epoch": 0.89, + "grad_norm": 0.9807167557360661, + "learning_rate": 6.521981960897639e-07, + "loss": 0.1545, + "step": 17460 + }, + { + "epoch": 0.89, + "grad_norm": 1.0752620461582922, + "learning_rate": 6.516132756285065e-07, + "loss": 0.1577, + "step": 17461 + }, + { + "epoch": 0.89, + "grad_norm": 0.9511644088197974, + "learning_rate": 6.510286087427664e-07, + "loss": 0.1579, + "step": 17462 + }, + { + "epoch": 0.89, + "grad_norm": 1.01542497369087, + "learning_rate": 6.504441954484042e-07, + "loss": 0.1461, + "step": 17463 + }, + { + "epoch": 0.89, + "grad_norm": 1.0220895756406778, + "learning_rate": 6.498600357612717e-07, + "loss": 0.1428, + "step": 17464 + }, + { + "epoch": 0.89, + "grad_norm": 1.1266382645524304, + "learning_rate": 6.492761296972117e-07, + "loss": 0.1868, + "step": 17465 + }, + { + "epoch": 0.89, + "grad_norm": 1.2207458765720829, + "learning_rate": 6.486924772720648e-07, + "loss": 0.1826, + "step": 17466 + }, + { + "epoch": 0.89, + "grad_norm": 1.2781352593174329, + "learning_rate": 6.481090785016631e-07, + "loss": 0.1654, + "step": 17467 + }, + { + "epoch": 0.89, + "grad_norm": 1.1189948022298877, + "learning_rate": 6.475259334018314e-07, + "loss": 0.1608, + "step": 17468 + }, + { + "epoch": 0.89, + "grad_norm": 1.0855749092863607, + "learning_rate": 6.46943041988387e-07, + "loss": 0.153, + "step": 17469 + }, + { + "epoch": 0.89, + "grad_norm": 0.9274200063910341, + "learning_rate": 6.463604042771409e-07, + "loss": 0.1548, + "step": 17470 + }, + { + "epoch": 0.89, + "grad_norm": 0.9209645234721234, + "learning_rate": 6.457780202838959e-07, + "loss": 0.1528, + "step": 17471 + }, + { + "epoch": 0.89, + "grad_norm": 0.9047802876618245, + "learning_rate": 6.451958900244526e-07, + "loss": 0.1596, + "step": 17472 + }, + { + "epoch": 0.89, + "grad_norm": 1.0981685953170108, + "learning_rate": 6.446140135145973e-07, + "loss": 0.152, + "step": 17473 + }, + { + "epoch": 0.89, + "grad_norm": 0.8935378090645346, + "learning_rate": 6.440323907701173e-07, + "loss": 0.1533, + "step": 17474 + }, + { + "epoch": 0.89, + "grad_norm": 0.8424974250053628, + "learning_rate": 6.434510218067846e-07, + "loss": 0.1673, + "step": 17475 + }, + { + "epoch": 0.89, + "grad_norm": 0.8613847064248749, + "learning_rate": 6.428699066403721e-07, + "loss": 0.1462, + "step": 17476 + }, + { + "epoch": 0.89, + "grad_norm": 1.0212232620116857, + "learning_rate": 6.422890452866415e-07, + "loss": 0.1787, + "step": 17477 + }, + { + "epoch": 0.89, + "grad_norm": 3.7089778175383827, + "learning_rate": 6.417084377613514e-07, + "loss": 0.1809, + "step": 17478 + }, + { + "epoch": 0.89, + "grad_norm": 1.0972190976416634, + "learning_rate": 6.411280840802459e-07, + "loss": 0.145, + "step": 17479 + }, + { + "epoch": 0.89, + "grad_norm": 1.1042160964760777, + "learning_rate": 6.405479842590723e-07, + "loss": 0.1591, + "step": 17480 + }, + { + "epoch": 0.89, + "grad_norm": 0.9220425796674508, + "learning_rate": 6.399681383135625e-07, + "loss": 0.1582, + "step": 17481 + }, + { + "epoch": 0.89, + "grad_norm": 0.9683037480178739, + "learning_rate": 6.39388546259444e-07, + "loss": 0.1504, + "step": 17482 + }, + { + "epoch": 0.89, + "grad_norm": 1.115613663388435, + "learning_rate": 6.388092081124398e-07, + "loss": 0.1617, + "step": 17483 + }, + { + "epoch": 0.89, + "grad_norm": 1.7121992562713897, + "learning_rate": 6.382301238882649e-07, + "loss": 0.1673, + "step": 17484 + }, + { + "epoch": 0.89, + "grad_norm": 1.0209392183063348, + "learning_rate": 6.37651293602628e-07, + "loss": 0.1584, + "step": 17485 + }, + { + "epoch": 0.89, + "grad_norm": 1.334734571271829, + "learning_rate": 6.370727172712276e-07, + "loss": 0.1617, + "step": 17486 + }, + { + "epoch": 0.89, + "grad_norm": 0.9918643664287256, + "learning_rate": 6.364943949097591e-07, + "loss": 0.1551, + "step": 17487 + }, + { + "epoch": 0.89, + "grad_norm": 0.9000122737801736, + "learning_rate": 6.359163265339085e-07, + "loss": 0.1652, + "step": 17488 + }, + { + "epoch": 0.89, + "grad_norm": 1.0864971415623828, + "learning_rate": 6.353385121593569e-07, + "loss": 0.1427, + "step": 17489 + }, + { + "epoch": 0.89, + "grad_norm": 1.266860418067606, + "learning_rate": 6.347609518017761e-07, + "loss": 0.1563, + "step": 17490 + }, + { + "epoch": 0.89, + "grad_norm": 1.0140534448588183, + "learning_rate": 6.341836454768358e-07, + "loss": 0.1619, + "step": 17491 + }, + { + "epoch": 0.89, + "grad_norm": 1.3410852805223372, + "learning_rate": 6.3360659320019e-07, + "loss": 0.1645, + "step": 17492 + }, + { + "epoch": 0.89, + "grad_norm": 1.255999216791831, + "learning_rate": 6.330297949874952e-07, + "loss": 0.1625, + "step": 17493 + }, + { + "epoch": 0.89, + "grad_norm": 1.2359667320876915, + "learning_rate": 6.324532508543967e-07, + "loss": 0.1781, + "step": 17494 + }, + { + "epoch": 0.89, + "grad_norm": 1.0118891462587718, + "learning_rate": 6.318769608165332e-07, + "loss": 0.1705, + "step": 17495 + }, + { + "epoch": 0.89, + "grad_norm": 0.8681988186278751, + "learning_rate": 6.313009248895352e-07, + "loss": 0.1352, + "step": 17496 + }, + { + "epoch": 0.89, + "grad_norm": 0.8505491563844542, + "learning_rate": 6.307251430890315e-07, + "loss": 0.1637, + "step": 17497 + }, + { + "epoch": 0.89, + "grad_norm": 0.9572439560590739, + "learning_rate": 6.301496154306363e-07, + "loss": 0.1578, + "step": 17498 + }, + { + "epoch": 0.89, + "grad_norm": 1.1260369067949432, + "learning_rate": 6.295743419299605e-07, + "loss": 0.1516, + "step": 17499 + }, + { + "epoch": 0.89, + "grad_norm": 0.9257905386818395, + "learning_rate": 6.289993226026114e-07, + "loss": 0.1556, + "step": 17500 + }, + { + "epoch": 0.89, + "grad_norm": 1.5032840341708493, + "learning_rate": 6.284245574641834e-07, + "loss": 0.1533, + "step": 17501 + }, + { + "epoch": 0.89, + "grad_norm": 0.8893792018006633, + "learning_rate": 6.278500465302684e-07, + "loss": 0.1584, + "step": 17502 + }, + { + "epoch": 0.89, + "grad_norm": 1.2981337750490072, + "learning_rate": 6.272757898164506e-07, + "loss": 0.1617, + "step": 17503 + }, + { + "epoch": 0.89, + "grad_norm": 2.0296535628324643, + "learning_rate": 6.267017873383085e-07, + "loss": 0.1752, + "step": 17504 + }, + { + "epoch": 0.89, + "grad_norm": 0.9195443136945383, + "learning_rate": 6.261280391114077e-07, + "loss": 0.1646, + "step": 17505 + }, + { + "epoch": 0.89, + "grad_norm": 0.9677007307761413, + "learning_rate": 6.255545451513146e-07, + "loss": 0.1379, + "step": 17506 + }, + { + "epoch": 0.89, + "grad_norm": 1.4911280417441737, + "learning_rate": 6.24981305473582e-07, + "loss": 0.1789, + "step": 17507 + }, + { + "epoch": 0.89, + "grad_norm": 1.4267656167569154, + "learning_rate": 6.244083200937634e-07, + "loss": 0.1667, + "step": 17508 + }, + { + "epoch": 0.89, + "grad_norm": 0.8906435122285429, + "learning_rate": 6.238355890273973e-07, + "loss": 0.1511, + "step": 17509 + }, + { + "epoch": 0.89, + "grad_norm": 1.120302944538742, + "learning_rate": 6.232631122900201e-07, + "loss": 0.1746, + "step": 17510 + }, + { + "epoch": 0.89, + "grad_norm": 1.0361573831639836, + "learning_rate": 6.226908898971596e-07, + "loss": 0.1612, + "step": 17511 + }, + { + "epoch": 0.89, + "grad_norm": 0.9827383407813749, + "learning_rate": 6.221189218643409e-07, + "loss": 0.1701, + "step": 17512 + }, + { + "epoch": 0.89, + "grad_norm": 1.2483588098983063, + "learning_rate": 6.21547208207075e-07, + "loss": 0.1732, + "step": 17513 + }, + { + "epoch": 0.89, + "grad_norm": 1.4254846113606228, + "learning_rate": 6.209757489408719e-07, + "loss": 0.1491, + "step": 17514 + }, + { + "epoch": 0.89, + "grad_norm": 1.438762605815942, + "learning_rate": 6.2040454408123e-07, + "loss": 0.1507, + "step": 17515 + }, + { + "epoch": 0.89, + "grad_norm": 1.0460399471502997, + "learning_rate": 6.198335936436451e-07, + "loss": 0.1535, + "step": 17516 + }, + { + "epoch": 0.89, + "grad_norm": 1.253299340529797, + "learning_rate": 6.192628976436044e-07, + "loss": 0.1462, + "step": 17517 + }, + { + "epoch": 0.89, + "grad_norm": 15.88619169160222, + "learning_rate": 6.186924560965856e-07, + "loss": 0.1825, + "step": 17518 + }, + { + "epoch": 0.89, + "grad_norm": 2.3292036213000302, + "learning_rate": 6.181222690180644e-07, + "loss": 0.1443, + "step": 17519 + }, + { + "epoch": 0.89, + "grad_norm": 1.03504688979119, + "learning_rate": 6.175523364235059e-07, + "loss": 0.1683, + "step": 17520 + }, + { + "epoch": 0.89, + "grad_norm": 1.253321521401051, + "learning_rate": 6.169826583283722e-07, + "loss": 0.1688, + "step": 17521 + }, + { + "epoch": 0.89, + "grad_norm": 1.0547134303571801, + "learning_rate": 6.164132347481122e-07, + "loss": 0.1816, + "step": 17522 + }, + { + "epoch": 0.89, + "grad_norm": 0.9580859608411995, + "learning_rate": 6.158440656981746e-07, + "loss": 0.1651, + "step": 17523 + }, + { + "epoch": 0.89, + "grad_norm": 1.509856539412932, + "learning_rate": 6.152751511939947e-07, + "loss": 0.158, + "step": 17524 + }, + { + "epoch": 0.89, + "grad_norm": 1.3455717781618253, + "learning_rate": 6.147064912510093e-07, + "loss": 0.1746, + "step": 17525 + }, + { + "epoch": 0.89, + "grad_norm": 1.227922347930265, + "learning_rate": 6.14138085884638e-07, + "loss": 0.1398, + "step": 17526 + }, + { + "epoch": 0.89, + "grad_norm": 1.1176263793989942, + "learning_rate": 6.135699351103031e-07, + "loss": 0.1848, + "step": 17527 + }, + { + "epoch": 0.89, + "grad_norm": 1.8362091794328037, + "learning_rate": 6.130020389434121e-07, + "loss": 0.155, + "step": 17528 + }, + { + "epoch": 0.89, + "grad_norm": 0.9411800601075747, + "learning_rate": 6.124343973993707e-07, + "loss": 0.1598, + "step": 17529 + }, + { + "epoch": 0.89, + "grad_norm": 1.0472801531129503, + "learning_rate": 6.118670104935765e-07, + "loss": 0.1622, + "step": 17530 + }, + { + "epoch": 0.89, + "grad_norm": 1.2564447978249575, + "learning_rate": 6.112998782414215e-07, + "loss": 0.1668, + "step": 17531 + }, + { + "epoch": 0.89, + "grad_norm": 0.8298318531080735, + "learning_rate": 6.107330006582878e-07, + "loss": 0.1592, + "step": 17532 + }, + { + "epoch": 0.89, + "grad_norm": 1.296337598576234, + "learning_rate": 6.101663777595501e-07, + "loss": 0.1737, + "step": 17533 + }, + { + "epoch": 0.89, + "grad_norm": 1.9101423442493164, + "learning_rate": 6.096000095605814e-07, + "loss": 0.1602, + "step": 17534 + }, + { + "epoch": 0.89, + "grad_norm": 0.9812618886749092, + "learning_rate": 6.090338960767417e-07, + "loss": 0.168, + "step": 17535 + }, + { + "epoch": 0.89, + "grad_norm": 1.2866841462265377, + "learning_rate": 6.084680373233875e-07, + "loss": 0.1703, + "step": 17536 + }, + { + "epoch": 0.89, + "grad_norm": 0.9056990924642179, + "learning_rate": 6.079024333158679e-07, + "loss": 0.1768, + "step": 17537 + }, + { + "epoch": 0.89, + "grad_norm": 1.0204475080312303, + "learning_rate": 6.073370840695269e-07, + "loss": 0.176, + "step": 17538 + }, + { + "epoch": 0.89, + "grad_norm": 1.3025219249136724, + "learning_rate": 6.067719895996971e-07, + "loss": 0.1602, + "step": 17539 + }, + { + "epoch": 0.89, + "grad_norm": 1.2543799126764876, + "learning_rate": 6.062071499217081e-07, + "loss": 0.162, + "step": 17540 + }, + { + "epoch": 0.89, + "grad_norm": 1.2796647743558915, + "learning_rate": 6.056425650508801e-07, + "loss": 0.1593, + "step": 17541 + }, + { + "epoch": 0.89, + "grad_norm": 0.9443775080108795, + "learning_rate": 6.050782350025297e-07, + "loss": 0.172, + "step": 17542 + }, + { + "epoch": 0.89, + "grad_norm": 1.2029689130455121, + "learning_rate": 6.045141597919613e-07, + "loss": 0.1822, + "step": 17543 + }, + { + "epoch": 0.89, + "grad_norm": 0.939996658457354, + "learning_rate": 6.039503394344782e-07, + "loss": 0.1472, + "step": 17544 + }, + { + "epoch": 0.89, + "grad_norm": 1.1660803223243563, + "learning_rate": 6.033867739453703e-07, + "loss": 0.1518, + "step": 17545 + }, + { + "epoch": 0.89, + "grad_norm": 0.8682073832006625, + "learning_rate": 6.028234633399277e-07, + "loss": 0.1602, + "step": 17546 + }, + { + "epoch": 0.89, + "grad_norm": 1.5234733333259054, + "learning_rate": 6.022604076334304e-07, + "loss": 0.1513, + "step": 17547 + }, + { + "epoch": 0.89, + "grad_norm": 0.9627404893959367, + "learning_rate": 6.016976068411506e-07, + "loss": 0.1587, + "step": 17548 + }, + { + "epoch": 0.89, + "grad_norm": 1.0389853213896696, + "learning_rate": 6.011350609783529e-07, + "loss": 0.1841, + "step": 17549 + }, + { + "epoch": 0.89, + "grad_norm": 1.319045427716174, + "learning_rate": 6.005727700602992e-07, + "loss": 0.1784, + "step": 17550 + }, + { + "epoch": 0.89, + "grad_norm": 0.9033564901415023, + "learning_rate": 6.000107341022399e-07, + "loss": 0.1841, + "step": 17551 + }, + { + "epoch": 0.89, + "grad_norm": 1.0968342902108015, + "learning_rate": 5.994489531194192e-07, + "loss": 0.1501, + "step": 17552 + }, + { + "epoch": 0.89, + "grad_norm": 1.2034136324685407, + "learning_rate": 5.988874271270773e-07, + "loss": 0.1458, + "step": 17553 + }, + { + "epoch": 0.89, + "grad_norm": 1.7376260021799919, + "learning_rate": 5.983261561404441e-07, + "loss": 0.1727, + "step": 17554 + }, + { + "epoch": 0.89, + "grad_norm": 1.1798377881298199, + "learning_rate": 5.977651401747442e-07, + "loss": 0.1375, + "step": 17555 + }, + { + "epoch": 0.89, + "grad_norm": 1.7601953087649302, + "learning_rate": 5.972043792451964e-07, + "loss": 0.1665, + "step": 17556 + }, + { + "epoch": 0.89, + "grad_norm": 1.0292216285996567, + "learning_rate": 5.966438733670121e-07, + "loss": 0.1547, + "step": 17557 + }, + { + "epoch": 0.89, + "grad_norm": 1.0016902697828487, + "learning_rate": 5.960836225553923e-07, + "loss": 0.1586, + "step": 17558 + }, + { + "epoch": 0.89, + "grad_norm": 0.8752761730699214, + "learning_rate": 5.955236268255372e-07, + "loss": 0.1724, + "step": 17559 + }, + { + "epoch": 0.89, + "grad_norm": 1.607044011795207, + "learning_rate": 5.949638861926333e-07, + "loss": 0.1407, + "step": 17560 + }, + { + "epoch": 0.89, + "grad_norm": 1.1743967307776313, + "learning_rate": 5.944044006718674e-07, + "loss": 0.1352, + "step": 17561 + }, + { + "epoch": 0.89, + "grad_norm": 1.0984485539787459, + "learning_rate": 5.938451702784109e-07, + "loss": 0.1715, + "step": 17562 + }, + { + "epoch": 0.89, + "grad_norm": 0.7723001616527106, + "learning_rate": 5.93286195027436e-07, + "loss": 0.1623, + "step": 17563 + }, + { + "epoch": 0.89, + "grad_norm": 2.3667467559842605, + "learning_rate": 5.927274749341039e-07, + "loss": 0.1737, + "step": 17564 + }, + { + "epoch": 0.89, + "grad_norm": 1.078327802462095, + "learning_rate": 5.921690100135713e-07, + "loss": 0.1619, + "step": 17565 + }, + { + "epoch": 0.89, + "grad_norm": 1.1087345773722312, + "learning_rate": 5.916108002809851e-07, + "loss": 0.1758, + "step": 17566 + }, + { + "epoch": 0.89, + "grad_norm": 0.931356983992569, + "learning_rate": 5.910528457514886e-07, + "loss": 0.1589, + "step": 17567 + }, + { + "epoch": 0.89, + "grad_norm": 0.9761997564311885, + "learning_rate": 5.904951464402154e-07, + "loss": 0.1704, + "step": 17568 + }, + { + "epoch": 0.89, + "grad_norm": 0.8754256176667727, + "learning_rate": 5.89937702362291e-07, + "loss": 0.171, + "step": 17569 + }, + { + "epoch": 0.89, + "grad_norm": 0.9700307877911704, + "learning_rate": 5.89380513532839e-07, + "loss": 0.1542, + "step": 17570 + }, + { + "epoch": 0.89, + "grad_norm": 0.8100403713688008, + "learning_rate": 5.888235799669705e-07, + "loss": 0.1737, + "step": 17571 + }, + { + "epoch": 0.89, + "grad_norm": 0.8726291967736584, + "learning_rate": 5.882669016797948e-07, + "loss": 0.1742, + "step": 17572 + }, + { + "epoch": 0.89, + "grad_norm": 1.1240848985822962, + "learning_rate": 5.877104786864107e-07, + "loss": 0.1621, + "step": 17573 + }, + { + "epoch": 0.89, + "grad_norm": 1.0396680948721067, + "learning_rate": 5.871543110019128e-07, + "loss": 0.1665, + "step": 17574 + }, + { + "epoch": 0.89, + "grad_norm": 1.1313393019502156, + "learning_rate": 5.865983986413848e-07, + "loss": 0.1529, + "step": 17575 + }, + { + "epoch": 0.89, + "grad_norm": 1.0259427094500448, + "learning_rate": 5.860427416199077e-07, + "loss": 0.1855, + "step": 17576 + }, + { + "epoch": 0.89, + "grad_norm": 1.0316922210292752, + "learning_rate": 5.854873399525518e-07, + "loss": 0.1643, + "step": 17577 + }, + { + "epoch": 0.89, + "grad_norm": 0.8627959272048978, + "learning_rate": 5.84932193654385e-07, + "loss": 0.1958, + "step": 17578 + }, + { + "epoch": 0.89, + "grad_norm": 0.8468770809456929, + "learning_rate": 5.843773027404631e-07, + "loss": 0.1748, + "step": 17579 + }, + { + "epoch": 0.89, + "grad_norm": 0.9074617601974535, + "learning_rate": 5.838226672258374e-07, + "loss": 0.1806, + "step": 17580 + }, + { + "epoch": 0.89, + "grad_norm": 0.8759627401266152, + "learning_rate": 5.832682871255546e-07, + "loss": 0.1743, + "step": 17581 + }, + { + "epoch": 0.89, + "grad_norm": 1.1549145279627093, + "learning_rate": 5.827141624546528e-07, + "loss": 0.1754, + "step": 17582 + }, + { + "epoch": 0.89, + "grad_norm": 0.9652020221208394, + "learning_rate": 5.8216029322816e-07, + "loss": 0.1497, + "step": 17583 + }, + { + "epoch": 0.89, + "grad_norm": 1.2358436435062201, + "learning_rate": 5.816066794611019e-07, + "loss": 0.1467, + "step": 17584 + }, + { + "epoch": 0.89, + "grad_norm": 1.2438610313138372, + "learning_rate": 5.810533211684954e-07, + "loss": 0.1646, + "step": 17585 + }, + { + "epoch": 0.89, + "grad_norm": 0.8587535130383744, + "learning_rate": 5.805002183653474e-07, + "loss": 0.1407, + "step": 17586 + }, + { + "epoch": 0.89, + "grad_norm": 1.1335931999480722, + "learning_rate": 5.799473710666659e-07, + "loss": 0.1711, + "step": 17587 + }, + { + "epoch": 0.89, + "grad_norm": 1.4396105891990831, + "learning_rate": 5.793947792874411e-07, + "loss": 0.1638, + "step": 17588 + }, + { + "epoch": 0.89, + "grad_norm": 0.9790535477952904, + "learning_rate": 5.788424430426653e-07, + "loss": 0.1857, + "step": 17589 + }, + { + "epoch": 0.89, + "grad_norm": 0.8856226673905241, + "learning_rate": 5.782903623473202e-07, + "loss": 0.1668, + "step": 17590 + }, + { + "epoch": 0.89, + "grad_norm": 1.067043539477932, + "learning_rate": 5.777385372163824e-07, + "loss": 0.174, + "step": 17591 + }, + { + "epoch": 0.89, + "grad_norm": 0.8740321505398196, + "learning_rate": 5.771869676648178e-07, + "loss": 0.15, + "step": 17592 + }, + { + "epoch": 0.89, + "grad_norm": 0.8681917829340098, + "learning_rate": 5.766356537075913e-07, + "loss": 0.1538, + "step": 17593 + }, + { + "epoch": 0.89, + "grad_norm": 0.9506182517731803, + "learning_rate": 5.760845953596527e-07, + "loss": 0.1759, + "step": 17594 + }, + { + "epoch": 0.89, + "grad_norm": 1.3177971254646998, + "learning_rate": 5.755337926359528e-07, + "loss": 0.1631, + "step": 17595 + }, + { + "epoch": 0.89, + "grad_norm": 1.117532008903971, + "learning_rate": 5.749832455514292e-07, + "loss": 0.1544, + "step": 17596 + }, + { + "epoch": 0.89, + "grad_norm": 1.2500663676142996, + "learning_rate": 5.744329541210203e-07, + "loss": 0.1733, + "step": 17597 + }, + { + "epoch": 0.89, + "grad_norm": 1.1405137035890365, + "learning_rate": 5.738829183596472e-07, + "loss": 0.1763, + "step": 17598 + }, + { + "epoch": 0.89, + "grad_norm": 1.364590498600788, + "learning_rate": 5.733331382822327e-07, + "loss": 0.1627, + "step": 17599 + }, + { + "epoch": 0.89, + "grad_norm": 1.3966575165611175, + "learning_rate": 5.727836139036902e-07, + "loss": 0.1771, + "step": 17600 + }, + { + "epoch": 0.9, + "grad_norm": 1.4316548739683352, + "learning_rate": 5.722343452389246e-07, + "loss": 0.1829, + "step": 17601 + }, + { + "epoch": 0.9, + "grad_norm": 0.9225231916886446, + "learning_rate": 5.71685332302836e-07, + "loss": 0.1652, + "step": 17602 + }, + { + "epoch": 0.9, + "grad_norm": 0.9262842826877503, + "learning_rate": 5.711365751103126e-07, + "loss": 0.1615, + "step": 17603 + }, + { + "epoch": 0.9, + "grad_norm": 1.125458347496385, + "learning_rate": 5.705880736762448e-07, + "loss": 0.1555, + "step": 17604 + }, + { + "epoch": 0.9, + "grad_norm": 1.0342644278015851, + "learning_rate": 5.700398280155062e-07, + "loss": 0.1555, + "step": 17605 + }, + { + "epoch": 0.9, + "grad_norm": 1.657524375857265, + "learning_rate": 5.694918381429693e-07, + "loss": 0.1715, + "step": 17606 + }, + { + "epoch": 0.9, + "grad_norm": 1.7613783650392223, + "learning_rate": 5.689441040735e-07, + "loss": 0.1705, + "step": 17607 + }, + { + "epoch": 0.9, + "grad_norm": 1.1654114356377074, + "learning_rate": 5.683966258219553e-07, + "loss": 0.1794, + "step": 17608 + }, + { + "epoch": 0.9, + "grad_norm": 1.117362526105987, + "learning_rate": 5.678494034031834e-07, + "loss": 0.1673, + "step": 17609 + }, + { + "epoch": 0.9, + "grad_norm": 0.9317194347022243, + "learning_rate": 5.673024368320313e-07, + "loss": 0.1671, + "step": 17610 + }, + { + "epoch": 0.9, + "grad_norm": 4.772568989083743, + "learning_rate": 5.667557261233303e-07, + "loss": 0.1537, + "step": 17611 + }, + { + "epoch": 0.9, + "grad_norm": 0.8476355847340409, + "learning_rate": 5.662092712919165e-07, + "loss": 0.1822, + "step": 17612 + }, + { + "epoch": 0.9, + "grad_norm": 0.9599586056070369, + "learning_rate": 5.656630723526058e-07, + "loss": 0.1925, + "step": 17613 + }, + { + "epoch": 0.9, + "grad_norm": 1.081935044119475, + "learning_rate": 5.651171293202195e-07, + "loss": 0.1543, + "step": 17614 + }, + { + "epoch": 0.9, + "grad_norm": 1.1782749849930128, + "learning_rate": 5.645714422095627e-07, + "loss": 0.1662, + "step": 17615 + }, + { + "epoch": 0.9, + "grad_norm": 1.038120793501522, + "learning_rate": 5.640260110354379e-07, + "loss": 0.1791, + "step": 17616 + }, + { + "epoch": 0.9, + "grad_norm": 1.0742685175075855, + "learning_rate": 5.634808358126409e-07, + "loss": 0.169, + "step": 17617 + }, + { + "epoch": 0.9, + "grad_norm": 1.490681460837666, + "learning_rate": 5.6293591655596e-07, + "loss": 0.1691, + "step": 17618 + }, + { + "epoch": 0.9, + "grad_norm": 2.0405156137959275, + "learning_rate": 5.623912532801745e-07, + "loss": 0.2042, + "step": 17619 + }, + { + "epoch": 0.9, + "grad_norm": 1.4385736720037017, + "learning_rate": 5.618468460000603e-07, + "loss": 0.1543, + "step": 17620 + }, + { + "epoch": 0.9, + "grad_norm": 1.3147500473045866, + "learning_rate": 5.613026947303846e-07, + "loss": 0.1723, + "step": 17621 + }, + { + "epoch": 0.9, + "grad_norm": 1.6507261497711412, + "learning_rate": 5.607587994859043e-07, + "loss": 0.173, + "step": 17622 + }, + { + "epoch": 0.9, + "grad_norm": 1.0463045500938963, + "learning_rate": 5.602151602813754e-07, + "loss": 0.1565, + "step": 17623 + }, + { + "epoch": 0.9, + "grad_norm": 0.9064519461727782, + "learning_rate": 5.59671777131543e-07, + "loss": 0.1683, + "step": 17624 + }, + { + "epoch": 0.9, + "grad_norm": 0.9830407275206406, + "learning_rate": 5.591286500511461e-07, + "loss": 0.1525, + "step": 17625 + }, + { + "epoch": 0.9, + "grad_norm": 1.4047169508316093, + "learning_rate": 5.585857790549176e-07, + "loss": 0.1647, + "step": 17626 + }, + { + "epoch": 0.9, + "grad_norm": 0.993469300350806, + "learning_rate": 5.580431641575856e-07, + "loss": 0.1591, + "step": 17627 + }, + { + "epoch": 0.9, + "grad_norm": 1.166303889142174, + "learning_rate": 5.57500805373864e-07, + "loss": 0.1714, + "step": 17628 + }, + { + "epoch": 0.9, + "grad_norm": 0.8781195607529766, + "learning_rate": 5.569587027184676e-07, + "loss": 0.1613, + "step": 17629 + }, + { + "epoch": 0.9, + "grad_norm": 0.885899291646843, + "learning_rate": 5.564168562060989e-07, + "loss": 0.1503, + "step": 17630 + }, + { + "epoch": 0.9, + "grad_norm": 0.7808838110262488, + "learning_rate": 5.558752658514576e-07, + "loss": 0.1421, + "step": 17631 + }, + { + "epoch": 0.9, + "grad_norm": 1.129720678794883, + "learning_rate": 5.553339316692319e-07, + "loss": 0.1671, + "step": 17632 + }, + { + "epoch": 0.9, + "grad_norm": 0.9170876078394514, + "learning_rate": 5.547928536741054e-07, + "loss": 0.1515, + "step": 17633 + }, + { + "epoch": 0.9, + "grad_norm": 3.195046931226926, + "learning_rate": 5.542520318807575e-07, + "loss": 0.1425, + "step": 17634 + }, + { + "epoch": 0.9, + "grad_norm": 1.289422396897735, + "learning_rate": 5.537114663038579e-07, + "loss": 0.1497, + "step": 17635 + }, + { + "epoch": 0.9, + "grad_norm": 1.002269400661614, + "learning_rate": 5.531711569580667e-07, + "loss": 0.171, + "step": 17636 + }, + { + "epoch": 0.9, + "grad_norm": 1.1868860593512662, + "learning_rate": 5.526311038580434e-07, + "loss": 0.1779, + "step": 17637 + }, + { + "epoch": 0.9, + "grad_norm": 0.7414258773959721, + "learning_rate": 5.520913070184342e-07, + "loss": 0.1615, + "step": 17638 + }, + { + "epoch": 0.9, + "grad_norm": 1.4872577817974402, + "learning_rate": 5.515517664538816e-07, + "loss": 0.1672, + "step": 17639 + }, + { + "epoch": 0.9, + "grad_norm": 1.1392463829779043, + "learning_rate": 5.510124821790208e-07, + "loss": 0.1743, + "step": 17640 + }, + { + "epoch": 0.9, + "grad_norm": 0.9808252375289511, + "learning_rate": 5.5047345420848e-07, + "loss": 0.1713, + "step": 17641 + }, + { + "epoch": 0.9, + "grad_norm": 1.0052517052647605, + "learning_rate": 5.499346825568796e-07, + "loss": 0.1572, + "step": 17642 + }, + { + "epoch": 0.9, + "grad_norm": 0.9217200434394364, + "learning_rate": 5.493961672388359e-07, + "loss": 0.1684, + "step": 17643 + }, + { + "epoch": 0.9, + "grad_norm": 0.9372417416015802, + "learning_rate": 5.488579082689549e-07, + "loss": 0.186, + "step": 17644 + }, + { + "epoch": 0.9, + "grad_norm": 1.3092412178722737, + "learning_rate": 5.48319905661836e-07, + "loss": 0.1465, + "step": 17645 + }, + { + "epoch": 0.9, + "grad_norm": 2.002333418364241, + "learning_rate": 5.477821594320754e-07, + "loss": 0.146, + "step": 17646 + }, + { + "epoch": 0.9, + "grad_norm": 0.8780082868559859, + "learning_rate": 5.472446695942557e-07, + "loss": 0.1576, + "step": 17647 + }, + { + "epoch": 0.9, + "grad_norm": 1.1158135010797137, + "learning_rate": 5.467074361629599e-07, + "loss": 0.1707, + "step": 17648 + }, + { + "epoch": 0.9, + "grad_norm": 1.2440407124510282, + "learning_rate": 5.461704591527573e-07, + "loss": 0.1658, + "step": 17649 + }, + { + "epoch": 0.9, + "grad_norm": 1.0253371799074213, + "learning_rate": 5.456337385782173e-07, + "loss": 0.1883, + "step": 17650 + }, + { + "epoch": 0.9, + "grad_norm": 0.915436610916007, + "learning_rate": 5.450972744538929e-07, + "loss": 0.1639, + "step": 17651 + }, + { + "epoch": 0.9, + "grad_norm": 1.0029131128139648, + "learning_rate": 5.445610667943401e-07, + "loss": 0.164, + "step": 17652 + }, + { + "epoch": 0.9, + "grad_norm": 0.9462649352774675, + "learning_rate": 5.440251156141019e-07, + "loss": 0.1628, + "step": 17653 + }, + { + "epoch": 0.9, + "grad_norm": 1.3574479219373592, + "learning_rate": 5.434894209277186e-07, + "loss": 0.1578, + "step": 17654 + }, + { + "epoch": 0.9, + "grad_norm": 0.9918804611222274, + "learning_rate": 5.429539827497188e-07, + "loss": 0.1497, + "step": 17655 + }, + { + "epoch": 0.9, + "grad_norm": 1.0503724191185535, + "learning_rate": 5.424188010946241e-07, + "loss": 0.1566, + "step": 17656 + }, + { + "epoch": 0.9, + "grad_norm": 0.8451239880660211, + "learning_rate": 5.418838759769551e-07, + "loss": 0.1548, + "step": 17657 + }, + { + "epoch": 0.9, + "grad_norm": 0.893324799854365, + "learning_rate": 5.413492074112192e-07, + "loss": 0.1468, + "step": 17658 + }, + { + "epoch": 0.9, + "grad_norm": 1.0016104659910292, + "learning_rate": 5.408147954119202e-07, + "loss": 0.1582, + "step": 17659 + }, + { + "epoch": 0.9, + "grad_norm": 0.8984243797807004, + "learning_rate": 5.402806399935545e-07, + "loss": 0.176, + "step": 17660 + }, + { + "epoch": 0.9, + "grad_norm": 9.62575895769711, + "learning_rate": 5.397467411706114e-07, + "loss": 0.1525, + "step": 17661 + }, + { + "epoch": 0.9, + "grad_norm": 1.0367581809540989, + "learning_rate": 5.392130989575716e-07, + "loss": 0.1787, + "step": 17662 + }, + { + "epoch": 0.9, + "grad_norm": 1.0075030435562442, + "learning_rate": 5.386797133689125e-07, + "loss": 0.1581, + "step": 17663 + }, + { + "epoch": 0.9, + "grad_norm": 0.9308276120576201, + "learning_rate": 5.38146584419098e-07, + "loss": 0.1415, + "step": 17664 + }, + { + "epoch": 0.9, + "grad_norm": 0.9995172688457722, + "learning_rate": 5.376137121225933e-07, + "loss": 0.1664, + "step": 17665 + }, + { + "epoch": 0.9, + "grad_norm": 1.1181798407086685, + "learning_rate": 5.370810964938511e-07, + "loss": 0.1645, + "step": 17666 + }, + { + "epoch": 0.9, + "grad_norm": 0.8228683638441441, + "learning_rate": 5.365487375473189e-07, + "loss": 0.1484, + "step": 17667 + }, + { + "epoch": 0.9, + "grad_norm": 0.91897186212851, + "learning_rate": 5.36016635297435e-07, + "loss": 0.1423, + "step": 17668 + }, + { + "epoch": 0.9, + "grad_norm": 0.9404184974406693, + "learning_rate": 5.354847897586346e-07, + "loss": 0.1536, + "step": 17669 + }, + { + "epoch": 0.9, + "grad_norm": 0.9341398123651984, + "learning_rate": 5.34953200945344e-07, + "loss": 0.1656, + "step": 17670 + }, + { + "epoch": 0.9, + "grad_norm": 0.9941746446631397, + "learning_rate": 5.344218688719849e-07, + "loss": 0.1838, + "step": 17671 + }, + { + "epoch": 0.9, + "grad_norm": 1.2760959638853353, + "learning_rate": 5.338907935529658e-07, + "loss": 0.1673, + "step": 17672 + }, + { + "epoch": 0.9, + "grad_norm": 1.2152025882319166, + "learning_rate": 5.33359975002693e-07, + "loss": 0.1714, + "step": 17673 + }, + { + "epoch": 0.9, + "grad_norm": 1.3978400774163173, + "learning_rate": 5.32829413235566e-07, + "loss": 0.1566, + "step": 17674 + }, + { + "epoch": 0.9, + "grad_norm": 2.228025280490407, + "learning_rate": 5.322991082659745e-07, + "loss": 0.1587, + "step": 17675 + }, + { + "epoch": 0.9, + "grad_norm": 1.6220795682324534, + "learning_rate": 5.31769060108307e-07, + "loss": 0.1669, + "step": 17676 + }, + { + "epoch": 0.9, + "grad_norm": 0.9484679760869359, + "learning_rate": 5.312392687769363e-07, + "loss": 0.1646, + "step": 17677 + }, + { + "epoch": 0.9, + "grad_norm": 1.7112336753826645, + "learning_rate": 5.307097342862355e-07, + "loss": 0.1476, + "step": 17678 + }, + { + "epoch": 0.9, + "grad_norm": 0.9050168602368139, + "learning_rate": 5.301804566505675e-07, + "loss": 0.1739, + "step": 17679 + }, + { + "epoch": 0.9, + "grad_norm": 2.6948532263660625, + "learning_rate": 5.296514358842919e-07, + "loss": 0.1562, + "step": 17680 + }, + { + "epoch": 0.9, + "grad_norm": 0.9223356351999948, + "learning_rate": 5.291226720017539e-07, + "loss": 0.1646, + "step": 17681 + }, + { + "epoch": 0.9, + "grad_norm": 1.0710729372748458, + "learning_rate": 5.285941650172999e-07, + "loss": 0.1781, + "step": 17682 + }, + { + "epoch": 0.9, + "grad_norm": 1.0156542824190222, + "learning_rate": 5.280659149452627e-07, + "loss": 0.161, + "step": 17683 + }, + { + "epoch": 0.9, + "grad_norm": 1.0609702715785894, + "learning_rate": 5.275379217999732e-07, + "loss": 0.1584, + "step": 17684 + }, + { + "epoch": 0.9, + "grad_norm": 2.593540452877707, + "learning_rate": 5.270101855957521e-07, + "loss": 0.1498, + "step": 17685 + }, + { + "epoch": 0.9, + "grad_norm": 0.9987113372708541, + "learning_rate": 5.264827063469146e-07, + "loss": 0.1667, + "step": 17686 + }, + { + "epoch": 0.9, + "grad_norm": 0.9799744766968221, + "learning_rate": 5.259554840677683e-07, + "loss": 0.1543, + "step": 17687 + }, + { + "epoch": 0.9, + "grad_norm": 1.0793003127543954, + "learning_rate": 5.25428518772616e-07, + "loss": 0.1772, + "step": 17688 + }, + { + "epoch": 0.9, + "grad_norm": 1.1104727507892227, + "learning_rate": 5.249018104757498e-07, + "loss": 0.1648, + "step": 17689 + }, + { + "epoch": 0.9, + "grad_norm": 0.8866876360592496, + "learning_rate": 5.24375359191458e-07, + "loss": 0.1711, + "step": 17690 + }, + { + "epoch": 0.9, + "grad_norm": 0.9318746380239975, + "learning_rate": 5.238491649340194e-07, + "loss": 0.1576, + "step": 17691 + }, + { + "epoch": 0.9, + "grad_norm": 1.704580316984937, + "learning_rate": 5.233232277177058e-07, + "loss": 0.1481, + "step": 17692 + }, + { + "epoch": 0.9, + "grad_norm": 0.9707591751898252, + "learning_rate": 5.22797547556787e-07, + "loss": 0.198, + "step": 17693 + }, + { + "epoch": 0.9, + "grad_norm": 1.4708636570765141, + "learning_rate": 5.222721244655182e-07, + "loss": 0.1606, + "step": 17694 + }, + { + "epoch": 0.9, + "grad_norm": 1.0986553399787764, + "learning_rate": 5.217469584581536e-07, + "loss": 0.1719, + "step": 17695 + }, + { + "epoch": 0.9, + "grad_norm": 2.24716223435193, + "learning_rate": 5.212220495489384e-07, + "loss": 0.1562, + "step": 17696 + }, + { + "epoch": 0.9, + "grad_norm": 0.8958578576870243, + "learning_rate": 5.206973977521113e-07, + "loss": 0.1731, + "step": 17697 + }, + { + "epoch": 0.9, + "grad_norm": 1.1951818186829957, + "learning_rate": 5.201730030819019e-07, + "loss": 0.1544, + "step": 17698 + }, + { + "epoch": 0.9, + "grad_norm": 1.4365885572432668, + "learning_rate": 5.196488655525356e-07, + "loss": 0.1787, + "step": 17699 + }, + { + "epoch": 0.9, + "grad_norm": 1.2545363657750488, + "learning_rate": 5.191249851782287e-07, + "loss": 0.145, + "step": 17700 + }, + { + "epoch": 0.9, + "grad_norm": 2.2246894585487254, + "learning_rate": 5.186013619731934e-07, + "loss": 0.1638, + "step": 17701 + }, + { + "epoch": 0.9, + "grad_norm": 0.9044530743019654, + "learning_rate": 5.180779959516303e-07, + "loss": 0.171, + "step": 17702 + }, + { + "epoch": 0.9, + "grad_norm": 0.8542074221845332, + "learning_rate": 5.175548871277358e-07, + "loss": 0.1606, + "step": 17703 + }, + { + "epoch": 0.9, + "grad_norm": 0.9570768342942108, + "learning_rate": 5.170320355157022e-07, + "loss": 0.1412, + "step": 17704 + }, + { + "epoch": 0.9, + "grad_norm": 1.377473388699456, + "learning_rate": 5.165094411297111e-07, + "loss": 0.1878, + "step": 17705 + }, + { + "epoch": 0.9, + "grad_norm": 0.9740939249314572, + "learning_rate": 5.15987103983936e-07, + "loss": 0.1739, + "step": 17706 + }, + { + "epoch": 0.9, + "grad_norm": 1.1030183081306857, + "learning_rate": 5.154650240925474e-07, + "loss": 0.1718, + "step": 17707 + }, + { + "epoch": 0.9, + "grad_norm": 0.9345049545849029, + "learning_rate": 5.149432014697053e-07, + "loss": 0.1798, + "step": 17708 + }, + { + "epoch": 0.9, + "grad_norm": 1.1709174668507403, + "learning_rate": 5.14421636129564e-07, + "loss": 0.1574, + "step": 17709 + }, + { + "epoch": 0.9, + "grad_norm": 0.8308813827049973, + "learning_rate": 5.13900328086272e-07, + "loss": 0.1487, + "step": 17710 + }, + { + "epoch": 0.9, + "grad_norm": 1.869450981379436, + "learning_rate": 5.133792773539681e-07, + "loss": 0.1568, + "step": 17711 + }, + { + "epoch": 0.9, + "grad_norm": 0.9869137581204871, + "learning_rate": 5.128584839467877e-07, + "loss": 0.1441, + "step": 17712 + }, + { + "epoch": 0.9, + "grad_norm": 0.7785372146076209, + "learning_rate": 5.12337947878857e-07, + "loss": 0.1614, + "step": 17713 + }, + { + "epoch": 0.9, + "grad_norm": 0.9084779713098403, + "learning_rate": 5.118176691642962e-07, + "loss": 0.144, + "step": 17714 + }, + { + "epoch": 0.9, + "grad_norm": 0.7622362638835993, + "learning_rate": 5.11297647817216e-07, + "loss": 0.1559, + "step": 17715 + }, + { + "epoch": 0.9, + "grad_norm": 1.6181503739545215, + "learning_rate": 5.107778838517241e-07, + "loss": 0.1525, + "step": 17716 + }, + { + "epoch": 0.9, + "grad_norm": 0.9288820385651829, + "learning_rate": 5.102583772819159e-07, + "loss": 0.1488, + "step": 17717 + }, + { + "epoch": 0.9, + "grad_norm": 1.0344836702661204, + "learning_rate": 5.097391281218877e-07, + "loss": 0.1446, + "step": 17718 + }, + { + "epoch": 0.9, + "grad_norm": 1.756541256971086, + "learning_rate": 5.092201363857197e-07, + "loss": 0.1528, + "step": 17719 + }, + { + "epoch": 0.9, + "grad_norm": 1.3498640132390163, + "learning_rate": 5.087014020874936e-07, + "loss": 0.1704, + "step": 17720 + }, + { + "epoch": 0.9, + "grad_norm": 0.9049343493676423, + "learning_rate": 5.081829252412762e-07, + "loss": 0.1487, + "step": 17721 + }, + { + "epoch": 0.9, + "grad_norm": 1.017207296182938, + "learning_rate": 5.076647058611328e-07, + "loss": 0.1758, + "step": 17722 + }, + { + "epoch": 0.9, + "grad_norm": 1.0428643056624292, + "learning_rate": 5.071467439611211e-07, + "loss": 0.1455, + "step": 17723 + }, + { + "epoch": 0.9, + "grad_norm": 1.1132281043897432, + "learning_rate": 5.066290395552909e-07, + "loss": 0.1574, + "step": 17724 + }, + { + "epoch": 0.9, + "grad_norm": 1.6376443306870083, + "learning_rate": 5.061115926576843e-07, + "loss": 0.1612, + "step": 17725 + }, + { + "epoch": 0.9, + "grad_norm": 1.537279060722791, + "learning_rate": 5.055944032823357e-07, + "loss": 0.1628, + "step": 17726 + }, + { + "epoch": 0.9, + "grad_norm": 1.133942195763958, + "learning_rate": 5.050774714432772e-07, + "loss": 0.1587, + "step": 17727 + }, + { + "epoch": 0.9, + "grad_norm": 1.6315795559668866, + "learning_rate": 5.045607971545263e-07, + "loss": 0.1767, + "step": 17728 + }, + { + "epoch": 0.9, + "grad_norm": 1.406991068781469, + "learning_rate": 5.040443804300998e-07, + "loss": 0.14, + "step": 17729 + }, + { + "epoch": 0.9, + "grad_norm": 1.3011977502033985, + "learning_rate": 5.035282212840065e-07, + "loss": 0.182, + "step": 17730 + }, + { + "epoch": 0.9, + "grad_norm": 1.3039874258239579, + "learning_rate": 5.030123197302472e-07, + "loss": 0.1708, + "step": 17731 + }, + { + "epoch": 0.9, + "grad_norm": 6.74425755208782, + "learning_rate": 5.024966757828143e-07, + "loss": 0.1766, + "step": 17732 + }, + { + "epoch": 0.9, + "grad_norm": 0.9541717050863457, + "learning_rate": 5.019812894556975e-07, + "loss": 0.1522, + "step": 17733 + }, + { + "epoch": 0.9, + "grad_norm": 0.9449006111982539, + "learning_rate": 5.014661607628723e-07, + "loss": 0.1705, + "step": 17734 + }, + { + "epoch": 0.9, + "grad_norm": 2.291618929572805, + "learning_rate": 5.009512897183156e-07, + "loss": 0.1501, + "step": 17735 + }, + { + "epoch": 0.9, + "grad_norm": 2.6137589502704652, + "learning_rate": 5.004366763359903e-07, + "loss": 0.1692, + "step": 17736 + }, + { + "epoch": 0.9, + "grad_norm": 2.088489181727494, + "learning_rate": 4.999223206298587e-07, + "loss": 0.1463, + "step": 17737 + }, + { + "epoch": 0.9, + "grad_norm": 0.7985636651852127, + "learning_rate": 4.994082226138686e-07, + "loss": 0.116, + "step": 17738 + }, + { + "epoch": 0.9, + "grad_norm": 1.2956073162739354, + "learning_rate": 4.988943823019676e-07, + "loss": 0.1524, + "step": 17739 + }, + { + "epoch": 0.9, + "grad_norm": 1.4870177517974048, + "learning_rate": 4.983807997080925e-07, + "loss": 0.1485, + "step": 17740 + }, + { + "epoch": 0.9, + "grad_norm": 1.1141072045469964, + "learning_rate": 4.978674748461765e-07, + "loss": 0.1965, + "step": 17741 + }, + { + "epoch": 0.9, + "grad_norm": 1.7596014826608797, + "learning_rate": 4.973544077301418e-07, + "loss": 0.1702, + "step": 17742 + }, + { + "epoch": 0.9, + "grad_norm": 0.8503845471946775, + "learning_rate": 4.968415983739039e-07, + "loss": 0.1554, + "step": 17743 + }, + { + "epoch": 0.9, + "grad_norm": 1.225632954529606, + "learning_rate": 4.963290467913761e-07, + "loss": 0.1623, + "step": 17744 + }, + { + "epoch": 0.9, + "grad_norm": 1.1538746263261441, + "learning_rate": 4.958167529964586e-07, + "loss": 0.1948, + "step": 17745 + }, + { + "epoch": 0.9, + "grad_norm": 1.0146882938179274, + "learning_rate": 4.953047170030489e-07, + "loss": 0.155, + "step": 17746 + }, + { + "epoch": 0.9, + "grad_norm": 1.0771355542823016, + "learning_rate": 4.94792938825035e-07, + "loss": 0.1491, + "step": 17747 + }, + { + "epoch": 0.9, + "grad_norm": 1.018788055431089, + "learning_rate": 4.942814184763001e-07, + "loss": 0.1723, + "step": 17748 + }, + { + "epoch": 0.9, + "grad_norm": 1.1316146560079632, + "learning_rate": 4.937701559707175e-07, + "loss": 0.1602, + "step": 17749 + }, + { + "epoch": 0.9, + "grad_norm": 0.9280565298619311, + "learning_rate": 4.932591513221586e-07, + "loss": 0.148, + "step": 17750 + }, + { + "epoch": 0.9, + "grad_norm": 1.0565016656740143, + "learning_rate": 4.927484045444797e-07, + "loss": 0.1574, + "step": 17751 + }, + { + "epoch": 0.9, + "grad_norm": 1.1490442384990298, + "learning_rate": 4.922379156515389e-07, + "loss": 0.1484, + "step": 17752 + }, + { + "epoch": 0.9, + "grad_norm": 1.0055808135829174, + "learning_rate": 4.917276846571806e-07, + "loss": 0.1499, + "step": 17753 + }, + { + "epoch": 0.9, + "grad_norm": 1.137822230302273, + "learning_rate": 4.912177115752481e-07, + "loss": 0.1698, + "step": 17754 + }, + { + "epoch": 0.9, + "grad_norm": 1.4222448581168465, + "learning_rate": 4.907079964195693e-07, + "loss": 0.1674, + "step": 17755 + }, + { + "epoch": 0.9, + "grad_norm": 1.0891384710066645, + "learning_rate": 4.901985392039743e-07, + "loss": 0.1892, + "step": 17756 + }, + { + "epoch": 0.9, + "grad_norm": 1.239926969876004, + "learning_rate": 4.896893399422809e-07, + "loss": 0.1529, + "step": 17757 + }, + { + "epoch": 0.9, + "grad_norm": 1.1012776690468857, + "learning_rate": 4.891803986483023e-07, + "loss": 0.1937, + "step": 17758 + }, + { + "epoch": 0.9, + "grad_norm": 1.0518012215731845, + "learning_rate": 4.886717153358411e-07, + "loss": 0.199, + "step": 17759 + }, + { + "epoch": 0.9, + "grad_norm": 1.0574357379309232, + "learning_rate": 4.881632900186983e-07, + "loss": 0.154, + "step": 17760 + }, + { + "epoch": 0.9, + "grad_norm": 1.175176478735189, + "learning_rate": 4.876551227106629e-07, + "loss": 0.1801, + "step": 17761 + }, + { + "epoch": 0.9, + "grad_norm": 0.8789036612254449, + "learning_rate": 4.871472134255195e-07, + "loss": 0.1609, + "step": 17762 + }, + { + "epoch": 0.9, + "grad_norm": 1.680680067709628, + "learning_rate": 4.866395621770458e-07, + "loss": 0.1423, + "step": 17763 + }, + { + "epoch": 0.9, + "grad_norm": 1.1085585855081044, + "learning_rate": 4.861321689790099e-07, + "loss": 0.1458, + "step": 17764 + }, + { + "epoch": 0.9, + "grad_norm": 2.6969531275425647, + "learning_rate": 4.856250338451763e-07, + "loss": 0.1805, + "step": 17765 + }, + { + "epoch": 0.9, + "grad_norm": 0.9945335175170363, + "learning_rate": 4.851181567893015e-07, + "loss": 0.1476, + "step": 17766 + }, + { + "epoch": 0.9, + "grad_norm": 1.1633841659011586, + "learning_rate": 4.846115378251348e-07, + "loss": 0.1401, + "step": 17767 + }, + { + "epoch": 0.9, + "grad_norm": 1.1946895954446297, + "learning_rate": 4.841051769664174e-07, + "loss": 0.1782, + "step": 17768 + }, + { + "epoch": 0.9, + "grad_norm": 1.0272983655384607, + "learning_rate": 4.835990742268848e-07, + "loss": 0.1578, + "step": 17769 + }, + { + "epoch": 0.9, + "grad_norm": 0.881203217537691, + "learning_rate": 4.83093229620264e-07, + "loss": 0.1575, + "step": 17770 + }, + { + "epoch": 0.9, + "grad_norm": 1.35515392014797, + "learning_rate": 4.825876431602772e-07, + "loss": 0.1692, + "step": 17771 + }, + { + "epoch": 0.9, + "grad_norm": 1.2188750181917818, + "learning_rate": 4.820823148606379e-07, + "loss": 0.183, + "step": 17772 + }, + { + "epoch": 0.9, + "grad_norm": 1.0844096893909176, + "learning_rate": 4.815772447350541e-07, + "loss": 0.1687, + "step": 17773 + }, + { + "epoch": 0.9, + "grad_norm": 1.051618523028314, + "learning_rate": 4.810724327972238e-07, + "loss": 0.1578, + "step": 17774 + }, + { + "epoch": 0.9, + "grad_norm": 1.7931119674665117, + "learning_rate": 4.805678790608415e-07, + "loss": 0.1636, + "step": 17775 + }, + { + "epoch": 0.9, + "grad_norm": 0.8151408484958761, + "learning_rate": 4.80063583539594e-07, + "loss": 0.1484, + "step": 17776 + }, + { + "epoch": 0.9, + "grad_norm": 1.0937623325399302, + "learning_rate": 4.795595462471592e-07, + "loss": 0.1779, + "step": 17777 + }, + { + "epoch": 0.9, + "grad_norm": 1.0631272079242073, + "learning_rate": 4.790557671972107e-07, + "loss": 0.1397, + "step": 17778 + }, + { + "epoch": 0.9, + "grad_norm": 1.0143499979429285, + "learning_rate": 4.785522464034109e-07, + "loss": 0.1782, + "step": 17779 + }, + { + "epoch": 0.9, + "grad_norm": 1.063689312103594, + "learning_rate": 4.7804898387942e-07, + "loss": 0.1645, + "step": 17780 + }, + { + "epoch": 0.9, + "grad_norm": 1.0920688128771128, + "learning_rate": 4.77545979638887e-07, + "loss": 0.1579, + "step": 17781 + }, + { + "epoch": 0.9, + "grad_norm": 1.1352514848095434, + "learning_rate": 4.770432336954567e-07, + "loss": 0.1594, + "step": 17782 + }, + { + "epoch": 0.9, + "grad_norm": 0.8693421343476601, + "learning_rate": 4.765407460627669e-07, + "loss": 0.1666, + "step": 17783 + }, + { + "epoch": 0.9, + "grad_norm": 1.512091212186362, + "learning_rate": 4.76038516754449e-07, + "loss": 0.1636, + "step": 17784 + }, + { + "epoch": 0.9, + "grad_norm": 0.9559664234061493, + "learning_rate": 4.755365457841221e-07, + "loss": 0.1621, + "step": 17785 + }, + { + "epoch": 0.9, + "grad_norm": 2.40871843285452, + "learning_rate": 4.750348331654064e-07, + "loss": 0.1668, + "step": 17786 + }, + { + "epoch": 0.9, + "grad_norm": 1.2538414667969688, + "learning_rate": 4.7453337891190776e-07, + "loss": 0.1684, + "step": 17787 + }, + { + "epoch": 0.9, + "grad_norm": 1.2285387934323766, + "learning_rate": 4.7403218303722963e-07, + "loss": 0.1673, + "step": 17788 + }, + { + "epoch": 0.9, + "grad_norm": 1.2252516668439057, + "learning_rate": 4.7353124555496566e-07, + "loss": 0.1568, + "step": 17789 + }, + { + "epoch": 0.9, + "grad_norm": 1.7644849909791742, + "learning_rate": 4.7303056647870605e-07, + "loss": 0.1833, + "step": 17790 + }, + { + "epoch": 0.9, + "grad_norm": 0.9280686009723658, + "learning_rate": 4.725301458220288e-07, + "loss": 0.1575, + "step": 17791 + }, + { + "epoch": 0.9, + "grad_norm": 0.9485796496560795, + "learning_rate": 4.7202998359850984e-07, + "loss": 0.1594, + "step": 17792 + }, + { + "epoch": 0.9, + "grad_norm": 1.2267938129539502, + "learning_rate": 4.7153007982171594e-07, + "loss": 0.1909, + "step": 17793 + }, + { + "epoch": 0.9, + "grad_norm": 0.7754106303778403, + "learning_rate": 4.7103043450520744e-07, + "loss": 0.1457, + "step": 17794 + }, + { + "epoch": 0.9, + "grad_norm": 1.026104662099303, + "learning_rate": 4.705310476625369e-07, + "loss": 0.1678, + "step": 17795 + }, + { + "epoch": 0.9, + "grad_norm": 1.1562353558815135, + "learning_rate": 4.700319193072489e-07, + "loss": 0.1869, + "step": 17796 + }, + { + "epoch": 0.91, + "grad_norm": 1.6137894136029478, + "learning_rate": 4.695330494528838e-07, + "loss": 0.1491, + "step": 17797 + }, + { + "epoch": 0.91, + "grad_norm": 1.09089695525694, + "learning_rate": 4.69034438112973e-07, + "loss": 0.1839, + "step": 17798 + }, + { + "epoch": 0.91, + "grad_norm": 1.333152341251913, + "learning_rate": 4.685360853010401e-07, + "loss": 0.1637, + "step": 17799 + }, + { + "epoch": 0.91, + "grad_norm": 0.9602187524299798, + "learning_rate": 4.6803799103060544e-07, + "loss": 0.1653, + "step": 17800 + }, + { + "epoch": 0.91, + "grad_norm": 1.1799898341032788, + "learning_rate": 4.6754015531517926e-07, + "loss": 0.1626, + "step": 17801 + }, + { + "epoch": 0.91, + "grad_norm": 3.665753566346427, + "learning_rate": 4.6704257816826306e-07, + "loss": 0.1961, + "step": 17802 + }, + { + "epoch": 0.91, + "grad_norm": 1.640526741340988, + "learning_rate": 4.6654525960335704e-07, + "loss": 0.1682, + "step": 17803 + }, + { + "epoch": 0.91, + "grad_norm": 0.8810679261644867, + "learning_rate": 4.660481996339483e-07, + "loss": 0.1621, + "step": 17804 + }, + { + "epoch": 0.91, + "grad_norm": 0.9417092579675105, + "learning_rate": 4.655513982735216e-07, + "loss": 0.1822, + "step": 17805 + }, + { + "epoch": 0.91, + "grad_norm": 1.1450114665041524, + "learning_rate": 4.6505485553555054e-07, + "loss": 0.1495, + "step": 17806 + }, + { + "epoch": 0.91, + "grad_norm": 1.2342421265088601, + "learning_rate": 4.6455857143350657e-07, + "loss": 0.1718, + "step": 17807 + }, + { + "epoch": 0.91, + "grad_norm": 1.4242395673614698, + "learning_rate": 4.6406254598084786e-07, + "loss": 0.1682, + "step": 17808 + }, + { + "epoch": 0.91, + "grad_norm": 0.9017379280780949, + "learning_rate": 4.635667791910314e-07, + "loss": 0.1617, + "step": 17809 + }, + { + "epoch": 0.91, + "grad_norm": 2.0869143627570828, + "learning_rate": 4.630712710775054e-07, + "loss": 0.1639, + "step": 17810 + }, + { + "epoch": 0.91, + "grad_norm": 0.9145479259162557, + "learning_rate": 4.625760216537112e-07, + "loss": 0.1559, + "step": 17811 + }, + { + "epoch": 0.91, + "grad_norm": 1.0659886892373898, + "learning_rate": 4.620810309330803e-07, + "loss": 0.1474, + "step": 17812 + }, + { + "epoch": 0.91, + "grad_norm": 1.0288166878075367, + "learning_rate": 4.615862989290387e-07, + "loss": 0.1814, + "step": 17813 + }, + { + "epoch": 0.91, + "grad_norm": 1.1905630758669472, + "learning_rate": 4.61091825655009e-07, + "loss": 0.1932, + "step": 17814 + }, + { + "epoch": 0.91, + "grad_norm": 1.0430728818674375, + "learning_rate": 4.605976111244015e-07, + "loss": 0.1629, + "step": 17815 + }, + { + "epoch": 0.91, + "grad_norm": 1.0384609491251084, + "learning_rate": 4.601036553506233e-07, + "loss": 0.174, + "step": 17816 + }, + { + "epoch": 0.91, + "grad_norm": 1.4392457512023191, + "learning_rate": 4.5960995834707146e-07, + "loss": 0.1346, + "step": 17817 + }, + { + "epoch": 0.91, + "grad_norm": 0.9923559260449925, + "learning_rate": 4.591165201271386e-07, + "loss": 0.1774, + "step": 17818 + }, + { + "epoch": 0.91, + "grad_norm": 1.0656259011880576, + "learning_rate": 4.5862334070420843e-07, + "loss": 0.1652, + "step": 17819 + }, + { + "epoch": 0.91, + "grad_norm": 1.4443650791473384, + "learning_rate": 4.581304200916603e-07, + "loss": 0.1698, + "step": 17820 + }, + { + "epoch": 0.91, + "grad_norm": 1.9518129480656117, + "learning_rate": 4.576377583028624e-07, + "loss": 0.1504, + "step": 17821 + }, + { + "epoch": 0.91, + "grad_norm": 1.1957606269589063, + "learning_rate": 4.571453553511807e-07, + "loss": 0.176, + "step": 17822 + }, + { + "epoch": 0.91, + "grad_norm": 1.0892795482220845, + "learning_rate": 4.5665321124996774e-07, + "loss": 0.1595, + "step": 17823 + }, + { + "epoch": 0.91, + "grad_norm": 1.1566948468637954, + "learning_rate": 4.5616132601257857e-07, + "loss": 0.1695, + "step": 17824 + }, + { + "epoch": 0.91, + "grad_norm": 0.9480355518150048, + "learning_rate": 4.556696996523502e-07, + "loss": 0.1724, + "step": 17825 + }, + { + "epoch": 0.91, + "grad_norm": 1.0548757699756477, + "learning_rate": 4.5517833218261974e-07, + "loss": 0.1673, + "step": 17826 + }, + { + "epoch": 0.91, + "grad_norm": 0.8952574575856083, + "learning_rate": 4.546872236167166e-07, + "loss": 0.1576, + "step": 17827 + }, + { + "epoch": 0.91, + "grad_norm": 1.0190701158443207, + "learning_rate": 4.5419637396796337e-07, + "loss": 0.1807, + "step": 17828 + }, + { + "epoch": 0.91, + "grad_norm": 0.8335552710274327, + "learning_rate": 4.5370578324967054e-07, + "loss": 0.1783, + "step": 17829 + }, + { + "epoch": 0.91, + "grad_norm": 1.3764248852007366, + "learning_rate": 4.532154514751497e-07, + "loss": 0.1893, + "step": 17830 + }, + { + "epoch": 0.91, + "grad_norm": 1.3915203240070135, + "learning_rate": 4.527253786576991e-07, + "loss": 0.1592, + "step": 17831 + }, + { + "epoch": 0.91, + "grad_norm": 1.2861389689670186, + "learning_rate": 4.5223556481060913e-07, + "loss": 0.1528, + "step": 17832 + }, + { + "epoch": 0.91, + "grad_norm": 1.4373456377797067, + "learning_rate": 4.5174600994717154e-07, + "loss": 0.1645, + "step": 17833 + }, + { + "epoch": 0.91, + "grad_norm": 1.348075943625048, + "learning_rate": 4.5125671408066006e-07, + "loss": 0.1546, + "step": 17834 + }, + { + "epoch": 0.91, + "grad_norm": 1.1271648016172928, + "learning_rate": 4.5076767722435075e-07, + "loss": 0.1644, + "step": 17835 + }, + { + "epoch": 0.91, + "grad_norm": 0.8177597987428121, + "learning_rate": 4.502788993915075e-07, + "loss": 0.1643, + "step": 17836 + }, + { + "epoch": 0.91, + "grad_norm": 1.4899303323960316, + "learning_rate": 4.4979038059538847e-07, + "loss": 0.1568, + "step": 17837 + }, + { + "epoch": 0.91, + "grad_norm": 0.8795186133211397, + "learning_rate": 4.493021208492443e-07, + "loss": 0.1421, + "step": 17838 + }, + { + "epoch": 0.91, + "grad_norm": 1.1307292120596275, + "learning_rate": 4.48814120166321e-07, + "loss": 0.1725, + "step": 17839 + }, + { + "epoch": 0.91, + "grad_norm": 1.0942327975567745, + "learning_rate": 4.483263785598524e-07, + "loss": 0.1607, + "step": 17840 + }, + { + "epoch": 0.91, + "grad_norm": 1.310134964018146, + "learning_rate": 4.478388960430724e-07, + "loss": 0.1533, + "step": 17841 + }, + { + "epoch": 0.91, + "grad_norm": 1.0896854956976845, + "learning_rate": 4.4735167262919934e-07, + "loss": 0.1672, + "step": 17842 + }, + { + "epoch": 0.91, + "grad_norm": 1.4009160493602761, + "learning_rate": 4.468647083314537e-07, + "loss": 0.172, + "step": 17843 + }, + { + "epoch": 0.91, + "grad_norm": 1.0883627609353188, + "learning_rate": 4.4637800316304157e-07, + "loss": 0.1673, + "step": 17844 + }, + { + "epoch": 0.91, + "grad_norm": 0.9322656297713793, + "learning_rate": 4.458915571371647e-07, + "loss": 0.1345, + "step": 17845 + }, + { + "epoch": 0.91, + "grad_norm": 1.3953361415792829, + "learning_rate": 4.4540537026702026e-07, + "loss": 0.1563, + "step": 17846 + }, + { + "epoch": 0.91, + "grad_norm": 2.582791955807601, + "learning_rate": 4.449194425657943e-07, + "loss": 0.1769, + "step": 17847 + }, + { + "epoch": 0.91, + "grad_norm": 1.2389198531379395, + "learning_rate": 4.4443377404666976e-07, + "loss": 0.1726, + "step": 17848 + }, + { + "epoch": 0.91, + "grad_norm": 1.8344787438413215, + "learning_rate": 4.439483647228171e-07, + "loss": 0.1823, + "step": 17849 + }, + { + "epoch": 0.91, + "grad_norm": 0.9170059286638855, + "learning_rate": 4.4346321460740583e-07, + "loss": 0.1698, + "step": 17850 + }, + { + "epoch": 0.91, + "grad_norm": 1.0316912778937046, + "learning_rate": 4.429783237135932e-07, + "loss": 0.17, + "step": 17851 + }, + { + "epoch": 0.91, + "grad_norm": 1.1489235166833727, + "learning_rate": 4.424936920545331e-07, + "loss": 0.1702, + "step": 17852 + }, + { + "epoch": 0.91, + "grad_norm": 1.0784830220040096, + "learning_rate": 4.420093196433717e-07, + "loss": 0.1726, + "step": 17853 + }, + { + "epoch": 0.91, + "grad_norm": 1.780789153470712, + "learning_rate": 4.415252064932485e-07, + "loss": 0.1517, + "step": 17854 + }, + { + "epoch": 0.91, + "grad_norm": 1.112357375081568, + "learning_rate": 4.4104135261729296e-07, + "loss": 0.1584, + "step": 17855 + }, + { + "epoch": 0.91, + "grad_norm": 1.1552800042958693, + "learning_rate": 4.4055775802863246e-07, + "loss": 0.1663, + "step": 17856 + }, + { + "epoch": 0.91, + "grad_norm": 1.408008062223532, + "learning_rate": 4.400744227403797e-07, + "loss": 0.1646, + "step": 17857 + }, + { + "epoch": 0.91, + "grad_norm": 0.9212932843285123, + "learning_rate": 4.3959134676565097e-07, + "loss": 0.1524, + "step": 17858 + }, + { + "epoch": 0.91, + "grad_norm": 0.9661145345045649, + "learning_rate": 4.391085301175457e-07, + "loss": 0.1622, + "step": 17859 + }, + { + "epoch": 0.91, + "grad_norm": 1.189610441208609, + "learning_rate": 4.386259728091613e-07, + "loss": 0.166, + "step": 17860 + }, + { + "epoch": 0.91, + "grad_norm": 1.2524582875011316, + "learning_rate": 4.381436748535872e-07, + "loss": 0.1598, + "step": 17861 + }, + { + "epoch": 0.91, + "grad_norm": 1.1669624890146575, + "learning_rate": 4.376616362639063e-07, + "loss": 0.1482, + "step": 17862 + }, + { + "epoch": 0.91, + "grad_norm": 1.2651938387594743, + "learning_rate": 4.371798570531927e-07, + "loss": 0.1598, + "step": 17863 + }, + { + "epoch": 0.91, + "grad_norm": 1.159737621227063, + "learning_rate": 4.3669833723451795e-07, + "loss": 0.1696, + "step": 17864 + }, + { + "epoch": 0.91, + "grad_norm": 1.210751056835837, + "learning_rate": 4.3621707682094063e-07, + "loss": 0.1556, + "step": 17865 + }, + { + "epoch": 0.91, + "grad_norm": 1.0647749145288499, + "learning_rate": 4.3573607582551356e-07, + "loss": 0.1692, + "step": 17866 + }, + { + "epoch": 0.91, + "grad_norm": 0.9487333310478399, + "learning_rate": 4.3525533426128643e-07, + "loss": 0.1684, + "step": 17867 + }, + { + "epoch": 0.91, + "grad_norm": 1.0380106070047885, + "learning_rate": 4.3477485214129864e-07, + "loss": 0.1641, + "step": 17868 + }, + { + "epoch": 0.91, + "grad_norm": 1.052899447067935, + "learning_rate": 4.3429462947858327e-07, + "loss": 0.181, + "step": 17869 + }, + { + "epoch": 0.91, + "grad_norm": 1.4736590717566007, + "learning_rate": 4.338146662861664e-07, + "loss": 0.1609, + "step": 17870 + }, + { + "epoch": 0.91, + "grad_norm": 1.797747112413201, + "learning_rate": 4.333349625770655e-07, + "loss": 0.1661, + "step": 17871 + }, + { + "epoch": 0.91, + "grad_norm": 1.2137042272213838, + "learning_rate": 4.3285551836429465e-07, + "loss": 0.1608, + "step": 17872 + }, + { + "epoch": 0.91, + "grad_norm": 0.8488474336115467, + "learning_rate": 4.3237633366085997e-07, + "loss": 0.1448, + "step": 17873 + }, + { + "epoch": 0.91, + "grad_norm": 1.0186497884210581, + "learning_rate": 4.3189740847975556e-07, + "loss": 0.1562, + "step": 17874 + }, + { + "epoch": 0.91, + "grad_norm": 1.0794700833681627, + "learning_rate": 4.3141874283397665e-07, + "loss": 0.1541, + "step": 17875 + }, + { + "epoch": 0.91, + "grad_norm": 1.3353539984921128, + "learning_rate": 4.309403367365028e-07, + "loss": 0.1698, + "step": 17876 + }, + { + "epoch": 0.91, + "grad_norm": 0.9808455563771287, + "learning_rate": 4.3046219020031366e-07, + "loss": 0.1791, + "step": 17877 + }, + { + "epoch": 0.91, + "grad_norm": 1.460488370062648, + "learning_rate": 4.299843032383777e-07, + "loss": 0.1983, + "step": 17878 + }, + { + "epoch": 0.91, + "grad_norm": 1.190477051437423, + "learning_rate": 4.295066758636579e-07, + "loss": 0.1534, + "step": 17879 + }, + { + "epoch": 0.91, + "grad_norm": 1.117376348606542, + "learning_rate": 4.2902930808910946e-07, + "loss": 0.1607, + "step": 17880 + }, + { + "epoch": 0.91, + "grad_norm": 1.1286525714094169, + "learning_rate": 4.2855219992768313e-07, + "loss": 0.1752, + "step": 17881 + }, + { + "epoch": 0.91, + "grad_norm": 1.0822666747841385, + "learning_rate": 4.280753513923197e-07, + "loss": 0.1518, + "step": 17882 + }, + { + "epoch": 0.91, + "grad_norm": 1.1124604964562284, + "learning_rate": 4.27598762495951e-07, + "loss": 0.1501, + "step": 17883 + }, + { + "epoch": 0.91, + "grad_norm": 1.2331043057271551, + "learning_rate": 4.271224332515078e-07, + "loss": 0.1651, + "step": 17884 + }, + { + "epoch": 0.91, + "grad_norm": 1.1070061068054977, + "learning_rate": 4.266463636719087e-07, + "loss": 0.1477, + "step": 17885 + }, + { + "epoch": 0.91, + "grad_norm": 1.9162346294809607, + "learning_rate": 4.261705537700678e-07, + "loss": 0.1644, + "step": 17886 + }, + { + "epoch": 0.91, + "grad_norm": 1.7168818418985248, + "learning_rate": 4.2569500355889027e-07, + "loss": 0.1631, + "step": 17887 + }, + { + "epoch": 0.91, + "grad_norm": 0.8780179759569882, + "learning_rate": 4.2521971305127695e-07, + "loss": 0.1571, + "step": 17888 + }, + { + "epoch": 0.91, + "grad_norm": 1.0836510792985983, + "learning_rate": 4.2474468226011976e-07, + "loss": 0.1682, + "step": 17889 + }, + { + "epoch": 0.91, + "grad_norm": 1.2082434934187478, + "learning_rate": 4.242699111983051e-07, + "loss": 0.1571, + "step": 17890 + }, + { + "epoch": 0.91, + "grad_norm": 1.8432251155722694, + "learning_rate": 4.2379539987870924e-07, + "loss": 0.1522, + "step": 17891 + }, + { + "epoch": 0.91, + "grad_norm": 0.800714309069143, + "learning_rate": 4.233211483142041e-07, + "loss": 0.155, + "step": 17892 + }, + { + "epoch": 0.91, + "grad_norm": 0.9767747713654685, + "learning_rate": 4.2284715651765287e-07, + "loss": 0.1674, + "step": 17893 + }, + { + "epoch": 0.91, + "grad_norm": 1.6629670704546073, + "learning_rate": 4.223734245019151e-07, + "loss": 0.1639, + "step": 17894 + }, + { + "epoch": 0.91, + "grad_norm": 1.2069269896015864, + "learning_rate": 4.2189995227983726e-07, + "loss": 0.1765, + "step": 17895 + }, + { + "epoch": 0.91, + "grad_norm": 1.466101959194786, + "learning_rate": 4.2142673986426685e-07, + "loss": 0.1842, + "step": 17896 + }, + { + "epoch": 0.91, + "grad_norm": 1.0350455168520254, + "learning_rate": 4.2095378726803473e-07, + "loss": 0.1613, + "step": 17897 + }, + { + "epoch": 0.91, + "grad_norm": 0.7932209596104125, + "learning_rate": 4.204810945039717e-07, + "loss": 0.1574, + "step": 17898 + }, + { + "epoch": 0.91, + "grad_norm": 0.9474340681428581, + "learning_rate": 4.2000866158490084e-07, + "loss": 0.16, + "step": 17899 + }, + { + "epoch": 0.91, + "grad_norm": 1.1472259303799852, + "learning_rate": 4.195364885236375e-07, + "loss": 0.1704, + "step": 17900 + }, + { + "epoch": 0.91, + "grad_norm": 1.0099861991323156, + "learning_rate": 4.1906457533298694e-07, + "loss": 0.171, + "step": 17901 + }, + { + "epoch": 0.91, + "grad_norm": 1.1539727871270642, + "learning_rate": 4.1859292202575007e-07, + "loss": 0.1546, + "step": 17902 + }, + { + "epoch": 0.91, + "grad_norm": 0.8311713315990183, + "learning_rate": 4.1812152861472333e-07, + "loss": 0.1852, + "step": 17903 + }, + { + "epoch": 0.91, + "grad_norm": 1.0243626136619337, + "learning_rate": 4.176503951126898e-07, + "loss": 0.156, + "step": 17904 + }, + { + "epoch": 0.91, + "grad_norm": 0.9057591631418205, + "learning_rate": 4.1717952153243034e-07, + "loss": 0.1389, + "step": 17905 + }, + { + "epoch": 0.91, + "grad_norm": 1.426122726890483, + "learning_rate": 4.16708907886717e-07, + "loss": 0.1613, + "step": 17906 + }, + { + "epoch": 0.91, + "grad_norm": 1.0026262405109105, + "learning_rate": 4.162385541883185e-07, + "loss": 0.1697, + "step": 17907 + }, + { + "epoch": 0.91, + "grad_norm": 1.1279028816464756, + "learning_rate": 4.157684604499879e-07, + "loss": 0.1595, + "step": 17908 + }, + { + "epoch": 0.91, + "grad_norm": 1.137347084964881, + "learning_rate": 4.152986266844805e-07, + "loss": 0.1796, + "step": 17909 + }, + { + "epoch": 0.91, + "grad_norm": 1.3469612177043435, + "learning_rate": 4.1482905290453846e-07, + "loss": 0.1645, + "step": 17910 + }, + { + "epoch": 0.91, + "grad_norm": 1.1427791293368184, + "learning_rate": 4.143597391229015e-07, + "loss": 0.1642, + "step": 17911 + }, + { + "epoch": 0.91, + "grad_norm": 0.9551977976133195, + "learning_rate": 4.1389068535229615e-07, + "loss": 0.1625, + "step": 17912 + }, + { + "epoch": 0.91, + "grad_norm": 2.279745687276524, + "learning_rate": 4.1342189160544775e-07, + "loss": 0.165, + "step": 17913 + }, + { + "epoch": 0.91, + "grad_norm": 0.9968432466039817, + "learning_rate": 4.1295335789507174e-07, + "loss": 0.1557, + "step": 17914 + }, + { + "epoch": 0.91, + "grad_norm": 0.8494539809967789, + "learning_rate": 4.124850842338779e-07, + "loss": 0.1633, + "step": 17915 + }, + { + "epoch": 0.91, + "grad_norm": 0.8921772831807426, + "learning_rate": 4.120170706345661e-07, + "loss": 0.1419, + "step": 17916 + }, + { + "epoch": 0.91, + "grad_norm": 1.2943009106600474, + "learning_rate": 4.1154931710983504e-07, + "loss": 0.1617, + "step": 17917 + }, + { + "epoch": 0.91, + "grad_norm": 1.2395827561193538, + "learning_rate": 4.1108182367237014e-07, + "loss": 0.1823, + "step": 17918 + }, + { + "epoch": 0.91, + "grad_norm": 0.8016794634208525, + "learning_rate": 4.106145903348513e-07, + "loss": 0.1587, + "step": 17919 + }, + { + "epoch": 0.91, + "grad_norm": 1.1507975908971602, + "learning_rate": 4.10147617109955e-07, + "loss": 0.1681, + "step": 17920 + }, + { + "epoch": 0.91, + "grad_norm": 0.8995866701271464, + "learning_rate": 4.0968090401034444e-07, + "loss": 0.1742, + "step": 17921 + }, + { + "epoch": 0.91, + "grad_norm": 0.8769457890812751, + "learning_rate": 4.092144510486806e-07, + "loss": 0.1565, + "step": 17922 + }, + { + "epoch": 0.91, + "grad_norm": 1.1400777198132501, + "learning_rate": 4.0874825823761676e-07, + "loss": 0.1481, + "step": 17923 + }, + { + "epoch": 0.91, + "grad_norm": 0.7596705469198951, + "learning_rate": 4.0828232558979943e-07, + "loss": 0.1577, + "step": 17924 + }, + { + "epoch": 0.91, + "grad_norm": 1.412245385430175, + "learning_rate": 4.078166531178651e-07, + "loss": 0.1475, + "step": 17925 + }, + { + "epoch": 0.91, + "grad_norm": 0.9016443987867834, + "learning_rate": 4.07351240834446e-07, + "loss": 0.1589, + "step": 17926 + }, + { + "epoch": 0.91, + "grad_norm": 1.2437259871228254, + "learning_rate": 4.0688608875216527e-07, + "loss": 0.1618, + "step": 17927 + }, + { + "epoch": 0.91, + "grad_norm": 1.763305394131877, + "learning_rate": 4.064211968836429e-07, + "loss": 0.1706, + "step": 17928 + }, + { + "epoch": 0.91, + "grad_norm": 0.776341037606745, + "learning_rate": 4.059565652414865e-07, + "loss": 0.1604, + "step": 17929 + }, + { + "epoch": 0.91, + "grad_norm": 1.1781010449575935, + "learning_rate": 4.0549219383830054e-07, + "loss": 0.1407, + "step": 17930 + }, + { + "epoch": 0.91, + "grad_norm": 0.9927757811539168, + "learning_rate": 4.0502808268668034e-07, + "loss": 0.1776, + "step": 17931 + }, + { + "epoch": 0.91, + "grad_norm": 0.9940650804664498, + "learning_rate": 4.045642317992149e-07, + "loss": 0.1609, + "step": 17932 + }, + { + "epoch": 0.91, + "grad_norm": 0.9952558127873156, + "learning_rate": 4.0410064118848624e-07, + "loss": 0.1515, + "step": 17933 + }, + { + "epoch": 0.91, + "grad_norm": 1.3742803805870323, + "learning_rate": 4.03637310867071e-07, + "loss": 0.1851, + "step": 17934 + }, + { + "epoch": 0.91, + "grad_norm": 1.1261120911519835, + "learning_rate": 4.031742408475359e-07, + "loss": 0.1669, + "step": 17935 + }, + { + "epoch": 0.91, + "grad_norm": 1.581412240413239, + "learning_rate": 4.027114311424407e-07, + "loss": 0.1574, + "step": 17936 + }, + { + "epoch": 0.91, + "grad_norm": 1.590825805117261, + "learning_rate": 4.0224888176434105e-07, + "loss": 0.1638, + "step": 17937 + }, + { + "epoch": 0.91, + "grad_norm": 0.9072850744524407, + "learning_rate": 4.0178659272578026e-07, + "loss": 0.1396, + "step": 17938 + }, + { + "epoch": 0.91, + "grad_norm": 44.79995566266486, + "learning_rate": 4.0132456403930263e-07, + "loss": 0.1802, + "step": 17939 + }, + { + "epoch": 0.91, + "grad_norm": 1.7677360420573207, + "learning_rate": 4.0086279571743715e-07, + "loss": 0.1641, + "step": 17940 + }, + { + "epoch": 0.91, + "grad_norm": 0.8974806317168946, + "learning_rate": 4.004012877727104e-07, + "loss": 0.1546, + "step": 17941 + }, + { + "epoch": 0.91, + "grad_norm": 0.9247563788673555, + "learning_rate": 3.999400402176401e-07, + "loss": 0.1758, + "step": 17942 + }, + { + "epoch": 0.91, + "grad_norm": 1.2695249530965618, + "learning_rate": 3.9947905306474077e-07, + "loss": 0.1596, + "step": 17943 + }, + { + "epoch": 0.91, + "grad_norm": 0.874647864267738, + "learning_rate": 3.990183263265124e-07, + "loss": 0.15, + "step": 17944 + }, + { + "epoch": 0.91, + "grad_norm": 1.1813860921210668, + "learning_rate": 3.985578600154549e-07, + "loss": 0.1676, + "step": 17945 + }, + { + "epoch": 0.91, + "grad_norm": 0.9040988900659936, + "learning_rate": 3.9809765414405734e-07, + "loss": 0.1454, + "step": 17946 + }, + { + "epoch": 0.91, + "grad_norm": 1.0329931692731547, + "learning_rate": 3.976377087248051e-07, + "loss": 0.1761, + "step": 17947 + }, + { + "epoch": 0.91, + "grad_norm": 0.883762203784139, + "learning_rate": 3.9717802377017057e-07, + "loss": 0.1717, + "step": 17948 + }, + { + "epoch": 0.91, + "grad_norm": 1.64805341126127, + "learning_rate": 3.967185992926237e-07, + "loss": 0.1613, + "step": 17949 + }, + { + "epoch": 0.91, + "grad_norm": 2.778869015406839, + "learning_rate": 3.9625943530462787e-07, + "loss": 0.1616, + "step": 17950 + }, + { + "epoch": 0.91, + "grad_norm": 0.8044296441418508, + "learning_rate": 3.9580053181863866e-07, + "loss": 0.1718, + "step": 17951 + }, + { + "epoch": 0.91, + "grad_norm": 1.1637532712791299, + "learning_rate": 3.953418888471017e-07, + "loss": 0.1708, + "step": 17952 + }, + { + "epoch": 0.91, + "grad_norm": 2.2436944226653273, + "learning_rate": 3.948835064024581e-07, + "loss": 0.1576, + "step": 17953 + }, + { + "epoch": 0.91, + "grad_norm": 2.16526928391713, + "learning_rate": 3.944253844971435e-07, + "loss": 0.1609, + "step": 17954 + }, + { + "epoch": 0.91, + "grad_norm": 1.1371984003091298, + "learning_rate": 3.939675231435802e-07, + "loss": 0.1489, + "step": 17955 + }, + { + "epoch": 0.91, + "grad_norm": 1.0432445408536668, + "learning_rate": 3.935099223541927e-07, + "loss": 0.1471, + "step": 17956 + }, + { + "epoch": 0.91, + "grad_norm": 0.939698839652885, + "learning_rate": 3.9305258214138995e-07, + "loss": 0.1406, + "step": 17957 + }, + { + "epoch": 0.91, + "grad_norm": 1.0156235305520485, + "learning_rate": 3.9259550251757763e-07, + "loss": 0.1752, + "step": 17958 + }, + { + "epoch": 0.91, + "grad_norm": 1.56236736930776, + "learning_rate": 3.921386834951557e-07, + "loss": 0.1594, + "step": 17959 + }, + { + "epoch": 0.91, + "grad_norm": 0.9256258931900568, + "learning_rate": 3.9168212508651547e-07, + "loss": 0.158, + "step": 17960 + }, + { + "epoch": 0.91, + "grad_norm": 1.2545803271356366, + "learning_rate": 3.9122582730403924e-07, + "loss": 0.1604, + "step": 17961 + }, + { + "epoch": 0.91, + "grad_norm": 1.2502358925172403, + "learning_rate": 3.907697901601071e-07, + "loss": 0.1283, + "step": 17962 + }, + { + "epoch": 0.91, + "grad_norm": 1.0882395503948148, + "learning_rate": 3.9031401366708467e-07, + "loss": 0.1499, + "step": 17963 + }, + { + "epoch": 0.91, + "grad_norm": 1.4270571392860416, + "learning_rate": 3.8985849783733873e-07, + "loss": 0.1528, + "step": 17964 + }, + { + "epoch": 0.91, + "grad_norm": 0.9151584518670104, + "learning_rate": 3.8940324268322285e-07, + "loss": 0.1507, + "step": 17965 + }, + { + "epoch": 0.91, + "grad_norm": 0.9494614142440932, + "learning_rate": 3.889482482170881e-07, + "loss": 0.1779, + "step": 17966 + }, + { + "epoch": 0.91, + "grad_norm": 1.311422455794517, + "learning_rate": 3.884935144512747e-07, + "loss": 0.1668, + "step": 17967 + }, + { + "epoch": 0.91, + "grad_norm": 0.9151959255779871, + "learning_rate": 3.880390413981161e-07, + "loss": 0.1457, + "step": 17968 + }, + { + "epoch": 0.91, + "grad_norm": 1.1434615925984102, + "learning_rate": 3.8758482906994245e-07, + "loss": 0.1798, + "step": 17969 + }, + { + "epoch": 0.91, + "grad_norm": 0.8407924412287494, + "learning_rate": 3.8713087747907385e-07, + "loss": 0.1365, + "step": 17970 + }, + { + "epoch": 0.91, + "grad_norm": 1.0285526948328574, + "learning_rate": 3.866771866378227e-07, + "loss": 0.156, + "step": 17971 + }, + { + "epoch": 0.91, + "grad_norm": 1.141798461154904, + "learning_rate": 3.862237565584959e-07, + "loss": 0.182, + "step": 17972 + }, + { + "epoch": 0.91, + "grad_norm": 0.9341811901107416, + "learning_rate": 3.8577058725339235e-07, + "loss": 0.157, + "step": 17973 + }, + { + "epoch": 0.91, + "grad_norm": 1.227656033497009, + "learning_rate": 3.8531767873480453e-07, + "loss": 0.162, + "step": 17974 + }, + { + "epoch": 0.91, + "grad_norm": 7.395502251060028, + "learning_rate": 3.8486503101501705e-07, + "loss": 0.1852, + "step": 17975 + }, + { + "epoch": 0.91, + "grad_norm": 1.1583305556455232, + "learning_rate": 3.84412644106309e-07, + "loss": 0.1536, + "step": 17976 + }, + { + "epoch": 0.91, + "grad_norm": 1.3079551443868314, + "learning_rate": 3.839605180209527e-07, + "loss": 0.1573, + "step": 17977 + }, + { + "epoch": 0.91, + "grad_norm": 0.8749974615321174, + "learning_rate": 3.835086527712084e-07, + "loss": 0.1618, + "step": 17978 + }, + { + "epoch": 0.91, + "grad_norm": 1.412753912406829, + "learning_rate": 3.830570483693374e-07, + "loss": 0.1508, + "step": 17979 + }, + { + "epoch": 0.91, + "grad_norm": 0.9967886468592525, + "learning_rate": 3.8260570482758554e-07, + "loss": 0.1565, + "step": 17980 + }, + { + "epoch": 0.91, + "grad_norm": 1.3170670252562915, + "learning_rate": 3.8215462215819733e-07, + "loss": 0.1727, + "step": 17981 + }, + { + "epoch": 0.91, + "grad_norm": 0.8589125439803201, + "learning_rate": 3.817038003734075e-07, + "loss": 0.1716, + "step": 17982 + }, + { + "epoch": 0.91, + "grad_norm": 2.77991289215004, + "learning_rate": 3.8125323948544734e-07, + "loss": 0.1717, + "step": 17983 + }, + { + "epoch": 0.91, + "grad_norm": 1.4852015113231571, + "learning_rate": 3.808029395065349e-07, + "loss": 0.1668, + "step": 17984 + }, + { + "epoch": 0.91, + "grad_norm": 1.3957447592536913, + "learning_rate": 3.803529004488848e-07, + "loss": 0.1621, + "step": 17985 + }, + { + "epoch": 0.91, + "grad_norm": 1.0856403131783523, + "learning_rate": 3.7990312232470627e-07, + "loss": 0.1625, + "step": 17986 + }, + { + "epoch": 0.91, + "grad_norm": 0.8855645448579809, + "learning_rate": 3.7945360514620056e-07, + "loss": 0.1342, + "step": 17987 + }, + { + "epoch": 0.91, + "grad_norm": 1.0073013102385229, + "learning_rate": 3.7900434892555903e-07, + "loss": 0.1579, + "step": 17988 + }, + { + "epoch": 0.91, + "grad_norm": 2.0871019309171697, + "learning_rate": 3.785553536749664e-07, + "loss": 0.1583, + "step": 17989 + }, + { + "epoch": 0.91, + "grad_norm": 1.0891511728916528, + "learning_rate": 3.781066194066052e-07, + "loss": 0.1642, + "step": 17990 + }, + { + "epoch": 0.91, + "grad_norm": 3.8153204240961576, + "learning_rate": 3.776581461326434e-07, + "loss": 0.1855, + "step": 17991 + }, + { + "epoch": 0.91, + "grad_norm": 1.0753731363296326, + "learning_rate": 3.772099338652491e-07, + "loss": 0.1686, + "step": 17992 + }, + { + "epoch": 0.91, + "grad_norm": 1.1277177371834488, + "learning_rate": 3.7676198261657803e-07, + "loss": 0.1571, + "step": 17993 + }, + { + "epoch": 0.92, + "grad_norm": 0.8328991730043827, + "learning_rate": 3.763142923987817e-07, + "loss": 0.1635, + "step": 17994 + }, + { + "epoch": 0.92, + "grad_norm": 1.1623322953496837, + "learning_rate": 3.7586686322400257e-07, + "loss": 0.1753, + "step": 17995 + }, + { + "epoch": 0.92, + "grad_norm": 1.2256700386655874, + "learning_rate": 3.7541969510438094e-07, + "loss": 0.1708, + "step": 17996 + }, + { + "epoch": 0.92, + "grad_norm": 0.9689492752414466, + "learning_rate": 3.749727880520415e-07, + "loss": 0.1555, + "step": 17997 + }, + { + "epoch": 0.92, + "grad_norm": 1.2295248073265919, + "learning_rate": 3.7452614207911133e-07, + "loss": 0.1701, + "step": 17998 + }, + { + "epoch": 0.92, + "grad_norm": 1.5945159084207308, + "learning_rate": 3.740797571977006e-07, + "loss": 0.1777, + "step": 17999 + }, + { + "epoch": 0.92, + "grad_norm": 1.097649874716848, + "learning_rate": 3.7363363341992197e-07, + "loss": 0.1689, + "step": 18000 + }, + { + "epoch": 0.92, + "grad_norm": 1.8499978036235123, + "learning_rate": 3.731877707578735e-07, + "loss": 0.1546, + "step": 18001 + }, + { + "epoch": 0.92, + "grad_norm": 1.8260381407155, + "learning_rate": 3.72742169223651e-07, + "loss": 0.1513, + "step": 18002 + }, + { + "epoch": 0.92, + "grad_norm": 0.7594243057445846, + "learning_rate": 3.722968288293405e-07, + "loss": 0.137, + "step": 18003 + }, + { + "epoch": 0.92, + "grad_norm": 0.9524979700874181, + "learning_rate": 3.718517495870233e-07, + "loss": 0.173, + "step": 18004 + }, + { + "epoch": 0.92, + "grad_norm": 2.0215817900375197, + "learning_rate": 3.714069315087709e-07, + "loss": 0.1522, + "step": 18005 + }, + { + "epoch": 0.92, + "grad_norm": 1.2227474046025721, + "learning_rate": 3.709623746066482e-07, + "loss": 0.1764, + "step": 18006 + }, + { + "epoch": 0.92, + "grad_norm": 1.4786570951548887, + "learning_rate": 3.7051807889271653e-07, + "loss": 0.1805, + "step": 18007 + }, + { + "epoch": 0.92, + "grad_norm": 0.9627071613835644, + "learning_rate": 3.7007404437902515e-07, + "loss": 0.1833, + "step": 18008 + }, + { + "epoch": 0.92, + "grad_norm": 0.9180815857025116, + "learning_rate": 3.6963027107761896e-07, + "loss": 0.1715, + "step": 18009 + }, + { + "epoch": 0.92, + "grad_norm": 1.144345240438523, + "learning_rate": 3.6918675900053605e-07, + "loss": 0.1531, + "step": 18010 + }, + { + "epoch": 0.92, + "grad_norm": 2.4240528022562584, + "learning_rate": 3.6874350815980565e-07, + "loss": 0.1887, + "step": 18011 + }, + { + "epoch": 0.92, + "grad_norm": 1.9857446024845689, + "learning_rate": 3.683005185674504e-07, + "loss": 0.1835, + "step": 18012 + }, + { + "epoch": 0.92, + "grad_norm": 0.8481856015382779, + "learning_rate": 3.678577902354907e-07, + "loss": 0.1537, + "step": 18013 + }, + { + "epoch": 0.92, + "grad_norm": 0.9481275747482545, + "learning_rate": 3.674153231759303e-07, + "loss": 0.159, + "step": 18014 + }, + { + "epoch": 0.92, + "grad_norm": 1.0549232250539309, + "learning_rate": 3.66973117400774e-07, + "loss": 0.1911, + "step": 18015 + }, + { + "epoch": 0.92, + "grad_norm": 1.1714198995947855, + "learning_rate": 3.665311729220156e-07, + "loss": 0.1681, + "step": 18016 + }, + { + "epoch": 0.92, + "grad_norm": 1.204461169805127, + "learning_rate": 3.6608948975164424e-07, + "loss": 0.1846, + "step": 18017 + }, + { + "epoch": 0.92, + "grad_norm": 1.0571081817276509, + "learning_rate": 3.6564806790163833e-07, + "loss": 0.1925, + "step": 18018 + }, + { + "epoch": 0.92, + "grad_norm": 0.9719685244560988, + "learning_rate": 3.6520690738397256e-07, + "loss": 0.1646, + "step": 18019 + }, + { + "epoch": 0.92, + "grad_norm": 1.9468764412733461, + "learning_rate": 3.64766008210613e-07, + "loss": 0.1658, + "step": 18020 + }, + { + "epoch": 0.92, + "grad_norm": 0.7563243675377808, + "learning_rate": 3.643253703935223e-07, + "loss": 0.1605, + "step": 18021 + }, + { + "epoch": 0.92, + "grad_norm": 0.9060599741346881, + "learning_rate": 3.6388499394464874e-07, + "loss": 0.1464, + "step": 18022 + }, + { + "epoch": 0.92, + "grad_norm": 1.2158613459959573, + "learning_rate": 3.6344487887593926e-07, + "loss": 0.17, + "step": 18023 + }, + { + "epoch": 0.92, + "grad_norm": 0.971532594962186, + "learning_rate": 3.630050251993311e-07, + "loss": 0.16, + "step": 18024 + }, + { + "epoch": 0.92, + "grad_norm": 1.1481698919830072, + "learning_rate": 3.6256543292675584e-07, + "loss": 0.1777, + "step": 18025 + }, + { + "epoch": 0.92, + "grad_norm": 1.486437555313577, + "learning_rate": 3.6212610207013943e-07, + "loss": 0.1588, + "step": 18026 + }, + { + "epoch": 0.92, + "grad_norm": 1.1882199195943515, + "learning_rate": 3.616870326413946e-07, + "loss": 0.1843, + "step": 18027 + }, + { + "epoch": 0.92, + "grad_norm": 0.8975928779369106, + "learning_rate": 3.61248224652434e-07, + "loss": 0.1493, + "step": 18028 + }, + { + "epoch": 0.92, + "grad_norm": 1.0875976408022372, + "learning_rate": 3.6080967811515933e-07, + "loss": 0.16, + "step": 18029 + }, + { + "epoch": 0.92, + "grad_norm": 0.9381668102488647, + "learning_rate": 3.603713930414676e-07, + "loss": 0.1578, + "step": 18030 + }, + { + "epoch": 0.92, + "grad_norm": 0.8959578955261757, + "learning_rate": 3.59933369443245e-07, + "loss": 0.158, + "step": 18031 + }, + { + "epoch": 0.92, + "grad_norm": 2.7477054698311623, + "learning_rate": 3.594956073323763e-07, + "loss": 0.1612, + "step": 18032 + }, + { + "epoch": 0.92, + "grad_norm": 2.0899173714504182, + "learning_rate": 3.5905810672073107e-07, + "loss": 0.1803, + "step": 18033 + }, + { + "epoch": 0.92, + "grad_norm": 1.5364206457954415, + "learning_rate": 3.586208676201819e-07, + "loss": 0.1786, + "step": 18034 + }, + { + "epoch": 0.92, + "grad_norm": 1.070889298110779, + "learning_rate": 3.581838900425838e-07, + "loss": 0.1661, + "step": 18035 + }, + { + "epoch": 0.92, + "grad_norm": 0.8732342679329591, + "learning_rate": 3.5774717399979396e-07, + "loss": 0.1585, + "step": 18036 + }, + { + "epoch": 0.92, + "grad_norm": 0.9400795374448295, + "learning_rate": 3.5731071950365625e-07, + "loss": 0.1693, + "step": 18037 + }, + { + "epoch": 0.92, + "grad_norm": 1.1803725967238192, + "learning_rate": 3.5687452656600896e-07, + "loss": 0.1711, + "step": 18038 + }, + { + "epoch": 0.92, + "grad_norm": 1.16940189288093, + "learning_rate": 3.5643859519868594e-07, + "loss": 0.1701, + "step": 18039 + }, + { + "epoch": 0.92, + "grad_norm": 0.8894016048155894, + "learning_rate": 3.5600292541351e-07, + "loss": 0.1652, + "step": 18040 + }, + { + "epoch": 0.92, + "grad_norm": 1.994107188440856, + "learning_rate": 3.5556751722230056e-07, + "loss": 0.143, + "step": 18041 + }, + { + "epoch": 0.92, + "grad_norm": 0.8943863074222658, + "learning_rate": 3.551323706368659e-07, + "loss": 0.1604, + "step": 18042 + }, + { + "epoch": 0.92, + "grad_norm": 0.8372530214153084, + "learning_rate": 3.546974856690111e-07, + "loss": 0.1682, + "step": 18043 + }, + { + "epoch": 0.92, + "grad_norm": 1.4334941554546106, + "learning_rate": 3.542628623305311e-07, + "loss": 0.1692, + "step": 18044 + }, + { + "epoch": 0.92, + "grad_norm": 0.8602732086285463, + "learning_rate": 3.538285006332154e-07, + "loss": 0.1651, + "step": 18045 + }, + { + "epoch": 0.92, + "grad_norm": 3.27004473331578, + "learning_rate": 3.5339440058884565e-07, + "loss": 0.1518, + "step": 18046 + }, + { + "epoch": 0.92, + "grad_norm": 1.2627309938058453, + "learning_rate": 3.529605622092003e-07, + "loss": 0.1388, + "step": 18047 + }, + { + "epoch": 0.92, + "grad_norm": 1.0584636145518418, + "learning_rate": 3.52526985506042e-07, + "loss": 0.1625, + "step": 18048 + }, + { + "epoch": 0.92, + "grad_norm": 0.8819800807661746, + "learning_rate": 3.5209367049113596e-07, + "loss": 0.1535, + "step": 18049 + }, + { + "epoch": 0.92, + "grad_norm": 1.1435665267192858, + "learning_rate": 3.516606171762338e-07, + "loss": 0.1647, + "step": 18050 + }, + { + "epoch": 0.92, + "grad_norm": 1.00416429595634, + "learning_rate": 3.5122782557308163e-07, + "loss": 0.1704, + "step": 18051 + }, + { + "epoch": 0.92, + "grad_norm": 1.040981168819708, + "learning_rate": 3.507952956934202e-07, + "loss": 0.1615, + "step": 18052 + }, + { + "epoch": 0.92, + "grad_norm": 1.1564463804169962, + "learning_rate": 3.503630275489811e-07, + "loss": 0.1751, + "step": 18053 + }, + { + "epoch": 0.92, + "grad_norm": 1.0239001745339626, + "learning_rate": 3.4993102115148947e-07, + "loss": 0.1444, + "step": 18054 + }, + { + "epoch": 0.92, + "grad_norm": 1.6025551048585556, + "learning_rate": 3.494992765126637e-07, + "loss": 0.1668, + "step": 18055 + }, + { + "epoch": 0.92, + "grad_norm": 1.0701646834638574, + "learning_rate": 3.490677936442155e-07, + "loss": 0.1418, + "step": 18056 + }, + { + "epoch": 0.92, + "grad_norm": 1.3906487577365694, + "learning_rate": 3.4863657255785e-07, + "loss": 0.1819, + "step": 18057 + }, + { + "epoch": 0.92, + "grad_norm": 1.2564578605249104, + "learning_rate": 3.482056132652623e-07, + "loss": 0.1609, + "step": 18058 + }, + { + "epoch": 0.92, + "grad_norm": 1.2013413310258307, + "learning_rate": 3.477749157781407e-07, + "loss": 0.1592, + "step": 18059 + }, + { + "epoch": 0.92, + "grad_norm": 2.8924948411927334, + "learning_rate": 3.4734448010817043e-07, + "loss": 0.1743, + "step": 18060 + }, + { + "epoch": 0.92, + "grad_norm": 1.0088436986399607, + "learning_rate": 3.469143062670266e-07, + "loss": 0.1438, + "step": 18061 + }, + { + "epoch": 0.92, + "grad_norm": 1.1062930156517436, + "learning_rate": 3.464843942663776e-07, + "loss": 0.1498, + "step": 18062 + }, + { + "epoch": 0.92, + "grad_norm": 1.2243603416203355, + "learning_rate": 3.4605474411788407e-07, + "loss": 0.1835, + "step": 18063 + }, + { + "epoch": 0.92, + "grad_norm": 1.1303646982159368, + "learning_rate": 3.456253558332001e-07, + "loss": 0.1652, + "step": 18064 + }, + { + "epoch": 0.92, + "grad_norm": 1.644905305055982, + "learning_rate": 3.451962294239741e-07, + "loss": 0.1728, + "step": 18065 + }, + { + "epoch": 0.92, + "grad_norm": 1.4259200726169508, + "learning_rate": 3.4476736490184683e-07, + "loss": 0.1637, + "step": 18066 + }, + { + "epoch": 0.92, + "grad_norm": 0.7865406720386555, + "learning_rate": 3.443387622784489e-07, + "loss": 0.1584, + "step": 18067 + }, + { + "epoch": 0.92, + "grad_norm": 0.7506526478559534, + "learning_rate": 3.439104215654088e-07, + "loss": 0.1562, + "step": 18068 + }, + { + "epoch": 0.92, + "grad_norm": 0.9513526709479354, + "learning_rate": 3.434823427743428e-07, + "loss": 0.1656, + "step": 18069 + }, + { + "epoch": 0.92, + "grad_norm": 0.9825643516548362, + "learning_rate": 3.430545259168638e-07, + "loss": 0.1626, + "step": 18070 + }, + { + "epoch": 0.92, + "grad_norm": 1.8435459556303768, + "learning_rate": 3.42626971004576e-07, + "loss": 0.1509, + "step": 18071 + }, + { + "epoch": 0.92, + "grad_norm": 1.1602326205822766, + "learning_rate": 3.421996780490766e-07, + "loss": 0.1743, + "step": 18072 + }, + { + "epoch": 0.92, + "grad_norm": 1.240986025282152, + "learning_rate": 3.4177264706195754e-07, + "loss": 0.1743, + "step": 18073 + }, + { + "epoch": 0.92, + "grad_norm": 0.9798339208614196, + "learning_rate": 3.413458780548007e-07, + "loss": 0.1678, + "step": 18074 + }, + { + "epoch": 0.92, + "grad_norm": 2.1122500459593923, + "learning_rate": 3.409193710391834e-07, + "loss": 0.1846, + "step": 18075 + }, + { + "epoch": 0.92, + "grad_norm": 1.2219867673175016, + "learning_rate": 3.4049312602667197e-07, + "loss": 0.1601, + "step": 18076 + }, + { + "epoch": 0.92, + "grad_norm": 1.1584272896050063, + "learning_rate": 3.400671430288316e-07, + "loss": 0.1418, + "step": 18077 + }, + { + "epoch": 0.92, + "grad_norm": 0.934316331610779, + "learning_rate": 3.396414220572142e-07, + "loss": 0.1549, + "step": 18078 + }, + { + "epoch": 0.92, + "grad_norm": 1.3242111151036162, + "learning_rate": 3.3921596312336935e-07, + "loss": 0.1588, + "step": 18079 + }, + { + "epoch": 0.92, + "grad_norm": 0.9434150029025636, + "learning_rate": 3.3879076623883677e-07, + "loss": 0.1556, + "step": 18080 + }, + { + "epoch": 0.92, + "grad_norm": 0.9255895715291566, + "learning_rate": 3.3836583141515054e-07, + "loss": 0.1705, + "step": 18081 + }, + { + "epoch": 0.92, + "grad_norm": 2.3659204139409478, + "learning_rate": 3.379411586638359e-07, + "loss": 0.1644, + "step": 18082 + }, + { + "epoch": 0.92, + "grad_norm": 2.2360570568932796, + "learning_rate": 3.3751674799641475e-07, + "loss": 0.1591, + "step": 18083 + }, + { + "epoch": 0.92, + "grad_norm": 1.1750311349980966, + "learning_rate": 3.3709259942439677e-07, + "loss": 0.1692, + "step": 18084 + }, + { + "epoch": 0.92, + "grad_norm": 1.2854669834238228, + "learning_rate": 3.3666871295928826e-07, + "loss": 0.168, + "step": 18085 + }, + { + "epoch": 0.92, + "grad_norm": 1.2254877804588578, + "learning_rate": 3.3624508861258564e-07, + "loss": 0.1592, + "step": 18086 + }, + { + "epoch": 0.92, + "grad_norm": 1.4678012533558435, + "learning_rate": 3.3582172639578304e-07, + "loss": 0.1746, + "step": 18087 + }, + { + "epoch": 0.92, + "grad_norm": 0.9622118655252306, + "learning_rate": 3.35398626320359e-07, + "loss": 0.1668, + "step": 18088 + }, + { + "epoch": 0.92, + "grad_norm": 1.467378103630613, + "learning_rate": 3.3497578839779554e-07, + "loss": 0.166, + "step": 18089 + }, + { + "epoch": 0.92, + "grad_norm": 1.0194618001770677, + "learning_rate": 3.345532126395579e-07, + "loss": 0.1638, + "step": 18090 + }, + { + "epoch": 0.92, + "grad_norm": 1.170019513023722, + "learning_rate": 3.3413089905711127e-07, + "loss": 0.1381, + "step": 18091 + }, + { + "epoch": 0.92, + "grad_norm": 0.936399762071647, + "learning_rate": 3.337088476619099e-07, + "loss": 0.1745, + "step": 18092 + }, + { + "epoch": 0.92, + "grad_norm": 0.9416114993211048, + "learning_rate": 3.332870584654013e-07, + "loss": 0.1697, + "step": 18093 + }, + { + "epoch": 0.92, + "grad_norm": 2.979094340683368, + "learning_rate": 3.3286553147902855e-07, + "loss": 0.1802, + "step": 18094 + }, + { + "epoch": 0.92, + "grad_norm": 0.9993114947029553, + "learning_rate": 3.3244426671422246e-07, + "loss": 0.1716, + "step": 18095 + }, + { + "epoch": 0.92, + "grad_norm": 0.9762340459586974, + "learning_rate": 3.320232641824139e-07, + "loss": 0.1665, + "step": 18096 + }, + { + "epoch": 0.92, + "grad_norm": 1.0886486279912961, + "learning_rate": 3.3160252389501824e-07, + "loss": 0.1652, + "step": 18097 + }, + { + "epoch": 0.92, + "grad_norm": 3.2737911651638636, + "learning_rate": 3.311820458634507e-07, + "loss": 0.1583, + "step": 18098 + }, + { + "epoch": 0.92, + "grad_norm": 0.9989512775252093, + "learning_rate": 3.3076183009911667e-07, + "loss": 0.1443, + "step": 18099 + }, + { + "epoch": 0.92, + "grad_norm": 0.9043083457326804, + "learning_rate": 3.3034187661341476e-07, + "loss": 0.1616, + "step": 18100 + }, + { + "epoch": 0.92, + "grad_norm": 1.511213481907978, + "learning_rate": 3.299221854177337e-07, + "loss": 0.1606, + "step": 18101 + }, + { + "epoch": 0.92, + "grad_norm": 0.8975862189129877, + "learning_rate": 3.29502756523461e-07, + "loss": 0.1452, + "step": 18102 + }, + { + "epoch": 0.92, + "grad_norm": 0.9606207256115495, + "learning_rate": 3.29083589941972e-07, + "loss": 0.1704, + "step": 18103 + }, + { + "epoch": 0.92, + "grad_norm": 0.9537946488192327, + "learning_rate": 3.286646856846376e-07, + "loss": 0.1623, + "step": 18104 + }, + { + "epoch": 0.92, + "grad_norm": 1.274674491839072, + "learning_rate": 3.2824604376281874e-07, + "loss": 0.166, + "step": 18105 + }, + { + "epoch": 0.92, + "grad_norm": 1.082756311497904, + "learning_rate": 3.2782766418787306e-07, + "loss": 0.1535, + "step": 18106 + }, + { + "epoch": 0.92, + "grad_norm": 1.2348526149655745, + "learning_rate": 3.274095469711469e-07, + "loss": 0.1637, + "step": 18107 + }, + { + "epoch": 0.92, + "grad_norm": 1.1677907803256713, + "learning_rate": 3.2699169212398354e-07, + "loss": 0.1689, + "step": 18108 + }, + { + "epoch": 0.92, + "grad_norm": 1.032761047187609, + "learning_rate": 3.2657409965771715e-07, + "loss": 0.1581, + "step": 18109 + }, + { + "epoch": 0.92, + "grad_norm": 1.5161936216848935, + "learning_rate": 3.2615676958367424e-07, + "loss": 0.1738, + "step": 18110 + }, + { + "epoch": 0.92, + "grad_norm": 0.8366912376904033, + "learning_rate": 3.2573970191317693e-07, + "loss": 0.1794, + "step": 18111 + }, + { + "epoch": 0.92, + "grad_norm": 0.8885962028795105, + "learning_rate": 3.2532289665753503e-07, + "loss": 0.1598, + "step": 18112 + }, + { + "epoch": 0.92, + "grad_norm": 1.7663929172818378, + "learning_rate": 3.2490635382805726e-07, + "loss": 0.156, + "step": 18113 + }, + { + "epoch": 0.92, + "grad_norm": 0.9646336089864638, + "learning_rate": 3.244900734360401e-07, + "loss": 0.169, + "step": 18114 + }, + { + "epoch": 0.92, + "grad_norm": 1.0913680494127547, + "learning_rate": 3.2407405549277683e-07, + "loss": 0.1544, + "step": 18115 + }, + { + "epoch": 0.92, + "grad_norm": 4.339377476577004, + "learning_rate": 3.2365830000954945e-07, + "loss": 0.1701, + "step": 18116 + }, + { + "epoch": 0.92, + "grad_norm": 1.1835551634972878, + "learning_rate": 3.23242806997639e-07, + "loss": 0.1798, + "step": 18117 + }, + { + "epoch": 0.92, + "grad_norm": 0.7438641443938219, + "learning_rate": 3.2282757646831306e-07, + "loss": 0.1447, + "step": 18118 + }, + { + "epoch": 0.92, + "grad_norm": 1.2541257573027336, + "learning_rate": 3.224126084328361e-07, + "loss": 0.1617, + "step": 18119 + }, + { + "epoch": 0.92, + "grad_norm": 1.298017080325946, + "learning_rate": 3.219979029024634e-07, + "loss": 0.1719, + "step": 18120 + }, + { + "epoch": 0.92, + "grad_norm": 1.0833901419058496, + "learning_rate": 3.21583459888446e-07, + "loss": 0.1504, + "step": 18121 + }, + { + "epoch": 0.92, + "grad_norm": 1.1311696910451334, + "learning_rate": 3.2116927940202157e-07, + "loss": 0.1725, + "step": 18122 + }, + { + "epoch": 0.92, + "grad_norm": 1.4654054266531409, + "learning_rate": 3.2075536145442897e-07, + "loss": 0.1606, + "step": 18123 + }, + { + "epoch": 0.92, + "grad_norm": 1.540578843845565, + "learning_rate": 3.203417060568925e-07, + "loss": 0.1611, + "step": 18124 + }, + { + "epoch": 0.92, + "grad_norm": 1.5544623075810544, + "learning_rate": 3.1992831322063324e-07, + "loss": 0.1667, + "step": 18125 + }, + { + "epoch": 0.92, + "grad_norm": 0.9578152551564797, + "learning_rate": 3.195151829568666e-07, + "loss": 0.1612, + "step": 18126 + }, + { + "epoch": 0.92, + "grad_norm": 0.9665434629439523, + "learning_rate": 3.191023152767969e-07, + "loss": 0.1497, + "step": 18127 + }, + { + "epoch": 0.92, + "grad_norm": 1.009141454543222, + "learning_rate": 3.1868971019162533e-07, + "loss": 0.1609, + "step": 18128 + }, + { + "epoch": 0.92, + "grad_norm": 1.0150188637483344, + "learning_rate": 3.1827736771253946e-07, + "loss": 0.1751, + "step": 18129 + }, + { + "epoch": 0.92, + "grad_norm": 0.8852232068809844, + "learning_rate": 3.178652878507293e-07, + "loss": 0.1538, + "step": 18130 + }, + { + "epoch": 0.92, + "grad_norm": 0.8192750933961805, + "learning_rate": 3.174534706173682e-07, + "loss": 0.1602, + "step": 18131 + }, + { + "epoch": 0.92, + "grad_norm": 0.8364749466835223, + "learning_rate": 3.170419160236293e-07, + "loss": 0.1617, + "step": 18132 + }, + { + "epoch": 0.92, + "grad_norm": 1.0486731726322975, + "learning_rate": 3.166306240806749e-07, + "loss": 0.1673, + "step": 18133 + }, + { + "epoch": 0.92, + "grad_norm": 0.994510073688784, + "learning_rate": 3.162195947996616e-07, + "loss": 0.1523, + "step": 18134 + }, + { + "epoch": 0.92, + "grad_norm": 1.2142922058584487, + "learning_rate": 3.158088281917393e-07, + "loss": 0.1691, + "step": 18135 + }, + { + "epoch": 0.92, + "grad_norm": 1.0238164627534345, + "learning_rate": 3.153983242680503e-07, + "loss": 0.1507, + "step": 18136 + }, + { + "epoch": 0.92, + "grad_norm": 1.0896154357419239, + "learning_rate": 3.149880830397267e-07, + "loss": 0.1846, + "step": 18137 + }, + { + "epoch": 0.92, + "grad_norm": 1.0416929788195814, + "learning_rate": 3.1457810451790083e-07, + "loss": 0.1621, + "step": 18138 + }, + { + "epoch": 0.92, + "grad_norm": 3.2883004776773355, + "learning_rate": 3.1416838871368925e-07, + "loss": 0.19, + "step": 18139 + }, + { + "epoch": 0.92, + "grad_norm": 0.8616914377273562, + "learning_rate": 3.137589356382076e-07, + "loss": 0.1522, + "step": 18140 + }, + { + "epoch": 0.92, + "grad_norm": 0.8543744553081801, + "learning_rate": 3.1334974530256134e-07, + "loss": 0.1656, + "step": 18141 + }, + { + "epoch": 0.92, + "grad_norm": 1.1054493297414734, + "learning_rate": 3.1294081771785057e-07, + "loss": 0.1475, + "step": 18142 + }, + { + "epoch": 0.92, + "grad_norm": 0.8807211474880493, + "learning_rate": 3.125321528951675e-07, + "loss": 0.1262, + "step": 18143 + }, + { + "epoch": 0.92, + "grad_norm": 0.9575123181423777, + "learning_rate": 3.1212375084559767e-07, + "loss": 0.1529, + "step": 18144 + }, + { + "epoch": 0.92, + "grad_norm": 1.1843135991699991, + "learning_rate": 3.117156115802178e-07, + "loss": 0.1592, + "step": 18145 + }, + { + "epoch": 0.92, + "grad_norm": 1.5669667485440848, + "learning_rate": 3.113077351100979e-07, + "loss": 0.1494, + "step": 18146 + }, + { + "epoch": 0.92, + "grad_norm": 0.9169278577698446, + "learning_rate": 3.1090012144630476e-07, + "loss": 0.1438, + "step": 18147 + }, + { + "epoch": 0.92, + "grad_norm": 1.554679031235613, + "learning_rate": 3.1049277059989167e-07, + "loss": 0.1633, + "step": 18148 + }, + { + "epoch": 0.92, + "grad_norm": 1.2194988964301239, + "learning_rate": 3.1008568258191095e-07, + "loss": 0.1622, + "step": 18149 + }, + { + "epoch": 0.92, + "grad_norm": 1.0482947007736183, + "learning_rate": 3.0967885740340266e-07, + "loss": 0.1358, + "step": 18150 + }, + { + "epoch": 0.92, + "grad_norm": 0.8393879878929953, + "learning_rate": 3.0927229507540126e-07, + "loss": 0.1526, + "step": 18151 + }, + { + "epoch": 0.92, + "grad_norm": 1.1180630860302485, + "learning_rate": 3.088659956089368e-07, + "loss": 0.1662, + "step": 18152 + }, + { + "epoch": 0.92, + "grad_norm": 1.0517176769625036, + "learning_rate": 3.0845995901503167e-07, + "loss": 0.1766, + "step": 18153 + }, + { + "epoch": 0.92, + "grad_norm": 1.3653458310572448, + "learning_rate": 3.080541853046948e-07, + "loss": 0.1944, + "step": 18154 + }, + { + "epoch": 0.92, + "grad_norm": 1.0415239876959776, + "learning_rate": 3.076486744889373e-07, + "loss": 0.1497, + "step": 18155 + }, + { + "epoch": 0.92, + "grad_norm": 1.24217910381336, + "learning_rate": 3.0724342657875604e-07, + "loss": 0.1783, + "step": 18156 + }, + { + "epoch": 0.92, + "grad_norm": 1.07365429308084, + "learning_rate": 3.068384415851455e-07, + "loss": 0.1538, + "step": 18157 + }, + { + "epoch": 0.92, + "grad_norm": 2.9870483996894945, + "learning_rate": 3.0643371951908806e-07, + "loss": 0.1896, + "step": 18158 + }, + { + "epoch": 0.92, + "grad_norm": 0.9650764946172202, + "learning_rate": 3.0602926039156487e-07, + "loss": 0.1665, + "step": 18159 + }, + { + "epoch": 0.92, + "grad_norm": 0.9600897084297326, + "learning_rate": 3.05625064213545e-07, + "loss": 0.1598, + "step": 18160 + }, + { + "epoch": 0.92, + "grad_norm": 0.8412084312803206, + "learning_rate": 3.0522113099599184e-07, + "loss": 0.1736, + "step": 18161 + }, + { + "epoch": 0.92, + "grad_norm": 1.050746648506869, + "learning_rate": 3.048174607498644e-07, + "loss": 0.186, + "step": 18162 + }, + { + "epoch": 0.92, + "grad_norm": 1.1393671291834178, + "learning_rate": 3.044140534861106e-07, + "loss": 0.1768, + "step": 18163 + }, + { + "epoch": 0.92, + "grad_norm": 1.2581635137906724, + "learning_rate": 3.04010909215674e-07, + "loss": 0.1781, + "step": 18164 + }, + { + "epoch": 0.92, + "grad_norm": 1.069811077440688, + "learning_rate": 3.0360802794948687e-07, + "loss": 0.1713, + "step": 18165 + }, + { + "epoch": 0.92, + "grad_norm": 0.8545719708007923, + "learning_rate": 3.032054096984816e-07, + "loss": 0.1502, + "step": 18166 + }, + { + "epoch": 0.92, + "grad_norm": 1.5327808645021874, + "learning_rate": 3.0280305447357607e-07, + "loss": 0.1389, + "step": 18167 + }, + { + "epoch": 0.92, + "grad_norm": 0.9530728622555193, + "learning_rate": 3.0240096228568606e-07, + "loss": 0.1396, + "step": 18168 + }, + { + "epoch": 0.92, + "grad_norm": 1.2315279397059917, + "learning_rate": 3.0199913314571726e-07, + "loss": 0.1459, + "step": 18169 + }, + { + "epoch": 0.92, + "grad_norm": 0.8872656692354135, + "learning_rate": 3.0159756706456987e-07, + "loss": 0.1484, + "step": 18170 + }, + { + "epoch": 0.92, + "grad_norm": 0.8181420046073249, + "learning_rate": 3.011962640531363e-07, + "loss": 0.1527, + "step": 18171 + }, + { + "epoch": 0.92, + "grad_norm": 1.3518825408066595, + "learning_rate": 3.007952241223022e-07, + "loss": 0.1806, + "step": 18172 + }, + { + "epoch": 0.92, + "grad_norm": 1.4158408673441485, + "learning_rate": 3.0039444728294563e-07, + "loss": 0.148, + "step": 18173 + }, + { + "epoch": 0.92, + "grad_norm": 1.1370683227512903, + "learning_rate": 2.999939335459379e-07, + "loss": 0.1633, + "step": 18174 + }, + { + "epoch": 0.92, + "grad_norm": 1.1368399110544725, + "learning_rate": 2.995936829221413e-07, + "loss": 0.1402, + "step": 18175 + }, + { + "epoch": 0.92, + "grad_norm": 0.9983565029532296, + "learning_rate": 2.9919369542241504e-07, + "loss": 0.1586, + "step": 18176 + }, + { + "epoch": 0.92, + "grad_norm": 0.8587644416400668, + "learning_rate": 2.9879397105760597e-07, + "loss": 0.1584, + "step": 18177 + }, + { + "epoch": 0.92, + "grad_norm": 1.1468541444837406, + "learning_rate": 2.9839450983855876e-07, + "loss": 0.1645, + "step": 18178 + }, + { + "epoch": 0.92, + "grad_norm": 1.8486483622789254, + "learning_rate": 2.979953117761103e-07, + "loss": 0.2073, + "step": 18179 + }, + { + "epoch": 0.92, + "grad_norm": 0.8125393213889613, + "learning_rate": 2.975963768810852e-07, + "loss": 0.1432, + "step": 18180 + }, + { + "epoch": 0.92, + "grad_norm": 1.3930677782012002, + "learning_rate": 2.971977051643071e-07, + "loss": 0.1656, + "step": 18181 + }, + { + "epoch": 0.92, + "grad_norm": 1.7007796046851476, + "learning_rate": 2.9679929663658957e-07, + "loss": 0.1799, + "step": 18182 + }, + { + "epoch": 0.92, + "grad_norm": 0.9477805648312042, + "learning_rate": 2.9640115130873835e-07, + "loss": 0.1565, + "step": 18183 + }, + { + "epoch": 0.92, + "grad_norm": 1.1168235302246006, + "learning_rate": 2.9600326919155486e-07, + "loss": 0.168, + "step": 18184 + }, + { + "epoch": 0.92, + "grad_norm": 0.859315154971044, + "learning_rate": 2.956056502958304e-07, + "loss": 0.1585, + "step": 18185 + }, + { + "epoch": 0.92, + "grad_norm": 1.5752916829869035, + "learning_rate": 2.952082946323498e-07, + "loss": 0.1666, + "step": 18186 + }, + { + "epoch": 0.92, + "grad_norm": 1.1845490612936995, + "learning_rate": 2.948112022118932e-07, + "loss": 0.1917, + "step": 18187 + }, + { + "epoch": 0.92, + "grad_norm": 1.7967161879916749, + "learning_rate": 2.944143730452298e-07, + "loss": 0.1686, + "step": 18188 + }, + { + "epoch": 0.92, + "grad_norm": 1.0093817119350061, + "learning_rate": 2.9401780714312657e-07, + "loss": 0.168, + "step": 18189 + }, + { + "epoch": 0.92, + "grad_norm": 0.8868337153966859, + "learning_rate": 2.936215045163371e-07, + "loss": 0.1441, + "step": 18190 + }, + { + "epoch": 0.93, + "grad_norm": 1.0742994689211378, + "learning_rate": 2.932254651756139e-07, + "loss": 0.1519, + "step": 18191 + }, + { + "epoch": 0.93, + "grad_norm": 1.9396325141785344, + "learning_rate": 2.928296891316973e-07, + "loss": 0.1725, + "step": 18192 + }, + { + "epoch": 0.93, + "grad_norm": 1.1127639369893016, + "learning_rate": 2.9243417639532424e-07, + "loss": 0.1516, + "step": 18193 + }, + { + "epoch": 0.93, + "grad_norm": 1.0071558709655952, + "learning_rate": 2.920389269772217e-07, + "loss": 0.1427, + "step": 18194 + }, + { + "epoch": 0.93, + "grad_norm": 1.3870759237473882, + "learning_rate": 2.916439408881111e-07, + "loss": 0.1573, + "step": 18195 + }, + { + "epoch": 0.93, + "grad_norm": 1.2161075701965, + "learning_rate": 2.912492181387072e-07, + "loss": 0.1739, + "step": 18196 + }, + { + "epoch": 0.93, + "grad_norm": 1.1769855123428175, + "learning_rate": 2.9085475873971815e-07, + "loss": 0.1627, + "step": 18197 + }, + { + "epoch": 0.93, + "grad_norm": 1.077500567192318, + "learning_rate": 2.9046056270184197e-07, + "loss": 0.1619, + "step": 18198 + }, + { + "epoch": 0.93, + "grad_norm": 1.067921049897875, + "learning_rate": 2.9006663003576904e-07, + "loss": 0.1553, + "step": 18199 + }, + { + "epoch": 0.93, + "grad_norm": 1.0080994104165948, + "learning_rate": 2.896729607521898e-07, + "loss": 0.1739, + "step": 18200 + }, + { + "epoch": 0.93, + "grad_norm": 1.031766215965229, + "learning_rate": 2.892795548617788e-07, + "loss": 0.1758, + "step": 18201 + }, + { + "epoch": 0.93, + "grad_norm": 1.2709143832706076, + "learning_rate": 2.8888641237520886e-07, + "loss": 0.1829, + "step": 18202 + }, + { + "epoch": 0.93, + "grad_norm": 0.7373246872216361, + "learning_rate": 2.8849353330314247e-07, + "loss": 0.1482, + "step": 18203 + }, + { + "epoch": 0.93, + "grad_norm": 1.6141345512194505, + "learning_rate": 2.881009176562377e-07, + "loss": 0.1835, + "step": 18204 + }, + { + "epoch": 0.93, + "grad_norm": 0.8910948700271394, + "learning_rate": 2.8770856544514393e-07, + "loss": 0.1536, + "step": 18205 + }, + { + "epoch": 0.93, + "grad_norm": 1.0048942029409267, + "learning_rate": 2.8731647668050477e-07, + "loss": 0.1706, + "step": 18206 + }, + { + "epoch": 0.93, + "grad_norm": 1.2715211457408504, + "learning_rate": 2.86924651372954e-07, + "loss": 0.1536, + "step": 18207 + }, + { + "epoch": 0.93, + "grad_norm": 1.185487138272821, + "learning_rate": 2.865330895331209e-07, + "loss": 0.1677, + "step": 18208 + }, + { + "epoch": 0.93, + "grad_norm": 0.8032552185841167, + "learning_rate": 2.861417911716269e-07, + "loss": 0.1383, + "step": 18209 + }, + { + "epoch": 0.93, + "grad_norm": 2.989140575419518, + "learning_rate": 2.8575075629908465e-07, + "loss": 0.1528, + "step": 18210 + }, + { + "epoch": 0.93, + "grad_norm": 1.3954966524477264, + "learning_rate": 2.853599849261024e-07, + "loss": 0.1658, + "step": 18211 + }, + { + "epoch": 0.93, + "grad_norm": 0.9393271717300197, + "learning_rate": 2.849694770632794e-07, + "loss": 0.1639, + "step": 18212 + }, + { + "epoch": 0.93, + "grad_norm": 0.8570807725947537, + "learning_rate": 2.8457923272120715e-07, + "loss": 0.157, + "step": 18213 + }, + { + "epoch": 0.93, + "grad_norm": 0.9916469584513358, + "learning_rate": 2.8418925191047163e-07, + "loss": 0.1492, + "step": 18214 + }, + { + "epoch": 0.93, + "grad_norm": 0.875743607965698, + "learning_rate": 2.8379953464165334e-07, + "loss": 0.1633, + "step": 18215 + }, + { + "epoch": 0.93, + "grad_norm": 1.0349400008204857, + "learning_rate": 2.8341008092531927e-07, + "loss": 0.1557, + "step": 18216 + }, + { + "epoch": 0.93, + "grad_norm": 1.6943081379529428, + "learning_rate": 2.8302089077203776e-07, + "loss": 0.1753, + "step": 18217 + }, + { + "epoch": 0.93, + "grad_norm": 1.3400755030544176, + "learning_rate": 2.826319641923614e-07, + "loss": 0.1639, + "step": 18218 + }, + { + "epoch": 0.93, + "grad_norm": 2.458439540499437, + "learning_rate": 2.8224330119684286e-07, + "loss": 0.1778, + "step": 18219 + }, + { + "epoch": 0.93, + "grad_norm": 1.0458337430912972, + "learning_rate": 2.818549017960237e-07, + "loss": 0.1697, + "step": 18220 + }, + { + "epoch": 0.93, + "grad_norm": 2.1383391293406424, + "learning_rate": 2.8146676600043777e-07, + "loss": 0.1956, + "step": 18221 + }, + { + "epoch": 0.93, + "grad_norm": 1.1962578304457712, + "learning_rate": 2.810788938206155e-07, + "loss": 0.1767, + "step": 18222 + }, + { + "epoch": 0.93, + "grad_norm": 0.9702710290737139, + "learning_rate": 2.8069128526707845e-07, + "loss": 0.178, + "step": 18223 + }, + { + "epoch": 0.93, + "grad_norm": 0.9376196152192133, + "learning_rate": 2.8030394035033827e-07, + "loss": 0.1586, + "step": 18224 + }, + { + "epoch": 0.93, + "grad_norm": 1.05253853210062, + "learning_rate": 2.7991685908090316e-07, + "loss": 0.16, + "step": 18225 + }, + { + "epoch": 0.93, + "grad_norm": 0.939644911654303, + "learning_rate": 2.7953004146927145e-07, + "loss": 0.1365, + "step": 18226 + }, + { + "epoch": 0.93, + "grad_norm": 1.1506454940271946, + "learning_rate": 2.791434875259369e-07, + "loss": 0.1536, + "step": 18227 + }, + { + "epoch": 0.93, + "grad_norm": 0.8693278008577767, + "learning_rate": 2.787571972613845e-07, + "loss": 0.1691, + "step": 18228 + }, + { + "epoch": 0.93, + "grad_norm": 0.8956275554521052, + "learning_rate": 2.7837117068609254e-07, + "loss": 0.1611, + "step": 18229 + }, + { + "epoch": 0.93, + "grad_norm": 1.3026610082054975, + "learning_rate": 2.779854078105304e-07, + "loss": 0.1835, + "step": 18230 + }, + { + "epoch": 0.93, + "grad_norm": 0.8988603613018683, + "learning_rate": 2.7759990864516306e-07, + "loss": 0.1412, + "step": 18231 + }, + { + "epoch": 0.93, + "grad_norm": 1.4338602081572938, + "learning_rate": 2.772146732004488e-07, + "loss": 0.1496, + "step": 18232 + }, + { + "epoch": 0.93, + "grad_norm": 1.7382306793567448, + "learning_rate": 2.768297014868337e-07, + "loss": 0.1581, + "step": 18233 + }, + { + "epoch": 0.93, + "grad_norm": 0.9871482240366329, + "learning_rate": 2.7644499351476396e-07, + "loss": 0.151, + "step": 18234 + }, + { + "epoch": 0.93, + "grad_norm": 1.4205508635886646, + "learning_rate": 2.760605492946722e-07, + "loss": 0.1774, + "step": 18235 + }, + { + "epoch": 0.93, + "grad_norm": 0.9918044669951058, + "learning_rate": 2.756763688369879e-07, + "loss": 0.176, + "step": 18236 + }, + { + "epoch": 0.93, + "grad_norm": 1.07450755147161, + "learning_rate": 2.7529245215213053e-07, + "loss": 0.1628, + "step": 18237 + }, + { + "epoch": 0.93, + "grad_norm": 1.0434581153695943, + "learning_rate": 2.7490879925051397e-07, + "loss": 0.1525, + "step": 18238 + }, + { + "epoch": 0.93, + "grad_norm": 0.9332330854715319, + "learning_rate": 2.745254101425465e-07, + "loss": 0.1678, + "step": 18239 + }, + { + "epoch": 0.93, + "grad_norm": 1.3009425937901884, + "learning_rate": 2.741422848386266e-07, + "loss": 0.1606, + "step": 18240 + }, + { + "epoch": 0.93, + "grad_norm": 1.2168959844464464, + "learning_rate": 2.737594233491458e-07, + "loss": 0.1757, + "step": 18241 + }, + { + "epoch": 0.93, + "grad_norm": 1.052663313889229, + "learning_rate": 2.733768256844915e-07, + "loss": 0.1414, + "step": 18242 + }, + { + "epoch": 0.93, + "grad_norm": 0.9262570446313309, + "learning_rate": 2.729944918550387e-07, + "loss": 0.1429, + "step": 18243 + }, + { + "epoch": 0.93, + "grad_norm": 0.810952260793739, + "learning_rate": 2.726124218711612e-07, + "loss": 0.1595, + "step": 18244 + }, + { + "epoch": 0.93, + "grad_norm": 1.0981094901401551, + "learning_rate": 2.7223061574321975e-07, + "loss": 0.184, + "step": 18245 + }, + { + "epoch": 0.93, + "grad_norm": 0.9443083295769201, + "learning_rate": 2.7184907348157377e-07, + "loss": 0.1632, + "step": 18246 + }, + { + "epoch": 0.93, + "grad_norm": 1.075805429275343, + "learning_rate": 2.714677950965694e-07, + "loss": 0.151, + "step": 18247 + }, + { + "epoch": 0.93, + "grad_norm": 1.1320019826123289, + "learning_rate": 2.7108678059855064e-07, + "loss": 0.1539, + "step": 18248 + }, + { + "epoch": 0.93, + "grad_norm": 0.9815125581439055, + "learning_rate": 2.707060299978537e-07, + "loss": 0.1659, + "step": 18249 + }, + { + "epoch": 0.93, + "grad_norm": 1.0098818213384717, + "learning_rate": 2.7032554330480464e-07, + "loss": 0.1505, + "step": 18250 + }, + { + "epoch": 0.93, + "grad_norm": 1.1575032174458653, + "learning_rate": 2.699453205297253e-07, + "loss": 0.1633, + "step": 18251 + }, + { + "epoch": 0.93, + "grad_norm": 1.2478476951841007, + "learning_rate": 2.6956536168292747e-07, + "loss": 0.1638, + "step": 18252 + }, + { + "epoch": 0.93, + "grad_norm": 1.2882046648564152, + "learning_rate": 2.6918566677471946e-07, + "loss": 0.1565, + "step": 18253 + }, + { + "epoch": 0.93, + "grad_norm": 1.1423199526776788, + "learning_rate": 2.688062358153998e-07, + "loss": 0.1486, + "step": 18254 + }, + { + "epoch": 0.93, + "grad_norm": 0.8599154346841992, + "learning_rate": 2.6842706881526125e-07, + "loss": 0.1521, + "step": 18255 + }, + { + "epoch": 0.93, + "grad_norm": 1.0747232249223346, + "learning_rate": 2.680481657845868e-07, + "loss": 0.1405, + "step": 18256 + }, + { + "epoch": 0.93, + "grad_norm": 1.1667972927867134, + "learning_rate": 2.67669526733656e-07, + "loss": 0.1858, + "step": 18257 + }, + { + "epoch": 0.93, + "grad_norm": 1.3900194062396538, + "learning_rate": 2.6729115167273834e-07, + "loss": 0.1652, + "step": 18258 + }, + { + "epoch": 0.93, + "grad_norm": 1.0342281416929027, + "learning_rate": 2.66913040612099e-07, + "loss": 0.1841, + "step": 18259 + }, + { + "epoch": 0.93, + "grad_norm": 1.1819018631064098, + "learning_rate": 2.665351935619931e-07, + "loss": 0.1683, + "step": 18260 + }, + { + "epoch": 0.93, + "grad_norm": 1.394688171936252, + "learning_rate": 2.661576105326702e-07, + "loss": 0.1641, + "step": 18261 + }, + { + "epoch": 0.93, + "grad_norm": 0.8566435495872279, + "learning_rate": 2.65780291534371e-07, + "loss": 0.1631, + "step": 18262 + }, + { + "epoch": 0.93, + "grad_norm": 1.355142728772295, + "learning_rate": 2.654032365773318e-07, + "loss": 0.1421, + "step": 18263 + }, + { + "epoch": 0.93, + "grad_norm": 1.1998677089669416, + "learning_rate": 2.650264456717788e-07, + "loss": 0.1559, + "step": 18264 + }, + { + "epoch": 0.93, + "grad_norm": 1.2343661872770706, + "learning_rate": 2.646499188279328e-07, + "loss": 0.1569, + "step": 18265 + }, + { + "epoch": 0.93, + "grad_norm": 0.9675566866904394, + "learning_rate": 2.6427365605600883e-07, + "loss": 0.1459, + "step": 18266 + }, + { + "epoch": 0.93, + "grad_norm": 1.2944387885918796, + "learning_rate": 2.638976573662122e-07, + "loss": 0.1758, + "step": 18267 + }, + { + "epoch": 0.93, + "grad_norm": 1.0901993055937835, + "learning_rate": 2.635219227687413e-07, + "loss": 0.1604, + "step": 18268 + }, + { + "epoch": 0.93, + "grad_norm": 0.8275611356887129, + "learning_rate": 2.63146452273787e-07, + "loss": 0.1381, + "step": 18269 + }, + { + "epoch": 0.93, + "grad_norm": 1.1146506553802311, + "learning_rate": 2.6277124589153657e-07, + "loss": 0.1734, + "step": 18270 + }, + { + "epoch": 0.93, + "grad_norm": 1.5159778292664159, + "learning_rate": 2.623963036321642e-07, + "loss": 0.157, + "step": 18271 + }, + { + "epoch": 0.93, + "grad_norm": 1.1055152798151293, + "learning_rate": 2.6202162550584387e-07, + "loss": 0.1712, + "step": 18272 + }, + { + "epoch": 0.93, + "grad_norm": 1.671505281711295, + "learning_rate": 2.6164721152273644e-07, + "loss": 0.1883, + "step": 18273 + }, + { + "epoch": 0.93, + "grad_norm": 0.9763923828625075, + "learning_rate": 2.6127306169299815e-07, + "loss": 0.1563, + "step": 18274 + }, + { + "epoch": 0.93, + "grad_norm": 1.1784812501395956, + "learning_rate": 2.608991760267776e-07, + "loss": 0.1732, + "step": 18275 + }, + { + "epoch": 0.93, + "grad_norm": 0.9906157660421311, + "learning_rate": 2.605255545342178e-07, + "loss": 0.1457, + "step": 18276 + }, + { + "epoch": 0.93, + "grad_norm": 1.1435715096827683, + "learning_rate": 2.6015219722545173e-07, + "loss": 0.1449, + "step": 18277 + }, + { + "epoch": 0.93, + "grad_norm": 1.0535128935316218, + "learning_rate": 2.5977910411060905e-07, + "loss": 0.1861, + "step": 18278 + }, + { + "epoch": 0.93, + "grad_norm": 1.5980771885217533, + "learning_rate": 2.594062751998061e-07, + "loss": 0.1623, + "step": 18279 + }, + { + "epoch": 0.93, + "grad_norm": 1.4537834976700899, + "learning_rate": 2.590337105031604e-07, + "loss": 0.166, + "step": 18280 + }, + { + "epoch": 0.93, + "grad_norm": 1.231681614406639, + "learning_rate": 2.586614100307738e-07, + "loss": 0.1532, + "step": 18281 + }, + { + "epoch": 0.93, + "grad_norm": 1.0291305206451635, + "learning_rate": 2.582893737927472e-07, + "loss": 0.1823, + "step": 18282 + }, + { + "epoch": 0.93, + "grad_norm": 0.8890034877363463, + "learning_rate": 2.5791760179917135e-07, + "loss": 0.1658, + "step": 18283 + }, + { + "epoch": 0.93, + "grad_norm": 1.0296792104223675, + "learning_rate": 2.5754609406013044e-07, + "loss": 0.161, + "step": 18284 + }, + { + "epoch": 0.93, + "grad_norm": 0.8199592585300417, + "learning_rate": 2.5717485058570304e-07, + "loss": 0.1665, + "step": 18285 + }, + { + "epoch": 0.93, + "grad_norm": 2.0453994543406147, + "learning_rate": 2.5680387138595663e-07, + "loss": 0.1708, + "step": 18286 + }, + { + "epoch": 0.93, + "grad_norm": 1.324591897357187, + "learning_rate": 2.5643315647095655e-07, + "loss": 0.1461, + "step": 18287 + }, + { + "epoch": 0.93, + "grad_norm": 1.942078050240307, + "learning_rate": 2.56062705850757e-07, + "loss": 0.1643, + "step": 18288 + }, + { + "epoch": 0.93, + "grad_norm": 1.0278227471601282, + "learning_rate": 2.5569251953540763e-07, + "loss": 0.1624, + "step": 18289 + }, + { + "epoch": 0.93, + "grad_norm": 1.1004008321153336, + "learning_rate": 2.5532259753494825e-07, + "loss": 0.165, + "step": 18290 + }, + { + "epoch": 0.93, + "grad_norm": 1.2990139603467061, + "learning_rate": 2.5495293985941414e-07, + "loss": 0.1631, + "step": 18291 + }, + { + "epoch": 0.93, + "grad_norm": 3.930321254713959, + "learning_rate": 2.5458354651883065e-07, + "loss": 0.1714, + "step": 18292 + }, + { + "epoch": 0.93, + "grad_norm": 1.093198105156117, + "learning_rate": 2.5421441752322086e-07, + "loss": 0.1641, + "step": 18293 + }, + { + "epoch": 0.93, + "grad_norm": 0.9736377661780048, + "learning_rate": 2.538455528825945e-07, + "loss": 0.1577, + "step": 18294 + }, + { + "epoch": 0.93, + "grad_norm": 0.8873869753036656, + "learning_rate": 2.5347695260695805e-07, + "loss": 0.1459, + "step": 18295 + }, + { + "epoch": 0.93, + "grad_norm": 1.216394455946613, + "learning_rate": 2.5310861670631015e-07, + "loss": 0.1644, + "step": 18296 + }, + { + "epoch": 0.93, + "grad_norm": 1.2671143732954295, + "learning_rate": 2.5274054519064175e-07, + "loss": 0.1524, + "step": 18297 + }, + { + "epoch": 0.93, + "grad_norm": 1.0028280164130396, + "learning_rate": 2.523727380699348e-07, + "loss": 0.1736, + "step": 18298 + }, + { + "epoch": 0.93, + "grad_norm": 1.3123715393982744, + "learning_rate": 2.5200519535417024e-07, + "loss": 0.157, + "step": 18299 + }, + { + "epoch": 0.93, + "grad_norm": 0.9316140446756942, + "learning_rate": 2.5163791705331343e-07, + "loss": 0.1543, + "step": 18300 + }, + { + "epoch": 0.93, + "grad_norm": 2.046187498563863, + "learning_rate": 2.5127090317732973e-07, + "loss": 0.1673, + "step": 18301 + }, + { + "epoch": 0.93, + "grad_norm": 1.1597565818611806, + "learning_rate": 2.5090415373617336e-07, + "loss": 0.1635, + "step": 18302 + }, + { + "epoch": 0.93, + "grad_norm": 0.8820846486012719, + "learning_rate": 2.5053766873979204e-07, + "loss": 0.163, + "step": 18303 + }, + { + "epoch": 0.93, + "grad_norm": 0.9880960254385381, + "learning_rate": 2.5017144819812766e-07, + "loss": 0.1483, + "step": 18304 + }, + { + "epoch": 0.93, + "grad_norm": 1.2739535762192775, + "learning_rate": 2.4980549212111236e-07, + "loss": 0.1561, + "step": 18305 + }, + { + "epoch": 0.93, + "grad_norm": 2.084334086475733, + "learning_rate": 2.494398005186749e-07, + "loss": 0.1584, + "step": 18306 + }, + { + "epoch": 0.93, + "grad_norm": 0.933607113556294, + "learning_rate": 2.490743734007328e-07, + "loss": 0.147, + "step": 18307 + }, + { + "epoch": 0.93, + "grad_norm": 1.0565751035924777, + "learning_rate": 2.4870921077720043e-07, + "loss": 0.1712, + "step": 18308 + }, + { + "epoch": 0.93, + "grad_norm": 0.7900491077807519, + "learning_rate": 2.483443126579799e-07, + "loss": 0.135, + "step": 18309 + }, + { + "epoch": 0.93, + "grad_norm": 0.7874166677831361, + "learning_rate": 2.4797967905297093e-07, + "loss": 0.1366, + "step": 18310 + }, + { + "epoch": 0.93, + "grad_norm": 1.1423803695769863, + "learning_rate": 2.476153099720635e-07, + "loss": 0.1542, + "step": 18311 + }, + { + "epoch": 0.93, + "grad_norm": 0.8768095068058398, + "learning_rate": 2.47251205425143e-07, + "loss": 0.1553, + "step": 18312 + }, + { + "epoch": 0.93, + "grad_norm": 1.1056877390442028, + "learning_rate": 2.468873654220838e-07, + "loss": 0.1493, + "step": 18313 + }, + { + "epoch": 0.93, + "grad_norm": 0.7899447781250339, + "learning_rate": 2.465237899727557e-07, + "loss": 0.1755, + "step": 18314 + }, + { + "epoch": 0.93, + "grad_norm": 1.7863005549546918, + "learning_rate": 2.461604790870209e-07, + "loss": 0.1535, + "step": 18315 + }, + { + "epoch": 0.93, + "grad_norm": 1.4838306408263966, + "learning_rate": 2.4579743277473365e-07, + "loss": 0.1582, + "step": 18316 + }, + { + "epoch": 0.93, + "grad_norm": 0.9945185936066469, + "learning_rate": 2.454346510457417e-07, + "loss": 0.1703, + "step": 18317 + }, + { + "epoch": 0.93, + "grad_norm": 1.184309812436225, + "learning_rate": 2.4507213390988604e-07, + "loss": 0.1555, + "step": 18318 + }, + { + "epoch": 0.93, + "grad_norm": 0.7978023591704787, + "learning_rate": 2.4470988137699993e-07, + "loss": 0.1503, + "step": 18319 + }, + { + "epoch": 0.93, + "grad_norm": 0.9209433934114493, + "learning_rate": 2.443478934569088e-07, + "loss": 0.1604, + "step": 18320 + }, + { + "epoch": 0.93, + "grad_norm": 1.5905991257382877, + "learning_rate": 2.439861701594326e-07, + "loss": 0.1831, + "step": 18321 + }, + { + "epoch": 0.93, + "grad_norm": 0.9910997370507246, + "learning_rate": 2.4362471149438237e-07, + "loss": 0.1515, + "step": 18322 + }, + { + "epoch": 0.93, + "grad_norm": 1.50148417664633, + "learning_rate": 2.4326351747156365e-07, + "loss": 0.1458, + "step": 18323 + }, + { + "epoch": 0.93, + "grad_norm": 0.9682930228090741, + "learning_rate": 2.4290258810077183e-07, + "loss": 0.1564, + "step": 18324 + }, + { + "epoch": 0.93, + "grad_norm": 1.0085186474825831, + "learning_rate": 2.4254192339179915e-07, + "loss": 0.1577, + "step": 18325 + }, + { + "epoch": 0.93, + "grad_norm": 1.0797598581900214, + "learning_rate": 2.421815233544267e-07, + "loss": 0.1556, + "step": 18326 + }, + { + "epoch": 0.93, + "grad_norm": 0.8254359964739783, + "learning_rate": 2.418213879984321e-07, + "loss": 0.1522, + "step": 18327 + }, + { + "epoch": 0.93, + "grad_norm": 0.9154350038423441, + "learning_rate": 2.4146151733358327e-07, + "loss": 0.1597, + "step": 18328 + }, + { + "epoch": 0.93, + "grad_norm": 1.8372875062859295, + "learning_rate": 2.4110191136964333e-07, + "loss": 0.1628, + "step": 18329 + }, + { + "epoch": 0.93, + "grad_norm": 1.228360493795967, + "learning_rate": 2.407425701163635e-07, + "loss": 0.1649, + "step": 18330 + }, + { + "epoch": 0.93, + "grad_norm": 1.325196990957879, + "learning_rate": 2.403834935834948e-07, + "loss": 0.1629, + "step": 18331 + }, + { + "epoch": 0.93, + "grad_norm": 1.3452048806832089, + "learning_rate": 2.400246817807728e-07, + "loss": 0.1703, + "step": 18332 + }, + { + "epoch": 0.93, + "grad_norm": 0.9918467529509656, + "learning_rate": 2.396661347179341e-07, + "loss": 0.1876, + "step": 18333 + }, + { + "epoch": 0.93, + "grad_norm": 1.4787781081476281, + "learning_rate": 2.393078524047021e-07, + "loss": 0.1545, + "step": 18334 + }, + { + "epoch": 0.93, + "grad_norm": 1.2674619556440334, + "learning_rate": 2.3894983485079683e-07, + "loss": 0.1668, + "step": 18335 + }, + { + "epoch": 0.93, + "grad_norm": 1.0590605390474992, + "learning_rate": 2.385920820659271e-07, + "loss": 0.1529, + "step": 18336 + }, + { + "epoch": 0.93, + "grad_norm": 1.201334926543082, + "learning_rate": 2.3823459405979854e-07, + "loss": 0.1582, + "step": 18337 + }, + { + "epoch": 0.93, + "grad_norm": 1.1333430396610051, + "learning_rate": 2.3787737084210893e-07, + "loss": 0.1554, + "step": 18338 + }, + { + "epoch": 0.93, + "grad_norm": 0.7409420607842309, + "learning_rate": 2.375204124225472e-07, + "loss": 0.1495, + "step": 18339 + }, + { + "epoch": 0.93, + "grad_norm": 1.0680731399439665, + "learning_rate": 2.3716371881079558e-07, + "loss": 0.1726, + "step": 18340 + }, + { + "epoch": 0.93, + "grad_norm": 1.0948349108272253, + "learning_rate": 2.3680729001652858e-07, + "loss": 0.1696, + "step": 18341 + }, + { + "epoch": 0.93, + "grad_norm": 1.051007648963238, + "learning_rate": 2.3645112604941623e-07, + "loss": 0.1737, + "step": 18342 + }, + { + "epoch": 0.93, + "grad_norm": 1.0056811990684162, + "learning_rate": 2.3609522691911746e-07, + "loss": 0.1605, + "step": 18343 + }, + { + "epoch": 0.93, + "grad_norm": 2.025955618299223, + "learning_rate": 2.3573959263528677e-07, + "loss": 0.1697, + "step": 18344 + }, + { + "epoch": 0.93, + "grad_norm": 1.0134681991322119, + "learning_rate": 2.3538422320757204e-07, + "loss": 0.1532, + "step": 18345 + }, + { + "epoch": 0.93, + "grad_norm": 1.2651014914706251, + "learning_rate": 2.3502911864561218e-07, + "loss": 0.1739, + "step": 18346 + }, + { + "epoch": 0.93, + "grad_norm": 1.0810092009920989, + "learning_rate": 2.346742789590384e-07, + "loss": 0.1534, + "step": 18347 + }, + { + "epoch": 0.93, + "grad_norm": 1.7203593062803417, + "learning_rate": 2.343197041574774e-07, + "loss": 0.1703, + "step": 18348 + }, + { + "epoch": 0.93, + "grad_norm": 0.988197154022997, + "learning_rate": 2.3396539425054376e-07, + "loss": 0.1713, + "step": 18349 + }, + { + "epoch": 0.93, + "grad_norm": 1.1065800214268064, + "learning_rate": 2.3361134924785313e-07, + "loss": 0.1667, + "step": 18350 + }, + { + "epoch": 0.93, + "grad_norm": 0.8575847025459382, + "learning_rate": 2.3325756915900445e-07, + "loss": 0.133, + "step": 18351 + }, + { + "epoch": 0.93, + "grad_norm": 0.8488946851219393, + "learning_rate": 2.329040539935967e-07, + "loss": 0.1523, + "step": 18352 + }, + { + "epoch": 0.93, + "grad_norm": 0.8545552151939171, + "learning_rate": 2.3255080376121676e-07, + "loss": 0.1528, + "step": 18353 + }, + { + "epoch": 0.93, + "grad_norm": 1.7129768044688733, + "learning_rate": 2.3219781847144906e-07, + "loss": 0.1884, + "step": 18354 + }, + { + "epoch": 0.93, + "grad_norm": 1.2532969807361725, + "learning_rate": 2.3184509813386713e-07, + "loss": 0.1554, + "step": 18355 + }, + { + "epoch": 0.93, + "grad_norm": 0.865328382304051, + "learning_rate": 2.3149264275803884e-07, + "loss": 0.1481, + "step": 18356 + }, + { + "epoch": 0.93, + "grad_norm": 0.819398377750584, + "learning_rate": 2.3114045235352433e-07, + "loss": 0.1597, + "step": 18357 + }, + { + "epoch": 0.93, + "grad_norm": 0.9760997913944155, + "learning_rate": 2.3078852692987596e-07, + "loss": 0.1533, + "step": 18358 + }, + { + "epoch": 0.93, + "grad_norm": 0.8124370575612277, + "learning_rate": 2.3043686649664165e-07, + "loss": 0.149, + "step": 18359 + }, + { + "epoch": 0.93, + "grad_norm": 0.9997317244657435, + "learning_rate": 2.3008547106335822e-07, + "loss": 0.1624, + "step": 18360 + }, + { + "epoch": 0.93, + "grad_norm": 1.2057741927307049, + "learning_rate": 2.2973434063955802e-07, + "loss": 0.1463, + "step": 18361 + }, + { + "epoch": 0.93, + "grad_norm": 0.8137021469097535, + "learning_rate": 2.2938347523476568e-07, + "loss": 0.1533, + "step": 18362 + }, + { + "epoch": 0.93, + "grad_norm": 3.0808834168689883, + "learning_rate": 2.2903287485849913e-07, + "loss": 0.1568, + "step": 18363 + }, + { + "epoch": 0.93, + "grad_norm": 0.9410298125961627, + "learning_rate": 2.2868253952026741e-07, + "loss": 0.1491, + "step": 18364 + }, + { + "epoch": 0.93, + "grad_norm": 1.0671457063576877, + "learning_rate": 2.2833246922957408e-07, + "loss": 0.1649, + "step": 18365 + }, + { + "epoch": 0.93, + "grad_norm": 1.163398353724536, + "learning_rate": 2.2798266399591374e-07, + "loss": 0.1666, + "step": 18366 + }, + { + "epoch": 0.93, + "grad_norm": 0.893536336859922, + "learning_rate": 2.2763312382877656e-07, + "loss": 0.1493, + "step": 18367 + }, + { + "epoch": 0.93, + "grad_norm": 1.1466722066958055, + "learning_rate": 2.2728384873764276e-07, + "loss": 0.1667, + "step": 18368 + }, + { + "epoch": 0.93, + "grad_norm": 1.1088802038655343, + "learning_rate": 2.2693483873198584e-07, + "loss": 0.1415, + "step": 18369 + }, + { + "epoch": 0.93, + "grad_norm": 1.1395079425027594, + "learning_rate": 2.2658609382127384e-07, + "loss": 0.1837, + "step": 18370 + }, + { + "epoch": 0.93, + "grad_norm": 1.0489862597909092, + "learning_rate": 2.2623761401496468e-07, + "loss": 0.1582, + "step": 18371 + }, + { + "epoch": 0.93, + "grad_norm": 1.030787526510104, + "learning_rate": 2.2588939932251418e-07, + "loss": 0.1592, + "step": 18372 + }, + { + "epoch": 0.93, + "grad_norm": 0.9229346827535737, + "learning_rate": 2.2554144975336477e-07, + "loss": 0.1509, + "step": 18373 + }, + { + "epoch": 0.93, + "grad_norm": 1.2346282121275605, + "learning_rate": 2.251937653169567e-07, + "loss": 0.1606, + "step": 18374 + }, + { + "epoch": 0.93, + "grad_norm": 0.8715933060605532, + "learning_rate": 2.2484634602271794e-07, + "loss": 0.1683, + "step": 18375 + }, + { + "epoch": 0.93, + "grad_norm": 0.941253110893302, + "learning_rate": 2.244991918800754e-07, + "loss": 0.1521, + "step": 18376 + }, + { + "epoch": 0.93, + "grad_norm": 0.9266730919291768, + "learning_rate": 2.2415230289844382e-07, + "loss": 0.1589, + "step": 18377 + }, + { + "epoch": 0.93, + "grad_norm": 0.9771410611139677, + "learning_rate": 2.238056790872345e-07, + "loss": 0.1724, + "step": 18378 + }, + { + "epoch": 0.93, + "grad_norm": 1.5235936206743117, + "learning_rate": 2.2345932045584662e-07, + "loss": 0.196, + "step": 18379 + }, + { + "epoch": 0.93, + "grad_norm": 0.9841273240110063, + "learning_rate": 2.231132270136771e-07, + "loss": 0.1763, + "step": 18380 + }, + { + "epoch": 0.93, + "grad_norm": 1.2765539802867825, + "learning_rate": 2.22767398770114e-07, + "loss": 0.155, + "step": 18381 + }, + { + "epoch": 0.93, + "grad_norm": 1.1186553643443082, + "learning_rate": 2.2242183573453756e-07, + "loss": 0.1868, + "step": 18382 + }, + { + "epoch": 0.93, + "grad_norm": 1.0287318642723777, + "learning_rate": 2.2207653791632035e-07, + "loss": 0.1417, + "step": 18383 + }, + { + "epoch": 0.93, + "grad_norm": 1.3699336996747302, + "learning_rate": 2.2173150532483035e-07, + "loss": 0.1648, + "step": 18384 + }, + { + "epoch": 0.93, + "grad_norm": 0.9571519442266735, + "learning_rate": 2.2138673796942457e-07, + "loss": 0.1568, + "step": 18385 + }, + { + "epoch": 0.93, + "grad_norm": 2.2746480790935992, + "learning_rate": 2.210422358594566e-07, + "loss": 0.167, + "step": 18386 + }, + { + "epoch": 0.94, + "grad_norm": 1.106230206821423, + "learning_rate": 2.2069799900427014e-07, + "loss": 0.1719, + "step": 18387 + }, + { + "epoch": 0.94, + "grad_norm": 2.791480398398672, + "learning_rate": 2.203540274132021e-07, + "loss": 0.149, + "step": 18388 + }, + { + "epoch": 0.94, + "grad_norm": 1.0617422537257237, + "learning_rate": 2.2001032109558396e-07, + "loss": 0.1616, + "step": 18389 + }, + { + "epoch": 0.94, + "grad_norm": 1.8104952562718482, + "learning_rate": 2.196668800607382e-07, + "loss": 0.1635, + "step": 18390 + }, + { + "epoch": 0.94, + "grad_norm": 0.9657527974823339, + "learning_rate": 2.1932370431798077e-07, + "loss": 0.1476, + "step": 18391 + }, + { + "epoch": 0.94, + "grad_norm": 0.983063383851243, + "learning_rate": 2.1898079387662085e-07, + "loss": 0.1919, + "step": 18392 + }, + { + "epoch": 0.94, + "grad_norm": 2.6038285199676197, + "learning_rate": 2.186381487459588e-07, + "loss": 0.1672, + "step": 18393 + }, + { + "epoch": 0.94, + "grad_norm": 0.8242832192874088, + "learning_rate": 2.1829576893528938e-07, + "loss": 0.1552, + "step": 18394 + }, + { + "epoch": 0.94, + "grad_norm": 2.072684200243416, + "learning_rate": 2.179536544538996e-07, + "loss": 0.1466, + "step": 18395 + }, + { + "epoch": 0.94, + "grad_norm": 0.9253757492899954, + "learning_rate": 2.1761180531106873e-07, + "loss": 0.1728, + "step": 18396 + }, + { + "epoch": 0.94, + "grad_norm": 1.066376706831964, + "learning_rate": 2.1727022151607046e-07, + "loss": 0.1488, + "step": 18397 + }, + { + "epoch": 0.94, + "grad_norm": 2.314431093429177, + "learning_rate": 2.1692890307817073e-07, + "loss": 0.1631, + "step": 18398 + }, + { + "epoch": 0.94, + "grad_norm": 0.8498934849664621, + "learning_rate": 2.1658785000662763e-07, + "loss": 0.1486, + "step": 18399 + }, + { + "epoch": 0.94, + "grad_norm": 1.0378436730938367, + "learning_rate": 2.1624706231068936e-07, + "loss": 0.1573, + "step": 18400 + }, + { + "epoch": 0.94, + "grad_norm": 0.8970051524119859, + "learning_rate": 2.159065399996041e-07, + "loss": 0.1624, + "step": 18401 + }, + { + "epoch": 0.94, + "grad_norm": 0.9922525353722269, + "learning_rate": 2.1556628308260552e-07, + "loss": 0.1818, + "step": 18402 + }, + { + "epoch": 0.94, + "grad_norm": 0.8332483233289575, + "learning_rate": 2.1522629156892516e-07, + "loss": 0.1438, + "step": 18403 + }, + { + "epoch": 0.94, + "grad_norm": 0.8701610657938882, + "learning_rate": 2.1488656546778342e-07, + "loss": 0.1745, + "step": 18404 + }, + { + "epoch": 0.94, + "grad_norm": 0.9793402914905001, + "learning_rate": 2.1454710478839736e-07, + "loss": 0.174, + "step": 18405 + }, + { + "epoch": 0.94, + "grad_norm": 2.817119570072124, + "learning_rate": 2.1420790953997294e-07, + "loss": 0.1722, + "step": 18406 + }, + { + "epoch": 0.94, + "grad_norm": 1.782561768407933, + "learning_rate": 2.138689797317106e-07, + "loss": 0.1741, + "step": 18407 + }, + { + "epoch": 0.94, + "grad_norm": 1.4030966551078101, + "learning_rate": 2.1353031537280743e-07, + "loss": 0.1519, + "step": 18408 + }, + { + "epoch": 0.94, + "grad_norm": 1.4607777385501066, + "learning_rate": 2.1319191647244497e-07, + "loss": 0.1799, + "step": 18409 + }, + { + "epoch": 0.94, + "grad_norm": 0.9416992136910253, + "learning_rate": 2.1285378303980585e-07, + "loss": 0.1441, + "step": 18410 + }, + { + "epoch": 0.94, + "grad_norm": 0.8708030270127827, + "learning_rate": 2.1251591508405945e-07, + "loss": 0.1561, + "step": 18411 + }, + { + "epoch": 0.94, + "grad_norm": 1.04718883719092, + "learning_rate": 2.1217831261437283e-07, + "loss": 0.1471, + "step": 18412 + }, + { + "epoch": 0.94, + "grad_norm": 1.4080305099714423, + "learning_rate": 2.1184097563990204e-07, + "loss": 0.1692, + "step": 18413 + }, + { + "epoch": 0.94, + "grad_norm": 1.3780724995478995, + "learning_rate": 2.115039041697964e-07, + "loss": 0.1723, + "step": 18414 + }, + { + "epoch": 0.94, + "grad_norm": 1.0810492064587849, + "learning_rate": 2.111670982132008e-07, + "loss": 0.1642, + "step": 18415 + }, + { + "epoch": 0.94, + "grad_norm": 1.4676322049548796, + "learning_rate": 2.1083055777925244e-07, + "loss": 0.1595, + "step": 18416 + }, + { + "epoch": 0.94, + "grad_norm": 1.3846819242414135, + "learning_rate": 2.104942828770762e-07, + "loss": 0.172, + "step": 18417 + }, + { + "epoch": 0.94, + "grad_norm": 1.007773102930094, + "learning_rate": 2.1015827351579588e-07, + "loss": 0.1845, + "step": 18418 + }, + { + "epoch": 0.94, + "grad_norm": 0.9008750167154151, + "learning_rate": 2.0982252970452532e-07, + "loss": 0.1526, + "step": 18419 + }, + { + "epoch": 0.94, + "grad_norm": 1.0345250458075697, + "learning_rate": 2.0948705145237168e-07, + "loss": 0.1622, + "step": 18420 + }, + { + "epoch": 0.94, + "grad_norm": 0.982428132994956, + "learning_rate": 2.0915183876843436e-07, + "loss": 0.1604, + "step": 18421 + }, + { + "epoch": 0.94, + "grad_norm": 0.9070283705308484, + "learning_rate": 2.0881689166180718e-07, + "loss": 0.1657, + "step": 18422 + }, + { + "epoch": 0.94, + "grad_norm": 1.2758911966430224, + "learning_rate": 2.0848221014157398e-07, + "loss": 0.1613, + "step": 18423 + }, + { + "epoch": 0.94, + "grad_norm": 1.1603708286280212, + "learning_rate": 2.081477942168142e-07, + "loss": 0.1639, + "step": 18424 + }, + { + "epoch": 0.94, + "grad_norm": 0.9503295180273585, + "learning_rate": 2.078136438965983e-07, + "loss": 0.1695, + "step": 18425 + }, + { + "epoch": 0.94, + "grad_norm": 1.0634378735733225, + "learning_rate": 2.0747975918999018e-07, + "loss": 0.1596, + "step": 18426 + }, + { + "epoch": 0.94, + "grad_norm": 1.3307297893329564, + "learning_rate": 2.0714614010604815e-07, + "loss": 0.1695, + "step": 18427 + }, + { + "epoch": 0.94, + "grad_norm": 1.3323480771188736, + "learning_rate": 2.0681278665381833e-07, + "loss": 0.145, + "step": 18428 + }, + { + "epoch": 0.94, + "grad_norm": 1.1547516123571364, + "learning_rate": 2.0647969884234676e-07, + "loss": 0.162, + "step": 18429 + }, + { + "epoch": 0.94, + "grad_norm": 1.032996754590199, + "learning_rate": 2.0614687668066403e-07, + "loss": 0.161, + "step": 18430 + }, + { + "epoch": 0.94, + "grad_norm": 1.3829844629918333, + "learning_rate": 2.058143201778029e-07, + "loss": 0.1819, + "step": 18431 + }, + { + "epoch": 0.94, + "grad_norm": 1.6345825829161515, + "learning_rate": 2.054820293427795e-07, + "loss": 0.1622, + "step": 18432 + }, + { + "epoch": 0.94, + "grad_norm": 1.13058969646442, + "learning_rate": 2.0515000418460995e-07, + "loss": 0.1688, + "step": 18433 + }, + { + "epoch": 0.94, + "grad_norm": 1.1264091710524033, + "learning_rate": 2.0481824471229927e-07, + "loss": 0.126, + "step": 18434 + }, + { + "epoch": 0.94, + "grad_norm": 0.9524093713904969, + "learning_rate": 2.0448675093484805e-07, + "loss": 0.1369, + "step": 18435 + }, + { + "epoch": 0.94, + "grad_norm": 1.0332060689778997, + "learning_rate": 2.0415552286124685e-07, + "loss": 0.164, + "step": 18436 + }, + { + "epoch": 0.94, + "grad_norm": 1.154708584588988, + "learning_rate": 2.0382456050048073e-07, + "loss": 0.1745, + "step": 18437 + }, + { + "epoch": 0.94, + "grad_norm": 1.9288602652094318, + "learning_rate": 2.034938638615247e-07, + "loss": 0.1668, + "step": 18438 + }, + { + "epoch": 0.94, + "grad_norm": 1.0836570479763294, + "learning_rate": 2.0316343295335272e-07, + "loss": 0.1602, + "step": 18439 + }, + { + "epoch": 0.94, + "grad_norm": 1.0766862097296297, + "learning_rate": 2.028332677849254e-07, + "loss": 0.1405, + "step": 18440 + }, + { + "epoch": 0.94, + "grad_norm": 1.012468415541933, + "learning_rate": 2.025033683651989e-07, + "loss": 0.1652, + "step": 18441 + }, + { + "epoch": 0.94, + "grad_norm": 2.088899367723937, + "learning_rate": 2.0217373470312275e-07, + "loss": 0.163, + "step": 18442 + }, + { + "epoch": 0.94, + "grad_norm": 1.1267366751772347, + "learning_rate": 2.018443668076364e-07, + "loss": 0.158, + "step": 18443 + }, + { + "epoch": 0.94, + "grad_norm": 1.1178922069284865, + "learning_rate": 2.0151526468767502e-07, + "loss": 0.1898, + "step": 18444 + }, + { + "epoch": 0.94, + "grad_norm": 0.9563470216819234, + "learning_rate": 2.0118642835216584e-07, + "loss": 0.1625, + "step": 18445 + }, + { + "epoch": 0.94, + "grad_norm": 0.9093202957962132, + "learning_rate": 2.0085785781002843e-07, + "loss": 0.1569, + "step": 18446 + }, + { + "epoch": 0.94, + "grad_norm": 1.0411398327926424, + "learning_rate": 2.005295530701745e-07, + "loss": 0.1465, + "step": 18447 + }, + { + "epoch": 0.94, + "grad_norm": 1.2165271777702973, + "learning_rate": 2.0020151414151146e-07, + "loss": 0.1421, + "step": 18448 + }, + { + "epoch": 0.94, + "grad_norm": 1.2697362647353578, + "learning_rate": 1.9987374103293433e-07, + "loss": 0.1722, + "step": 18449 + }, + { + "epoch": 0.94, + "grad_norm": 0.9059765341830827, + "learning_rate": 1.9954623375333493e-07, + "loss": 0.1608, + "step": 18450 + }, + { + "epoch": 0.94, + "grad_norm": 0.9790695354920086, + "learning_rate": 1.9921899231159836e-07, + "loss": 0.1538, + "step": 18451 + }, + { + "epoch": 0.94, + "grad_norm": 0.7967358277210099, + "learning_rate": 1.9889201671660084e-07, + "loss": 0.1439, + "step": 18452 + }, + { + "epoch": 0.94, + "grad_norm": 1.1676863492777445, + "learning_rate": 1.9856530697720976e-07, + "loss": 0.1732, + "step": 18453 + }, + { + "epoch": 0.94, + "grad_norm": 0.9026498634322633, + "learning_rate": 1.9823886310228911e-07, + "loss": 0.1515, + "step": 18454 + }, + { + "epoch": 0.94, + "grad_norm": 1.5898085358785705, + "learning_rate": 1.9791268510069184e-07, + "loss": 0.1791, + "step": 18455 + }, + { + "epoch": 0.94, + "grad_norm": 0.8838055911204667, + "learning_rate": 1.975867729812686e-07, + "loss": 0.1318, + "step": 18456 + }, + { + "epoch": 0.94, + "grad_norm": 0.8962589559141396, + "learning_rate": 1.9726112675285568e-07, + "loss": 0.1487, + "step": 18457 + }, + { + "epoch": 0.94, + "grad_norm": 1.1455441239812183, + "learning_rate": 1.9693574642428935e-07, + "loss": 0.1594, + "step": 18458 + }, + { + "epoch": 0.94, + "grad_norm": 0.8697554124013273, + "learning_rate": 1.9661063200439478e-07, + "loss": 0.1747, + "step": 18459 + }, + { + "epoch": 0.94, + "grad_norm": 0.8473698779305922, + "learning_rate": 1.9628578350198933e-07, + "loss": 0.1416, + "step": 18460 + }, + { + "epoch": 0.94, + "grad_norm": 1.1622171600184905, + "learning_rate": 1.959612009258871e-07, + "loss": 0.1898, + "step": 18461 + }, + { + "epoch": 0.94, + "grad_norm": 1.9635651441512039, + "learning_rate": 1.9563688428489103e-07, + "loss": 0.1467, + "step": 18462 + }, + { + "epoch": 0.94, + "grad_norm": 0.9709659456803279, + "learning_rate": 1.953128335877974e-07, + "loss": 0.1551, + "step": 18463 + }, + { + "epoch": 0.94, + "grad_norm": 1.1570414393860904, + "learning_rate": 1.9498904884339697e-07, + "loss": 0.1885, + "step": 18464 + }, + { + "epoch": 0.94, + "grad_norm": 1.0220800299049926, + "learning_rate": 1.9466553006047383e-07, + "loss": 0.1556, + "step": 18465 + }, + { + "epoch": 0.94, + "grad_norm": 1.17271748461141, + "learning_rate": 1.9434227724779987e-07, + "loss": 0.1475, + "step": 18466 + }, + { + "epoch": 0.94, + "grad_norm": 0.9698973968273322, + "learning_rate": 1.9401929041414692e-07, + "loss": 0.2208, + "step": 18467 + }, + { + "epoch": 0.94, + "grad_norm": 0.9539471986986922, + "learning_rate": 1.9369656956827355e-07, + "loss": 0.1628, + "step": 18468 + }, + { + "epoch": 0.94, + "grad_norm": 1.0196693761635764, + "learning_rate": 1.9337411471893498e-07, + "loss": 0.1703, + "step": 18469 + }, + { + "epoch": 0.94, + "grad_norm": 1.1934751615375507, + "learning_rate": 1.9305192587487753e-07, + "loss": 0.1462, + "step": 18470 + }, + { + "epoch": 0.94, + "grad_norm": 1.199618048089345, + "learning_rate": 1.927300030448409e-07, + "loss": 0.1973, + "step": 18471 + }, + { + "epoch": 0.94, + "grad_norm": 1.1152576551484823, + "learning_rate": 1.924083462375559e-07, + "loss": 0.1673, + "step": 18472 + }, + { + "epoch": 0.94, + "grad_norm": 1.0406551584167096, + "learning_rate": 1.9208695546174994e-07, + "loss": 0.1679, + "step": 18473 + }, + { + "epoch": 0.94, + "grad_norm": 1.1662476783946836, + "learning_rate": 1.917658307261383e-07, + "loss": 0.1899, + "step": 18474 + }, + { + "epoch": 0.94, + "grad_norm": 1.0191715895198943, + "learning_rate": 1.914449720394329e-07, + "loss": 0.1636, + "step": 18475 + }, + { + "epoch": 0.94, + "grad_norm": 1.1530279147333242, + "learning_rate": 1.9112437941033567e-07, + "loss": 0.1574, + "step": 18476 + }, + { + "epoch": 0.94, + "grad_norm": 1.1700995784032147, + "learning_rate": 1.908040528475441e-07, + "loss": 0.1662, + "step": 18477 + }, + { + "epoch": 0.94, + "grad_norm": 1.1758161723996137, + "learning_rate": 1.904839923597468e-07, + "loss": 0.1666, + "step": 18478 + }, + { + "epoch": 0.94, + "grad_norm": 1.2056737356424008, + "learning_rate": 1.9016419795562568e-07, + "loss": 0.177, + "step": 18479 + }, + { + "epoch": 0.94, + "grad_norm": 0.8871261305872965, + "learning_rate": 1.8984466964385384e-07, + "loss": 0.1552, + "step": 18480 + }, + { + "epoch": 0.94, + "grad_norm": 1.878197883112823, + "learning_rate": 1.8952540743309988e-07, + "loss": 0.1556, + "step": 18481 + }, + { + "epoch": 0.94, + "grad_norm": 1.0543218480340648, + "learning_rate": 1.8920641133202356e-07, + "loss": 0.1494, + "step": 18482 + }, + { + "epoch": 0.94, + "grad_norm": 0.941396341109944, + "learning_rate": 1.888876813492768e-07, + "loss": 0.1685, + "step": 18483 + }, + { + "epoch": 0.94, + "grad_norm": 0.9301788089742421, + "learning_rate": 1.8856921749350608e-07, + "loss": 0.1506, + "step": 18484 + }, + { + "epoch": 0.94, + "grad_norm": 1.1855751996628046, + "learning_rate": 1.8825101977334891e-07, + "loss": 0.1714, + "step": 18485 + }, + { + "epoch": 0.94, + "grad_norm": 1.7965655623295367, + "learning_rate": 1.8793308819743837e-07, + "loss": 0.1793, + "step": 18486 + }, + { + "epoch": 0.94, + "grad_norm": 1.0720094037637893, + "learning_rate": 1.8761542277439648e-07, + "loss": 0.1738, + "step": 18487 + }, + { + "epoch": 0.94, + "grad_norm": 1.4204108111036178, + "learning_rate": 1.8729802351284077e-07, + "loss": 0.1427, + "step": 18488 + }, + { + "epoch": 0.94, + "grad_norm": 1.0684230576153166, + "learning_rate": 1.869808904213799e-07, + "loss": 0.1785, + "step": 18489 + }, + { + "epoch": 0.94, + "grad_norm": 0.8876069325807378, + "learning_rate": 1.8666402350861701e-07, + "loss": 0.1544, + "step": 18490 + }, + { + "epoch": 0.94, + "grad_norm": 1.6348599888108668, + "learning_rate": 1.8634742278314632e-07, + "loss": 0.1696, + "step": 18491 + }, + { + "epoch": 0.94, + "grad_norm": 1.1283463049697549, + "learning_rate": 1.8603108825355654e-07, + "loss": 0.1829, + "step": 18492 + }, + { + "epoch": 0.94, + "grad_norm": 1.0989175147355235, + "learning_rate": 1.8571501992842634e-07, + "loss": 0.1541, + "step": 18493 + }, + { + "epoch": 0.94, + "grad_norm": 1.5485827407936898, + "learning_rate": 1.8539921781633107e-07, + "loss": 0.1574, + "step": 18494 + }, + { + "epoch": 0.94, + "grad_norm": 1.2035629459141746, + "learning_rate": 1.8508368192583838e-07, + "loss": 0.1599, + "step": 18495 + }, + { + "epoch": 0.94, + "grad_norm": 1.175110774320181, + "learning_rate": 1.8476841226550247e-07, + "loss": 0.166, + "step": 18496 + }, + { + "epoch": 0.94, + "grad_norm": 0.8648008632983784, + "learning_rate": 1.8445340884387986e-07, + "loss": 0.1555, + "step": 18497 + }, + { + "epoch": 0.94, + "grad_norm": 1.186022566237658, + "learning_rate": 1.841386716695115e-07, + "loss": 0.1621, + "step": 18498 + }, + { + "epoch": 0.94, + "grad_norm": 0.966907537171716, + "learning_rate": 1.8382420075093722e-07, + "loss": 0.1663, + "step": 18499 + }, + { + "epoch": 0.94, + "grad_norm": 1.35987273141365, + "learning_rate": 1.8350999609668462e-07, + "loss": 0.1931, + "step": 18500 + }, + { + "epoch": 0.94, + "grad_norm": 1.7185887664553376, + "learning_rate": 1.8319605771527916e-07, + "loss": 0.1508, + "step": 18501 + }, + { + "epoch": 0.94, + "grad_norm": 0.9654463459049943, + "learning_rate": 1.8288238561523397e-07, + "loss": 0.1851, + "step": 18502 + }, + { + "epoch": 0.94, + "grad_norm": 1.5191769294067194, + "learning_rate": 1.8256897980505895e-07, + "loss": 0.1786, + "step": 18503 + }, + { + "epoch": 0.94, + "grad_norm": 1.0547956256766307, + "learning_rate": 1.8225584029325394e-07, + "loss": 0.1451, + "step": 18504 + }, + { + "epoch": 0.94, + "grad_norm": 0.9602419545290328, + "learning_rate": 1.8194296708831548e-07, + "loss": 0.1654, + "step": 18505 + }, + { + "epoch": 0.94, + "grad_norm": 0.9905278074256648, + "learning_rate": 1.8163036019872682e-07, + "loss": 0.1699, + "step": 18506 + }, + { + "epoch": 0.94, + "grad_norm": 0.9878465962130885, + "learning_rate": 1.8131801963297112e-07, + "loss": 0.1574, + "step": 18507 + }, + { + "epoch": 0.94, + "grad_norm": 2.202120778350107, + "learning_rate": 1.810059453995172e-07, + "loss": 0.1595, + "step": 18508 + }, + { + "epoch": 0.94, + "grad_norm": 1.5424916385881604, + "learning_rate": 1.8069413750683274e-07, + "loss": 0.1531, + "step": 18509 + }, + { + "epoch": 0.94, + "grad_norm": 1.2032151268116966, + "learning_rate": 1.8038259596337316e-07, + "loss": 0.1527, + "step": 18510 + }, + { + "epoch": 0.94, + "grad_norm": 0.9692901581171245, + "learning_rate": 1.8007132077759059e-07, + "loss": 0.1611, + "step": 18511 + }, + { + "epoch": 0.94, + "grad_norm": 1.2384022796992629, + "learning_rate": 1.7976031195792942e-07, + "loss": 0.1616, + "step": 18512 + }, + { + "epoch": 0.94, + "grad_norm": 0.9518845029745795, + "learning_rate": 1.794495695128229e-07, + "loss": 0.1513, + "step": 18513 + }, + { + "epoch": 0.94, + "grad_norm": 1.3445048561231276, + "learning_rate": 1.7913909345070202e-07, + "loss": 0.1584, + "step": 18514 + }, + { + "epoch": 0.94, + "grad_norm": 0.8910941735915785, + "learning_rate": 1.7882888377998787e-07, + "loss": 0.1346, + "step": 18515 + }, + { + "epoch": 0.94, + "grad_norm": 1.058905833198794, + "learning_rate": 1.7851894050909479e-07, + "loss": 0.1772, + "step": 18516 + }, + { + "epoch": 0.94, + "grad_norm": 0.889457926665445, + "learning_rate": 1.7820926364643054e-07, + "loss": 0.1433, + "step": 18517 + }, + { + "epoch": 0.94, + "grad_norm": 1.1393201867917613, + "learning_rate": 1.7789985320039505e-07, + "loss": 0.1922, + "step": 18518 + }, + { + "epoch": 0.94, + "grad_norm": 1.1461805626320478, + "learning_rate": 1.7759070917937937e-07, + "loss": 0.1691, + "step": 18519 + }, + { + "epoch": 0.94, + "grad_norm": 1.9552372498782165, + "learning_rate": 1.7728183159177126e-07, + "loss": 0.1574, + "step": 18520 + }, + { + "epoch": 0.94, + "grad_norm": 1.317634139784253, + "learning_rate": 1.7697322044594846e-07, + "loss": 0.1621, + "step": 18521 + }, + { + "epoch": 0.94, + "grad_norm": 1.5026527540251304, + "learning_rate": 1.76664875750282e-07, + "loss": 0.1535, + "step": 18522 + }, + { + "epoch": 0.94, + "grad_norm": 0.9529695966362643, + "learning_rate": 1.7635679751313529e-07, + "loss": 0.1604, + "step": 18523 + }, + { + "epoch": 0.94, + "grad_norm": 1.362781251116253, + "learning_rate": 1.7604898574286488e-07, + "loss": 0.1676, + "step": 18524 + }, + { + "epoch": 0.94, + "grad_norm": 1.794298494281626, + "learning_rate": 1.7574144044782083e-07, + "loss": 0.1632, + "step": 18525 + }, + { + "epoch": 0.94, + "grad_norm": 1.3543147984644823, + "learning_rate": 1.754341616363464e-07, + "loss": 0.1539, + "step": 18526 + }, + { + "epoch": 0.94, + "grad_norm": 1.8098682812727673, + "learning_rate": 1.7512714931677387e-07, + "loss": 0.1744, + "step": 18527 + }, + { + "epoch": 0.94, + "grad_norm": 1.0965203264417904, + "learning_rate": 1.7482040349743323e-07, + "loss": 0.1719, + "step": 18528 + }, + { + "epoch": 0.94, + "grad_norm": 1.7137149759900159, + "learning_rate": 1.7451392418664227e-07, + "loss": 0.1512, + "step": 18529 + }, + { + "epoch": 0.94, + "grad_norm": 0.8312516693775386, + "learning_rate": 1.7420771139271765e-07, + "loss": 0.1578, + "step": 18530 + }, + { + "epoch": 0.94, + "grad_norm": 1.941353238215755, + "learning_rate": 1.7390176512396384e-07, + "loss": 0.1575, + "step": 18531 + }, + { + "epoch": 0.94, + "grad_norm": 0.9570430798627169, + "learning_rate": 1.7359608538867868e-07, + "loss": 0.164, + "step": 18532 + }, + { + "epoch": 0.94, + "grad_norm": 1.2019136658490237, + "learning_rate": 1.732906721951555e-07, + "loss": 0.1681, + "step": 18533 + }, + { + "epoch": 0.94, + "grad_norm": 2.6202511232946657, + "learning_rate": 1.729855255516777e-07, + "loss": 0.1778, + "step": 18534 + }, + { + "epoch": 0.94, + "grad_norm": 1.134760261935929, + "learning_rate": 1.7268064546652308e-07, + "loss": 0.1998, + "step": 18535 + }, + { + "epoch": 0.94, + "grad_norm": 0.8102705538864348, + "learning_rate": 1.7237603194795948e-07, + "loss": 0.1581, + "step": 18536 + }, + { + "epoch": 0.94, + "grad_norm": 0.8801215439384064, + "learning_rate": 1.7207168500425142e-07, + "loss": 0.1523, + "step": 18537 + }, + { + "epoch": 0.94, + "grad_norm": 0.8586275430989962, + "learning_rate": 1.7176760464365449e-07, + "loss": 0.1519, + "step": 18538 + }, + { + "epoch": 0.94, + "grad_norm": 1.9018279699676834, + "learning_rate": 1.7146379087441655e-07, + "loss": 0.1914, + "step": 18539 + }, + { + "epoch": 0.94, + "grad_norm": 1.5663057154553512, + "learning_rate": 1.711602437047788e-07, + "loss": 0.1854, + "step": 18540 + }, + { + "epoch": 0.94, + "grad_norm": 1.0250812414102985, + "learning_rate": 1.708569631429746e-07, + "loss": 0.1579, + "step": 18541 + }, + { + "epoch": 0.94, + "grad_norm": 1.4051658931794355, + "learning_rate": 1.7055394919722856e-07, + "loss": 0.1577, + "step": 18542 + }, + { + "epoch": 0.94, + "grad_norm": 0.7920877521485331, + "learning_rate": 1.7025120187576406e-07, + "loss": 0.1664, + "step": 18543 + }, + { + "epoch": 0.94, + "grad_norm": 1.0138473039450129, + "learning_rate": 1.6994872118679006e-07, + "loss": 0.1551, + "step": 18544 + }, + { + "epoch": 0.94, + "grad_norm": 0.8477002854411233, + "learning_rate": 1.6964650713851228e-07, + "loss": 0.1292, + "step": 18545 + }, + { + "epoch": 0.94, + "grad_norm": 1.2484301198202659, + "learning_rate": 1.6934455973912744e-07, + "loss": 0.1529, + "step": 18546 + }, + { + "epoch": 0.94, + "grad_norm": 6.576777670975822, + "learning_rate": 1.690428789968268e-07, + "loss": 0.1777, + "step": 18547 + }, + { + "epoch": 0.94, + "grad_norm": 0.91852055430131, + "learning_rate": 1.6874146491979493e-07, + "loss": 0.1524, + "step": 18548 + }, + { + "epoch": 0.94, + "grad_norm": 0.9500129432796155, + "learning_rate": 1.6844031751620414e-07, + "loss": 0.1788, + "step": 18549 + }, + { + "epoch": 0.94, + "grad_norm": 1.7767599888761454, + "learning_rate": 1.6813943679422684e-07, + "loss": 0.1486, + "step": 18550 + }, + { + "epoch": 0.94, + "grad_norm": 0.9680141309651069, + "learning_rate": 1.678388227620209e-07, + "loss": 0.1601, + "step": 18551 + }, + { + "epoch": 0.94, + "grad_norm": 1.0630914404291878, + "learning_rate": 1.6753847542774315e-07, + "loss": 0.1498, + "step": 18552 + }, + { + "epoch": 0.94, + "grad_norm": 1.5211018887090226, + "learning_rate": 1.6723839479953929e-07, + "loss": 0.1689, + "step": 18553 + }, + { + "epoch": 0.94, + "grad_norm": 2.468104798877948, + "learning_rate": 1.669385808855495e-07, + "loss": 0.1652, + "step": 18554 + }, + { + "epoch": 0.94, + "grad_norm": 1.001087755929558, + "learning_rate": 1.666390336939061e-07, + "loss": 0.1736, + "step": 18555 + }, + { + "epoch": 0.94, + "grad_norm": 0.8453852231440885, + "learning_rate": 1.6633975323273376e-07, + "loss": 0.1438, + "step": 18556 + }, + { + "epoch": 0.94, + "grad_norm": 1.0352472415273843, + "learning_rate": 1.6604073951015154e-07, + "loss": 0.1709, + "step": 18557 + }, + { + "epoch": 0.94, + "grad_norm": 0.9459021394612915, + "learning_rate": 1.6574199253426958e-07, + "loss": 0.1691, + "step": 18558 + }, + { + "epoch": 0.94, + "grad_norm": 1.8641474095406372, + "learning_rate": 1.6544351231319145e-07, + "loss": 0.1436, + "step": 18559 + }, + { + "epoch": 0.94, + "grad_norm": 1.1152650363612875, + "learning_rate": 1.6514529885501397e-07, + "loss": 0.1423, + "step": 18560 + }, + { + "epoch": 0.94, + "grad_norm": 1.2606785611258506, + "learning_rate": 1.648473521678251e-07, + "loss": 0.1796, + "step": 18561 + }, + { + "epoch": 0.94, + "grad_norm": 1.298872067227049, + "learning_rate": 1.645496722597084e-07, + "loss": 0.1715, + "step": 18562 + }, + { + "epoch": 0.94, + "grad_norm": 0.8674647051865187, + "learning_rate": 1.642522591387352e-07, + "loss": 0.1452, + "step": 18563 + }, + { + "epoch": 0.94, + "grad_norm": 1.0411258487676804, + "learning_rate": 1.6395511281297682e-07, + "loss": 0.1454, + "step": 18564 + }, + { + "epoch": 0.94, + "grad_norm": 1.2294606917015065, + "learning_rate": 1.6365823329049124e-07, + "loss": 0.1551, + "step": 18565 + }, + { + "epoch": 0.94, + "grad_norm": 1.2626410672708395, + "learning_rate": 1.633616205793309e-07, + "loss": 0.1647, + "step": 18566 + }, + { + "epoch": 0.94, + "grad_norm": 0.8820935476550902, + "learning_rate": 1.6306527468754384e-07, + "loss": 0.1538, + "step": 18567 + }, + { + "epoch": 0.94, + "grad_norm": 1.076423859688967, + "learning_rate": 1.6276919562316475e-07, + "loss": 0.1607, + "step": 18568 + }, + { + "epoch": 0.94, + "grad_norm": 1.058599832374519, + "learning_rate": 1.6247338339422823e-07, + "loss": 0.1652, + "step": 18569 + }, + { + "epoch": 0.94, + "grad_norm": 0.9862983112602691, + "learning_rate": 1.6217783800875576e-07, + "loss": 0.1525, + "step": 18570 + }, + { + "epoch": 0.94, + "grad_norm": 1.1039987539964633, + "learning_rate": 1.618825594747664e-07, + "loss": 0.1691, + "step": 18571 + }, + { + "epoch": 0.94, + "grad_norm": 1.1355796680292811, + "learning_rate": 1.615875478002671e-07, + "loss": 0.1444, + "step": 18572 + }, + { + "epoch": 0.94, + "grad_norm": 1.1295020685609354, + "learning_rate": 1.6129280299326144e-07, + "loss": 0.1779, + "step": 18573 + }, + { + "epoch": 0.94, + "grad_norm": 1.1228313989287582, + "learning_rate": 1.6099832506174419e-07, + "loss": 0.1544, + "step": 18574 + }, + { + "epoch": 0.94, + "grad_norm": 1.3138722407016137, + "learning_rate": 1.6070411401370335e-07, + "loss": 0.1783, + "step": 18575 + }, + { + "epoch": 0.94, + "grad_norm": 1.1715432450412604, + "learning_rate": 1.6041016985711923e-07, + "loss": 0.161, + "step": 18576 + }, + { + "epoch": 0.94, + "grad_norm": 1.3627767559615154, + "learning_rate": 1.6011649259996541e-07, + "loss": 0.172, + "step": 18577 + }, + { + "epoch": 0.94, + "grad_norm": 1.1528273504272677, + "learning_rate": 1.598230822502067e-07, + "loss": 0.1569, + "step": 18578 + }, + { + "epoch": 0.94, + "grad_norm": 1.5197766356269795, + "learning_rate": 1.5952993881580336e-07, + "loss": 0.1562, + "step": 18579 + }, + { + "epoch": 0.94, + "grad_norm": 1.6041305313914618, + "learning_rate": 1.592370623047046e-07, + "loss": 0.1507, + "step": 18580 + }, + { + "epoch": 0.94, + "grad_norm": 0.8985954018484826, + "learning_rate": 1.5894445272485736e-07, + "loss": 0.1629, + "step": 18581 + }, + { + "epoch": 0.94, + "grad_norm": 1.0707360930908472, + "learning_rate": 1.586521100841987e-07, + "loss": 0.1543, + "step": 18582 + }, + { + "epoch": 0.94, + "grad_norm": 1.3859762485376421, + "learning_rate": 1.583600343906566e-07, + "loss": 0.1772, + "step": 18583 + }, + { + "epoch": 0.95, + "grad_norm": 0.9957403277629844, + "learning_rate": 1.5806822565215373e-07, + "loss": 0.1627, + "step": 18584 + }, + { + "epoch": 0.95, + "grad_norm": 1.4046400140650128, + "learning_rate": 1.5777668387660706e-07, + "loss": 0.1851, + "step": 18585 + }, + { + "epoch": 0.95, + "grad_norm": 0.891535151011113, + "learning_rate": 1.5748540907192356e-07, + "loss": 0.2006, + "step": 18586 + }, + { + "epoch": 0.95, + "grad_norm": 1.5090297703983357, + "learning_rate": 1.571944012460036e-07, + "loss": 0.1712, + "step": 18587 + }, + { + "epoch": 0.95, + "grad_norm": 0.9003120205448941, + "learning_rate": 1.569036604067431e-07, + "loss": 0.1649, + "step": 18588 + }, + { + "epoch": 0.95, + "grad_norm": 1.1480753994260278, + "learning_rate": 1.566131865620246e-07, + "loss": 0.158, + "step": 18589 + }, + { + "epoch": 0.95, + "grad_norm": 0.9565359080976353, + "learning_rate": 1.5632297971972966e-07, + "loss": 0.1721, + "step": 18590 + }, + { + "epoch": 0.95, + "grad_norm": 1.0272957499468065, + "learning_rate": 1.5603303988773078e-07, + "loss": 0.1378, + "step": 18591 + }, + { + "epoch": 0.95, + "grad_norm": 0.9377209184491478, + "learning_rate": 1.5574336707389171e-07, + "loss": 0.1706, + "step": 18592 + }, + { + "epoch": 0.95, + "grad_norm": 0.7620016285666292, + "learning_rate": 1.554539612860695e-07, + "loss": 0.14, + "step": 18593 + }, + { + "epoch": 0.95, + "grad_norm": 0.9909527907118318, + "learning_rate": 1.551648225321145e-07, + "loss": 0.1582, + "step": 18594 + }, + { + "epoch": 0.95, + "grad_norm": 0.9335941141651837, + "learning_rate": 1.548759508198694e-07, + "loss": 0.1469, + "step": 18595 + }, + { + "epoch": 0.95, + "grad_norm": 0.7752699845301906, + "learning_rate": 1.545873461571712e-07, + "loss": 0.1461, + "step": 18596 + }, + { + "epoch": 0.95, + "grad_norm": 1.9913274331233224, + "learning_rate": 1.54299008551847e-07, + "loss": 0.1474, + "step": 18597 + }, + { + "epoch": 0.95, + "grad_norm": 1.2023120555067652, + "learning_rate": 1.5401093801171828e-07, + "loss": 0.1873, + "step": 18598 + }, + { + "epoch": 0.95, + "grad_norm": 1.071690745494057, + "learning_rate": 1.5372313454459887e-07, + "loss": 0.1433, + "step": 18599 + }, + { + "epoch": 0.95, + "grad_norm": 9.053479587695811, + "learning_rate": 1.5343559815829468e-07, + "loss": 0.1503, + "step": 18600 + }, + { + "epoch": 0.95, + "grad_norm": 0.9531335525572882, + "learning_rate": 1.5314832886060727e-07, + "loss": 0.1471, + "step": 18601 + }, + { + "epoch": 0.95, + "grad_norm": 1.036438135675386, + "learning_rate": 1.5286132665932706e-07, + "loss": 0.1672, + "step": 18602 + }, + { + "epoch": 0.95, + "grad_norm": 1.9933283290691903, + "learning_rate": 1.525745915622401e-07, + "loss": 0.1593, + "step": 18603 + }, + { + "epoch": 0.95, + "grad_norm": 1.1621345115927901, + "learning_rate": 1.5228812357712231e-07, + "loss": 0.1512, + "step": 18604 + }, + { + "epoch": 0.95, + "grad_norm": 0.860499166036676, + "learning_rate": 1.520019227117464e-07, + "loss": 0.167, + "step": 18605 + }, + { + "epoch": 0.95, + "grad_norm": 1.1558556579174626, + "learning_rate": 1.5171598897387395e-07, + "loss": 0.154, + "step": 18606 + }, + { + "epoch": 0.95, + "grad_norm": 0.9880628780813716, + "learning_rate": 1.514303223712621e-07, + "loss": 0.1725, + "step": 18607 + }, + { + "epoch": 0.95, + "grad_norm": 1.1171775767513268, + "learning_rate": 1.5114492291165794e-07, + "loss": 0.1747, + "step": 18608 + }, + { + "epoch": 0.95, + "grad_norm": 1.1338849876747592, + "learning_rate": 1.508597906028053e-07, + "loss": 0.1733, + "step": 18609 + }, + { + "epoch": 0.95, + "grad_norm": 1.1072484917924614, + "learning_rate": 1.505749254524358e-07, + "loss": 0.1475, + "step": 18610 + }, + { + "epoch": 0.95, + "grad_norm": 0.9919583007519222, + "learning_rate": 1.5029032746827875e-07, + "loss": 0.1575, + "step": 18611 + }, + { + "epoch": 0.95, + "grad_norm": 1.8184994189922, + "learning_rate": 1.500059966580525e-07, + "loss": 0.1411, + "step": 18612 + }, + { + "epoch": 0.95, + "grad_norm": 1.398520475493659, + "learning_rate": 1.497219330294708e-07, + "loss": 0.1708, + "step": 18613 + }, + { + "epoch": 0.95, + "grad_norm": 1.0964778320968296, + "learning_rate": 1.4943813659023753e-07, + "loss": 0.1693, + "step": 18614 + }, + { + "epoch": 0.95, + "grad_norm": 1.6528757245841295, + "learning_rate": 1.49154607348051e-07, + "loss": 0.1722, + "step": 18615 + }, + { + "epoch": 0.95, + "grad_norm": 0.9692735415387215, + "learning_rate": 1.4887134531060165e-07, + "loss": 0.1523, + "step": 18616 + }, + { + "epoch": 0.95, + "grad_norm": 1.0528475743073271, + "learning_rate": 1.485883504855734e-07, + "loss": 0.1507, + "step": 18617 + }, + { + "epoch": 0.95, + "grad_norm": 1.9032628322727292, + "learning_rate": 1.4830562288064344e-07, + "loss": 0.1719, + "step": 18618 + }, + { + "epoch": 0.95, + "grad_norm": 1.0427542611832965, + "learning_rate": 1.4802316250347893e-07, + "loss": 0.1806, + "step": 18619 + }, + { + "epoch": 0.95, + "grad_norm": 0.9516894573352137, + "learning_rate": 1.4774096936174376e-07, + "loss": 0.1616, + "step": 18620 + }, + { + "epoch": 0.95, + "grad_norm": 1.189078582059111, + "learning_rate": 1.474590434630907e-07, + "loss": 0.1558, + "step": 18621 + }, + { + "epoch": 0.95, + "grad_norm": 1.0663919954221222, + "learning_rate": 1.4717738481516808e-07, + "loss": 0.1742, + "step": 18622 + }, + { + "epoch": 0.95, + "grad_norm": 1.0112513802414314, + "learning_rate": 1.4689599342561423e-07, + "loss": 0.1508, + "step": 18623 + }, + { + "epoch": 0.95, + "grad_norm": 0.9221034568216941, + "learning_rate": 1.4661486930206415e-07, + "loss": 0.1558, + "step": 18624 + }, + { + "epoch": 0.95, + "grad_norm": 2.141316296046559, + "learning_rate": 1.4633401245214064e-07, + "loss": 0.156, + "step": 18625 + }, + { + "epoch": 0.95, + "grad_norm": 1.0125885082259565, + "learning_rate": 1.4605342288346536e-07, + "loss": 0.1609, + "step": 18626 + }, + { + "epoch": 0.95, + "grad_norm": 2.2259660659391147, + "learning_rate": 1.4577310060364558e-07, + "loss": 0.1641, + "step": 18627 + }, + { + "epoch": 0.95, + "grad_norm": 1.0070231515282992, + "learning_rate": 1.4549304562028966e-07, + "loss": 0.1706, + "step": 18628 + }, + { + "epoch": 0.95, + "grad_norm": 0.8620628643100052, + "learning_rate": 1.4521325794098928e-07, + "loss": 0.1614, + "step": 18629 + }, + { + "epoch": 0.95, + "grad_norm": 0.9305244147767624, + "learning_rate": 1.449337375733373e-07, + "loss": 0.1551, + "step": 18630 + }, + { + "epoch": 0.95, + "grad_norm": 2.5742305713151112, + "learning_rate": 1.4465448452491315e-07, + "loss": 0.1686, + "step": 18631 + }, + { + "epoch": 0.95, + "grad_norm": 1.2104559628853415, + "learning_rate": 1.4437549880329415e-07, + "loss": 0.16, + "step": 18632 + }, + { + "epoch": 0.95, + "grad_norm": 1.239220991219772, + "learning_rate": 1.4409678041604426e-07, + "loss": 0.1666, + "step": 18633 + }, + { + "epoch": 0.95, + "grad_norm": 0.786747929668593, + "learning_rate": 1.4381832937072737e-07, + "loss": 0.1394, + "step": 18634 + }, + { + "epoch": 0.95, + "grad_norm": 1.2145595739356847, + "learning_rate": 1.4354014567489528e-07, + "loss": 0.1649, + "step": 18635 + }, + { + "epoch": 0.95, + "grad_norm": 1.9754466948839298, + "learning_rate": 1.432622293360919e-07, + "loss": 0.1846, + "step": 18636 + }, + { + "epoch": 0.95, + "grad_norm": 1.0548122289678061, + "learning_rate": 1.4298458036185903e-07, + "loss": 0.1495, + "step": 18637 + }, + { + "epoch": 0.95, + "grad_norm": 1.403480202485536, + "learning_rate": 1.4270719875972506e-07, + "loss": 0.1592, + "step": 18638 + }, + { + "epoch": 0.95, + "grad_norm": 4.063392804506902, + "learning_rate": 1.424300845372162e-07, + "loss": 0.1699, + "step": 18639 + }, + { + "epoch": 0.95, + "grad_norm": 1.2431406279033599, + "learning_rate": 1.4215323770184642e-07, + "loss": 0.1733, + "step": 18640 + }, + { + "epoch": 0.95, + "grad_norm": 0.9805121919052685, + "learning_rate": 1.418766582611286e-07, + "loss": 0.1477, + "step": 18641 + }, + { + "epoch": 0.95, + "grad_norm": 1.1772098397613278, + "learning_rate": 1.4160034622256125e-07, + "loss": 0.1678, + "step": 18642 + }, + { + "epoch": 0.95, + "grad_norm": 1.2741303245522206, + "learning_rate": 1.4132430159364273e-07, + "loss": 0.154, + "step": 18643 + }, + { + "epoch": 0.95, + "grad_norm": 1.0401178606701769, + "learning_rate": 1.4104852438185823e-07, + "loss": 0.1571, + "step": 18644 + }, + { + "epoch": 0.95, + "grad_norm": 1.3489124060031128, + "learning_rate": 1.4077301459469062e-07, + "loss": 0.1594, + "step": 18645 + }, + { + "epoch": 0.95, + "grad_norm": 1.1101616198916429, + "learning_rate": 1.404977722396106e-07, + "loss": 0.1495, + "step": 18646 + }, + { + "epoch": 0.95, + "grad_norm": 1.266653077152881, + "learning_rate": 1.4022279732408661e-07, + "loss": 0.1544, + "step": 18647 + }, + { + "epoch": 0.95, + "grad_norm": 1.3044435715874434, + "learning_rate": 1.3994808985557497e-07, + "loss": 0.1595, + "step": 18648 + }, + { + "epoch": 0.95, + "grad_norm": 0.865404252667061, + "learning_rate": 1.3967364984152965e-07, + "loss": 0.1569, + "step": 18649 + }, + { + "epoch": 0.95, + "grad_norm": 1.1375610633591677, + "learning_rate": 1.393994772893925e-07, + "loss": 0.1546, + "step": 18650 + }, + { + "epoch": 0.95, + "grad_norm": 1.3318447589511289, + "learning_rate": 1.3912557220660206e-07, + "loss": 0.1477, + "step": 18651 + }, + { + "epoch": 0.95, + "grad_norm": 1.155841518237319, + "learning_rate": 1.3885193460058676e-07, + "loss": 0.1355, + "step": 18652 + }, + { + "epoch": 0.95, + "grad_norm": 1.148309023639851, + "learning_rate": 1.3857856447876962e-07, + "loss": 0.1764, + "step": 18653 + }, + { + "epoch": 0.95, + "grad_norm": 1.087533529247148, + "learning_rate": 1.3830546184856687e-07, + "loss": 0.1581, + "step": 18654 + }, + { + "epoch": 0.95, + "grad_norm": 1.167221914578398, + "learning_rate": 1.380326267173848e-07, + "loss": 0.1784, + "step": 18655 + }, + { + "epoch": 0.95, + "grad_norm": 0.8385862560019581, + "learning_rate": 1.3776005909262423e-07, + "loss": 0.1415, + "step": 18656 + }, + { + "epoch": 0.95, + "grad_norm": 1.1988900522919759, + "learning_rate": 1.374877589816792e-07, + "loss": 0.1595, + "step": 18657 + }, + { + "epoch": 0.95, + "grad_norm": 2.5518572845652296, + "learning_rate": 1.3721572639193714e-07, + "loss": 0.1569, + "step": 18658 + }, + { + "epoch": 0.95, + "grad_norm": 1.3188198072119175, + "learning_rate": 1.3694396133077436e-07, + "loss": 0.1524, + "step": 18659 + }, + { + "epoch": 0.95, + "grad_norm": 1.3273904922071935, + "learning_rate": 1.3667246380556386e-07, + "loss": 0.1603, + "step": 18660 + }, + { + "epoch": 0.95, + "grad_norm": 0.9632445855587911, + "learning_rate": 1.3640123382366977e-07, + "loss": 0.1464, + "step": 18661 + }, + { + "epoch": 0.95, + "grad_norm": 1.1982344972555086, + "learning_rate": 1.361302713924495e-07, + "loss": 0.1712, + "step": 18662 + }, + { + "epoch": 0.95, + "grad_norm": 2.628306264095215, + "learning_rate": 1.3585957651925274e-07, + "loss": 0.1512, + "step": 18663 + }, + { + "epoch": 0.95, + "grad_norm": 2.7487833080827206, + "learning_rate": 1.355891492114214e-07, + "loss": 0.1611, + "step": 18664 + }, + { + "epoch": 0.95, + "grad_norm": 0.9688560463974245, + "learning_rate": 1.3531898947629296e-07, + "loss": 0.161, + "step": 18665 + }, + { + "epoch": 0.95, + "grad_norm": 1.263410858922274, + "learning_rate": 1.3504909732119266e-07, + "loss": 0.1701, + "step": 18666 + }, + { + "epoch": 0.95, + "grad_norm": 1.545720766035433, + "learning_rate": 1.347794727534435e-07, + "loss": 0.1691, + "step": 18667 + }, + { + "epoch": 0.95, + "grad_norm": 1.280358835845226, + "learning_rate": 1.3451011578035856e-07, + "loss": 0.1576, + "step": 18668 + }, + { + "epoch": 0.95, + "grad_norm": 1.018370216604535, + "learning_rate": 1.3424102640924307e-07, + "loss": 0.1566, + "step": 18669 + }, + { + "epoch": 0.95, + "grad_norm": 3.649240179706749, + "learning_rate": 1.339722046473979e-07, + "loss": 0.1624, + "step": 18670 + }, + { + "epoch": 0.95, + "grad_norm": 1.585373607840606, + "learning_rate": 1.3370365050211387e-07, + "loss": 0.1445, + "step": 18671 + }, + { + "epoch": 0.95, + "grad_norm": 0.9880163861646013, + "learning_rate": 1.3343536398067513e-07, + "loss": 0.1682, + "step": 18672 + }, + { + "epoch": 0.95, + "grad_norm": 1.1272581002377124, + "learning_rate": 1.3316734509035922e-07, + "loss": 0.1606, + "step": 18673 + }, + { + "epoch": 0.95, + "grad_norm": 0.9879019391840964, + "learning_rate": 1.3289959383843698e-07, + "loss": 0.1414, + "step": 18674 + }, + { + "epoch": 0.95, + "grad_norm": 0.9941590979821857, + "learning_rate": 1.3263211023217038e-07, + "loss": 0.1523, + "step": 18675 + }, + { + "epoch": 0.95, + "grad_norm": 1.0029671973975969, + "learning_rate": 1.323648942788147e-07, + "loss": 0.1646, + "step": 18676 + }, + { + "epoch": 0.95, + "grad_norm": 0.9179981656127324, + "learning_rate": 1.3209794598561864e-07, + "loss": 0.1772, + "step": 18677 + }, + { + "epoch": 0.95, + "grad_norm": 0.9915365118700741, + "learning_rate": 1.3183126535982306e-07, + "loss": 0.1552, + "step": 18678 + }, + { + "epoch": 0.95, + "grad_norm": 1.6295534645843557, + "learning_rate": 1.3156485240866213e-07, + "loss": 0.1526, + "step": 18679 + }, + { + "epoch": 0.95, + "grad_norm": 1.1206894933896525, + "learning_rate": 1.312987071393612e-07, + "loss": 0.165, + "step": 18680 + }, + { + "epoch": 0.95, + "grad_norm": 1.1012662595538143, + "learning_rate": 1.310328295591412e-07, + "loss": 0.171, + "step": 18681 + }, + { + "epoch": 0.95, + "grad_norm": 1.112825275192214, + "learning_rate": 1.30767219675213e-07, + "loss": 0.1534, + "step": 18682 + }, + { + "epoch": 0.95, + "grad_norm": 1.042100924816748, + "learning_rate": 1.3050187749478192e-07, + "loss": 0.1683, + "step": 18683 + }, + { + "epoch": 0.95, + "grad_norm": 3.4419620222320186, + "learning_rate": 1.3023680302504338e-07, + "loss": 0.1751, + "step": 18684 + }, + { + "epoch": 0.95, + "grad_norm": 0.9906677459433227, + "learning_rate": 1.2997199627319047e-07, + "loss": 0.1514, + "step": 18685 + }, + { + "epoch": 0.95, + "grad_norm": 0.9638069201467271, + "learning_rate": 1.29707457246403e-07, + "loss": 0.1733, + "step": 18686 + }, + { + "epoch": 0.95, + "grad_norm": 1.1689158639836776, + "learning_rate": 1.2944318595185855e-07, + "loss": 0.1571, + "step": 18687 + }, + { + "epoch": 0.95, + "grad_norm": 0.9465121862654976, + "learning_rate": 1.29179182396727e-07, + "loss": 0.1454, + "step": 18688 + }, + { + "epoch": 0.95, + "grad_norm": 1.490398094105073, + "learning_rate": 1.289154465881659e-07, + "loss": 0.1399, + "step": 18689 + }, + { + "epoch": 0.95, + "grad_norm": 1.0151831663290165, + "learning_rate": 1.2865197853333179e-07, + "loss": 0.1606, + "step": 18690 + }, + { + "epoch": 0.95, + "grad_norm": 1.0566143989903571, + "learning_rate": 1.283887782393689e-07, + "loss": 0.1635, + "step": 18691 + }, + { + "epoch": 0.95, + "grad_norm": 1.450115053552093, + "learning_rate": 1.2812584571341936e-07, + "loss": 0.1679, + "step": 18692 + }, + { + "epoch": 0.95, + "grad_norm": 1.32553033080943, + "learning_rate": 1.2786318096261298e-07, + "loss": 0.1729, + "step": 18693 + }, + { + "epoch": 0.95, + "grad_norm": 1.0099512930173309, + "learning_rate": 1.2760078399407626e-07, + "loss": 0.1483, + "step": 18694 + }, + { + "epoch": 0.95, + "grad_norm": 1.0971077485331466, + "learning_rate": 1.273386548149247e-07, + "loss": 0.1406, + "step": 18695 + }, + { + "epoch": 0.95, + "grad_norm": 0.9603529997571104, + "learning_rate": 1.270767934322703e-07, + "loss": 0.1662, + "step": 18696 + }, + { + "epoch": 0.95, + "grad_norm": 1.7585292023883128, + "learning_rate": 1.2681519985321522e-07, + "loss": 0.1954, + "step": 18697 + }, + { + "epoch": 0.95, + "grad_norm": 1.3029394904526037, + "learning_rate": 1.2655387408485597e-07, + "loss": 0.1469, + "step": 18698 + }, + { + "epoch": 0.95, + "grad_norm": 0.9517084868877973, + "learning_rate": 1.2629281613428024e-07, + "loss": 0.1617, + "step": 18699 + }, + { + "epoch": 0.95, + "grad_norm": 0.8858492726075213, + "learning_rate": 1.260320260085701e-07, + "loss": 0.1501, + "step": 18700 + }, + { + "epoch": 0.95, + "grad_norm": 1.061871176835859, + "learning_rate": 1.2577150371479884e-07, + "loss": 0.1422, + "step": 18701 + }, + { + "epoch": 0.95, + "grad_norm": 1.2908260521128192, + "learning_rate": 1.25511249260033e-07, + "loss": 0.165, + "step": 18702 + }, + { + "epoch": 0.95, + "grad_norm": 1.1293124330242905, + "learning_rate": 1.2525126265133137e-07, + "loss": 0.1873, + "step": 18703 + }, + { + "epoch": 0.95, + "grad_norm": 1.085955564601038, + "learning_rate": 1.249915438957483e-07, + "loss": 0.1599, + "step": 18704 + }, + { + "epoch": 0.95, + "grad_norm": 1.021664792923056, + "learning_rate": 1.2473209300032706e-07, + "loss": 0.1607, + "step": 18705 + }, + { + "epoch": 0.95, + "grad_norm": 1.3178651427571242, + "learning_rate": 1.2447290997210426e-07, + "loss": 0.1552, + "step": 18706 + }, + { + "epoch": 0.95, + "grad_norm": 3.5516786445283004, + "learning_rate": 1.2421399481811313e-07, + "loss": 0.1702, + "step": 18707 + }, + { + "epoch": 0.95, + "grad_norm": 1.329356585607747, + "learning_rate": 1.2395534754537475e-07, + "loss": 0.1616, + "step": 18708 + }, + { + "epoch": 0.95, + "grad_norm": 1.1807373706580682, + "learning_rate": 1.236969681609057e-07, + "loss": 0.1495, + "step": 18709 + }, + { + "epoch": 0.95, + "grad_norm": 1.5458064139687657, + "learning_rate": 1.2343885667171373e-07, + "loss": 0.1954, + "step": 18710 + }, + { + "epoch": 0.95, + "grad_norm": 0.94979791393677, + "learning_rate": 1.23181013084801e-07, + "loss": 0.1673, + "step": 18711 + }, + { + "epoch": 0.95, + "grad_norm": 1.3667713860642676, + "learning_rate": 1.2292343740715973e-07, + "loss": 0.1695, + "step": 18712 + }, + { + "epoch": 0.95, + "grad_norm": 0.8269227975604101, + "learning_rate": 1.2266612964577984e-07, + "loss": 0.1403, + "step": 18713 + }, + { + "epoch": 0.95, + "grad_norm": 1.1395132914278172, + "learning_rate": 1.22409089807638e-07, + "loss": 0.1547, + "step": 18714 + }, + { + "epoch": 0.95, + "grad_norm": 1.3599861623733471, + "learning_rate": 1.221523178997075e-07, + "loss": 0.1406, + "step": 18715 + }, + { + "epoch": 0.95, + "grad_norm": 1.1998746875068946, + "learning_rate": 1.2189581392895388e-07, + "loss": 0.1695, + "step": 18716 + }, + { + "epoch": 0.95, + "grad_norm": 1.0641164731288262, + "learning_rate": 1.2163957790233382e-07, + "loss": 0.1664, + "step": 18717 + }, + { + "epoch": 0.95, + "grad_norm": 1.188292593607651, + "learning_rate": 1.2138360982679842e-07, + "loss": 0.1845, + "step": 18718 + }, + { + "epoch": 0.95, + "grad_norm": 1.253121358483849, + "learning_rate": 1.21127909709291e-07, + "loss": 0.1712, + "step": 18719 + }, + { + "epoch": 0.95, + "grad_norm": 0.9717715577170544, + "learning_rate": 1.2087247755674603e-07, + "loss": 0.1675, + "step": 18720 + }, + { + "epoch": 0.95, + "grad_norm": 1.258572516156622, + "learning_rate": 1.206173133760935e-07, + "loss": 0.1513, + "step": 18721 + }, + { + "epoch": 0.95, + "grad_norm": 1.0221084000142802, + "learning_rate": 1.2036241717425456e-07, + "loss": 0.1663, + "step": 18722 + }, + { + "epoch": 0.95, + "grad_norm": 1.2012471054237224, + "learning_rate": 1.2010778895814258e-07, + "loss": 0.1562, + "step": 18723 + }, + { + "epoch": 0.95, + "grad_norm": 1.594584266408908, + "learning_rate": 1.1985342873466532e-07, + "loss": 0.178, + "step": 18724 + }, + { + "epoch": 0.95, + "grad_norm": 2.201800662850403, + "learning_rate": 1.1959933651072065e-07, + "loss": 0.1562, + "step": 18725 + }, + { + "epoch": 0.95, + "grad_norm": 0.910808606905359, + "learning_rate": 1.1934551229320413e-07, + "loss": 0.1586, + "step": 18726 + }, + { + "epoch": 0.95, + "grad_norm": 1.004299274296927, + "learning_rate": 1.1909195608899694e-07, + "loss": 0.1672, + "step": 18727 + }, + { + "epoch": 0.95, + "grad_norm": 1.0095444901687516, + "learning_rate": 1.1883866790497911e-07, + "loss": 0.1509, + "step": 18728 + }, + { + "epoch": 0.95, + "grad_norm": 0.8761406814468267, + "learning_rate": 1.1858564774802073e-07, + "loss": 0.1589, + "step": 18729 + }, + { + "epoch": 0.95, + "grad_norm": 1.034883019557606, + "learning_rate": 1.1833289562498406e-07, + "loss": 0.1614, + "step": 18730 + }, + { + "epoch": 0.95, + "grad_norm": 1.1760568222090761, + "learning_rate": 1.1808041154272587e-07, + "loss": 0.1413, + "step": 18731 + }, + { + "epoch": 0.95, + "grad_norm": 0.8505003865615471, + "learning_rate": 1.178281955080951e-07, + "loss": 0.1571, + "step": 18732 + }, + { + "epoch": 0.95, + "grad_norm": 0.9717764432364998, + "learning_rate": 1.1757624752793184e-07, + "loss": 0.1698, + "step": 18733 + }, + { + "epoch": 0.95, + "grad_norm": 1.1320680343929483, + "learning_rate": 1.1732456760907174e-07, + "loss": 0.1648, + "step": 18734 + }, + { + "epoch": 0.95, + "grad_norm": 0.9709646357714493, + "learning_rate": 1.1707315575834044e-07, + "loss": 0.1525, + "step": 18735 + }, + { + "epoch": 0.95, + "grad_norm": 0.8168264220250449, + "learning_rate": 1.1682201198255916e-07, + "loss": 0.1456, + "step": 18736 + }, + { + "epoch": 0.95, + "grad_norm": 0.953615837573349, + "learning_rate": 1.16571136288538e-07, + "loss": 0.1477, + "step": 18737 + }, + { + "epoch": 0.95, + "grad_norm": 1.1923327503063306, + "learning_rate": 1.1632052868308375e-07, + "loss": 0.1763, + "step": 18738 + }, + { + "epoch": 0.95, + "grad_norm": 1.0083921240352136, + "learning_rate": 1.1607018917299207e-07, + "loss": 0.1484, + "step": 18739 + }, + { + "epoch": 0.95, + "grad_norm": 1.3179243790087516, + "learning_rate": 1.158201177650553e-07, + "loss": 0.1573, + "step": 18740 + }, + { + "epoch": 0.95, + "grad_norm": 1.2500627713525843, + "learning_rate": 1.1557031446605693e-07, + "loss": 0.1505, + "step": 18741 + }, + { + "epoch": 0.95, + "grad_norm": 1.239246970452079, + "learning_rate": 1.1532077928277152e-07, + "loss": 0.1716, + "step": 18742 + }, + { + "epoch": 0.95, + "grad_norm": 0.8462484777876413, + "learning_rate": 1.1507151222196811e-07, + "loss": 0.1404, + "step": 18743 + }, + { + "epoch": 0.95, + "grad_norm": 1.059018702238596, + "learning_rate": 1.1482251329040795e-07, + "loss": 0.1819, + "step": 18744 + }, + { + "epoch": 0.95, + "grad_norm": 1.4355337392067744, + "learning_rate": 1.1457378249484674e-07, + "loss": 0.1705, + "step": 18745 + }, + { + "epoch": 0.95, + "grad_norm": 0.9243722354509532, + "learning_rate": 1.143253198420291e-07, + "loss": 0.1648, + "step": 18746 + }, + { + "epoch": 0.95, + "grad_norm": 1.7227231193621009, + "learning_rate": 1.1407712533869519e-07, + "loss": 0.152, + "step": 18747 + }, + { + "epoch": 0.95, + "grad_norm": 1.1571786371702186, + "learning_rate": 1.1382919899157852e-07, + "loss": 0.1618, + "step": 18748 + }, + { + "epoch": 0.95, + "grad_norm": 1.0542535821163654, + "learning_rate": 1.1358154080740147e-07, + "loss": 0.1438, + "step": 18749 + }, + { + "epoch": 0.95, + "grad_norm": 2.031459382470392, + "learning_rate": 1.1333415079288424e-07, + "loss": 0.1706, + "step": 18750 + }, + { + "epoch": 0.95, + "grad_norm": 1.1376771610307743, + "learning_rate": 1.130870289547381e-07, + "loss": 0.1569, + "step": 18751 + }, + { + "epoch": 0.95, + "grad_norm": 1.1361995973113577, + "learning_rate": 1.1284017529966329e-07, + "loss": 0.1689, + "step": 18752 + }, + { + "epoch": 0.95, + "grad_norm": 0.9430514064426113, + "learning_rate": 1.1259358983435775e-07, + "loss": 0.1825, + "step": 18753 + }, + { + "epoch": 0.95, + "grad_norm": 1.0092812343386752, + "learning_rate": 1.1234727256550837e-07, + "loss": 0.1763, + "step": 18754 + }, + { + "epoch": 0.95, + "grad_norm": 0.8164572156621775, + "learning_rate": 1.1210122349979979e-07, + "loss": 0.154, + "step": 18755 + }, + { + "epoch": 0.95, + "grad_norm": 1.327723266994876, + "learning_rate": 1.1185544264390225e-07, + "loss": 0.1733, + "step": 18756 + }, + { + "epoch": 0.95, + "grad_norm": 1.6508728885834751, + "learning_rate": 1.1160993000448372e-07, + "loss": 0.1735, + "step": 18757 + }, + { + "epoch": 0.95, + "grad_norm": 0.9180275397058472, + "learning_rate": 1.1136468558820668e-07, + "loss": 0.1582, + "step": 18758 + }, + { + "epoch": 0.95, + "grad_norm": 1.101850420181442, + "learning_rate": 1.1111970940171912e-07, + "loss": 0.1516, + "step": 18759 + }, + { + "epoch": 0.95, + "grad_norm": 1.1555854613546752, + "learning_rate": 1.1087500145166908e-07, + "loss": 0.1411, + "step": 18760 + }, + { + "epoch": 0.95, + "grad_norm": 1.2756787789402384, + "learning_rate": 1.1063056174469234e-07, + "loss": 0.1396, + "step": 18761 + }, + { + "epoch": 0.95, + "grad_norm": 1.4078874087782576, + "learning_rate": 1.1038639028742138e-07, + "loss": 0.194, + "step": 18762 + }, + { + "epoch": 0.95, + "grad_norm": 0.8496709042888108, + "learning_rate": 1.1014248708647645e-07, + "loss": 0.1247, + "step": 18763 + }, + { + "epoch": 0.95, + "grad_norm": 0.8844652923011713, + "learning_rate": 1.0989885214847673e-07, + "loss": 0.1586, + "step": 18764 + }, + { + "epoch": 0.95, + "grad_norm": 1.001028423834968, + "learning_rate": 1.0965548548002802e-07, + "loss": 0.1641, + "step": 18765 + }, + { + "epoch": 0.95, + "grad_norm": 0.865470853867464, + "learning_rate": 1.0941238708773283e-07, + "loss": 0.1806, + "step": 18766 + }, + { + "epoch": 0.95, + "grad_norm": 1.408923759241474, + "learning_rate": 1.0916955697818587e-07, + "loss": 0.1504, + "step": 18767 + }, + { + "epoch": 0.95, + "grad_norm": 1.2024386779570817, + "learning_rate": 1.0892699515797411e-07, + "loss": 0.1791, + "step": 18768 + }, + { + "epoch": 0.95, + "grad_norm": 1.0923015100986826, + "learning_rate": 1.086847016336745e-07, + "loss": 0.1392, + "step": 18769 + }, + { + "epoch": 0.95, + "grad_norm": 1.8647220030074458, + "learning_rate": 1.084426764118629e-07, + "loss": 0.1486, + "step": 18770 + }, + { + "epoch": 0.95, + "grad_norm": 1.7755338691389857, + "learning_rate": 1.0820091949910072e-07, + "loss": 0.1672, + "step": 18771 + }, + { + "epoch": 0.95, + "grad_norm": 0.8298954613901326, + "learning_rate": 1.0795943090194827e-07, + "loss": 0.1649, + "step": 18772 + }, + { + "epoch": 0.95, + "grad_norm": 0.9795338903069379, + "learning_rate": 1.0771821062695476e-07, + "loss": 0.1489, + "step": 18773 + }, + { + "epoch": 0.95, + "grad_norm": 1.0089380968964687, + "learning_rate": 1.0747725868066383e-07, + "loss": 0.151, + "step": 18774 + }, + { + "epoch": 0.95, + "grad_norm": 1.2822032644898074, + "learning_rate": 1.0723657506961027e-07, + "loss": 0.172, + "step": 18775 + }, + { + "epoch": 0.95, + "grad_norm": 1.7322298865698078, + "learning_rate": 1.069961598003233e-07, + "loss": 0.1599, + "step": 18776 + }, + { + "epoch": 0.95, + "grad_norm": 2.046246918063842, + "learning_rate": 1.0675601287932547e-07, + "loss": 0.1966, + "step": 18777 + }, + { + "epoch": 0.95, + "grad_norm": 2.328086242939519, + "learning_rate": 1.0651613431312824e-07, + "loss": 0.1464, + "step": 18778 + }, + { + "epoch": 0.95, + "grad_norm": 1.294347819359744, + "learning_rate": 1.0627652410823975e-07, + "loss": 0.1369, + "step": 18779 + }, + { + "epoch": 0.95, + "grad_norm": 0.7989102185537711, + "learning_rate": 1.0603718227116034e-07, + "loss": 0.1613, + "step": 18780 + }, + { + "epoch": 0.96, + "grad_norm": 1.036952896112565, + "learning_rate": 1.0579810880838037e-07, + "loss": 0.1639, + "step": 18781 + }, + { + "epoch": 0.96, + "grad_norm": 1.2476287988888424, + "learning_rate": 1.0555930372638578e-07, + "loss": 0.1744, + "step": 18782 + }, + { + "epoch": 0.96, + "grad_norm": 3.37060572453245, + "learning_rate": 1.0532076703165362e-07, + "loss": 0.1899, + "step": 18783 + }, + { + "epoch": 0.96, + "grad_norm": 0.9596618628758464, + "learning_rate": 1.0508249873065424e-07, + "loss": 0.15, + "step": 18784 + }, + { + "epoch": 0.96, + "grad_norm": 1.185689083060517, + "learning_rate": 1.0484449882985138e-07, + "loss": 0.1541, + "step": 18785 + }, + { + "epoch": 0.96, + "grad_norm": 1.8349670685171569, + "learning_rate": 1.0460676733570096e-07, + "loss": 0.166, + "step": 18786 + }, + { + "epoch": 0.96, + "grad_norm": 3.0295126853380023, + "learning_rate": 1.0436930425465008e-07, + "loss": 0.1874, + "step": 18787 + }, + { + "epoch": 0.96, + "grad_norm": 1.0593615747004992, + "learning_rate": 1.0413210959314135e-07, + "loss": 0.1595, + "step": 18788 + }, + { + "epoch": 0.96, + "grad_norm": 2.168075177134784, + "learning_rate": 1.038951833576074e-07, + "loss": 0.1723, + "step": 18789 + }, + { + "epoch": 0.96, + "grad_norm": 1.1218155857630336, + "learning_rate": 1.0365852555447642e-07, + "loss": 0.1753, + "step": 18790 + }, + { + "epoch": 0.96, + "grad_norm": 0.8050559077961082, + "learning_rate": 1.0342213619016661e-07, + "loss": 0.1587, + "step": 18791 + }, + { + "epoch": 0.96, + "grad_norm": 0.9181660602621501, + "learning_rate": 1.0318601527108952e-07, + "loss": 0.1496, + "step": 18792 + }, + { + "epoch": 0.96, + "grad_norm": 1.1033877829228513, + "learning_rate": 1.0295016280365111e-07, + "loss": 0.1379, + "step": 18793 + }, + { + "epoch": 0.96, + "grad_norm": 0.9590634235469165, + "learning_rate": 1.0271457879424851e-07, + "loss": 0.1546, + "step": 18794 + }, + { + "epoch": 0.96, + "grad_norm": 1.952709206529511, + "learning_rate": 1.0247926324927215e-07, + "loss": 0.1687, + "step": 18795 + }, + { + "epoch": 0.96, + "grad_norm": 1.1759733030062076, + "learning_rate": 1.0224421617510471e-07, + "loss": 0.1583, + "step": 18796 + }, + { + "epoch": 0.96, + "grad_norm": 1.2763520910574768, + "learning_rate": 1.020094375781222e-07, + "loss": 0.1571, + "step": 18797 + }, + { + "epoch": 0.96, + "grad_norm": 1.7511259754137039, + "learning_rate": 1.0177492746469286e-07, + "loss": 0.1676, + "step": 18798 + }, + { + "epoch": 0.96, + "grad_norm": 1.148516859847391, + "learning_rate": 1.0154068584117716e-07, + "loss": 0.1773, + "step": 18799 + }, + { + "epoch": 0.96, + "grad_norm": 0.9627330663214961, + "learning_rate": 1.0130671271392889e-07, + "loss": 0.141, + "step": 18800 + }, + { + "epoch": 0.96, + "grad_norm": 0.9745604777780793, + "learning_rate": 1.0107300808929522e-07, + "loss": 0.1718, + "step": 18801 + }, + { + "epoch": 0.96, + "grad_norm": 0.8737543564782753, + "learning_rate": 1.008395719736166e-07, + "loss": 0.1648, + "step": 18802 + }, + { + "epoch": 0.96, + "grad_norm": 1.0772967372661506, + "learning_rate": 1.0060640437322244e-07, + "loss": 0.1645, + "step": 18803 + }, + { + "epoch": 0.96, + "grad_norm": 1.4726543395679172, + "learning_rate": 1.003735052944399e-07, + "loss": 0.1848, + "step": 18804 + }, + { + "epoch": 0.96, + "grad_norm": 1.169918964022628, + "learning_rate": 1.0014087474358392e-07, + "loss": 0.1543, + "step": 18805 + }, + { + "epoch": 0.96, + "grad_norm": 2.216989418510401, + "learning_rate": 9.990851272696722e-08, + "loss": 0.1576, + "step": 18806 + }, + { + "epoch": 0.96, + "grad_norm": 1.1686919692769921, + "learning_rate": 9.967641925089033e-08, + "loss": 0.173, + "step": 18807 + }, + { + "epoch": 0.96, + "grad_norm": 1.4996681879717797, + "learning_rate": 9.944459432165044e-08, + "loss": 0.1664, + "step": 18808 + }, + { + "epoch": 0.96, + "grad_norm": 1.1045459523269272, + "learning_rate": 9.92130379455336e-08, + "loss": 0.1522, + "step": 18809 + }, + { + "epoch": 0.96, + "grad_norm": 0.9910006468676175, + "learning_rate": 9.89817501288226e-08, + "loss": 0.1373, + "step": 18810 + }, + { + "epoch": 0.96, + "grad_norm": 1.1419457094601917, + "learning_rate": 9.875073087779241e-08, + "loss": 0.1856, + "step": 18811 + }, + { + "epoch": 0.96, + "grad_norm": 1.0739981086468395, + "learning_rate": 9.85199801987069e-08, + "loss": 0.1716, + "step": 18812 + }, + { + "epoch": 0.96, + "grad_norm": 0.9680114664815531, + "learning_rate": 9.828949809782662e-08, + "loss": 0.1678, + "step": 18813 + }, + { + "epoch": 0.96, + "grad_norm": 1.0786498296590985, + "learning_rate": 9.805928458140212e-08, + "loss": 0.1591, + "step": 18814 + }, + { + "epoch": 0.96, + "grad_norm": 1.64021686468835, + "learning_rate": 9.782933965567953e-08, + "loss": 0.1626, + "step": 18815 + }, + { + "epoch": 0.96, + "grad_norm": 1.1278692654493154, + "learning_rate": 9.759966332689497e-08, + "loss": 0.1598, + "step": 18816 + }, + { + "epoch": 0.96, + "grad_norm": 1.1266880264484171, + "learning_rate": 9.737025560127899e-08, + "loss": 0.1598, + "step": 18817 + }, + { + "epoch": 0.96, + "grad_norm": 1.1920107831865574, + "learning_rate": 9.714111648505442e-08, + "loss": 0.1857, + "step": 18818 + }, + { + "epoch": 0.96, + "grad_norm": 1.1175544268483446, + "learning_rate": 9.691224598443515e-08, + "loss": 0.1506, + "step": 18819 + }, + { + "epoch": 0.96, + "grad_norm": 1.0988902168208523, + "learning_rate": 9.66836441056318e-08, + "loss": 0.1727, + "step": 18820 + }, + { + "epoch": 0.96, + "grad_norm": 0.826008174194182, + "learning_rate": 9.645531085484383e-08, + "loss": 0.1501, + "step": 18821 + }, + { + "epoch": 0.96, + "grad_norm": 4.292188236065506, + "learning_rate": 9.622724623826407e-08, + "loss": 0.1771, + "step": 18822 + }, + { + "epoch": 0.96, + "grad_norm": 1.4458397407326542, + "learning_rate": 9.59994502620809e-08, + "loss": 0.1767, + "step": 18823 + }, + { + "epoch": 0.96, + "grad_norm": 1.2335768493154349, + "learning_rate": 9.57719229324705e-08, + "loss": 0.17, + "step": 18824 + }, + { + "epoch": 0.96, + "grad_norm": 0.9917330277547299, + "learning_rate": 9.554466425560793e-08, + "loss": 0.1501, + "step": 18825 + }, + { + "epoch": 0.96, + "grad_norm": 0.9023946695552103, + "learning_rate": 9.531767423765381e-08, + "loss": 0.1746, + "step": 18826 + }, + { + "epoch": 0.96, + "grad_norm": 1.1679938172110933, + "learning_rate": 9.509095288476767e-08, + "loss": 0.1623, + "step": 18827 + }, + { + "epoch": 0.96, + "grad_norm": 1.3085712269367609, + "learning_rate": 9.486450020310011e-08, + "loss": 0.1536, + "step": 18828 + }, + { + "epoch": 0.96, + "grad_norm": 1.1529943097730873, + "learning_rate": 9.463831619879183e-08, + "loss": 0.1537, + "step": 18829 + }, + { + "epoch": 0.96, + "grad_norm": 1.0872766259105977, + "learning_rate": 9.441240087797787e-08, + "loss": 0.172, + "step": 18830 + }, + { + "epoch": 0.96, + "grad_norm": 1.0909881135030688, + "learning_rate": 9.41867542467878e-08, + "loss": 0.1737, + "step": 18831 + }, + { + "epoch": 0.96, + "grad_norm": 0.8379483903814336, + "learning_rate": 9.396137631134116e-08, + "loss": 0.1478, + "step": 18832 + }, + { + "epoch": 0.96, + "grad_norm": 1.3761318998390024, + "learning_rate": 9.373626707775196e-08, + "loss": 0.1652, + "step": 18833 + }, + { + "epoch": 0.96, + "grad_norm": 1.2279841676392231, + "learning_rate": 9.351142655212642e-08, + "loss": 0.1867, + "step": 18834 + }, + { + "epoch": 0.96, + "grad_norm": 1.0752650044873828, + "learning_rate": 9.328685474056187e-08, + "loss": 0.1638, + "step": 18835 + }, + { + "epoch": 0.96, + "grad_norm": 1.2379873953884408, + "learning_rate": 9.306255164915123e-08, + "loss": 0.1366, + "step": 18836 + }, + { + "epoch": 0.96, + "grad_norm": 1.2130097579213732, + "learning_rate": 9.283851728397853e-08, + "loss": 0.1628, + "step": 18837 + }, + { + "epoch": 0.96, + "grad_norm": 0.8947669982322405, + "learning_rate": 9.26147516511211e-08, + "loss": 0.1507, + "step": 18838 + }, + { + "epoch": 0.96, + "grad_norm": 1.0036150550246519, + "learning_rate": 9.239125475664746e-08, + "loss": 0.159, + "step": 18839 + }, + { + "epoch": 0.96, + "grad_norm": 1.0657812870787666, + "learning_rate": 9.216802660662161e-08, + "loss": 0.1743, + "step": 18840 + }, + { + "epoch": 0.96, + "grad_norm": 0.9154068268279001, + "learning_rate": 9.194506720709651e-08, + "loss": 0.1747, + "step": 18841 + }, + { + "epoch": 0.96, + "grad_norm": 0.9264247298220879, + "learning_rate": 9.172237656412175e-08, + "loss": 0.1458, + "step": 18842 + }, + { + "epoch": 0.96, + "grad_norm": 0.9801921323891072, + "learning_rate": 9.149995468373696e-08, + "loss": 0.1621, + "step": 18843 + }, + { + "epoch": 0.96, + "grad_norm": 1.1632621863958537, + "learning_rate": 9.127780157197619e-08, + "loss": 0.1676, + "step": 18844 + }, + { + "epoch": 0.96, + "grad_norm": 1.1317424244696488, + "learning_rate": 9.105591723486352e-08, + "loss": 0.1542, + "step": 18845 + }, + { + "epoch": 0.96, + "grad_norm": 1.0736880815431518, + "learning_rate": 9.083430167841856e-08, + "loss": 0.1511, + "step": 18846 + }, + { + "epoch": 0.96, + "grad_norm": 2.0161957878733108, + "learning_rate": 9.061295490865429e-08, + "loss": 0.1674, + "step": 18847 + }, + { + "epoch": 0.96, + "grad_norm": 0.7928674453212935, + "learning_rate": 9.039187693157147e-08, + "loss": 0.1398, + "step": 18848 + }, + { + "epoch": 0.96, + "grad_norm": 0.9045398606870544, + "learning_rate": 9.017106775317086e-08, + "loss": 0.1855, + "step": 18849 + }, + { + "epoch": 0.96, + "grad_norm": 1.0535097383038936, + "learning_rate": 8.995052737943766e-08, + "loss": 0.163, + "step": 18850 + }, + { + "epoch": 0.96, + "grad_norm": 1.4496573427182988, + "learning_rate": 8.973025581635819e-08, + "loss": 0.1817, + "step": 18851 + }, + { + "epoch": 0.96, + "grad_norm": 1.0777957220050927, + "learning_rate": 8.951025306990324e-08, + "loss": 0.1563, + "step": 18852 + }, + { + "epoch": 0.96, + "grad_norm": 1.0506377567143983, + "learning_rate": 8.929051914604359e-08, + "loss": 0.1916, + "step": 18853 + }, + { + "epoch": 0.96, + "grad_norm": 3.002996556765619, + "learning_rate": 8.907105405073779e-08, + "loss": 0.1759, + "step": 18854 + }, + { + "epoch": 0.96, + "grad_norm": 0.998730610245173, + "learning_rate": 8.885185778993999e-08, + "loss": 0.1495, + "step": 18855 + }, + { + "epoch": 0.96, + "grad_norm": 1.2495765988017125, + "learning_rate": 8.863293036959431e-08, + "loss": 0.165, + "step": 18856 + }, + { + "epoch": 0.96, + "grad_norm": 1.0130819408578617, + "learning_rate": 8.841427179564154e-08, + "loss": 0.146, + "step": 18857 + }, + { + "epoch": 0.96, + "grad_norm": 1.8272516627383144, + "learning_rate": 8.819588207401142e-08, + "loss": 0.1715, + "step": 18858 + }, + { + "epoch": 0.96, + "grad_norm": 0.9948517329878664, + "learning_rate": 8.797776121062696e-08, + "loss": 0.1534, + "step": 18859 + }, + { + "epoch": 0.96, + "grad_norm": 1.3309641860820505, + "learning_rate": 8.775990921140565e-08, + "loss": 0.1656, + "step": 18860 + }, + { + "epoch": 0.96, + "grad_norm": 1.444462781797579, + "learning_rate": 8.754232608225722e-08, + "loss": 0.1625, + "step": 18861 + }, + { + "epoch": 0.96, + "grad_norm": 0.8860214555939581, + "learning_rate": 8.732501182908249e-08, + "loss": 0.1835, + "step": 18862 + }, + { + "epoch": 0.96, + "grad_norm": 1.300447118415056, + "learning_rate": 8.710796645777674e-08, + "loss": 0.1569, + "step": 18863 + }, + { + "epoch": 0.96, + "grad_norm": 1.011564783440716, + "learning_rate": 8.689118997422752e-08, + "loss": 0.1413, + "step": 18864 + }, + { + "epoch": 0.96, + "grad_norm": 2.543969469262822, + "learning_rate": 8.667468238431453e-08, + "loss": 0.151, + "step": 18865 + }, + { + "epoch": 0.96, + "grad_norm": 1.6519119012480639, + "learning_rate": 8.645844369391088e-08, + "loss": 0.155, + "step": 18866 + }, + { + "epoch": 0.96, + "grad_norm": 0.790938070722103, + "learning_rate": 8.624247390888186e-08, + "loss": 0.1392, + "step": 18867 + }, + { + "epoch": 0.96, + "grad_norm": 2.509865696614438, + "learning_rate": 8.602677303508611e-08, + "loss": 0.1826, + "step": 18868 + }, + { + "epoch": 0.96, + "grad_norm": 1.4021515201833346, + "learning_rate": 8.581134107837341e-08, + "loss": 0.1487, + "step": 18869 + }, + { + "epoch": 0.96, + "grad_norm": 1.6080350955718345, + "learning_rate": 8.559617804458798e-08, + "loss": 0.1771, + "step": 18870 + }, + { + "epoch": 0.96, + "grad_norm": 0.9752358089227612, + "learning_rate": 8.538128393956624e-08, + "loss": 0.1662, + "step": 18871 + }, + { + "epoch": 0.96, + "grad_norm": 1.3198701123217553, + "learning_rate": 8.516665876913687e-08, + "loss": 0.1725, + "step": 18872 + }, + { + "epoch": 0.96, + "grad_norm": 0.9611312290629432, + "learning_rate": 8.49523025391219e-08, + "loss": 0.1626, + "step": 18873 + }, + { + "epoch": 0.96, + "grad_norm": 1.2329227273918022, + "learning_rate": 8.473821525533665e-08, + "loss": 0.1747, + "step": 18874 + }, + { + "epoch": 0.96, + "grad_norm": 0.9490149018446066, + "learning_rate": 8.452439692358649e-08, + "loss": 0.172, + "step": 18875 + }, + { + "epoch": 0.96, + "grad_norm": 1.0099470775860748, + "learning_rate": 8.431084754967345e-08, + "loss": 0.1378, + "step": 18876 + }, + { + "epoch": 0.96, + "grad_norm": 0.7531190523770079, + "learning_rate": 8.409756713938733e-08, + "loss": 0.1473, + "step": 18877 + }, + { + "epoch": 0.96, + "grad_norm": 1.0347859261232761, + "learning_rate": 8.388455569851461e-08, + "loss": 0.1719, + "step": 18878 + }, + { + "epoch": 0.96, + "grad_norm": 0.8585341785400499, + "learning_rate": 8.367181323283402e-08, + "loss": 0.1678, + "step": 18879 + }, + { + "epoch": 0.96, + "grad_norm": 0.9211259975521868, + "learning_rate": 8.345933974811537e-08, + "loss": 0.1711, + "step": 18880 + }, + { + "epoch": 0.96, + "grad_norm": 1.208502773218923, + "learning_rate": 8.324713525012185e-08, + "loss": 0.1451, + "step": 18881 + }, + { + "epoch": 0.96, + "grad_norm": 0.854440510579316, + "learning_rate": 8.303519974461106e-08, + "loss": 0.1565, + "step": 18882 + }, + { + "epoch": 0.96, + "grad_norm": 0.9862239698761738, + "learning_rate": 8.28235332373295e-08, + "loss": 0.1941, + "step": 18883 + }, + { + "epoch": 0.96, + "grad_norm": 1.5255599255822914, + "learning_rate": 8.261213573402038e-08, + "loss": 0.159, + "step": 18884 + }, + { + "epoch": 0.96, + "grad_norm": 2.6668881972462515, + "learning_rate": 8.240100724041689e-08, + "loss": 0.1566, + "step": 18885 + }, + { + "epoch": 0.96, + "grad_norm": 0.9173579531264667, + "learning_rate": 8.219014776224555e-08, + "loss": 0.1594, + "step": 18886 + }, + { + "epoch": 0.96, + "grad_norm": 1.4391060778764997, + "learning_rate": 8.197955730522733e-08, + "loss": 0.1432, + "step": 18887 + }, + { + "epoch": 0.96, + "grad_norm": 1.7719058427891368, + "learning_rate": 8.176923587507434e-08, + "loss": 0.1642, + "step": 18888 + }, + { + "epoch": 0.96, + "grad_norm": 0.9890541847862871, + "learning_rate": 8.155918347748981e-08, + "loss": 0.15, + "step": 18889 + }, + { + "epoch": 0.96, + "grad_norm": 1.2971062282347046, + "learning_rate": 8.13494001181725e-08, + "loss": 0.1396, + "step": 18890 + }, + { + "epoch": 0.96, + "grad_norm": 0.89297258092725, + "learning_rate": 8.113988580281451e-08, + "loss": 0.1593, + "step": 18891 + }, + { + "epoch": 0.96, + "grad_norm": 1.1064976604626795, + "learning_rate": 8.093064053709576e-08, + "loss": 0.1901, + "step": 18892 + }, + { + "epoch": 0.96, + "grad_norm": 0.9866375316992192, + "learning_rate": 8.072166432669503e-08, + "loss": 0.1433, + "step": 18893 + }, + { + "epoch": 0.96, + "grad_norm": 2.1940034352979834, + "learning_rate": 8.051295717727892e-08, + "loss": 0.1657, + "step": 18894 + }, + { + "epoch": 0.96, + "grad_norm": 1.1200969835311203, + "learning_rate": 8.030451909450842e-08, + "loss": 0.1698, + "step": 18895 + }, + { + "epoch": 0.96, + "grad_norm": 1.4641811009107635, + "learning_rate": 8.009635008403904e-08, + "loss": 0.1399, + "step": 18896 + }, + { + "epoch": 0.96, + "grad_norm": 0.9307287020748609, + "learning_rate": 7.988845015151513e-08, + "loss": 0.1617, + "step": 18897 + }, + { + "epoch": 0.96, + "grad_norm": 1.2935854651952166, + "learning_rate": 7.968081930257887e-08, + "loss": 0.1524, + "step": 18898 + }, + { + "epoch": 0.96, + "grad_norm": 1.1529460938502556, + "learning_rate": 7.947345754285906e-08, + "loss": 0.1661, + "step": 18899 + }, + { + "epoch": 0.96, + "grad_norm": 1.6332754029727536, + "learning_rate": 7.926636487798345e-08, + "loss": 0.1597, + "step": 18900 + }, + { + "epoch": 0.96, + "grad_norm": 0.9341773562612538, + "learning_rate": 7.905954131356752e-08, + "loss": 0.1714, + "step": 18901 + }, + { + "epoch": 0.96, + "grad_norm": 0.7598173351821158, + "learning_rate": 7.885298685522235e-08, + "loss": 0.1494, + "step": 18902 + }, + { + "epoch": 0.96, + "grad_norm": 0.9162616507880349, + "learning_rate": 7.864670150854903e-08, + "loss": 0.1413, + "step": 18903 + }, + { + "epoch": 0.96, + "grad_norm": 0.9287105319838184, + "learning_rate": 7.844068527914528e-08, + "loss": 0.1839, + "step": 18904 + }, + { + "epoch": 0.96, + "grad_norm": 1.144598902170974, + "learning_rate": 7.823493817259776e-08, + "loss": 0.1772, + "step": 18905 + }, + { + "epoch": 0.96, + "grad_norm": 1.147242897737404, + "learning_rate": 7.802946019448864e-08, + "loss": 0.1854, + "step": 18906 + }, + { + "epoch": 0.96, + "grad_norm": 1.2450550086501007, + "learning_rate": 7.782425135039018e-08, + "loss": 0.1666, + "step": 18907 + }, + { + "epoch": 0.96, + "grad_norm": 1.0442645415313312, + "learning_rate": 7.7619311645869e-08, + "loss": 0.1513, + "step": 18908 + }, + { + "epoch": 0.96, + "grad_norm": 0.9659641393581991, + "learning_rate": 7.741464108648511e-08, + "loss": 0.1479, + "step": 18909 + }, + { + "epoch": 0.96, + "grad_norm": 1.0802928177377895, + "learning_rate": 7.721023967778962e-08, + "loss": 0.1478, + "step": 18910 + }, + { + "epoch": 0.96, + "grad_norm": 1.2944293111076215, + "learning_rate": 7.700610742532588e-08, + "loss": 0.165, + "step": 18911 + }, + { + "epoch": 0.96, + "grad_norm": 1.1730839439653735, + "learning_rate": 7.68022443346328e-08, + "loss": 0.1776, + "step": 18912 + }, + { + "epoch": 0.96, + "grad_norm": 0.9910193731834973, + "learning_rate": 7.659865041123926e-08, + "loss": 0.1662, + "step": 18913 + }, + { + "epoch": 0.96, + "grad_norm": 1.056809813408625, + "learning_rate": 7.639532566066755e-08, + "loss": 0.1683, + "step": 18914 + }, + { + "epoch": 0.96, + "grad_norm": 1.0770889532907564, + "learning_rate": 7.619227008843322e-08, + "loss": 0.1468, + "step": 18915 + }, + { + "epoch": 0.96, + "grad_norm": 1.033875052227631, + "learning_rate": 7.598948370004412e-08, + "loss": 0.1914, + "step": 18916 + }, + { + "epoch": 0.96, + "grad_norm": 1.0503186323084912, + "learning_rate": 7.578696650100025e-08, + "loss": 0.1652, + "step": 18917 + }, + { + "epoch": 0.96, + "grad_norm": 1.061638624902994, + "learning_rate": 7.558471849679616e-08, + "loss": 0.1609, + "step": 18918 + }, + { + "epoch": 0.96, + "grad_norm": 0.874059333965608, + "learning_rate": 7.538273969291743e-08, + "loss": 0.1628, + "step": 18919 + }, + { + "epoch": 0.96, + "grad_norm": 0.9470801203586013, + "learning_rate": 7.518103009484079e-08, + "loss": 0.1403, + "step": 18920 + }, + { + "epoch": 0.96, + "grad_norm": 1.200812196227737, + "learning_rate": 7.497958970804076e-08, + "loss": 0.1667, + "step": 18921 + }, + { + "epoch": 0.96, + "grad_norm": 1.0078825382674224, + "learning_rate": 7.477841853797963e-08, + "loss": 0.1674, + "step": 18922 + }, + { + "epoch": 0.96, + "grad_norm": 1.1759053898946894, + "learning_rate": 7.457751659011414e-08, + "loss": 0.1676, + "step": 18923 + }, + { + "epoch": 0.96, + "grad_norm": 0.8222529337220591, + "learning_rate": 7.437688386989438e-08, + "loss": 0.1599, + "step": 18924 + }, + { + "epoch": 0.96, + "grad_norm": 0.8533396031334325, + "learning_rate": 7.417652038276157e-08, + "loss": 0.15, + "step": 18925 + }, + { + "epoch": 0.96, + "grad_norm": 1.1056761413519485, + "learning_rate": 7.397642613415245e-08, + "loss": 0.1585, + "step": 18926 + }, + { + "epoch": 0.96, + "grad_norm": 1.3920512039464155, + "learning_rate": 7.37766011294927e-08, + "loss": 0.1554, + "step": 18927 + }, + { + "epoch": 0.96, + "grad_norm": 1.5934123887083915, + "learning_rate": 7.357704537420351e-08, + "loss": 0.1708, + "step": 18928 + }, + { + "epoch": 0.96, + "grad_norm": 1.2091051382121403, + "learning_rate": 7.337775887369725e-08, + "loss": 0.1646, + "step": 18929 + }, + { + "epoch": 0.96, + "grad_norm": 1.148339743268307, + "learning_rate": 7.317874163338068e-08, + "loss": 0.1679, + "step": 18930 + }, + { + "epoch": 0.96, + "grad_norm": 1.0786303167088929, + "learning_rate": 7.29799936586506e-08, + "loss": 0.1727, + "step": 18931 + }, + { + "epoch": 0.96, + "grad_norm": 1.132990580590024, + "learning_rate": 7.278151495489938e-08, + "loss": 0.1621, + "step": 18932 + }, + { + "epoch": 0.96, + "grad_norm": 0.839523001421132, + "learning_rate": 7.258330552750936e-08, + "loss": 0.1708, + "step": 18933 + }, + { + "epoch": 0.96, + "grad_norm": 2.1078873139031478, + "learning_rate": 7.238536538185848e-08, + "loss": 0.1629, + "step": 18934 + }, + { + "epoch": 0.96, + "grad_norm": 1.149341920558963, + "learning_rate": 7.218769452331576e-08, + "loss": 0.1722, + "step": 18935 + }, + { + "epoch": 0.96, + "grad_norm": 1.08472091194159, + "learning_rate": 7.199029295724247e-08, + "loss": 0.1674, + "step": 18936 + }, + { + "epoch": 0.96, + "grad_norm": 1.229463761384355, + "learning_rate": 7.179316068899211e-08, + "loss": 0.1659, + "step": 18937 + }, + { + "epoch": 0.96, + "grad_norm": 1.4155063754897228, + "learning_rate": 7.159629772391485e-08, + "loss": 0.1902, + "step": 18938 + }, + { + "epoch": 0.96, + "grad_norm": 0.9655177285562941, + "learning_rate": 7.139970406734753e-08, + "loss": 0.1753, + "step": 18939 + }, + { + "epoch": 0.96, + "grad_norm": 1.118473850390554, + "learning_rate": 7.120337972462365e-08, + "loss": 0.1529, + "step": 18940 + }, + { + "epoch": 0.96, + "grad_norm": 1.241518882169247, + "learning_rate": 7.100732470107008e-08, + "loss": 0.156, + "step": 18941 + }, + { + "epoch": 0.96, + "grad_norm": 0.913325622858883, + "learning_rate": 7.081153900200255e-08, + "loss": 0.1557, + "step": 18942 + }, + { + "epoch": 0.96, + "grad_norm": 1.2350006356643224, + "learning_rate": 7.061602263273237e-08, + "loss": 0.172, + "step": 18943 + }, + { + "epoch": 0.96, + "grad_norm": 1.1409555515039296, + "learning_rate": 7.04207755985653e-08, + "loss": 0.159, + "step": 18944 + }, + { + "epoch": 0.96, + "grad_norm": 0.8621408823943831, + "learning_rate": 7.022579790479378e-08, + "loss": 0.1437, + "step": 18945 + }, + { + "epoch": 0.96, + "grad_norm": 0.8696956434949746, + "learning_rate": 7.003108955670911e-08, + "loss": 0.1808, + "step": 18946 + }, + { + "epoch": 0.96, + "grad_norm": 1.0370223435433659, + "learning_rate": 6.983665055959154e-08, + "loss": 0.1462, + "step": 18947 + }, + { + "epoch": 0.96, + "grad_norm": 1.0713365572057763, + "learning_rate": 6.964248091871683e-08, + "loss": 0.1703, + "step": 18948 + }, + { + "epoch": 0.96, + "grad_norm": 0.9865314911287251, + "learning_rate": 6.944858063934967e-08, + "loss": 0.1432, + "step": 18949 + }, + { + "epoch": 0.96, + "grad_norm": 1.0416169563655078, + "learning_rate": 6.925494972675029e-08, + "loss": 0.1425, + "step": 18950 + }, + { + "epoch": 0.96, + "grad_norm": 1.2588921376373747, + "learning_rate": 6.906158818617226e-08, + "loss": 0.1717, + "step": 18951 + }, + { + "epoch": 0.96, + "grad_norm": 1.0330787653455085, + "learning_rate": 6.886849602285916e-08, + "loss": 0.1566, + "step": 18952 + }, + { + "epoch": 0.96, + "grad_norm": 1.0131289400465249, + "learning_rate": 6.867567324204905e-08, + "loss": 0.1638, + "step": 18953 + }, + { + "epoch": 0.96, + "grad_norm": 0.8849718675951277, + "learning_rate": 6.848311984897216e-08, + "loss": 0.1354, + "step": 18954 + }, + { + "epoch": 0.96, + "grad_norm": 1.2590838070351569, + "learning_rate": 6.82908358488521e-08, + "loss": 0.1656, + "step": 18955 + }, + { + "epoch": 0.96, + "grad_norm": 2.542926066213727, + "learning_rate": 6.809882124690358e-08, + "loss": 0.1639, + "step": 18956 + }, + { + "epoch": 0.96, + "grad_norm": 1.3956886472564511, + "learning_rate": 6.79070760483358e-08, + "loss": 0.1853, + "step": 18957 + }, + { + "epoch": 0.96, + "grad_norm": 1.3101703160806484, + "learning_rate": 6.771560025834901e-08, + "loss": 0.1963, + "step": 18958 + }, + { + "epoch": 0.96, + "grad_norm": 1.5623705229321303, + "learning_rate": 6.752439388213682e-08, + "loss": 0.1965, + "step": 18959 + }, + { + "epoch": 0.96, + "grad_norm": 1.12056540940211, + "learning_rate": 6.733345692488736e-08, + "loss": 0.1622, + "step": 18960 + }, + { + "epoch": 0.96, + "grad_norm": 1.0543695819488192, + "learning_rate": 6.714278939177976e-08, + "loss": 0.1504, + "step": 18961 + }, + { + "epoch": 0.96, + "grad_norm": 1.2197686288091252, + "learning_rate": 6.695239128798325e-08, + "loss": 0.1545, + "step": 18962 + }, + { + "epoch": 0.96, + "grad_norm": 0.819372886682206, + "learning_rate": 6.676226261866591e-08, + "loss": 0.1587, + "step": 18963 + }, + { + "epoch": 0.96, + "grad_norm": 1.1616646556897137, + "learning_rate": 6.657240338898141e-08, + "loss": 0.1658, + "step": 18964 + }, + { + "epoch": 0.96, + "grad_norm": 1.4269534756249098, + "learning_rate": 6.638281360408339e-08, + "loss": 0.1673, + "step": 18965 + }, + { + "epoch": 0.96, + "grad_norm": 1.578288524551596, + "learning_rate": 6.619349326911218e-08, + "loss": 0.143, + "step": 18966 + }, + { + "epoch": 0.96, + "grad_norm": 1.2967738604121724, + "learning_rate": 6.600444238920256e-08, + "loss": 0.1589, + "step": 18967 + }, + { + "epoch": 0.96, + "grad_norm": 0.9378892547102705, + "learning_rate": 6.581566096948377e-08, + "loss": 0.1514, + "step": 18968 + }, + { + "epoch": 0.96, + "grad_norm": 1.0222674207787896, + "learning_rate": 6.562714901507616e-08, + "loss": 0.1551, + "step": 18969 + }, + { + "epoch": 0.96, + "grad_norm": 1.4522802005044373, + "learning_rate": 6.543890653109341e-08, + "loss": 0.1993, + "step": 18970 + }, + { + "epoch": 0.96, + "grad_norm": 1.9929201580900462, + "learning_rate": 6.525093352264145e-08, + "loss": 0.1565, + "step": 18971 + }, + { + "epoch": 0.96, + "grad_norm": 1.2075074305444102, + "learning_rate": 6.506322999481951e-08, + "loss": 0.1568, + "step": 18972 + }, + { + "epoch": 0.96, + "grad_norm": 0.9119396547683619, + "learning_rate": 6.487579595271798e-08, + "loss": 0.1385, + "step": 18973 + }, + { + "epoch": 0.96, + "grad_norm": 0.9468835404050955, + "learning_rate": 6.468863140142279e-08, + "loss": 0.1522, + "step": 18974 + }, + { + "epoch": 0.96, + "grad_norm": 1.0360263939289716, + "learning_rate": 6.450173634600876e-08, + "loss": 0.1544, + "step": 18975 + }, + { + "epoch": 0.96, + "grad_norm": 0.9384445234929053, + "learning_rate": 6.431511079154517e-08, + "loss": 0.1421, + "step": 18976 + }, + { + "epoch": 0.97, + "grad_norm": 0.9717830054475515, + "learning_rate": 6.412875474309688e-08, + "loss": 0.143, + "step": 18977 + }, + { + "epoch": 0.97, + "grad_norm": 1.242839667500306, + "learning_rate": 6.394266820571649e-08, + "loss": 0.1657, + "step": 18978 + }, + { + "epoch": 0.97, + "grad_norm": 0.8385638418607924, + "learning_rate": 6.375685118445329e-08, + "loss": 0.1526, + "step": 18979 + }, + { + "epoch": 0.97, + "grad_norm": 1.0274904096006774, + "learning_rate": 6.35713036843455e-08, + "loss": 0.1462, + "step": 18980 + }, + { + "epoch": 0.97, + "grad_norm": 0.955038906916783, + "learning_rate": 6.338602571042795e-08, + "loss": 0.1813, + "step": 18981 + }, + { + "epoch": 0.97, + "grad_norm": 1.4956850451648465, + "learning_rate": 6.320101726772665e-08, + "loss": 0.1594, + "step": 18982 + }, + { + "epoch": 0.97, + "grad_norm": 1.2377378344046592, + "learning_rate": 6.301627836125757e-08, + "loss": 0.1779, + "step": 18983 + }, + { + "epoch": 0.97, + "grad_norm": 0.9260561719978769, + "learning_rate": 6.283180899603447e-08, + "loss": 0.171, + "step": 18984 + }, + { + "epoch": 0.97, + "grad_norm": 0.9400856349983138, + "learning_rate": 6.264760917705782e-08, + "loss": 0.1605, + "step": 18985 + }, + { + "epoch": 0.97, + "grad_norm": 1.4391237828833978, + "learning_rate": 6.246367890932803e-08, + "loss": 0.1747, + "step": 18986 + }, + { + "epoch": 0.97, + "grad_norm": 1.0650353212909858, + "learning_rate": 6.228001819783113e-08, + "loss": 0.1503, + "step": 18987 + }, + { + "epoch": 0.97, + "grad_norm": 1.2264828665242962, + "learning_rate": 6.209662704754982e-08, + "loss": 0.1679, + "step": 18988 + }, + { + "epoch": 0.97, + "grad_norm": 1.2608897282732527, + "learning_rate": 6.191350546346008e-08, + "loss": 0.1607, + "step": 18989 + }, + { + "epoch": 0.97, + "grad_norm": 1.0195823417707264, + "learning_rate": 6.173065345052687e-08, + "loss": 0.1584, + "step": 18990 + }, + { + "epoch": 0.97, + "grad_norm": 3.3217468456080654, + "learning_rate": 6.154807101371063e-08, + "loss": 0.1415, + "step": 18991 + }, + { + "epoch": 0.97, + "grad_norm": 1.3982193219953636, + "learning_rate": 6.136575815796519e-08, + "loss": 0.1604, + "step": 18992 + }, + { + "epoch": 0.97, + "grad_norm": 1.039154864355294, + "learning_rate": 6.118371488823439e-08, + "loss": 0.171, + "step": 18993 + }, + { + "epoch": 0.97, + "grad_norm": 1.449132551538369, + "learning_rate": 6.100194120945645e-08, + "loss": 0.185, + "step": 18994 + }, + { + "epoch": 0.97, + "grad_norm": 1.7339355163445271, + "learning_rate": 6.082043712656305e-08, + "loss": 0.1715, + "step": 18995 + }, + { + "epoch": 0.97, + "grad_norm": 2.6051479995279934, + "learning_rate": 6.063920264447576e-08, + "loss": 0.1468, + "step": 18996 + }, + { + "epoch": 0.97, + "grad_norm": 1.0033063919052598, + "learning_rate": 6.045823776811177e-08, + "loss": 0.1525, + "step": 18997 + }, + { + "epoch": 0.97, + "grad_norm": 0.9764565540340898, + "learning_rate": 6.027754250237938e-08, + "loss": 0.1755, + "step": 18998 + }, + { + "epoch": 0.97, + "grad_norm": 1.3922419999429214, + "learning_rate": 6.009711685218134e-08, + "loss": 0.1729, + "step": 18999 + }, + { + "epoch": 0.97, + "grad_norm": 1.1355283592252898, + "learning_rate": 5.991696082240817e-08, + "loss": 0.1741, + "step": 19000 + }, + { + "epoch": 0.97, + "grad_norm": 1.0226456454781145, + "learning_rate": 5.97370744179504e-08, + "loss": 0.1596, + "step": 19001 + }, + { + "epoch": 0.97, + "grad_norm": 1.022674255084776, + "learning_rate": 5.9557457643685257e-08, + "loss": 0.1534, + "step": 19002 + }, + { + "epoch": 0.97, + "grad_norm": 1.3787117157844775, + "learning_rate": 5.93781105044855e-08, + "loss": 0.1637, + "step": 19003 + }, + { + "epoch": 0.97, + "grad_norm": 1.5550964475153704, + "learning_rate": 5.9199033005217233e-08, + "loss": 0.157, + "step": 19004 + }, + { + "epoch": 0.97, + "grad_norm": 1.2498322296452542, + "learning_rate": 5.9020225150735466e-08, + "loss": 0.1518, + "step": 19005 + }, + { + "epoch": 0.97, + "grad_norm": 2.5263437302645886, + "learning_rate": 5.8841686945891874e-08, + "loss": 0.1414, + "step": 19006 + }, + { + "epoch": 0.97, + "grad_norm": 1.0709694859747907, + "learning_rate": 5.866341839552814e-08, + "loss": 0.1782, + "step": 19007 + }, + { + "epoch": 0.97, + "grad_norm": 1.4903520322067012, + "learning_rate": 5.848541950448261e-08, + "loss": 0.1648, + "step": 19008 + }, + { + "epoch": 0.97, + "grad_norm": 1.9290190400309384, + "learning_rate": 5.830769027757921e-08, + "loss": 0.1842, + "step": 19009 + }, + { + "epoch": 0.97, + "grad_norm": 1.1113056250270656, + "learning_rate": 5.813023071964296e-08, + "loss": 0.1524, + "step": 19010 + }, + { + "epoch": 0.97, + "grad_norm": 0.8503388097141104, + "learning_rate": 5.795304083548559e-08, + "loss": 0.1451, + "step": 19011 + }, + { + "epoch": 0.97, + "grad_norm": 1.0071646211516418, + "learning_rate": 5.777612062991211e-08, + "loss": 0.1571, + "step": 19012 + }, + { + "epoch": 0.97, + "grad_norm": 0.9059092713987779, + "learning_rate": 5.759947010772426e-08, + "loss": 0.1576, + "step": 19013 + }, + { + "epoch": 0.97, + "grad_norm": 0.9766952791494546, + "learning_rate": 5.742308927371154e-08, + "loss": 0.1701, + "step": 19014 + }, + { + "epoch": 0.97, + "grad_norm": 0.9185439725383592, + "learning_rate": 5.7246978132659e-08, + "loss": 0.148, + "step": 19015 + }, + { + "epoch": 0.97, + "grad_norm": 1.419729476220642, + "learning_rate": 5.707113668934505e-08, + "loss": 0.1695, + "step": 19016 + }, + { + "epoch": 0.97, + "grad_norm": 1.1401721563452156, + "learning_rate": 5.6895564948536984e-08, + "loss": 0.1611, + "step": 19017 + }, + { + "epoch": 0.97, + "grad_norm": 1.053616248533265, + "learning_rate": 5.672026291499877e-08, + "loss": 0.1598, + "step": 19018 + }, + { + "epoch": 0.97, + "grad_norm": 1.463192697506729, + "learning_rate": 5.6545230593484376e-08, + "loss": 0.1678, + "step": 19019 + }, + { + "epoch": 0.97, + "grad_norm": 0.9548696767406545, + "learning_rate": 5.637046798874335e-08, + "loss": 0.1494, + "step": 19020 + }, + { + "epoch": 0.97, + "grad_norm": 1.0769999525229907, + "learning_rate": 5.619597510551411e-08, + "loss": 0.1516, + "step": 19021 + }, + { + "epoch": 0.97, + "grad_norm": 1.1269093531874508, + "learning_rate": 5.6021751948530656e-08, + "loss": 0.1603, + "step": 19022 + }, + { + "epoch": 0.97, + "grad_norm": 0.9860154062923643, + "learning_rate": 5.58477985225192e-08, + "loss": 0.1511, + "step": 19023 + }, + { + "epoch": 0.97, + "grad_norm": 1.3408468901522745, + "learning_rate": 5.567411483219709e-08, + "loss": 0.1514, + "step": 19024 + }, + { + "epoch": 0.97, + "grad_norm": 1.0965825124126323, + "learning_rate": 5.550070088227721e-08, + "loss": 0.1573, + "step": 19025 + }, + { + "epoch": 0.97, + "grad_norm": 1.023824284466048, + "learning_rate": 5.532755667746248e-08, + "loss": 0.1946, + "step": 19026 + }, + { + "epoch": 0.97, + "grad_norm": 1.0445023365005794, + "learning_rate": 5.515468222244913e-08, + "loss": 0.1519, + "step": 19027 + }, + { + "epoch": 0.97, + "grad_norm": 0.7889943973151463, + "learning_rate": 5.4982077521926744e-08, + "loss": 0.1603, + "step": 19028 + }, + { + "epoch": 0.97, + "grad_norm": 1.113921879902848, + "learning_rate": 5.4809742580577144e-08, + "loss": 0.1807, + "step": 19029 + }, + { + "epoch": 0.97, + "grad_norm": 1.2384246609536174, + "learning_rate": 5.4637677403074355e-08, + "loss": 0.1507, + "step": 19030 + }, + { + "epoch": 0.97, + "grad_norm": 1.1053062410513481, + "learning_rate": 5.4465881994087976e-08, + "loss": 0.159, + "step": 19031 + }, + { + "epoch": 0.97, + "grad_norm": 1.317934675993351, + "learning_rate": 5.4294356358274294e-08, + "loss": 0.1587, + "step": 19032 + }, + { + "epoch": 0.97, + "grad_norm": 1.0281026923641639, + "learning_rate": 5.4123100500289574e-08, + "loss": 0.1613, + "step": 19033 + }, + { + "epoch": 0.97, + "grad_norm": 1.385724210989773, + "learning_rate": 5.395211442477677e-08, + "loss": 0.1577, + "step": 19034 + }, + { + "epoch": 0.97, + "grad_norm": 1.002683221703717, + "learning_rate": 5.3781398136374394e-08, + "loss": 0.1447, + "step": 19035 + }, + { + "epoch": 0.97, + "grad_norm": 1.3531188851630518, + "learning_rate": 5.3610951639714305e-08, + "loss": 0.1619, + "step": 19036 + }, + { + "epoch": 0.97, + "grad_norm": 1.7890219249827146, + "learning_rate": 5.344077493941835e-08, + "loss": 0.1771, + "step": 19037 + }, + { + "epoch": 0.97, + "grad_norm": 1.1012008101422308, + "learning_rate": 5.327086804010284e-08, + "loss": 0.1614, + "step": 19038 + }, + { + "epoch": 0.97, + "grad_norm": 1.0428208001568597, + "learning_rate": 5.3101230946376314e-08, + "loss": 0.1754, + "step": 19039 + }, + { + "epoch": 0.97, + "grad_norm": 1.026144332646727, + "learning_rate": 5.2931863662841755e-08, + "loss": 0.1688, + "step": 19040 + }, + { + "epoch": 0.97, + "grad_norm": 1.603608003297128, + "learning_rate": 5.276276619409215e-08, + "loss": 0.1589, + "step": 19041 + }, + { + "epoch": 0.97, + "grad_norm": 1.1969975578810148, + "learning_rate": 5.259393854471384e-08, + "loss": 0.1777, + "step": 19042 + }, + { + "epoch": 0.97, + "grad_norm": 1.0041848312681114, + "learning_rate": 5.242538071928649e-08, + "loss": 0.174, + "step": 19043 + }, + { + "epoch": 0.97, + "grad_norm": 1.2724602233865854, + "learning_rate": 5.225709272238311e-08, + "loss": 0.1433, + "step": 19044 + }, + { + "epoch": 0.97, + "grad_norm": 1.0626391350708608, + "learning_rate": 5.208907455856782e-08, + "loss": 0.1853, + "step": 19045 + }, + { + "epoch": 0.97, + "grad_norm": 0.9280447894665182, + "learning_rate": 5.19213262323981e-08, + "loss": 0.1554, + "step": 19046 + }, + { + "epoch": 0.97, + "grad_norm": 1.4995114449645333, + "learning_rate": 5.1753847748424735e-08, + "loss": 0.1609, + "step": 19047 + }, + { + "epoch": 0.97, + "grad_norm": 0.8714223222188999, + "learning_rate": 5.158663911118966e-08, + "loss": 0.1459, + "step": 19048 + }, + { + "epoch": 0.97, + "grad_norm": 1.1422759045113517, + "learning_rate": 5.141970032522925e-08, + "loss": 0.1769, + "step": 19049 + }, + { + "epoch": 0.97, + "grad_norm": 0.9078013270129156, + "learning_rate": 5.1253031395070985e-08, + "loss": 0.1561, + "step": 19050 + }, + { + "epoch": 0.97, + "grad_norm": 1.4725725563008483, + "learning_rate": 5.1086632325236804e-08, + "loss": 0.177, + "step": 19051 + }, + { + "epoch": 0.97, + "grad_norm": 1.0447898313350958, + "learning_rate": 5.0920503120239775e-08, + "loss": 0.1582, + "step": 19052 + }, + { + "epoch": 0.97, + "grad_norm": 2.403622306022244, + "learning_rate": 5.075464378458517e-08, + "loss": 0.1592, + "step": 19053 + }, + { + "epoch": 0.97, + "grad_norm": 1.2749760646824433, + "learning_rate": 5.0589054322772725e-08, + "loss": 0.1795, + "step": 19054 + }, + { + "epoch": 0.97, + "grad_norm": 1.151845211047926, + "learning_rate": 5.04237347392944e-08, + "loss": 0.1592, + "step": 19055 + }, + { + "epoch": 0.97, + "grad_norm": 1.0014197338720585, + "learning_rate": 5.0258685038634406e-08, + "loss": 0.151, + "step": 19056 + }, + { + "epoch": 0.97, + "grad_norm": 1.381306982875491, + "learning_rate": 5.009390522526914e-08, + "loss": 0.1544, + "step": 19057 + }, + { + "epoch": 0.97, + "grad_norm": 1.6958918491548332, + "learning_rate": 4.992939530366947e-08, + "loss": 0.1547, + "step": 19058 + }, + { + "epoch": 0.97, + "grad_norm": 2.4727261828084512, + "learning_rate": 4.9765155278296284e-08, + "loss": 0.1544, + "step": 19059 + }, + { + "epoch": 0.97, + "grad_norm": 2.328137183771419, + "learning_rate": 4.96011851536049e-08, + "loss": 0.169, + "step": 19060 + }, + { + "epoch": 0.97, + "grad_norm": 1.162867331490707, + "learning_rate": 4.9437484934043987e-08, + "loss": 0.1507, + "step": 19061 + }, + { + "epoch": 0.97, + "grad_norm": 0.9319785655569531, + "learning_rate": 4.927405462405332e-08, + "loss": 0.1596, + "step": 19062 + }, + { + "epoch": 0.97, + "grad_norm": 1.1297107255840486, + "learning_rate": 4.911089422806603e-08, + "loss": 0.1448, + "step": 19063 + }, + { + "epoch": 0.97, + "grad_norm": 1.0959641369353819, + "learning_rate": 4.8948003750507455e-08, + "loss": 0.1556, + "step": 19064 + }, + { + "epoch": 0.97, + "grad_norm": 1.4195341111796325, + "learning_rate": 4.878538319579629e-08, + "loss": 0.1801, + "step": 19065 + }, + { + "epoch": 0.97, + "grad_norm": 1.3786838067360543, + "learning_rate": 4.862303256834344e-08, + "loss": 0.173, + "step": 19066 + }, + { + "epoch": 0.97, + "grad_norm": 2.551126828679602, + "learning_rate": 4.846095187255318e-08, + "loss": 0.1725, + "step": 19067 + }, + { + "epoch": 0.97, + "grad_norm": 1.0936716110028684, + "learning_rate": 4.8299141112821966e-08, + "loss": 0.1677, + "step": 19068 + }, + { + "epoch": 0.97, + "grad_norm": 1.1227017299371511, + "learning_rate": 4.8137600293538536e-08, + "loss": 0.1773, + "step": 19069 + }, + { + "epoch": 0.97, + "grad_norm": 1.021515630146002, + "learning_rate": 4.7976329419084924e-08, + "loss": 0.1722, + "step": 19070 + }, + { + "epoch": 0.97, + "grad_norm": 1.1159643409886524, + "learning_rate": 4.781532849383541e-08, + "loss": 0.1559, + "step": 19071 + }, + { + "epoch": 0.97, + "grad_norm": 2.508985334043882, + "learning_rate": 4.765459752215651e-08, + "loss": 0.1624, + "step": 19072 + }, + { + "epoch": 0.97, + "grad_norm": 1.4314591906020395, + "learning_rate": 4.749413650840917e-08, + "loss": 0.1639, + "step": 19073 + }, + { + "epoch": 0.97, + "grad_norm": 1.1081591206295462, + "learning_rate": 4.7333945456945474e-08, + "loss": 0.1629, + "step": 19074 + }, + { + "epoch": 0.97, + "grad_norm": 0.9484977524062326, + "learning_rate": 4.717402437211083e-08, + "loss": 0.1546, + "step": 19075 + }, + { + "epoch": 0.97, + "grad_norm": 1.032321978750882, + "learning_rate": 4.701437325824287e-08, + "loss": 0.1548, + "step": 19076 + }, + { + "epoch": 0.97, + "grad_norm": 1.0338695817692656, + "learning_rate": 4.6854992119671484e-08, + "loss": 0.1546, + "step": 19077 + }, + { + "epoch": 0.97, + "grad_norm": 0.8415263499415587, + "learning_rate": 4.6695880960722085e-08, + "loss": 0.1634, + "step": 19078 + }, + { + "epoch": 0.97, + "grad_norm": 2.1299997587136064, + "learning_rate": 4.653703978570789e-08, + "loss": 0.1507, + "step": 19079 + }, + { + "epoch": 0.97, + "grad_norm": 3.050521335679299, + "learning_rate": 4.6378468598938794e-08, + "loss": 0.1812, + "step": 19080 + }, + { + "epoch": 0.97, + "grad_norm": 1.361031673379913, + "learning_rate": 4.62201674047158e-08, + "loss": 0.1644, + "step": 19081 + }, + { + "epoch": 0.97, + "grad_norm": 0.9886945750398041, + "learning_rate": 4.606213620733324e-08, + "loss": 0.1619, + "step": 19082 + }, + { + "epoch": 0.97, + "grad_norm": 1.1377023965397417, + "learning_rate": 4.59043750110777e-08, + "loss": 0.1677, + "step": 19083 + }, + { + "epoch": 0.97, + "grad_norm": 1.3184405498120437, + "learning_rate": 4.574688382022796e-08, + "loss": 0.1714, + "step": 19084 + }, + { + "epoch": 0.97, + "grad_norm": 0.8280553700474405, + "learning_rate": 4.5589662639056175e-08, + "loss": 0.1735, + "step": 19085 + }, + { + "epoch": 0.97, + "grad_norm": 1.005031500161824, + "learning_rate": 4.5432711471826704e-08, + "loss": 0.159, + "step": 19086 + }, + { + "epoch": 0.97, + "grad_norm": 1.1721295729425816, + "learning_rate": 4.527603032279726e-08, + "loss": 0.1513, + "step": 19087 + }, + { + "epoch": 0.97, + "grad_norm": 1.7713609394550736, + "learning_rate": 4.511961919621888e-08, + "loss": 0.1603, + "step": 19088 + }, + { + "epoch": 0.97, + "grad_norm": 4.941106674306136, + "learning_rate": 4.496347809633261e-08, + "loss": 0.1454, + "step": 19089 + }, + { + "epoch": 0.97, + "grad_norm": 0.824939947863532, + "learning_rate": 4.480760702737286e-08, + "loss": 0.137, + "step": 19090 + }, + { + "epoch": 0.97, + "grad_norm": 1.275105006746803, + "learning_rate": 4.465200599356956e-08, + "loss": 0.1616, + "step": 19091 + }, + { + "epoch": 0.97, + "grad_norm": 1.1215004426931332, + "learning_rate": 4.4496674999143786e-08, + "loss": 0.1473, + "step": 19092 + }, + { + "epoch": 0.97, + "grad_norm": 1.4440259031152056, + "learning_rate": 4.434161404830772e-08, + "loss": 0.1563, + "step": 19093 + }, + { + "epoch": 0.97, + "grad_norm": 2.7954277937536793, + "learning_rate": 4.41868231452669e-08, + "loss": 0.1615, + "step": 19094 + }, + { + "epoch": 0.97, + "grad_norm": 1.1079319217667891, + "learning_rate": 4.40323022942224e-08, + "loss": 0.1961, + "step": 19095 + }, + { + "epoch": 0.97, + "grad_norm": 1.3025911019288117, + "learning_rate": 4.387805149936197e-08, + "loss": 0.1534, + "step": 19096 + }, + { + "epoch": 0.97, + "grad_norm": 0.9653298618682854, + "learning_rate": 4.3724070764873396e-08, + "loss": 0.1516, + "step": 19097 + }, + { + "epoch": 0.97, + "grad_norm": 1.1497810602577705, + "learning_rate": 4.35703600949311e-08, + "loss": 0.158, + "step": 19098 + }, + { + "epoch": 0.97, + "grad_norm": 0.9031827811840533, + "learning_rate": 4.341691949370508e-08, + "loss": 0.1581, + "step": 19099 + }, + { + "epoch": 0.97, + "grad_norm": 1.0561575396976002, + "learning_rate": 4.326374896535757e-08, + "loss": 0.1816, + "step": 19100 + }, + { + "epoch": 0.97, + "grad_norm": 1.2191283042704604, + "learning_rate": 4.311084851404301e-08, + "loss": 0.1577, + "step": 19101 + }, + { + "epoch": 0.97, + "grad_norm": 1.7094001015154057, + "learning_rate": 4.2958218143909214e-08, + "loss": 0.1614, + "step": 19102 + }, + { + "epoch": 0.97, + "grad_norm": 1.0578646131166962, + "learning_rate": 4.280585785909619e-08, + "loss": 0.1417, + "step": 19103 + }, + { + "epoch": 0.97, + "grad_norm": 0.9456307144413473, + "learning_rate": 4.265376766373619e-08, + "loss": 0.1486, + "step": 19104 + }, + { + "epoch": 0.97, + "grad_norm": 0.9368889064225784, + "learning_rate": 4.2501947561955914e-08, + "loss": 0.1611, + "step": 19105 + }, + { + "epoch": 0.97, + "grad_norm": 0.9330791875403646, + "learning_rate": 4.235039755787318e-08, + "loss": 0.1539, + "step": 19106 + }, + { + "epoch": 0.97, + "grad_norm": 0.9165604635550397, + "learning_rate": 4.2199117655596924e-08, + "loss": 0.1688, + "step": 19107 + }, + { + "epoch": 0.97, + "grad_norm": 0.9110528720041452, + "learning_rate": 4.204810785923275e-08, + "loss": 0.1742, + "step": 19108 + }, + { + "epoch": 0.97, + "grad_norm": 1.9730364455869631, + "learning_rate": 4.1897368172875156e-08, + "loss": 0.146, + "step": 19109 + }, + { + "epoch": 0.97, + "grad_norm": 1.5814289522865332, + "learning_rate": 4.174689860061532e-08, + "loss": 0.1815, + "step": 19110 + }, + { + "epoch": 0.97, + "grad_norm": 1.543582728055052, + "learning_rate": 4.159669914653219e-08, + "loss": 0.1706, + "step": 19111 + }, + { + "epoch": 0.97, + "grad_norm": 1.1285363465670573, + "learning_rate": 4.144676981470142e-08, + "loss": 0.1716, + "step": 19112 + }, + { + "epoch": 0.97, + "grad_norm": 1.3445171338848143, + "learning_rate": 4.1297110609189726e-08, + "loss": 0.1576, + "step": 19113 + }, + { + "epoch": 0.97, + "grad_norm": 0.8753483217197732, + "learning_rate": 4.1147721534056106e-08, + "loss": 0.1479, + "step": 19114 + }, + { + "epoch": 0.97, + "grad_norm": 0.9138793875980022, + "learning_rate": 4.099860259335287e-08, + "loss": 0.1736, + "step": 19115 + }, + { + "epoch": 0.97, + "grad_norm": 1.1239955889645314, + "learning_rate": 4.084975379112566e-08, + "loss": 0.1492, + "step": 19116 + }, + { + "epoch": 0.97, + "grad_norm": 0.875353207317943, + "learning_rate": 4.070117513141014e-08, + "loss": 0.1633, + "step": 19117 + }, + { + "epoch": 0.97, + "grad_norm": 0.9480898716991211, + "learning_rate": 4.055286661823976e-08, + "loss": 0.1524, + "step": 19118 + }, + { + "epoch": 0.97, + "grad_norm": 1.0478210911230053, + "learning_rate": 4.040482825563352e-08, + "loss": 0.1739, + "step": 19119 + }, + { + "epoch": 0.97, + "grad_norm": 0.9717161847311233, + "learning_rate": 4.025706004760932e-08, + "loss": 0.1709, + "step": 19120 + }, + { + "epoch": 0.97, + "grad_norm": 0.9507746435779297, + "learning_rate": 4.010956199817506e-08, + "loss": 0.1628, + "step": 19121 + }, + { + "epoch": 0.97, + "grad_norm": 0.8743938983011655, + "learning_rate": 3.996233411133199e-08, + "loss": 0.1664, + "step": 19122 + }, + { + "epoch": 0.97, + "grad_norm": 1.0571851837849218, + "learning_rate": 3.981537639107247e-08, + "loss": 0.166, + "step": 19123 + }, + { + "epoch": 0.97, + "grad_norm": 1.0122948012698287, + "learning_rate": 3.966868884138442e-08, + "loss": 0.1588, + "step": 19124 + }, + { + "epoch": 0.97, + "grad_norm": 0.8442422327482219, + "learning_rate": 3.952227146624465e-08, + "loss": 0.161, + "step": 19125 + }, + { + "epoch": 0.97, + "grad_norm": 1.0852162833362355, + "learning_rate": 3.937612426962556e-08, + "loss": 0.1804, + "step": 19126 + }, + { + "epoch": 0.97, + "grad_norm": 0.9181077560303157, + "learning_rate": 3.923024725549285e-08, + "loss": 0.164, + "step": 19127 + }, + { + "epoch": 0.97, + "grad_norm": 1.2820486158378837, + "learning_rate": 3.908464042780114e-08, + "loss": 0.173, + "step": 19128 + }, + { + "epoch": 0.97, + "grad_norm": 1.136925404121932, + "learning_rate": 3.8939303790501706e-08, + "loss": 0.1761, + "step": 19129 + }, + { + "epoch": 0.97, + "grad_norm": 1.1490818691885378, + "learning_rate": 3.879423734753585e-08, + "loss": 0.1519, + "step": 19130 + }, + { + "epoch": 0.97, + "grad_norm": 1.4095179194478367, + "learning_rate": 3.864944110284041e-08, + "loss": 0.1591, + "step": 19131 + }, + { + "epoch": 0.97, + "grad_norm": 1.0354443872710228, + "learning_rate": 3.850491506034004e-08, + "loss": 0.1597, + "step": 19132 + }, + { + "epoch": 0.97, + "grad_norm": 0.927952075094008, + "learning_rate": 3.8360659223957155e-08, + "loss": 0.1622, + "step": 19133 + }, + { + "epoch": 0.97, + "grad_norm": 1.0288648343438391, + "learning_rate": 3.821667359760306e-08, + "loss": 0.1674, + "step": 19134 + }, + { + "epoch": 0.97, + "grad_norm": 0.9923904976945497, + "learning_rate": 3.8072958185184635e-08, + "loss": 0.1688, + "step": 19135 + }, + { + "epoch": 0.97, + "grad_norm": 0.9784718164833021, + "learning_rate": 3.7929512990600995e-08, + "loss": 0.1649, + "step": 19136 + }, + { + "epoch": 0.97, + "grad_norm": 12.11685222340869, + "learning_rate": 3.778633801774123e-08, + "loss": 0.1644, + "step": 19137 + }, + { + "epoch": 0.97, + "grad_norm": 0.8243294680976933, + "learning_rate": 3.764343327048892e-08, + "loss": 0.1577, + "step": 19138 + }, + { + "epoch": 0.97, + "grad_norm": 0.9167141022086956, + "learning_rate": 3.750079875272206e-08, + "loss": 0.1739, + "step": 19139 + }, + { + "epoch": 0.97, + "grad_norm": 1.072275320165148, + "learning_rate": 3.735843446830867e-08, + "loss": 0.1399, + "step": 19140 + }, + { + "epoch": 0.97, + "grad_norm": 0.9768691762964762, + "learning_rate": 3.721634042111011e-08, + "loss": 0.165, + "step": 19141 + }, + { + "epoch": 0.97, + "grad_norm": 0.838464973730494, + "learning_rate": 3.707451661498107e-08, + "loss": 0.1578, + "step": 19142 + }, + { + "epoch": 0.97, + "grad_norm": 0.9608964549999129, + "learning_rate": 3.693296305376959e-08, + "loss": 0.1492, + "step": 19143 + }, + { + "epoch": 0.97, + "grad_norm": 1.228769947210927, + "learning_rate": 3.679167974131259e-08, + "loss": 0.1453, + "step": 19144 + }, + { + "epoch": 0.97, + "grad_norm": 1.1040416367837809, + "learning_rate": 3.665066668144479e-08, + "loss": 0.1522, + "step": 19145 + }, + { + "epoch": 0.97, + "grad_norm": 0.8744148989593945, + "learning_rate": 3.65099238779909e-08, + "loss": 0.1632, + "step": 19146 + }, + { + "epoch": 0.97, + "grad_norm": 0.922122532580692, + "learning_rate": 3.636945133476677e-08, + "loss": 0.1846, + "step": 19147 + }, + { + "epoch": 0.97, + "grad_norm": 1.0865718018611439, + "learning_rate": 3.622924905558489e-08, + "loss": 0.1593, + "step": 19148 + }, + { + "epoch": 0.97, + "grad_norm": 0.973217513769555, + "learning_rate": 3.608931704424778e-08, + "loss": 0.1602, + "step": 19149 + }, + { + "epoch": 0.97, + "grad_norm": 1.2178422284953592, + "learning_rate": 3.5949655304550193e-08, + "loss": 0.1646, + "step": 19150 + }, + { + "epoch": 0.97, + "grad_norm": 0.8812935813974213, + "learning_rate": 3.581026384028019e-08, + "loss": 0.1523, + "step": 19151 + }, + { + "epoch": 0.97, + "grad_norm": 3.8532607015421814, + "learning_rate": 3.567114265522031e-08, + "loss": 0.1392, + "step": 19152 + }, + { + "epoch": 0.97, + "grad_norm": 1.116891255212077, + "learning_rate": 3.55322917531431e-08, + "loss": 0.161, + "step": 19153 + }, + { + "epoch": 0.97, + "grad_norm": 1.0735371441706794, + "learning_rate": 3.539371113781553e-08, + "loss": 0.1443, + "step": 19154 + }, + { + "epoch": 0.97, + "grad_norm": 0.995049097027222, + "learning_rate": 3.525540081299683e-08, + "loss": 0.1607, + "step": 19155 + }, + { + "epoch": 0.97, + "grad_norm": 1.372855437481362, + "learning_rate": 3.511736078243844e-08, + "loss": 0.1941, + "step": 19156 + }, + { + "epoch": 0.97, + "grad_norm": 0.9548197354253422, + "learning_rate": 3.497959104988291e-08, + "loss": 0.1552, + "step": 19157 + }, + { + "epoch": 0.97, + "grad_norm": 1.171880449888493, + "learning_rate": 3.4842091619070594e-08, + "loss": 0.1554, + "step": 19158 + }, + { + "epoch": 0.97, + "grad_norm": 1.0613535166121013, + "learning_rate": 3.470486249372851e-08, + "loss": 0.1488, + "step": 19159 + }, + { + "epoch": 0.97, + "grad_norm": 1.0495709235018573, + "learning_rate": 3.456790367757923e-08, + "loss": 0.1404, + "step": 19160 + }, + { + "epoch": 0.97, + "grad_norm": 0.9657074899743266, + "learning_rate": 3.4431215174338675e-08, + "loss": 0.1589, + "step": 19161 + }, + { + "epoch": 0.97, + "grad_norm": 1.615432278184323, + "learning_rate": 3.429479698771499e-08, + "loss": 0.1848, + "step": 19162 + }, + { + "epoch": 0.97, + "grad_norm": 0.8700346511067772, + "learning_rate": 3.415864912140743e-08, + "loss": 0.143, + "step": 19163 + }, + { + "epoch": 0.97, + "grad_norm": 0.998348237566735, + "learning_rate": 3.402277157910971e-08, + "loss": 0.1488, + "step": 19164 + }, + { + "epoch": 0.97, + "grad_norm": 1.1098996119917584, + "learning_rate": 3.3887164364506676e-08, + "loss": 0.156, + "step": 19165 + }, + { + "epoch": 0.97, + "grad_norm": 0.9244051009838743, + "learning_rate": 3.375182748127759e-08, + "loss": 0.1975, + "step": 19166 + }, + { + "epoch": 0.97, + "grad_norm": 1.0719850242465663, + "learning_rate": 3.361676093309285e-08, + "loss": 0.1649, + "step": 19167 + }, + { + "epoch": 0.97, + "grad_norm": 1.4384884218914096, + "learning_rate": 3.34819647236162e-08, + "loss": 0.1911, + "step": 19168 + }, + { + "epoch": 0.97, + "grad_norm": 1.6804235632026177, + "learning_rate": 3.334743885650471e-08, + "loss": 0.1742, + "step": 19169 + }, + { + "epoch": 0.97, + "grad_norm": 1.5391305538864972, + "learning_rate": 3.321318333540546e-08, + "loss": 0.1745, + "step": 19170 + }, + { + "epoch": 0.97, + "grad_norm": 0.821442405909109, + "learning_rate": 3.307919816396332e-08, + "loss": 0.1707, + "step": 19171 + }, + { + "epoch": 0.97, + "grad_norm": 0.9376385559888619, + "learning_rate": 3.294548334580982e-08, + "loss": 0.1571, + "step": 19172 + }, + { + "epoch": 0.97, + "grad_norm": 1.0596456111225416, + "learning_rate": 3.2812038884573185e-08, + "loss": 0.1722, + "step": 19173 + }, + { + "epoch": 0.98, + "grad_norm": 1.512084876890349, + "learning_rate": 3.267886478387383e-08, + "loss": 0.1509, + "step": 19174 + }, + { + "epoch": 0.98, + "grad_norm": 0.9142596774458099, + "learning_rate": 3.254596104732222e-08, + "loss": 0.1706, + "step": 19175 + }, + { + "epoch": 0.98, + "grad_norm": 1.227178005224331, + "learning_rate": 3.241332767852545e-08, + "loss": 0.1699, + "step": 19176 + }, + { + "epoch": 0.98, + "grad_norm": 0.988269507234306, + "learning_rate": 3.228096468107844e-08, + "loss": 0.1521, + "step": 19177 + }, + { + "epoch": 0.98, + "grad_norm": 1.0376160910233094, + "learning_rate": 3.214887205857387e-08, + "loss": 0.1781, + "step": 19178 + }, + { + "epoch": 0.98, + "grad_norm": 1.0444876319228076, + "learning_rate": 3.201704981459441e-08, + "loss": 0.1569, + "step": 19179 + }, + { + "epoch": 0.98, + "grad_norm": 1.151779282207466, + "learning_rate": 3.188549795271612e-08, + "loss": 0.1567, + "step": 19180 + }, + { + "epoch": 0.98, + "grad_norm": 1.1160958146018711, + "learning_rate": 3.175421647650612e-08, + "loss": 0.1439, + "step": 19181 + }, + { + "epoch": 0.98, + "grad_norm": 1.1713617665246183, + "learning_rate": 3.1623205389526015e-08, + "loss": 0.1822, + "step": 19182 + }, + { + "epoch": 0.98, + "grad_norm": 1.1129384740967305, + "learning_rate": 3.1492464695328517e-08, + "loss": 0.1473, + "step": 19183 + }, + { + "epoch": 0.98, + "grad_norm": 1.1710919498361196, + "learning_rate": 3.1361994397463015e-08, + "loss": 0.1641, + "step": 19184 + }, + { + "epoch": 0.98, + "grad_norm": 2.47322006099083, + "learning_rate": 3.123179449946445e-08, + "loss": 0.1597, + "step": 19185 + }, + { + "epoch": 0.98, + "grad_norm": 1.0377026560653144, + "learning_rate": 3.110186500486778e-08, + "loss": 0.1523, + "step": 19186 + }, + { + "epoch": 0.98, + "grad_norm": 0.8996411914981404, + "learning_rate": 3.097220591719574e-08, + "loss": 0.1602, + "step": 19187 + }, + { + "epoch": 0.98, + "grad_norm": 1.1501427842312555, + "learning_rate": 3.084281723996552e-08, + "loss": 0.1571, + "step": 19188 + }, + { + "epoch": 0.98, + "grad_norm": 1.0734279089738592, + "learning_rate": 3.071369897668652e-08, + "loss": 0.1622, + "step": 19189 + }, + { + "epoch": 0.98, + "grad_norm": 0.9380478409377784, + "learning_rate": 3.0584851130861516e-08, + "loss": 0.1438, + "step": 19190 + }, + { + "epoch": 0.98, + "grad_norm": 1.1629182446898398, + "learning_rate": 3.0456273705986585e-08, + "loss": 0.1703, + "step": 19191 + }, + { + "epoch": 0.98, + "grad_norm": 1.1467098491253804, + "learning_rate": 3.032796670554783e-08, + "loss": 0.166, + "step": 19192 + }, + { + "epoch": 0.98, + "grad_norm": 0.8112423150515589, + "learning_rate": 3.01999301330258e-08, + "loss": 0.1689, + "step": 19193 + }, + { + "epoch": 0.98, + "grad_norm": 0.9835760332721184, + "learning_rate": 3.007216399189328e-08, + "loss": 0.1427, + "step": 19194 + }, + { + "epoch": 0.98, + "grad_norm": 1.0386973083640156, + "learning_rate": 2.9944668285617486e-08, + "loss": 0.16, + "step": 19195 + }, + { + "epoch": 0.98, + "grad_norm": 1.2121770744729188, + "learning_rate": 2.981744301765454e-08, + "loss": 0.1702, + "step": 19196 + }, + { + "epoch": 0.98, + "grad_norm": 1.1948688625434023, + "learning_rate": 2.9690488191457256e-08, + "loss": 0.1715, + "step": 19197 + }, + { + "epoch": 0.98, + "grad_norm": 1.0920289287961265, + "learning_rate": 2.9563803810468417e-08, + "loss": 0.1407, + "step": 19198 + }, + { + "epoch": 0.98, + "grad_norm": 1.6118283072831598, + "learning_rate": 2.943738987812528e-08, + "loss": 0.1629, + "step": 19199 + }, + { + "epoch": 0.98, + "grad_norm": 1.0650245468422912, + "learning_rate": 2.9311246397855097e-08, + "loss": 0.1664, + "step": 19200 + }, + { + "epoch": 0.98, + "grad_norm": 0.9317474614007701, + "learning_rate": 2.9185373373080694e-08, + "loss": 0.1704, + "step": 19201 + }, + { + "epoch": 0.98, + "grad_norm": 1.0744841689521663, + "learning_rate": 2.9059770807217114e-08, + "loss": 0.1483, + "step": 19202 + }, + { + "epoch": 0.98, + "grad_norm": 1.0447025231148186, + "learning_rate": 2.893443870366941e-08, + "loss": 0.1695, + "step": 19203 + }, + { + "epoch": 0.98, + "grad_norm": 1.0831017244705572, + "learning_rate": 2.8809377065838195e-08, + "loss": 0.1731, + "step": 19204 + }, + { + "epoch": 0.98, + "grad_norm": 0.957580011722551, + "learning_rate": 2.8684585897116313e-08, + "loss": 0.1442, + "step": 19205 + }, + { + "epoch": 0.98, + "grad_norm": 1.1950503252036402, + "learning_rate": 2.856006520088772e-08, + "loss": 0.1447, + "step": 19206 + }, + { + "epoch": 0.98, + "grad_norm": 1.0911236067523062, + "learning_rate": 2.843581498053083e-08, + "loss": 0.154, + "step": 19207 + }, + { + "epoch": 0.98, + "grad_norm": 1.3110033762325044, + "learning_rate": 2.8311835239415166e-08, + "loss": 0.166, + "step": 19208 + }, + { + "epoch": 0.98, + "grad_norm": 0.8672953809905334, + "learning_rate": 2.8188125980904702e-08, + "loss": 0.144, + "step": 19209 + }, + { + "epoch": 0.98, + "grad_norm": 1.1167029052994197, + "learning_rate": 2.8064687208354534e-08, + "loss": 0.1766, + "step": 19210 + }, + { + "epoch": 0.98, + "grad_norm": 1.0134270323241488, + "learning_rate": 2.7941518925113098e-08, + "loss": 0.1565, + "step": 19211 + }, + { + "epoch": 0.98, + "grad_norm": 1.3869272247853248, + "learning_rate": 2.7818621134521052e-08, + "loss": 0.1532, + "step": 19212 + }, + { + "epoch": 0.98, + "grad_norm": 3.1589186608695665, + "learning_rate": 2.7695993839912394e-08, + "loss": 0.1417, + "step": 19213 + }, + { + "epoch": 0.98, + "grad_norm": 0.8894785662670377, + "learning_rate": 2.7573637044612245e-08, + "loss": 0.1661, + "step": 19214 + }, + { + "epoch": 0.98, + "grad_norm": 1.039413572033424, + "learning_rate": 2.7451550751941282e-08, + "loss": 0.1762, + "step": 19215 + }, + { + "epoch": 0.98, + "grad_norm": 1.1886792996172786, + "learning_rate": 2.7329734965210185e-08, + "loss": 0.1754, + "step": 19216 + }, + { + "epoch": 0.98, + "grad_norm": 0.9696007012494356, + "learning_rate": 2.7208189687722986e-08, + "loss": 0.1551, + "step": 19217 + }, + { + "epoch": 0.98, + "grad_norm": 1.037129191027604, + "learning_rate": 2.708691492277704e-08, + "loss": 0.1538, + "step": 19218 + }, + { + "epoch": 0.98, + "grad_norm": 1.0038345821338919, + "learning_rate": 2.6965910673661945e-08, + "loss": 0.1361, + "step": 19219 + }, + { + "epoch": 0.98, + "grad_norm": 1.0297748191335272, + "learning_rate": 2.684517694365951e-08, + "loss": 0.1653, + "step": 19220 + }, + { + "epoch": 0.98, + "grad_norm": 0.9427532899833283, + "learning_rate": 2.6724713736044904e-08, + "loss": 0.1593, + "step": 19221 + }, + { + "epoch": 0.98, + "grad_norm": 1.2120297659638368, + "learning_rate": 2.6604521054085507e-08, + "loss": 0.1655, + "step": 19222 + }, + { + "epoch": 0.98, + "grad_norm": 1.0842678478940873, + "learning_rate": 2.6484598901042047e-08, + "loss": 0.1572, + "step": 19223 + }, + { + "epoch": 0.98, + "grad_norm": 1.0850936239860367, + "learning_rate": 2.6364947280167474e-08, + "loss": 0.1567, + "step": 19224 + }, + { + "epoch": 0.98, + "grad_norm": 1.1589672563300961, + "learning_rate": 2.6245566194706973e-08, + "loss": 0.1565, + "step": 19225 + }, + { + "epoch": 0.98, + "grad_norm": 0.9041978598526125, + "learning_rate": 2.6126455647899063e-08, + "loss": 0.1401, + "step": 19226 + }, + { + "epoch": 0.98, + "grad_norm": 1.1471922837033912, + "learning_rate": 2.6007615642973383e-08, + "loss": 0.18, + "step": 19227 + }, + { + "epoch": 0.98, + "grad_norm": 0.9977763246122786, + "learning_rate": 2.588904618315513e-08, + "loss": 0.1533, + "step": 19228 + }, + { + "epoch": 0.98, + "grad_norm": 0.9580627412826506, + "learning_rate": 2.5770747271659512e-08, + "loss": 0.1619, + "step": 19229 + }, + { + "epoch": 0.98, + "grad_norm": 0.992200161492214, + "learning_rate": 2.5652718911696185e-08, + "loss": 0.1521, + "step": 19230 + }, + { + "epoch": 0.98, + "grad_norm": 1.0007268238753533, + "learning_rate": 2.5534961106465918e-08, + "loss": 0.1644, + "step": 19231 + }, + { + "epoch": 0.98, + "grad_norm": 0.8550931452612291, + "learning_rate": 2.5417473859162823e-08, + "loss": 0.147, + "step": 19232 + }, + { + "epoch": 0.98, + "grad_norm": 0.8227788485716492, + "learning_rate": 2.530025717297546e-08, + "loss": 0.1548, + "step": 19233 + }, + { + "epoch": 0.98, + "grad_norm": 1.1593496255775817, + "learning_rate": 2.5183311051080184e-08, + "loss": 0.1691, + "step": 19234 + }, + { + "epoch": 0.98, + "grad_norm": 2.1956712286111624, + "learning_rate": 2.5066635496652225e-08, + "loss": 0.1716, + "step": 19235 + }, + { + "epoch": 0.98, + "grad_norm": 0.9729699845844008, + "learning_rate": 2.4950230512854612e-08, + "loss": 0.1862, + "step": 19236 + }, + { + "epoch": 0.98, + "grad_norm": 1.0720151399962266, + "learning_rate": 2.4834096102845927e-08, + "loss": 0.167, + "step": 19237 + }, + { + "epoch": 0.98, + "grad_norm": 1.9591798117726331, + "learning_rate": 2.4718232269774767e-08, + "loss": 0.1603, + "step": 19238 + }, + { + "epoch": 0.98, + "grad_norm": 1.343576188483149, + "learning_rate": 2.460263901678639e-08, + "loss": 0.1562, + "step": 19239 + }, + { + "epoch": 0.98, + "grad_norm": 1.252390946762163, + "learning_rate": 2.4487316347013845e-08, + "loss": 0.1719, + "step": 19240 + }, + { + "epoch": 0.98, + "grad_norm": 1.156646101992709, + "learning_rate": 2.4372264263586852e-08, + "loss": 0.1698, + "step": 19241 + }, + { + "epoch": 0.98, + "grad_norm": 1.081421366726149, + "learning_rate": 2.425748276962514e-08, + "loss": 0.1574, + "step": 19242 + }, + { + "epoch": 0.98, + "grad_norm": 0.9253255105133054, + "learning_rate": 2.414297186824288e-08, + "loss": 0.1552, + "step": 19243 + }, + { + "epoch": 0.98, + "grad_norm": 0.8335899901045356, + "learning_rate": 2.402873156254537e-08, + "loss": 0.1423, + "step": 19244 + }, + { + "epoch": 0.98, + "grad_norm": 0.9997764295713396, + "learning_rate": 2.3914761855632353e-08, + "loss": 0.1778, + "step": 19245 + }, + { + "epoch": 0.98, + "grad_norm": 1.0051440205824753, + "learning_rate": 2.3801062750595794e-08, + "loss": 0.1605, + "step": 19246 + }, + { + "epoch": 0.98, + "grad_norm": 1.016491155010764, + "learning_rate": 2.3687634250517676e-08, + "loss": 0.1457, + "step": 19247 + }, + { + "epoch": 0.98, + "grad_norm": 0.9058016869766914, + "learning_rate": 2.357447635847554e-08, + "loss": 0.1631, + "step": 19248 + }, + { + "epoch": 0.98, + "grad_norm": 1.9075130738463206, + "learning_rate": 2.3461589077540258e-08, + "loss": 0.1589, + "step": 19249 + }, + { + "epoch": 0.98, + "grad_norm": 0.9980310571120593, + "learning_rate": 2.3348972410772718e-08, + "loss": 0.1454, + "step": 19250 + }, + { + "epoch": 0.98, + "grad_norm": 1.0595001302371383, + "learning_rate": 2.3236626361227145e-08, + "loss": 0.1514, + "step": 19251 + }, + { + "epoch": 0.98, + "grad_norm": 1.5416509882505045, + "learning_rate": 2.3124550931952205e-08, + "loss": 0.1594, + "step": 19252 + }, + { + "epoch": 0.98, + "grad_norm": 0.8453900521274997, + "learning_rate": 2.3012746125987695e-08, + "loss": 0.1525, + "step": 19253 + }, + { + "epoch": 0.98, + "grad_norm": 0.7985186282417956, + "learning_rate": 2.290121194636452e-08, + "loss": 0.1423, + "step": 19254 + }, + { + "epoch": 0.98, + "grad_norm": 0.8561020250996914, + "learning_rate": 2.278994839611026e-08, + "loss": 0.151, + "step": 19255 + }, + { + "epoch": 0.98, + "grad_norm": 1.27649659032301, + "learning_rate": 2.2678955478242504e-08, + "loss": 0.1467, + "step": 19256 + }, + { + "epoch": 0.98, + "grad_norm": 0.8932878466540868, + "learning_rate": 2.256823319577217e-08, + "loss": 0.1622, + "step": 19257 + }, + { + "epoch": 0.98, + "grad_norm": 0.9914841386610725, + "learning_rate": 2.2457781551700198e-08, + "loss": 0.1424, + "step": 19258 + }, + { + "epoch": 0.98, + "grad_norm": 1.0687596475817032, + "learning_rate": 2.2347600549025294e-08, + "loss": 0.1477, + "step": 19259 + }, + { + "epoch": 0.98, + "grad_norm": 1.8674035862177825, + "learning_rate": 2.2237690190736183e-08, + "loss": 0.1522, + "step": 19260 + }, + { + "epoch": 0.98, + "grad_norm": 1.1072519797129452, + "learning_rate": 2.2128050479812703e-08, + "loss": 0.1686, + "step": 19261 + }, + { + "epoch": 0.98, + "grad_norm": 0.8107126930749041, + "learning_rate": 2.2018681419229138e-08, + "loss": 0.1415, + "step": 19262 + }, + { + "epoch": 0.98, + "grad_norm": 1.0972507429958072, + "learning_rate": 2.1909583011952007e-08, + "loss": 0.1542, + "step": 19263 + }, + { + "epoch": 0.98, + "grad_norm": 0.9810825168290226, + "learning_rate": 2.1800755260942276e-08, + "loss": 0.1549, + "step": 19264 + }, + { + "epoch": 0.98, + "grad_norm": 1.7842847357181404, + "learning_rate": 2.169219816914869e-08, + "loss": 0.1415, + "step": 19265 + }, + { + "epoch": 0.98, + "grad_norm": 0.8654898219260009, + "learning_rate": 2.1583911739518904e-08, + "loss": 0.1685, + "step": 19266 + }, + { + "epoch": 0.98, + "grad_norm": 1.3440330628325314, + "learning_rate": 2.1475895974989446e-08, + "loss": 0.1689, + "step": 19267 + }, + { + "epoch": 0.98, + "grad_norm": 1.3828300319608515, + "learning_rate": 2.13681508784902e-08, + "loss": 0.1501, + "step": 19268 + }, + { + "epoch": 0.98, + "grad_norm": 1.462503265865077, + "learning_rate": 2.1260676452942164e-08, + "loss": 0.1763, + "step": 19269 + }, + { + "epoch": 0.98, + "grad_norm": 0.9461929891605176, + "learning_rate": 2.1153472701263e-08, + "loss": 0.1723, + "step": 19270 + }, + { + "epoch": 0.98, + "grad_norm": 1.2117507915141943, + "learning_rate": 2.1046539626359274e-08, + "loss": 0.1671, + "step": 19271 + }, + { + "epoch": 0.98, + "grad_norm": 1.0426408318344735, + "learning_rate": 2.093987723113089e-08, + "loss": 0.1633, + "step": 19272 + }, + { + "epoch": 0.98, + "grad_norm": 1.0395904044309374, + "learning_rate": 2.0833485518473305e-08, + "loss": 0.1538, + "step": 19273 + }, + { + "epoch": 0.98, + "grad_norm": 1.0043395663080972, + "learning_rate": 2.0727364491269773e-08, + "loss": 0.1555, + "step": 19274 + }, + { + "epoch": 0.98, + "grad_norm": 1.3419287721746127, + "learning_rate": 2.0621514152401323e-08, + "loss": 0.1582, + "step": 19275 + }, + { + "epoch": 0.98, + "grad_norm": 1.1290578600458498, + "learning_rate": 2.0515934504736768e-08, + "loss": 0.1365, + "step": 19276 + }, + { + "epoch": 0.98, + "grad_norm": 2.6278224170282116, + "learning_rate": 2.0410625551141594e-08, + "loss": 0.1416, + "step": 19277 + }, + { + "epoch": 0.98, + "grad_norm": 1.5792626413568063, + "learning_rate": 2.0305587294472408e-08, + "loss": 0.1629, + "step": 19278 + }, + { + "epoch": 0.98, + "grad_norm": 1.1919448565898834, + "learning_rate": 2.0200819737576926e-08, + "loss": 0.1526, + "step": 19279 + }, + { + "epoch": 0.98, + "grad_norm": 1.179775286640012, + "learning_rate": 2.0096322883298435e-08, + "loss": 0.1533, + "step": 19280 + }, + { + "epoch": 0.98, + "grad_norm": 1.0520319965190825, + "learning_rate": 1.9992096734471333e-08, + "loss": 0.1556, + "step": 19281 + }, + { + "epoch": 0.98, + "grad_norm": 1.2946183283037571, + "learning_rate": 1.988814129392114e-08, + "loss": 0.1473, + "step": 19282 + }, + { + "epoch": 0.98, + "grad_norm": 1.0521064047313005, + "learning_rate": 1.9784456564470035e-08, + "loss": 0.1545, + "step": 19283 + }, + { + "epoch": 0.98, + "grad_norm": 1.2723479920478658, + "learning_rate": 1.9681042548928e-08, + "loss": 0.1575, + "step": 19284 + }, + { + "epoch": 0.98, + "grad_norm": 0.9780784388943833, + "learning_rate": 1.9577899250101674e-08, + "loss": 0.1556, + "step": 19285 + }, + { + "epoch": 0.98, + "grad_norm": 0.9001383282508998, + "learning_rate": 1.9475026670788822e-08, + "loss": 0.1594, + "step": 19286 + }, + { + "epoch": 0.98, + "grad_norm": 1.193731060326972, + "learning_rate": 1.9372424813779432e-08, + "loss": 0.1526, + "step": 19287 + }, + { + "epoch": 0.98, + "grad_norm": 1.1469736780532818, + "learning_rate": 1.9270093681856837e-08, + "loss": 0.1938, + "step": 19288 + }, + { + "epoch": 0.98, + "grad_norm": 2.133842881745061, + "learning_rate": 1.9168033277796595e-08, + "loss": 0.1514, + "step": 19289 + }, + { + "epoch": 0.98, + "grad_norm": 2.4315479417567807, + "learning_rate": 1.9066243604367595e-08, + "loss": 0.1574, + "step": 19290 + }, + { + "epoch": 0.98, + "grad_norm": 1.1500103445417296, + "learning_rate": 1.896472466432986e-08, + "loss": 0.1771, + "step": 19291 + }, + { + "epoch": 0.98, + "grad_norm": 1.1258307209006846, + "learning_rate": 1.8863476460437847e-08, + "loss": 0.1519, + "step": 19292 + }, + { + "epoch": 0.98, + "grad_norm": 1.2529336735398418, + "learning_rate": 1.876249899543825e-08, + "loss": 0.1799, + "step": 19293 + }, + { + "epoch": 0.98, + "grad_norm": 0.8555299298372947, + "learning_rate": 1.866179227206888e-08, + "loss": 0.1459, + "step": 19294 + }, + { + "epoch": 0.98, + "grad_norm": 1.0055165033399343, + "learning_rate": 1.8561356293061995e-08, + "loss": 0.1629, + "step": 19295 + }, + { + "epoch": 0.98, + "grad_norm": 0.8929016490331868, + "learning_rate": 1.846119106114319e-08, + "loss": 0.1629, + "step": 19296 + }, + { + "epoch": 0.98, + "grad_norm": 1.4282189392830789, + "learning_rate": 1.8361296579026965e-08, + "loss": 0.173, + "step": 19297 + }, + { + "epoch": 0.98, + "grad_norm": 1.3290545789755563, + "learning_rate": 1.8261672849425593e-08, + "loss": 0.1651, + "step": 19298 + }, + { + "epoch": 0.98, + "grad_norm": 1.506479397433706, + "learning_rate": 1.8162319875040247e-08, + "loss": 0.1668, + "step": 19299 + }, + { + "epoch": 0.98, + "grad_norm": 1.2558990219488457, + "learning_rate": 1.8063237658564325e-08, + "loss": 0.1554, + "step": 19300 + }, + { + "epoch": 0.98, + "grad_norm": 1.2926613820482529, + "learning_rate": 1.7964426202687902e-08, + "loss": 0.1858, + "step": 19301 + }, + { + "epoch": 0.98, + "grad_norm": 0.9604834521488833, + "learning_rate": 1.7865885510089943e-08, + "loss": 0.1639, + "step": 19302 + }, + { + "epoch": 0.98, + "grad_norm": 0.8900020514217135, + "learning_rate": 1.7767615583443865e-08, + "loss": 0.1464, + "step": 19303 + }, + { + "epoch": 0.98, + "grad_norm": 3.1007333087388065, + "learning_rate": 1.7669616425414203e-08, + "loss": 0.1428, + "step": 19304 + }, + { + "epoch": 0.98, + "grad_norm": 1.255905003951268, + "learning_rate": 1.7571888038661056e-08, + "loss": 0.1575, + "step": 19305 + }, + { + "epoch": 0.98, + "grad_norm": 1.2103777182423623, + "learning_rate": 1.747443042583341e-08, + "loss": 0.1729, + "step": 19306 + }, + { + "epoch": 0.98, + "grad_norm": 0.8329630456146007, + "learning_rate": 1.737724358957582e-08, + "loss": 0.1635, + "step": 19307 + }, + { + "epoch": 0.98, + "grad_norm": 1.1297998214124445, + "learning_rate": 1.7280327532525066e-08, + "loss": 0.1525, + "step": 19308 + }, + { + "epoch": 0.98, + "grad_norm": 1.6314144516197124, + "learning_rate": 1.7183682257309043e-08, + "loss": 0.1684, + "step": 19309 + }, + { + "epoch": 0.98, + "grad_norm": 1.3026685814979018, + "learning_rate": 1.708730776654899e-08, + "loss": 0.1768, + "step": 19310 + }, + { + "epoch": 0.98, + "grad_norm": 0.8763899372471403, + "learning_rate": 1.6991204062859478e-08, + "loss": 0.145, + "step": 19311 + }, + { + "epoch": 0.98, + "grad_norm": 1.0921780067084008, + "learning_rate": 1.6895371148847316e-08, + "loss": 0.1505, + "step": 19312 + }, + { + "epoch": 0.98, + "grad_norm": 1.0594109223251242, + "learning_rate": 1.6799809027112645e-08, + "loss": 0.1549, + "step": 19313 + }, + { + "epoch": 0.98, + "grad_norm": 0.9525505933657265, + "learning_rate": 1.6704517700246725e-08, + "loss": 0.1458, + "step": 19314 + }, + { + "epoch": 0.98, + "grad_norm": 1.0551914198110557, + "learning_rate": 1.6609497170834154e-08, + "loss": 0.1677, + "step": 19315 + }, + { + "epoch": 0.98, + "grad_norm": 1.7643407849401922, + "learning_rate": 1.6514747441453983e-08, + "loss": 0.1662, + "step": 19316 + }, + { + "epoch": 0.98, + "grad_norm": 1.6991980825364204, + "learning_rate": 1.6420268514674153e-08, + "loss": 0.1714, + "step": 19317 + }, + { + "epoch": 0.98, + "grad_norm": 0.8688835282260733, + "learning_rate": 1.6326060393058174e-08, + "loss": 0.1686, + "step": 19318 + }, + { + "epoch": 0.98, + "grad_norm": 1.7232939795009312, + "learning_rate": 1.6232123079162887e-08, + "loss": 0.1594, + "step": 19319 + }, + { + "epoch": 0.98, + "grad_norm": 1.1199605143149651, + "learning_rate": 1.6138456575534034e-08, + "loss": 0.1828, + "step": 19320 + }, + { + "epoch": 0.98, + "grad_norm": 0.9937593945033635, + "learning_rate": 1.6045060884714027e-08, + "loss": 0.1543, + "step": 19321 + }, + { + "epoch": 0.98, + "grad_norm": 1.0113342428247933, + "learning_rate": 1.5951936009235282e-08, + "loss": 0.1813, + "step": 19322 + }, + { + "epoch": 0.98, + "grad_norm": 0.9151153716972111, + "learning_rate": 1.5859081951624668e-08, + "loss": 0.1466, + "step": 19323 + }, + { + "epoch": 0.98, + "grad_norm": 1.10118412807842, + "learning_rate": 1.5766498714400168e-08, + "loss": 0.1761, + "step": 19324 + }, + { + "epoch": 0.98, + "grad_norm": 1.1592831567954685, + "learning_rate": 1.5674186300073113e-08, + "loss": 0.1723, + "step": 19325 + }, + { + "epoch": 0.98, + "grad_norm": 1.0636078897730215, + "learning_rate": 1.5582144711148163e-08, + "loss": 0.1616, + "step": 19326 + }, + { + "epoch": 0.98, + "grad_norm": 1.157339395167924, + "learning_rate": 1.5490373950121097e-08, + "loss": 0.1554, + "step": 19327 + }, + { + "epoch": 0.98, + "grad_norm": 0.8743647019705796, + "learning_rate": 1.5398874019481037e-08, + "loss": 0.1766, + "step": 19328 + }, + { + "epoch": 0.98, + "grad_norm": 0.8649892968907565, + "learning_rate": 1.5307644921710442e-08, + "loss": 0.1613, + "step": 19329 + }, + { + "epoch": 0.98, + "grad_norm": 1.0898910343208772, + "learning_rate": 1.5216686659285106e-08, + "loss": 0.1548, + "step": 19330 + }, + { + "epoch": 0.98, + "grad_norm": 1.1210862777680415, + "learning_rate": 1.512599923467084e-08, + "loss": 0.1539, + "step": 19331 + }, + { + "epoch": 0.98, + "grad_norm": 0.8436561419359566, + "learning_rate": 1.5035582650326786e-08, + "loss": 0.1484, + "step": 19332 + }, + { + "epoch": 0.98, + "grad_norm": 1.0534986774048938, + "learning_rate": 1.4945436908707645e-08, + "loss": 0.1465, + "step": 19333 + }, + { + "epoch": 0.98, + "grad_norm": 1.1097802246883643, + "learning_rate": 1.4855562012257019e-08, + "loss": 0.1513, + "step": 19334 + }, + { + "epoch": 0.98, + "grad_norm": 0.9406791484763374, + "learning_rate": 1.4765957963412957e-08, + "loss": 0.1457, + "step": 19335 + }, + { + "epoch": 0.98, + "grad_norm": 0.9979331385771827, + "learning_rate": 1.467662476460574e-08, + "loss": 0.1778, + "step": 19336 + }, + { + "epoch": 0.98, + "grad_norm": 0.9978593170264436, + "learning_rate": 1.4587562418260092e-08, + "loss": 0.1857, + "step": 19337 + }, + { + "epoch": 0.98, + "grad_norm": 1.0801844337278366, + "learning_rate": 1.449877092679075e-08, + "loss": 0.1514, + "step": 19338 + }, + { + "epoch": 0.98, + "grad_norm": 0.9235210259060828, + "learning_rate": 1.4410250292605788e-08, + "loss": 0.1797, + "step": 19339 + }, + { + "epoch": 0.98, + "grad_norm": 1.5939303682114307, + "learning_rate": 1.4322000518106616e-08, + "loss": 0.168, + "step": 19340 + }, + { + "epoch": 0.98, + "grad_norm": 1.0240846558467789, + "learning_rate": 1.4234021605687987e-08, + "loss": 0.1645, + "step": 19341 + }, + { + "epoch": 0.98, + "grad_norm": 1.1303492834383382, + "learning_rate": 1.414631355773466e-08, + "loss": 0.1592, + "step": 19342 + }, + { + "epoch": 0.98, + "grad_norm": 1.0958579086559461, + "learning_rate": 1.405887637662695e-08, + "loss": 0.1577, + "step": 19343 + }, + { + "epoch": 0.98, + "grad_norm": 1.3423002977177827, + "learning_rate": 1.3971710064736299e-08, + "loss": 0.1589, + "step": 19344 + }, + { + "epoch": 0.98, + "grad_norm": 0.9730615483497294, + "learning_rate": 1.3884814624427478e-08, + "loss": 0.1797, + "step": 19345 + }, + { + "epoch": 0.98, + "grad_norm": 0.8705667810277697, + "learning_rate": 1.3798190058056383e-08, + "loss": 0.143, + "step": 19346 + }, + { + "epoch": 0.98, + "grad_norm": 1.0582401666488022, + "learning_rate": 1.3711836367973353e-08, + "loss": 0.1499, + "step": 19347 + }, + { + "epoch": 0.98, + "grad_norm": 1.2066413755415508, + "learning_rate": 1.362575355652096e-08, + "loss": 0.155, + "step": 19348 + }, + { + "epoch": 0.98, + "grad_norm": 1.6819423962631999, + "learning_rate": 1.3539941626034003e-08, + "loss": 0.1695, + "step": 19349 + }, + { + "epoch": 0.98, + "grad_norm": 1.8172723617428077, + "learning_rate": 1.345440057884062e-08, + "loss": 0.1632, + "step": 19350 + }, + { + "epoch": 0.98, + "grad_norm": 0.8142788410916596, + "learning_rate": 1.3369130417260067e-08, + "loss": 0.1624, + "step": 19351 + }, + { + "epoch": 0.98, + "grad_norm": 0.973732870831668, + "learning_rate": 1.3284131143606049e-08, + "loss": 0.1711, + "step": 19352 + }, + { + "epoch": 0.98, + "grad_norm": 0.9400105566861664, + "learning_rate": 1.3199402760184499e-08, + "loss": 0.1644, + "step": 19353 + }, + { + "epoch": 0.98, + "grad_norm": 1.0797258749734406, + "learning_rate": 1.3114945269292468e-08, + "loss": 0.1795, + "step": 19354 + }, + { + "epoch": 0.98, + "grad_norm": 1.1797224027283473, + "learning_rate": 1.3030758673221456e-08, + "loss": 0.1561, + "step": 19355 + }, + { + "epoch": 0.98, + "grad_norm": 1.1102196689897572, + "learning_rate": 1.2946842974256301e-08, + "loss": 0.1649, + "step": 19356 + }, + { + "epoch": 0.98, + "grad_norm": 1.6320482620567274, + "learning_rate": 1.2863198174671853e-08, + "loss": 0.1408, + "step": 19357 + }, + { + "epoch": 0.98, + "grad_norm": 1.3116515018242307, + "learning_rate": 1.2779824276736298e-08, + "loss": 0.159, + "step": 19358 + }, + { + "epoch": 0.98, + "grad_norm": 0.76738250679809, + "learning_rate": 1.2696721282712266e-08, + "loss": 0.1355, + "step": 19359 + }, + { + "epoch": 0.98, + "grad_norm": 1.1489894563325833, + "learning_rate": 1.2613889194854623e-08, + "loss": 0.1742, + "step": 19360 + }, + { + "epoch": 0.98, + "grad_norm": 0.876093932556513, + "learning_rate": 1.253132801540935e-08, + "loss": 0.1541, + "step": 19361 + }, + { + "epoch": 0.98, + "grad_norm": 1.1655631323365128, + "learning_rate": 1.2449037746614657e-08, + "loss": 0.1569, + "step": 19362 + }, + { + "epoch": 0.98, + "grad_norm": 1.3719793760387626, + "learning_rate": 1.2367018390704311e-08, + "loss": 0.1394, + "step": 19363 + }, + { + "epoch": 0.98, + "grad_norm": 1.1327152068419841, + "learning_rate": 1.22852699499032e-08, + "loss": 0.1777, + "step": 19364 + }, + { + "epoch": 0.98, + "grad_norm": 1.3962861506837203, + "learning_rate": 1.220379242642844e-08, + "loss": 0.1614, + "step": 19365 + }, + { + "epoch": 0.98, + "grad_norm": 0.9929737564903329, + "learning_rate": 1.2122585822489374e-08, + "loss": 0.1599, + "step": 19366 + }, + { + "epoch": 0.98, + "grad_norm": 0.9344319731990219, + "learning_rate": 1.2041650140289796e-08, + "loss": 0.1611, + "step": 19367 + }, + { + "epoch": 0.98, + "grad_norm": 0.9210789177024352, + "learning_rate": 1.1960985382024615e-08, + "loss": 0.1423, + "step": 19368 + }, + { + "epoch": 0.98, + "grad_norm": 1.2308739590673927, + "learning_rate": 1.188059154988097e-08, + "loss": 0.1977, + "step": 19369 + }, + { + "epoch": 0.98, + "grad_norm": 0.9493833067382035, + "learning_rate": 1.1800468646041563e-08, + "loss": 0.1815, + "step": 19370 + }, + { + "epoch": 0.99, + "grad_norm": 0.8787135242048683, + "learning_rate": 1.1720616672676876e-08, + "loss": 0.1478, + "step": 19371 + }, + { + "epoch": 0.99, + "grad_norm": 1.0307096213171172, + "learning_rate": 1.1641035631956288e-08, + "loss": 0.1711, + "step": 19372 + }, + { + "epoch": 0.99, + "grad_norm": 1.2366301080695932, + "learning_rate": 1.156172552603585e-08, + "loss": 0.1562, + "step": 19373 + }, + { + "epoch": 0.99, + "grad_norm": 0.9931567520163074, + "learning_rate": 1.1482686357068284e-08, + "loss": 0.1371, + "step": 19374 + }, + { + "epoch": 0.99, + "grad_norm": 0.9024408195387488, + "learning_rate": 1.1403918127196323e-08, + "loss": 0.173, + "step": 19375 + }, + { + "epoch": 0.99, + "grad_norm": 1.0171629994494382, + "learning_rate": 1.1325420838558254e-08, + "loss": 0.1705, + "step": 19376 + }, + { + "epoch": 0.99, + "grad_norm": 1.7020732645492647, + "learning_rate": 1.1247194493281266e-08, + "loss": 0.1393, + "step": 19377 + }, + { + "epoch": 0.99, + "grad_norm": 0.9275481098725942, + "learning_rate": 1.1169239093489214e-08, + "loss": 0.1614, + "step": 19378 + }, + { + "epoch": 0.99, + "grad_norm": 1.0968512398274697, + "learning_rate": 1.1091554641294854e-08, + "loss": 0.146, + "step": 19379 + }, + { + "epoch": 0.99, + "grad_norm": 1.1071662113271206, + "learning_rate": 1.10141411388065e-08, + "loss": 0.1928, + "step": 19380 + }, + { + "epoch": 0.99, + "grad_norm": 1.0772512461637493, + "learning_rate": 1.0936998588124693e-08, + "loss": 0.145, + "step": 19381 + }, + { + "epoch": 0.99, + "grad_norm": 1.1481733489822117, + "learning_rate": 1.0860126991339982e-08, + "loss": 0.1415, + "step": 19382 + }, + { + "epoch": 0.99, + "grad_norm": 0.8551816592286479, + "learning_rate": 1.0783526350538476e-08, + "loss": 0.1608, + "step": 19383 + }, + { + "epoch": 0.99, + "grad_norm": 0.9707542368946773, + "learning_rate": 1.0707196667798513e-08, + "loss": 0.1539, + "step": 19384 + }, + { + "epoch": 0.99, + "grad_norm": 1.5645121903262291, + "learning_rate": 1.063113794518955e-08, + "loss": 0.1612, + "step": 19385 + }, + { + "epoch": 0.99, + "grad_norm": 1.3182108529350907, + "learning_rate": 1.0555350184775493e-08, + "loss": 0.1936, + "step": 19386 + }, + { + "epoch": 0.99, + "grad_norm": 0.9540155428082053, + "learning_rate": 1.047983338861136e-08, + "loss": 0.1589, + "step": 19387 + }, + { + "epoch": 0.99, + "grad_norm": 0.9832984324694535, + "learning_rate": 1.0404587558746626e-08, + "loss": 0.1438, + "step": 19388 + }, + { + "epoch": 0.99, + "grad_norm": 0.9006067158466624, + "learning_rate": 1.032961269722077e-08, + "loss": 0.1776, + "step": 19389 + }, + { + "epoch": 0.99, + "grad_norm": 4.244743279426564, + "learning_rate": 1.0254908806068831e-08, + "loss": 0.1625, + "step": 19390 + }, + { + "epoch": 0.99, + "grad_norm": 0.9579273824568465, + "learning_rate": 1.0180475887316966e-08, + "loss": 0.1696, + "step": 19391 + }, + { + "epoch": 0.99, + "grad_norm": 1.0104067076033303, + "learning_rate": 1.010631394298467e-08, + "loss": 0.1642, + "step": 19392 + }, + { + "epoch": 0.99, + "grad_norm": 0.9938051532179932, + "learning_rate": 1.0032422975081446e-08, + "loss": 0.1532, + "step": 19393 + }, + { + "epoch": 0.99, + "grad_norm": 0.8113243547723034, + "learning_rate": 9.95880298561347e-09, + "loss": 0.1686, + "step": 19394 + }, + { + "epoch": 0.99, + "grad_norm": 1.6819415896919474, + "learning_rate": 9.88545397657692e-09, + "loss": 0.1578, + "step": 19395 + }, + { + "epoch": 0.99, + "grad_norm": 1.6053397774824008, + "learning_rate": 9.812375949962426e-09, + "loss": 0.1852, + "step": 19396 + }, + { + "epoch": 0.99, + "grad_norm": 1.088283665461383, + "learning_rate": 9.739568907750629e-09, + "loss": 0.136, + "step": 19397 + }, + { + "epoch": 0.99, + "grad_norm": 0.9290486862927769, + "learning_rate": 9.667032851917723e-09, + "loss": 0.167, + "step": 19398 + }, + { + "epoch": 0.99, + "grad_norm": 1.1691011391086281, + "learning_rate": 9.594767784431025e-09, + "loss": 0.139, + "step": 19399 + }, + { + "epoch": 0.99, + "grad_norm": 0.8467822619073552, + "learning_rate": 9.522773707250077e-09, + "loss": 0.1709, + "step": 19400 + }, + { + "epoch": 0.99, + "grad_norm": 1.471340651344358, + "learning_rate": 9.451050622328873e-09, + "loss": 0.16, + "step": 19401 + }, + { + "epoch": 0.99, + "grad_norm": 1.273843813651816, + "learning_rate": 9.379598531611412e-09, + "loss": 0.157, + "step": 19402 + }, + { + "epoch": 0.99, + "grad_norm": 1.5288014134271248, + "learning_rate": 9.308417437037254e-09, + "loss": 0.1755, + "step": 19403 + }, + { + "epoch": 0.99, + "grad_norm": 0.9807843288626784, + "learning_rate": 9.237507340535968e-09, + "loss": 0.1631, + "step": 19404 + }, + { + "epoch": 0.99, + "grad_norm": 0.9258704245017536, + "learning_rate": 9.166868244031568e-09, + "loss": 0.1685, + "step": 19405 + }, + { + "epoch": 0.99, + "grad_norm": 1.1879016453660538, + "learning_rate": 9.096500149440302e-09, + "loss": 0.1534, + "step": 19406 + }, + { + "epoch": 0.99, + "grad_norm": 0.866655679800848, + "learning_rate": 9.02640305867064e-09, + "loss": 0.1266, + "step": 19407 + }, + { + "epoch": 0.99, + "grad_norm": 1.0303505197830716, + "learning_rate": 8.956576973624398e-09, + "loss": 0.1772, + "step": 19408 + }, + { + "epoch": 0.99, + "grad_norm": 1.0919726382863375, + "learning_rate": 8.887021896195614e-09, + "loss": 0.1592, + "step": 19409 + }, + { + "epoch": 0.99, + "grad_norm": 0.986364397224031, + "learning_rate": 8.817737828269446e-09, + "loss": 0.1547, + "step": 19410 + }, + { + "epoch": 0.99, + "grad_norm": 1.5733663737118595, + "learning_rate": 8.748724771727724e-09, + "loss": 0.1783, + "step": 19411 + }, + { + "epoch": 0.99, + "grad_norm": 1.1926579268620583, + "learning_rate": 8.679982728440061e-09, + "loss": 0.1575, + "step": 19412 + }, + { + "epoch": 0.99, + "grad_norm": 1.0728604790120746, + "learning_rate": 8.611511700272746e-09, + "loss": 0.1656, + "step": 19413 + }, + { + "epoch": 0.99, + "grad_norm": 0.9749045658625051, + "learning_rate": 8.543311689083177e-09, + "loss": 0.142, + "step": 19414 + }, + { + "epoch": 0.99, + "grad_norm": 0.9908315305488623, + "learning_rate": 8.47538269671988e-09, + "loss": 0.1469, + "step": 19415 + }, + { + "epoch": 0.99, + "grad_norm": 1.0193752846011748, + "learning_rate": 8.407724725025823e-09, + "loss": 0.1627, + "step": 19416 + }, + { + "epoch": 0.99, + "grad_norm": 1.770904345379466, + "learning_rate": 8.340337775837316e-09, + "loss": 0.1468, + "step": 19417 + }, + { + "epoch": 0.99, + "grad_norm": 0.9309726993901151, + "learning_rate": 8.273221850980673e-09, + "loss": 0.1453, + "step": 19418 + }, + { + "epoch": 0.99, + "grad_norm": 1.1105475116378467, + "learning_rate": 8.206376952277772e-09, + "loss": 0.1756, + "step": 19419 + }, + { + "epoch": 0.99, + "grad_norm": 1.4908709404855125, + "learning_rate": 8.139803081540499e-09, + "loss": 0.1683, + "step": 19420 + }, + { + "epoch": 0.99, + "grad_norm": 0.939681217803894, + "learning_rate": 8.073500240576292e-09, + "loss": 0.1752, + "step": 19421 + }, + { + "epoch": 0.99, + "grad_norm": 0.9335727034124158, + "learning_rate": 8.007468431182609e-09, + "loss": 0.1528, + "step": 19422 + }, + { + "epoch": 0.99, + "grad_norm": 0.8478138172436791, + "learning_rate": 7.941707655150233e-09, + "loss": 0.1453, + "step": 19423 + }, + { + "epoch": 0.99, + "grad_norm": 0.9989968451138482, + "learning_rate": 7.876217914264406e-09, + "loss": 0.1616, + "step": 19424 + }, + { + "epoch": 0.99, + "grad_norm": 1.419711096650346, + "learning_rate": 7.810999210299263e-09, + "loss": 0.1573, + "step": 19425 + }, + { + "epoch": 0.99, + "grad_norm": 0.8033453385822357, + "learning_rate": 7.746051545025613e-09, + "loss": 0.1636, + "step": 19426 + }, + { + "epoch": 0.99, + "grad_norm": 1.2856780985716851, + "learning_rate": 7.681374920205375e-09, + "loss": 0.1671, + "step": 19427 + }, + { + "epoch": 0.99, + "grad_norm": 1.021890655817455, + "learning_rate": 7.616969337591595e-09, + "loss": 0.1625, + "step": 19428 + }, + { + "epoch": 0.99, + "grad_norm": 1.200794353149773, + "learning_rate": 7.552834798931763e-09, + "loss": 0.162, + "step": 19429 + }, + { + "epoch": 0.99, + "grad_norm": 0.9148040032386322, + "learning_rate": 7.488971305965598e-09, + "loss": 0.1706, + "step": 19430 + }, + { + "epoch": 0.99, + "grad_norm": 0.9170760257674808, + "learning_rate": 7.425378860425048e-09, + "loss": 0.1772, + "step": 19431 + }, + { + "epoch": 0.99, + "grad_norm": 1.1806206086463553, + "learning_rate": 7.36205746403651e-09, + "loss": 0.1531, + "step": 19432 + }, + { + "epoch": 0.99, + "grad_norm": 0.8990746509040047, + "learning_rate": 7.299007118516388e-09, + "loss": 0.1441, + "step": 19433 + }, + { + "epoch": 0.99, + "grad_norm": 1.1953447975827665, + "learning_rate": 7.236227825574427e-09, + "loss": 0.1672, + "step": 19434 + }, + { + "epoch": 0.99, + "grad_norm": 1.1967523379896308, + "learning_rate": 7.173719586914818e-09, + "loss": 0.1612, + "step": 19435 + }, + { + "epoch": 0.99, + "grad_norm": 1.091873566076658, + "learning_rate": 7.111482404231762e-09, + "loss": 0.1483, + "step": 19436 + }, + { + "epoch": 0.99, + "grad_norm": 1.3397730582631586, + "learning_rate": 7.049516279215018e-09, + "loss": 0.1588, + "step": 19437 + }, + { + "epoch": 0.99, + "grad_norm": 1.2216177742464538, + "learning_rate": 6.987821213544355e-09, + "loss": 0.1521, + "step": 19438 + }, + { + "epoch": 0.99, + "grad_norm": 1.2590376447335887, + "learning_rate": 6.926397208892877e-09, + "loss": 0.1614, + "step": 19439 + }, + { + "epoch": 0.99, + "grad_norm": 1.5650243589807098, + "learning_rate": 6.8652442669281394e-09, + "loss": 0.1776, + "step": 19440 + }, + { + "epoch": 0.99, + "grad_norm": 0.9726278730987528, + "learning_rate": 6.804362389306596e-09, + "loss": 0.185, + "step": 19441 + }, + { + "epoch": 0.99, + "grad_norm": 0.9403291135586567, + "learning_rate": 6.743751577682478e-09, + "loss": 0.1619, + "step": 19442 + }, + { + "epoch": 0.99, + "grad_norm": 0.9548342447709992, + "learning_rate": 6.683411833697806e-09, + "loss": 0.1671, + "step": 19443 + }, + { + "epoch": 0.99, + "grad_norm": 2.2582047847405757, + "learning_rate": 6.623343158990159e-09, + "loss": 0.1707, + "step": 19444 + }, + { + "epoch": 0.99, + "grad_norm": 0.8723806454457383, + "learning_rate": 6.563545555189343e-09, + "loss": 0.1594, + "step": 19445 + }, + { + "epoch": 0.99, + "grad_norm": 1.4617744785679774, + "learning_rate": 6.504019023916286e-09, + "loss": 0.1528, + "step": 19446 + }, + { + "epoch": 0.99, + "grad_norm": 0.967536921726817, + "learning_rate": 6.444763566786361e-09, + "loss": 0.1491, + "step": 19447 + }, + { + "epoch": 0.99, + "grad_norm": 1.3426213804774425, + "learning_rate": 6.385779185407171e-09, + "loss": 0.1664, + "step": 19448 + }, + { + "epoch": 0.99, + "grad_norm": 1.0871880246818988, + "learning_rate": 6.327065881377437e-09, + "loss": 0.1806, + "step": 19449 + }, + { + "epoch": 0.99, + "grad_norm": 1.8191155304043163, + "learning_rate": 6.2686236562903294e-09, + "loss": 0.1644, + "step": 19450 + }, + { + "epoch": 0.99, + "grad_norm": 0.8209864529546275, + "learning_rate": 6.210452511731246e-09, + "loss": 0.1499, + "step": 19451 + }, + { + "epoch": 0.99, + "grad_norm": 0.9778755032833552, + "learning_rate": 6.152552449278925e-09, + "loss": 0.1752, + "step": 19452 + }, + { + "epoch": 0.99, + "grad_norm": 0.9975335549067313, + "learning_rate": 6.094923470502112e-09, + "loss": 0.185, + "step": 19453 + }, + { + "epoch": 0.99, + "grad_norm": 0.8685575465714487, + "learning_rate": 6.03756557696511e-09, + "loss": 0.1568, + "step": 19454 + }, + { + "epoch": 0.99, + "grad_norm": 0.9752549270513626, + "learning_rate": 5.980478770224452e-09, + "loss": 0.1645, + "step": 19455 + }, + { + "epoch": 0.99, + "grad_norm": 1.002505500250835, + "learning_rate": 5.923663051826678e-09, + "loss": 0.154, + "step": 19456 + }, + { + "epoch": 0.99, + "grad_norm": 1.6090929823221471, + "learning_rate": 5.867118423314999e-09, + "loss": 0.1711, + "step": 19457 + }, + { + "epoch": 0.99, + "grad_norm": 1.1652972258786725, + "learning_rate": 5.810844886221523e-09, + "loss": 0.1514, + "step": 19458 + }, + { + "epoch": 0.99, + "grad_norm": 0.9219302327292703, + "learning_rate": 5.754842442073916e-09, + "loss": 0.1825, + "step": 19459 + }, + { + "epoch": 0.99, + "grad_norm": 1.075261151804333, + "learning_rate": 5.699111092389853e-09, + "loss": 0.146, + "step": 19460 + }, + { + "epoch": 0.99, + "grad_norm": 0.8768162885658606, + "learning_rate": 5.643650838682568e-09, + "loss": 0.1537, + "step": 19461 + }, + { + "epoch": 0.99, + "grad_norm": 1.0508940154630682, + "learning_rate": 5.588461682455304e-09, + "loss": 0.1637, + "step": 19462 + }, + { + "epoch": 0.99, + "grad_norm": 1.346320532701835, + "learning_rate": 5.53354362520575e-09, + "loss": 0.1735, + "step": 19463 + }, + { + "epoch": 0.99, + "grad_norm": 1.0066814892312403, + "learning_rate": 5.478896668423828e-09, + "loss": 0.1506, + "step": 19464 + }, + { + "epoch": 0.99, + "grad_norm": 1.0829562803149226, + "learning_rate": 5.4245208135905725e-09, + "loss": 0.1518, + "step": 19465 + }, + { + "epoch": 0.99, + "grad_norm": 1.2651296228774314, + "learning_rate": 5.370416062181472e-09, + "loss": 0.1628, + "step": 19466 + }, + { + "epoch": 0.99, + "grad_norm": 0.9646946338958992, + "learning_rate": 5.31658241566535e-09, + "loss": 0.1564, + "step": 19467 + }, + { + "epoch": 0.99, + "grad_norm": 3.692309308973504, + "learning_rate": 5.2630198754999304e-09, + "loss": 0.1626, + "step": 19468 + }, + { + "epoch": 0.99, + "grad_norm": 1.0675900182191573, + "learning_rate": 5.209728443140716e-09, + "loss": 0.1559, + "step": 19469 + }, + { + "epoch": 0.99, + "grad_norm": 1.0059903839169024, + "learning_rate": 5.156708120032106e-09, + "loss": 0.1625, + "step": 19470 + }, + { + "epoch": 0.99, + "grad_norm": 1.4617637087129334, + "learning_rate": 5.103958907611839e-09, + "loss": 0.1519, + "step": 19471 + }, + { + "epoch": 0.99, + "grad_norm": 1.1075830315746373, + "learning_rate": 5.051480807312103e-09, + "loss": 0.1766, + "step": 19472 + }, + { + "epoch": 0.99, + "grad_norm": 1.1481469399042683, + "learning_rate": 4.999273820553985e-09, + "loss": 0.1509, + "step": 19473 + }, + { + "epoch": 0.99, + "grad_norm": 1.1386676238961613, + "learning_rate": 4.947337948756348e-09, + "loss": 0.1763, + "step": 19474 + }, + { + "epoch": 0.99, + "grad_norm": 1.072465063966951, + "learning_rate": 4.895673193325845e-09, + "loss": 0.1605, + "step": 19475 + }, + { + "epoch": 0.99, + "grad_norm": 0.8936326244245394, + "learning_rate": 4.8442795556657984e-09, + "loss": 0.1624, + "step": 19476 + }, + { + "epoch": 0.99, + "grad_norm": 0.9424084453406708, + "learning_rate": 4.793157037168428e-09, + "loss": 0.1414, + "step": 19477 + }, + { + "epoch": 0.99, + "grad_norm": 1.0914969007378266, + "learning_rate": 4.742305639221512e-09, + "loss": 0.1686, + "step": 19478 + }, + { + "epoch": 0.99, + "grad_norm": 1.2472389434505498, + "learning_rate": 4.6917253632039475e-09, + "loss": 0.1683, + "step": 19479 + }, + { + "epoch": 0.99, + "grad_norm": 1.0606679946176014, + "learning_rate": 4.64141621048797e-09, + "loss": 0.1839, + "step": 19480 + }, + { + "epoch": 0.99, + "grad_norm": 1.1715536705735645, + "learning_rate": 4.591378182438044e-09, + "loss": 0.1557, + "step": 19481 + }, + { + "epoch": 0.99, + "grad_norm": 1.2390844663128597, + "learning_rate": 4.541611280410862e-09, + "loss": 0.1746, + "step": 19482 + }, + { + "epoch": 0.99, + "grad_norm": 0.8912082389176219, + "learning_rate": 4.492115505757566e-09, + "loss": 0.1536, + "step": 19483 + }, + { + "epoch": 0.99, + "grad_norm": 0.905352948158156, + "learning_rate": 4.442890859820414e-09, + "loss": 0.1621, + "step": 19484 + }, + { + "epoch": 0.99, + "grad_norm": 1.0956200005472319, + "learning_rate": 4.393937343933896e-09, + "loss": 0.1582, + "step": 19485 + }, + { + "epoch": 0.99, + "grad_norm": 0.8536426185940816, + "learning_rate": 4.345254959426948e-09, + "loss": 0.1571, + "step": 19486 + }, + { + "epoch": 0.99, + "grad_norm": 1.229837492542071, + "learning_rate": 4.296843707619625e-09, + "loss": 0.1574, + "step": 19487 + }, + { + "epoch": 0.99, + "grad_norm": 0.95821743748653, + "learning_rate": 4.2487035898242106e-09, + "loss": 0.161, + "step": 19488 + }, + { + "epoch": 0.99, + "grad_norm": 1.3127235685010321, + "learning_rate": 4.200834607348547e-09, + "loss": 0.1621, + "step": 19489 + }, + { + "epoch": 0.99, + "grad_norm": 1.2313703140225252, + "learning_rate": 4.153236761488266e-09, + "loss": 0.173, + "step": 19490 + }, + { + "epoch": 0.99, + "grad_norm": 1.4217470558916299, + "learning_rate": 4.105910053536777e-09, + "loss": 0.1551, + "step": 19491 + }, + { + "epoch": 0.99, + "grad_norm": 1.0070122397761707, + "learning_rate": 4.058854484777497e-09, + "loss": 0.1475, + "step": 19492 + }, + { + "epoch": 0.99, + "grad_norm": 2.1355502059955587, + "learning_rate": 4.012070056484963e-09, + "loss": 0.1322, + "step": 19493 + }, + { + "epoch": 0.99, + "grad_norm": 1.0643535971160303, + "learning_rate": 3.965556769930379e-09, + "loss": 0.1466, + "step": 19494 + }, + { + "epoch": 0.99, + "grad_norm": 0.9244812085559133, + "learning_rate": 3.9193146263749595e-09, + "loss": 0.1604, + "step": 19495 + }, + { + "epoch": 0.99, + "grad_norm": 1.3720180064592766, + "learning_rate": 3.873343627073256e-09, + "loss": 0.1453, + "step": 19496 + }, + { + "epoch": 0.99, + "grad_norm": 1.1020529971092021, + "learning_rate": 3.827643773270939e-09, + "loss": 0.1655, + "step": 19497 + }, + { + "epoch": 0.99, + "grad_norm": 0.8882727079169191, + "learning_rate": 3.782215066208128e-09, + "loss": 0.1532, + "step": 19498 + }, + { + "epoch": 0.99, + "grad_norm": 0.8608176999058977, + "learning_rate": 3.73705750711717e-09, + "loss": 0.138, + "step": 19499 + }, + { + "epoch": 0.99, + "grad_norm": 1.6619832340491163, + "learning_rate": 3.692171097223751e-09, + "loss": 0.1516, + "step": 19500 + }, + { + "epoch": 0.99, + "grad_norm": 0.9864753391422965, + "learning_rate": 3.647555837744676e-09, + "loss": 0.1703, + "step": 19501 + }, + { + "epoch": 0.99, + "grad_norm": 0.8821308274352536, + "learning_rate": 3.603211729890088e-09, + "loss": 0.1549, + "step": 19502 + }, + { + "epoch": 0.99, + "grad_norm": 1.0563855911607007, + "learning_rate": 3.5591387748634687e-09, + "loss": 0.1603, + "step": 19503 + }, + { + "epoch": 0.99, + "grad_norm": 1.144958580490145, + "learning_rate": 3.5153369738583078e-09, + "loss": 0.156, + "step": 19504 + }, + { + "epoch": 0.99, + "grad_norm": 1.4642207318698972, + "learning_rate": 3.471806328065874e-09, + "loss": 0.1537, + "step": 19505 + }, + { + "epoch": 0.99, + "grad_norm": 2.2838464816610706, + "learning_rate": 3.428546838664115e-09, + "loss": 0.1542, + "step": 19506 + }, + { + "epoch": 0.99, + "grad_norm": 1.1749911154214572, + "learning_rate": 3.3855585068287564e-09, + "loss": 0.1595, + "step": 19507 + }, + { + "epoch": 0.99, + "grad_norm": 0.9491328852325978, + "learning_rate": 3.3428413337244224e-09, + "loss": 0.145, + "step": 19508 + }, + { + "epoch": 0.99, + "grad_norm": 1.0850935098083156, + "learning_rate": 3.3003953205101857e-09, + "loss": 0.1725, + "step": 19509 + }, + { + "epoch": 0.99, + "grad_norm": 1.13078152777645, + "learning_rate": 3.2582204683362372e-09, + "loss": 0.1457, + "step": 19510 + }, + { + "epoch": 0.99, + "grad_norm": 1.3915491422700175, + "learning_rate": 3.216316778348327e-09, + "loss": 0.1465, + "step": 19511 + }, + { + "epoch": 0.99, + "grad_norm": 1.3624287232659398, + "learning_rate": 3.1746842516833243e-09, + "loss": 0.1536, + "step": 19512 + }, + { + "epoch": 0.99, + "grad_norm": 0.9363607116738734, + "learning_rate": 3.1333228894692147e-09, + "loss": 0.1725, + "step": 19513 + }, + { + "epoch": 0.99, + "grad_norm": 1.1789056989836593, + "learning_rate": 3.092232692827324e-09, + "loss": 0.1567, + "step": 19514 + }, + { + "epoch": 0.99, + "grad_norm": 1.1716631678255858, + "learning_rate": 3.0514136628745363e-09, + "loss": 0.1741, + "step": 19515 + }, + { + "epoch": 0.99, + "grad_norm": 0.9295582498624071, + "learning_rate": 3.0108658007155235e-09, + "loss": 0.1647, + "step": 19516 + }, + { + "epoch": 0.99, + "grad_norm": 1.4339393414590325, + "learning_rate": 2.970589107452737e-09, + "loss": 0.1936, + "step": 19517 + }, + { + "epoch": 0.99, + "grad_norm": 1.183292046360521, + "learning_rate": 2.930583584176416e-09, + "loss": 0.1558, + "step": 19518 + }, + { + "epoch": 0.99, + "grad_norm": 0.7651834428055864, + "learning_rate": 2.890849231973469e-09, + "loss": 0.1538, + "step": 19519 + }, + { + "epoch": 0.99, + "grad_norm": 1.5131433513827075, + "learning_rate": 2.851386051919702e-09, + "loss": 0.1729, + "step": 19520 + }, + { + "epoch": 0.99, + "grad_norm": 1.6187138358686646, + "learning_rate": 2.8121940450875907e-09, + "loss": 0.1875, + "step": 19521 + }, + { + "epoch": 0.99, + "grad_norm": 1.12632087757575, + "learning_rate": 2.7732732125396177e-09, + "loss": 0.168, + "step": 19522 + }, + { + "epoch": 0.99, + "grad_norm": 1.022487523217451, + "learning_rate": 2.7346235553304955e-09, + "loss": 0.1642, + "step": 19523 + }, + { + "epoch": 0.99, + "grad_norm": 1.059501541651189, + "learning_rate": 2.696245074509385e-09, + "loss": 0.1501, + "step": 19524 + }, + { + "epoch": 0.99, + "grad_norm": 0.9766296923988161, + "learning_rate": 2.6581377711176747e-09, + "loss": 0.158, + "step": 19525 + }, + { + "epoch": 0.99, + "grad_norm": 1.0403942539814275, + "learning_rate": 2.620301646188983e-09, + "loss": 0.194, + "step": 19526 + }, + { + "epoch": 0.99, + "grad_norm": 1.1544990838019817, + "learning_rate": 2.5827367007491555e-09, + "loss": 0.1649, + "step": 19527 + }, + { + "epoch": 0.99, + "grad_norm": 1.7138085897003958, + "learning_rate": 2.545442935816267e-09, + "loss": 0.1536, + "step": 19528 + }, + { + "epoch": 0.99, + "grad_norm": 0.8899480974681093, + "learning_rate": 2.5084203524039507e-09, + "loss": 0.157, + "step": 19529 + }, + { + "epoch": 0.99, + "grad_norm": 0.9575638072030332, + "learning_rate": 2.4716689515147386e-09, + "loss": 0.1568, + "step": 19530 + }, + { + "epoch": 0.99, + "grad_norm": 2.5434583311996226, + "learning_rate": 2.4351887341467206e-09, + "loss": 0.148, + "step": 19531 + }, + { + "epoch": 0.99, + "grad_norm": 2.157022762265355, + "learning_rate": 2.3989797012879957e-09, + "loss": 0.1785, + "step": 19532 + }, + { + "epoch": 0.99, + "grad_norm": 1.2032012954869988, + "learning_rate": 2.363041853922221e-09, + "loss": 0.1566, + "step": 19533 + }, + { + "epoch": 0.99, + "grad_norm": 1.016008852523729, + "learning_rate": 2.327375193024173e-09, + "loss": 0.1645, + "step": 19534 + }, + { + "epoch": 0.99, + "grad_norm": 0.8736000709689397, + "learning_rate": 2.291979719559745e-09, + "loss": 0.1517, + "step": 19535 + }, + { + "epoch": 0.99, + "grad_norm": 1.8238877899880166, + "learning_rate": 2.25685543449039e-09, + "loss": 0.1519, + "step": 19536 + }, + { + "epoch": 0.99, + "grad_norm": 1.0281866407460118, + "learning_rate": 2.2220023387686805e-09, + "loss": 0.1596, + "step": 19537 + }, + { + "epoch": 0.99, + "grad_norm": 0.8196043357201952, + "learning_rate": 2.1874204333394157e-09, + "loss": 0.1506, + "step": 19538 + }, + { + "epoch": 0.99, + "grad_norm": 1.5190087216009494, + "learning_rate": 2.1531097191418438e-09, + "loss": 0.1812, + "step": 19539 + }, + { + "epoch": 0.99, + "grad_norm": 2.078921323523072, + "learning_rate": 2.1190701971052218e-09, + "loss": 0.1842, + "step": 19540 + }, + { + "epoch": 0.99, + "grad_norm": 1.16461521444874, + "learning_rate": 2.0853018681532557e-09, + "loss": 0.1488, + "step": 19541 + }, + { + "epoch": 0.99, + "grad_norm": 1.0488166753749382, + "learning_rate": 2.051804733202989e-09, + "loss": 0.1616, + "step": 19542 + }, + { + "epoch": 0.99, + "grad_norm": 0.9667513288660236, + "learning_rate": 2.018578793161474e-09, + "loss": 0.1601, + "step": 19543 + }, + { + "epoch": 0.99, + "grad_norm": 0.9142594167680658, + "learning_rate": 1.985624048931323e-09, + "loss": 0.1453, + "step": 19544 + }, + { + "epoch": 0.99, + "grad_norm": 0.9533579569326212, + "learning_rate": 1.952940501405154e-09, + "loss": 0.176, + "step": 19545 + }, + { + "epoch": 0.99, + "grad_norm": 0.8812100501930893, + "learning_rate": 1.9205281514700356e-09, + "loss": 0.1419, + "step": 19546 + }, + { + "epoch": 0.99, + "grad_norm": 2.502962961960015, + "learning_rate": 1.8883870000063753e-09, + "loss": 0.154, + "step": 19547 + }, + { + "epoch": 0.99, + "grad_norm": 1.0899422449191924, + "learning_rate": 1.856517047883477e-09, + "loss": 0.1636, + "step": 19548 + }, + { + "epoch": 0.99, + "grad_norm": 0.9187958817022984, + "learning_rate": 1.8249182959684253e-09, + "loss": 0.1584, + "step": 19549 + }, + { + "epoch": 0.99, + "grad_norm": 0.8635351997259035, + "learning_rate": 1.7935907451172019e-09, + "loss": 0.1413, + "step": 19550 + }, + { + "epoch": 0.99, + "grad_norm": 1.0210498380084243, + "learning_rate": 1.7625343961791275e-09, + "loss": 0.1537, + "step": 19551 + }, + { + "epoch": 0.99, + "grad_norm": 1.689591606566535, + "learning_rate": 1.7317492499968614e-09, + "loss": 0.165, + "step": 19552 + }, + { + "epoch": 0.99, + "grad_norm": 1.3730827056860522, + "learning_rate": 1.7012353074052912e-09, + "loss": 0.1652, + "step": 19553 + }, + { + "epoch": 0.99, + "grad_norm": 0.9159153423477189, + "learning_rate": 1.6709925692326435e-09, + "loss": 0.1768, + "step": 19554 + }, + { + "epoch": 0.99, + "grad_norm": 1.328847538504899, + "learning_rate": 1.6410210362993729e-09, + "loss": 0.1754, + "step": 19555 + }, + { + "epoch": 0.99, + "grad_norm": 0.9906761771767876, + "learning_rate": 1.6113207094181626e-09, + "loss": 0.1257, + "step": 19556 + }, + { + "epoch": 0.99, + "grad_norm": 0.8607195647789683, + "learning_rate": 1.5818915893939246e-09, + "loss": 0.152, + "step": 19557 + }, + { + "epoch": 0.99, + "grad_norm": 0.877998645264861, + "learning_rate": 1.5527336770260193e-09, + "loss": 0.1637, + "step": 19558 + }, + { + "epoch": 0.99, + "grad_norm": 1.148237424551445, + "learning_rate": 1.5238469731049254e-09, + "loss": 0.1579, + "step": 19559 + }, + { + "epoch": 0.99, + "grad_norm": 1.3199269131975926, + "learning_rate": 1.4952314784144606e-09, + "loss": 0.1764, + "step": 19560 + }, + { + "epoch": 0.99, + "grad_norm": 1.5449672703207997, + "learning_rate": 1.4668871937306706e-09, + "loss": 0.1767, + "step": 19561 + }, + { + "epoch": 0.99, + "grad_norm": 0.9551654351745247, + "learning_rate": 1.4388141198218297e-09, + "loss": 0.1352, + "step": 19562 + }, + { + "epoch": 0.99, + "grad_norm": 0.9266927361479481, + "learning_rate": 1.4110122574506612e-09, + "loss": 0.1586, + "step": 19563 + }, + { + "epoch": 0.99, + "grad_norm": 0.8555127548715358, + "learning_rate": 1.3834816073687862e-09, + "loss": 0.1689, + "step": 19564 + }, + { + "epoch": 0.99, + "grad_norm": 0.947450765781288, + "learning_rate": 1.3562221703267153e-09, + "loss": 0.1448, + "step": 19565 + }, + { + "epoch": 0.99, + "grad_norm": 1.5330077115508238, + "learning_rate": 1.3292339470605264e-09, + "loss": 0.168, + "step": 19566 + }, + { + "epoch": 1.0, + "grad_norm": 1.185404398834737, + "learning_rate": 1.302516938304077e-09, + "loss": 0.1527, + "step": 19567 + }, + { + "epoch": 1.0, + "grad_norm": 1.3458661533631802, + "learning_rate": 1.2760711447812324e-09, + "loss": 0.1749, + "step": 19568 + }, + { + "epoch": 1.0, + "grad_norm": 0.8806027444964389, + "learning_rate": 1.249896567210307e-09, + "loss": 0.1538, + "step": 19569 + }, + { + "epoch": 1.0, + "grad_norm": 1.3762344261508295, + "learning_rate": 1.2239932062996229e-09, + "loss": 0.1651, + "step": 19570 + }, + { + "epoch": 1.0, + "grad_norm": 1.3219725268163043, + "learning_rate": 1.198361062754172e-09, + "loss": 0.1637, + "step": 19571 + }, + { + "epoch": 1.0, + "grad_norm": 1.108314030942384, + "learning_rate": 1.1730001372667332e-09, + "loss": 0.1611, + "step": 19572 + }, + { + "epoch": 1.0, + "grad_norm": 1.1122036596028475, + "learning_rate": 1.1479104305267553e-09, + "loss": 0.1613, + "step": 19573 + }, + { + "epoch": 1.0, + "grad_norm": 0.960709597396094, + "learning_rate": 1.1230919432148046e-09, + "loss": 0.1858, + "step": 19574 + }, + { + "epoch": 1.0, + "grad_norm": 1.322649053633847, + "learning_rate": 1.0985446760036766e-09, + "loss": 0.1546, + "step": 19575 + }, + { + "epoch": 1.0, + "grad_norm": 1.0212454361300904, + "learning_rate": 1.074268629559505e-09, + "loss": 0.1615, + "step": 19576 + }, + { + "epoch": 1.0, + "grad_norm": 0.9147610824352448, + "learning_rate": 1.050263804539542e-09, + "loss": 0.1607, + "step": 19577 + }, + { + "epoch": 1.0, + "grad_norm": 1.155984862637736, + "learning_rate": 1.0265302015965984e-09, + "loss": 0.1655, + "step": 19578 + }, + { + "epoch": 1.0, + "grad_norm": 4.158209229203261, + "learning_rate": 1.0030678213746037e-09, + "loss": 0.1675, + "step": 19579 + }, + { + "epoch": 1.0, + "grad_norm": 1.0781213576452073, + "learning_rate": 9.798766645074953e-10, + "loss": 0.1765, + "step": 19580 + }, + { + "epoch": 1.0, + "grad_norm": 0.9225119895905177, + "learning_rate": 9.569567316269901e-10, + "loss": 0.162, + "step": 19581 + }, + { + "epoch": 1.0, + "grad_norm": 1.3798220622535513, + "learning_rate": 9.343080233537028e-10, + "loss": 0.1559, + "step": 19582 + }, + { + "epoch": 1.0, + "grad_norm": 1.5883719794381368, + "learning_rate": 9.119305403015865e-10, + "loss": 0.1681, + "step": 19583 + }, + { + "epoch": 1.0, + "grad_norm": 0.9920192181846645, + "learning_rate": 8.898242830779336e-10, + "loss": 0.1702, + "step": 19584 + }, + { + "epoch": 1.0, + "grad_norm": 1.011582731494466, + "learning_rate": 8.679892522833744e-10, + "loss": 0.152, + "step": 19585 + }, + { + "epoch": 1.0, + "grad_norm": 0.8377799966259223, + "learning_rate": 8.464254485096579e-10, + "loss": 0.1716, + "step": 19586 + }, + { + "epoch": 1.0, + "grad_norm": 1.0654245746696522, + "learning_rate": 8.251328723407615e-10, + "loss": 0.2018, + "step": 19587 + }, + { + "epoch": 1.0, + "grad_norm": 1.0281990884378034, + "learning_rate": 8.04111524354001e-10, + "loss": 0.1515, + "step": 19588 + }, + { + "epoch": 1.0, + "grad_norm": 1.0625756720875446, + "learning_rate": 7.833614051222515e-10, + "loss": 0.1688, + "step": 19589 + }, + { + "epoch": 1.0, + "grad_norm": 1.0201727001752259, + "learning_rate": 7.628825152050656e-10, + "loss": 0.1583, + "step": 19590 + }, + { + "epoch": 1.0, + "grad_norm": 0.9110095625867899, + "learning_rate": 7.426748551597751e-10, + "loss": 0.1504, + "step": 19591 + }, + { + "epoch": 1.0, + "grad_norm": 1.1998072018181751, + "learning_rate": 7.227384255348302e-10, + "loss": 0.1589, + "step": 19592 + }, + { + "epoch": 1.0, + "grad_norm": 1.8852849233106013, + "learning_rate": 7.030732268697993e-10, + "loss": 0.2041, + "step": 19593 + }, + { + "epoch": 1.0, + "grad_norm": 1.0241911357987625, + "learning_rate": 6.836792596986996e-10, + "loss": 0.1654, + "step": 19594 + }, + { + "epoch": 1.0, + "grad_norm": 1.08609554904953, + "learning_rate": 6.64556524547777e-10, + "loss": 0.1553, + "step": 19595 + }, + { + "epoch": 1.0, + "grad_norm": 1.56946727399011, + "learning_rate": 6.457050219355054e-10, + "loss": 0.1606, + "step": 19596 + }, + { + "epoch": 1.0, + "grad_norm": 1.5602719586550857, + "learning_rate": 6.271247523736978e-10, + "loss": 0.1621, + "step": 19597 + }, + { + "epoch": 1.0, + "grad_norm": 1.318659581015296, + "learning_rate": 6.088157163652853e-10, + "loss": 0.1669, + "step": 19598 + }, + { + "epoch": 1.0, + "grad_norm": 0.9424409032077522, + "learning_rate": 5.907779144076475e-10, + "loss": 0.1562, + "step": 19599 + }, + { + "epoch": 1.0, + "grad_norm": 0.9982593233791778, + "learning_rate": 5.730113469903931e-10, + "loss": 0.1598, + "step": 19600 + }, + { + "epoch": 1.0, + "grad_norm": 1.5222263838126218, + "learning_rate": 5.555160145942485e-10, + "loss": 0.1492, + "step": 19601 + }, + { + "epoch": 1.0, + "grad_norm": 1.0665216789776741, + "learning_rate": 5.38291917694389e-10, + "loss": 0.1641, + "step": 19602 + }, + { + "epoch": 1.0, + "grad_norm": 1.2836376729407852, + "learning_rate": 5.21339056759329e-10, + "loss": 0.1614, + "step": 19603 + }, + { + "epoch": 1.0, + "grad_norm": 0.8861752562480091, + "learning_rate": 5.046574322464803e-10, + "loss": 0.1605, + "step": 19604 + }, + { + "epoch": 1.0, + "grad_norm": 1.573271589681279, + "learning_rate": 4.882470446099241e-10, + "loss": 0.1373, + "step": 19605 + }, + { + "epoch": 1.0, + "grad_norm": 1.3650632783304832, + "learning_rate": 4.721078942948598e-10, + "loss": 0.1643, + "step": 19606 + }, + { + "epoch": 1.0, + "grad_norm": 0.9852357360213777, + "learning_rate": 4.562399817376051e-10, + "loss": 0.1739, + "step": 19607 + }, + { + "epoch": 1.0, + "grad_norm": 0.9335982121563716, + "learning_rate": 4.406433073711469e-10, + "loss": 0.1748, + "step": 19608 + }, + { + "epoch": 1.0, + "grad_norm": 1.2664477625014325, + "learning_rate": 4.253178716162598e-10, + "loss": 0.1724, + "step": 19609 + }, + { + "epoch": 1.0, + "grad_norm": 0.9898241612198758, + "learning_rate": 4.1026367488927745e-10, + "loss": 0.1871, + "step": 19610 + }, + { + "epoch": 1.0, + "grad_norm": 0.8668942232628565, + "learning_rate": 3.9548071759876185e-10, + "loss": 0.1578, + "step": 19611 + }, + { + "epoch": 1.0, + "grad_norm": 1.016572741939258, + "learning_rate": 3.809690001455035e-10, + "loss": 0.152, + "step": 19612 + }, + { + "epoch": 1.0, + "grad_norm": 1.0372104824857318, + "learning_rate": 3.667285229236317e-10, + "loss": 0.1711, + "step": 19613 + }, + { + "epoch": 1.0, + "grad_norm": 1.2856759711969599, + "learning_rate": 3.5275928631839375e-10, + "loss": 0.1505, + "step": 19614 + }, + { + "epoch": 1.0, + "grad_norm": 1.2258181466335094, + "learning_rate": 3.390612907094859e-10, + "loss": 0.1553, + "step": 19615 + }, + { + "epoch": 1.0, + "grad_norm": 0.9124728749818874, + "learning_rate": 3.256345364688329e-10, + "loss": 0.1467, + "step": 19616 + }, + { + "epoch": 1.0, + "grad_norm": 0.9079859042074966, + "learning_rate": 3.124790239594777e-10, + "loss": 0.1459, + "step": 19617 + }, + { + "epoch": 1.0, + "grad_norm": 0.8911178077678168, + "learning_rate": 2.995947535389121e-10, + "loss": 0.1631, + "step": 19618 + }, + { + "epoch": 1.0, + "grad_norm": 2.200090105682176, + "learning_rate": 2.8698172555685634e-10, + "loss": 0.1464, + "step": 19619 + }, + { + "epoch": 1.0, + "grad_norm": 0.945105130525938, + "learning_rate": 2.746399403552591e-10, + "loss": 0.1725, + "step": 19620 + }, + { + "epoch": 1.0, + "grad_norm": 1.1593818063618362, + "learning_rate": 2.6256939826940774e-10, + "loss": 0.1595, + "step": 19621 + }, + { + "epoch": 1.0, + "grad_norm": 1.080925000211112, + "learning_rate": 2.5077009962570784e-10, + "loss": 0.1645, + "step": 19622 + }, + { + "epoch": 1.0, + "grad_norm": 0.9285369238641832, + "learning_rate": 2.392420447450139e-10, + "loss": 0.1516, + "step": 19623 + }, + { + "epoch": 1.0, + "grad_norm": 1.3598322147140447, + "learning_rate": 2.279852339392985e-10, + "loss": 0.1464, + "step": 19624 + }, + { + "epoch": 1.0, + "grad_norm": 1.0434638990818221, + "learning_rate": 2.1699966751387303e-10, + "loss": 0.1612, + "step": 19625 + }, + { + "epoch": 1.0, + "grad_norm": 0.9800285397183612, + "learning_rate": 2.0628534576738746e-10, + "loss": 0.1652, + "step": 19626 + }, + { + "epoch": 1.0, + "grad_norm": 1.5006060025917467, + "learning_rate": 1.9584226898961e-10, + "loss": 0.1796, + "step": 19627 + }, + { + "epoch": 1.0, + "grad_norm": 1.118456926792813, + "learning_rate": 1.85670437465868e-10, + "loss": 0.1532, + "step": 19628 + }, + { + "epoch": 1.0, + "grad_norm": 1.3154930824584454, + "learning_rate": 1.757698514692763e-10, + "loss": 0.1737, + "step": 19629 + }, + { + "epoch": 1.0, + "grad_norm": 1.2594644655238652, + "learning_rate": 1.6614051127072929e-10, + "loss": 0.1776, + "step": 19630 + }, + { + "epoch": 1.0, + "grad_norm": 1.2309343752309614, + "learning_rate": 1.567824171300192e-10, + "loss": 0.1664, + "step": 19631 + }, + { + "epoch": 1.0, + "grad_norm": 1.4860321980415303, + "learning_rate": 1.4769556930138707e-10, + "loss": 0.1441, + "step": 19632 + }, + { + "epoch": 1.0, + "grad_norm": 0.9653230918868438, + "learning_rate": 1.3887996803130242e-10, + "loss": 0.1638, + "step": 19633 + }, + { + "epoch": 1.0, + "grad_norm": 2.5851984809016977, + "learning_rate": 1.3033561355846324e-10, + "loss": 0.1601, + "step": 19634 + }, + { + "epoch": 1.0, + "grad_norm": 1.1925293052215677, + "learning_rate": 1.2206250611490612e-10, + "loss": 0.1721, + "step": 19635 + }, + { + "epoch": 1.0, + "grad_norm": 1.0617452077781544, + "learning_rate": 1.1406064592600629e-10, + "loss": 0.1635, + "step": 19636 + }, + { + "epoch": 1.0, + "grad_norm": 1.0323618468439235, + "learning_rate": 1.063300332082573e-10, + "loss": 0.1347, + "step": 19637 + }, + { + "epoch": 1.0, + "grad_norm": 1.2798116231535444, + "learning_rate": 9.887066817038105e-11, + "loss": 0.1562, + "step": 19638 + }, + { + "epoch": 1.0, + "grad_norm": 0.9078167713684868, + "learning_rate": 9.168255101554835e-11, + "loss": 0.148, + "step": 19639 + }, + { + "epoch": 1.0, + "grad_norm": 1.9109382670862405, + "learning_rate": 8.476568193804824e-11, + "loss": 0.1563, + "step": 19640 + }, + { + "epoch": 1.0, + "grad_norm": 0.9894577234866273, + "learning_rate": 7.812006112661863e-11, + "loss": 0.1612, + "step": 19641 + }, + { + "epoch": 1.0, + "grad_norm": 1.0206860112046388, + "learning_rate": 7.174568876111565e-11, + "loss": 0.1626, + "step": 19642 + }, + { + "epoch": 1.0, + "grad_norm": 1.3743462768995467, + "learning_rate": 6.56425650147341e-11, + "loss": 0.1498, + "step": 19643 + }, + { + "epoch": 1.0, + "grad_norm": 0.8318224179362256, + "learning_rate": 5.981069005178697e-11, + "loss": 0.1659, + "step": 19644 + }, + { + "epoch": 1.0, + "grad_norm": 1.247774085231404, + "learning_rate": 5.425006403214639e-11, + "loss": 0.1671, + "step": 19645 + }, + { + "epoch": 1.0, + "grad_norm": 1.2683171725518405, + "learning_rate": 4.8960687104582235e-11, + "loss": 0.17, + "step": 19646 + }, + { + "epoch": 1.0, + "grad_norm": 2.001840363433565, + "learning_rate": 4.3942559414533734e-11, + "loss": 0.1918, + "step": 19647 + }, + { + "epoch": 1.0, + "grad_norm": 1.2938793167991183, + "learning_rate": 3.919568109744809e-11, + "loss": 0.181, + "step": 19648 + }, + { + "epoch": 1.0, + "grad_norm": 1.0800015855464375, + "learning_rate": 3.472005228211117e-11, + "loss": 0.1809, + "step": 19649 + }, + { + "epoch": 1.0, + "grad_norm": 1.0588591780403915, + "learning_rate": 3.051567308953729e-11, + "loss": 0.1805, + "step": 19650 + }, + { + "epoch": 1.0, + "grad_norm": 1.2033718217226848, + "learning_rate": 2.6582543634079416e-11, + "loss": 0.1423, + "step": 19651 + }, + { + "epoch": 1.0, + "grad_norm": 1.4721066909443041, + "learning_rate": 2.292066402120874e-11, + "loss": 0.1634, + "step": 19652 + }, + { + "epoch": 1.0, + "grad_norm": 6.723042537451307, + "learning_rate": 1.9530034353065775e-11, + "loss": 0.1664, + "step": 19653 + }, + { + "epoch": 1.0, + "grad_norm": 0.9442132049513063, + "learning_rate": 1.6410654719578588e-11, + "loss": 0.1605, + "step": 19654 + }, + { + "epoch": 1.0, + "grad_norm": 0.9540795193476649, + "learning_rate": 1.3562525205124134e-11, + "loss": 0.1505, + "step": 19655 + }, + { + "epoch": 1.0, + "grad_norm": 1.1669133271587695, + "learning_rate": 1.0985645887418017e-11, + "loss": 0.1659, + "step": 19656 + }, + { + "epoch": 1.0, + "grad_norm": 1.0352443925181587, + "learning_rate": 8.680016837514516e-12, + "loss": 0.1578, + "step": 19657 + }, + { + "epoch": 1.0, + "grad_norm": 1.2268663110505775, + "learning_rate": 6.645638116475894e-12, + "loss": 0.1621, + "step": 19658 + }, + { + "epoch": 1.0, + "grad_norm": 1.1273871387987642, + "learning_rate": 4.882509779813305e-12, + "loss": 0.1569, + "step": 19659 + }, + { + "epoch": 1.0, + "grad_norm": 1.3269521982201904, + "learning_rate": 3.39063187637656e-12, + "loss": 0.171, + "step": 19660 + }, + { + "epoch": 1.0, + "grad_norm": 1.2765834298934498, + "learning_rate": 2.1700044450234657e-12, + "loss": 0.1679, + "step": 19661 + }, + { + "epoch": 1.0, + "grad_norm": 0.9155996421638892, + "learning_rate": 1.2206275190607132e-12, + "loss": 0.1621, + "step": 19662 + }, + { + "epoch": 1.0, + "grad_norm": 0.8992063234029176, + "learning_rate": 5.425011262438773e-13, + "loss": 0.137, + "step": 19663 + }, + { + "epoch": 1.0, + "grad_norm": 1.401467580475726, + "learning_rate": 1.3562528211608085e-13, + "loss": 0.1682, + "step": 19664 + }, + { + "epoch": 1.0, + "grad_norm": 1.0157853651703213, + "learning_rate": 0.0, + "loss": 0.1681, + "step": 19665 + }, + { + "epoch": 1.0, + "step": 19665, + "total_flos": 4376372611039232.0, + "train_loss": 0.19586870987702348, + "train_runtime": 255341.6294, + "train_samples_per_second": 19.716, + "train_steps_per_second": 0.077 + } + ], + "logging_steps": 1.0, + "max_steps": 19665, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "total_flos": 4376372611039232.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}